Source code for neurom.stats

# Copyright (c) 2015, Ecole Polytechnique Federale de Lausanne, Blue Brain Project
# All rights reserved.
#
# This file is part of NeuroM <https://github.com/BlueBrain/NeuroM>
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#     1. Redistributions of source code must retain the above copyright
#        notice, this list of conditions and the following disclaimer.
#     2. Redistributions in binary form must reproduce the above copyright
#        notice, this list of conditions and the following disclaimer in the
#        documentation and/or other materials provided with the distribution.
#     3. Neither the name of the copyright holder nor the names of
#        its contributors may be used to endorse or promote products
#        derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

'''Statistical analysis helper functions

Nothing fancy. Just commonly used functions using scipy functionality.'''

from collections import namedtuple
from scipy import stats as _st
import numpy as np
from enum import Enum, unique
from neurom._compat import OrderedDict

FitResults = namedtuple('FitResults', ['params', 'errs', 'type'])


@unique
[docs]class StatTests(Enum): '''Enum representing valid statistical tests of scipy''' ks = 1 wilcoxon = 2 ttest = 3
[docs]def get_test(stest): '''Returns the correct stat test''' sts = {StatTests.ks: 'ks_2samp', StatTests.wilcoxon: 'wilcoxon', StatTests.ttest: 'ttest_ind'} if stest in StatTests: return sts[stest] else: raise TypeError('Statistical test not recognized. Choose from ks, wilcoxon, ttest.')
[docs]def fit_results_to_dict(fit_results, min_bound=None, max_bound=None): '''Create a JSON-comparible dict from a FitResults object Parameters: fit_results (FitResults): object containing fit parameters,\ errors and type min_bound: optional min value to add to dictionary if min isn't\ a fit parameter. max_bound: optional max value to add to dictionary if max isn't\ a fit parameter. Returns: JSON-compatible dictionary with fit results Note: Supported fit types: 'norm', 'expon', 'uniform' ''' type_map = {'norm': 'normal', 'expon': 'exponential', 'uniform': 'uniform'} param_map = {'uniform': lambda p: [('min', p[0]), ('max', p[0] + p[1])], 'norm': lambda p: [('mu', p[0]), ('sigma', p[1])], 'expon': lambda p: [('lambda', 1.0 / p[1])]} d = OrderedDict({'type': type_map[fit_results.type]}) d.update(param_map[fit_results.type](fit_results.params)) if min_bound is not None and 'min' not in d: d['min'] = min_bound if max_bound is not None and 'max' not in d: d['max'] = max_bound return d
[docs]def fit(data, distribution='norm'): '''Calculate the parameters of a fit of a distribution to a data set Parameters: data: array of data points to be fitted Options: distribution (str): type of distribution to fit. Default 'norm'. Returns: FitResults object with fitted parameters, errors and distrubution type Note: Uses Kolmogorov-Smirnov test to estimate distance and p-value. ''' params = getattr(_st, distribution).fit(data) return FitResults(params, _st.kstest(data, distribution, params), distribution)
[docs]def optimal_distribution(data, distr_to_check=('norm', 'expon', 'uniform')): '''Calculate the parameters of a fit of different distributions to a data set and returns the distribution of the minimal ks-distance. Parameters: data: array of data points to be fitted Options: distr_to_check: tuple of distributions to be checked Returns: FitResults object with fitted parameters, errors and distrubution type\ of the fit with the smallest fit distance Note: Uses Kolmogorov-Smirnov test to estimate distance and p-value. ''' fit_results = [fit(data, d) for d in distr_to_check] return min(fit_results, key=lambda fit: fit.errs[0])
[docs]def scalar_stats(data, functions=('min', 'max', 'mean', 'std')): '''Calculate the stats from the given numpy functions Parameters: data: array of data points to be used for the stats Options: functions: tuple of numpy stat functions to apply on data Returns: Dictionary with tha name of the function as key and the result as the respective value ''' stats = {} for func in functions: stats[func] = getattr(np, func)(data) return stats
[docs]def compare_two(data1, data2, test=StatTests.ks): '''Compares two distributions of data and assess two scores: a distance between them and a probability they are drawn from the same distribution. Parameters: data1: numpy array of dataset 1 data2: numpy array of dataset 2 test: Stat_tests\ Defines the statistical test to be used, based\ on the scipy available modules.\ Accepted tests: ks_2samp, wilcoxon, ttest Returns: dist: float\ High numbers define high dissimilarity between the two datasets p-value: float\ Small numbers define high probability the data come from\ same dataset. ''' results = getattr(_st, get_test(test))(data1, data2) Stats = namedtuple('Stats', ['dist', 'pvalue']) return Stats(*results)
[docs]def total_score(paired_dats, p=2, test=StatTests.ks): '''Calculates the p-norm of the distances that have been calculated from the statistical test that has been applied on all the paired datasets. Parameters: paired_dats: a list of tuples or where each tuple contains the paired data lists from two datasets Options: p : integer that defines the order of p-norm test: Stat_tests\ Defines the statistical test to be used, based\ on the scipy available modules.\ Accepted tests: ks_2samp, wilcoxon, ttest Returns: A float corresponding to the p-norm of the distances that have been calculated. 0 corresponds to high similarity while 1 to low. ''' scores = np.array([compare_two(fL1, fL2, test=test).dist for fL1, fL2 in paired_dats]) return np.linalg.norm(scores, p)