Source code for multi_locus_analysis.stats.distributions

"""
Tools for analysis of distributional statistics of trajectories.

Anything that's not a moment of the trajectory that you want to analyze, you
typically have to look at distributionally (e.g. displacement distributions,
waiting time distributions, etc.). This module contains code that
facilitates working with probability and cumulative distribution functions,
bootstrapping, etc., as well as various miscellaneous distributional tools like
normality tests.
"""
from functools import partial
import warnings

import numpy as np
import pandas as pd
import scipy
import scipy.stats
from scipy.signal import savgol_filter
import statsmodels.stats.proportion as binom

from .moments import pos_to_all_vel
from ..util import mark_code_only


def power_law_slope_mle(x, xmin, N=None):
    if N is None:
        N = len(x)
    return 1 + N / np.sum(np.log(x/xmin))


[docs]def ecdf(y, y_allowed=None, auto_pad_left=False, pad_left_at_x=None):
    """Compute empirical cumulative distribution function (eCDF) from data.

    Parameters
    ----------
    y : (N,) array_like
        Values of the data.
    y_allowed : (M,) array_like
        Unique values that the data can take. Mostly useful for adding
        eCDF values at locations where data could or should have been observed
        but none was recorded.
    auto_pad_left : bool
        If left False, the data will not have a data value at the point where
        the eCDF equals zero. Use mean inter-data spacing to automatically
        generate an aesthetically reasonable such point.
    pad_left_at_x : bool
        If ``auto_pad_left`` is False, you may explicitly specify the value at
        which to add the leftmost extra point.

    Returns
    -------
    x : (M,) array_like
        The values at which the eCDF was computed. By default
        :code:`np.sort(np.unique(y))`.
    cdf : (M,) array_like
        Values of the eCDF at each x.

    Notes
    -----
    If using ``y_allowed``, the *pad_left* parameters are redundant, and should
    typically be left False/None.
    """
    y = np.array(y)
    y.sort()
    if y_allowed is not None:
        x = np.unique(y_allowed)
    else:
        x = np.unique(y)
    x.sort()
    if auto_pad_left:
        dx = np.mean(np.diff(x))
        x = np.insert(x, 0, x[0] - dx)
    elif pad_left_at_x is not None:
        if not np.isclose(pad_left_at_x, x[0]):
            if x[0] < pad_left_at_x:
                warnings.warn('pad_left_at_x not left of x in ecdf_windowed! '
                              'Ignoring...')
            else:
                x = np.insert(x, 0, pad_left_at_x)
    num_obs = len(y)
    cdf = np.zeros(x.shape, dtype=np.dtype('float'))
    i = 0
    for xi, xx in enumerate(x):
        while i < num_obs and y[i] <= xx:
            i += 1
        cdf[xi] = i/float(num_obs)
    return x, cdf


def _double_up(x):
    """[1,2,3] to [1,1,2,2,3,3]"""
    return np.tile(x, (2, 1)).T.flatten()


def bars_given_hist(y, bins):
    return _double_up(bins)[1:-1], _double_up(y)


[docs]def bars_given_discrete_cdf(x, cdf):
    """like bars_given_cdf for when you've used ecdf's times_allowed arg."""
    x_mid = (x[1:] + x[:-1]) / 2
    real_cdf = cdf[:-1]
    X, Y = bars_given_cdf(x_mid, real_cdf)
    return np.insert(X, 0, [0, 0]), np.insert(Y, 0, [0, 0])


[docs]def bars_given_cdf(x, cdf):
    """takes x, cdf from cdf_exact* functions and makes a plottable histogram
    by tracing out the PDF. Works well for CDFs that come from observations on
    a fixed grid, and not well for continuous observations. (i.e.
    discrete_trajectory_to_wait_times output will work well, but not
    state_changes_to_wait_times)."""
    if np.any(np.diff(x) == 0):
        raise ValueError('Values in x repeated!')
    pmf = np.diff(cdf)/np.diff(x)
    # tested by inspection
    return _double_up(x)[1:-1], _double_up(pmf)


[docs]def smooth_pdf(x, cdf, bw_method=None):
    """Takes x, cdf from cdf_exact* and returns a kernel density estimator that
    can be evaluated at any X to get an estimate of pdf(X).

    use bw_method to specify the way scipy.stats.gaussian_kde should determine
    the bandwidth of the gaussian. """
    Dcdf = np.diff(cdf)
    Dx = x[1:]
    return scipy.stats.gaussian_kde(Dx, weights=Dcdf,
                                    bw_method=bw_method)


def simultaneous_confint_from_cdf(alpha, n_samples, x, cdf):
    return binom.multinomial_proportions_confint(
        n_samples*np.diff(cdf), alpha=alpha, method='sison-glaz'
    )


def pointwise_confint_from_cdf(alpha, n_samples, x, cdf, bonferroni=True):
    n_bins = len(x) - 1
    if bonferroni:
        alpha = alpha/n_bins
    # pmf = np.diff(cdf)*n_samples
    # WARNING: not clear that this code ever worked, what is it supposed to be
    # doing?
    return binom.proportion_confint(cdf, n_samples, alpha)


[docs]def bars_given_confint(x, confint):
    """takes x, confint from cdf_exact*, binom.multinomial_proportions_confint
    (respectively), and rescales confint to correctly fit around the output of
    bars_given_cdf in an aesthetic way.
    """
    # binom speaks in counts, but bars_given_cdf speaks in density, so we need
    # to translate to densities by scaling by dx
    if np.any(np.diff(x) == 0):
        raise ValueError('Values in x repeated!')
    confint = confint/np.tile(np.diff(x), (2, 1)).T
    conf_bar_y = np.stack((
        _double_up(confint[:, 0]),
        _double_up(confint[:, 1])
    ))
    return _double_up(x)[1:-1], conf_bar_y


[docs]def sample_from_cdf(n, x, cdf):
    """Takes a sample count, cdf in the form x,cdf, like from output of
    cdf_exact* functions [i.e. pairs of (x, P(X<=x))]. Samples from the
    empirical distribution function at the maximum "x" resolution allowed by x.

    Returns fraction of the resampled data that fell into each bin. In other
    words, it returns pmf, as if one had done:

    >>> pmf, x = np.histogram(samples, bins=x)
    """
    r = np.random.rand(n)
    i = np.searchsorted(cdf, r)
    pmf = np.bincount(i)[1:]
    pmf = pmf/np.sum(pmf)
    extra_zeros = len(x) - len(pmf) - 1
    return np.concatenate([pmf, np.zeros((extra_zeros,))])


[docs]def bootstrapped_pmf_confint(n_samples, alpha, x, cdf, num_bootstraps=1000,
                             bonferroni=True):
    r"""Given an empirical cdf (x, cdf), this function generates bootstrapped
    error bars that represent, pointwise, the area that a second observation of
    n_samples (need not equal the number of samples used to generate (x, cdf))
    would lie between with probability 1-alpha if it had a true CDF given by
    the (continuous, linear interpolation of the) empircal CDF.

    Parameters
    ----------
    n_samples : int
        How many samples the secondary measurement has. This is the number of
        data points drawn in each bootstrap iteration.
    alpha : float \in [0,1]
        1 - confidence level desired
    x : (N,) array_like
        Values at which the empirical CDF was measured
    cdf : (N,) array_like
        Values of the empirical CDF
    num_bootstraps : (optional) int
        Number of bootstrapping interations to perform. WARNING: scales the
        memory required for now.
    bonferroni : (optional) bool
        Whether to scale alpha based on the number of bins so that the plot can
        be used to visually assert pointwise statistical significance at the
        requested alpha.

    Returns
    -------
    confint : (2,N-1) array_like
        Upper and lower bounds of the confidence interval calculated.
    """
    n_bins = len(x) - 1
    if bonferroni:
        alpha = alpha/n_bins
    bootstrapped_pmfs = np.zeros((n_bins, num_bootstraps))
    for i in range(num_bootstraps):
        bootstrapped_pmfs[:, i] = sample_from_cdf(n_samples, x, cdf)
    # axis arg is index of array's shape that will be "deleted"
    # we pass two alphas, this new dimension will be prepended
    return np.quantile(bootstrapped_pmfs, [alpha, 1-alpha],
                       overwrite_input=True, axis=1)


[docs]def bootstrapped_pmf_confint_bars(n_samples, x, cdf, num_bootstraps=1000):
    """same as non-bars version, but returns the actual samples as pmf bars,
    ready to plot."""
    n_bins = len(x) - 1
    bootstrapped_pmfs = np.zeros((n_bins, num_bootstraps))
    for i in range(num_bootstraps):
        bootstrapped_pmfs[:, i] = sample_from_cdf(n_samples, x, cdf)
    bootstrapped_cdfs = np.cumsum(bootstrapped_pmfs, axis=0)
    bootstrapped_cdfs = np.concatenate(
        [np.zeros((1, num_bootstraps)), bootstrapped_cdfs]
    )
    pmf_bars = np.zeros((2*n_bins, num_bootstraps))
    for i in range(num_bootstraps):
        _, pmf_bars[:, i] = bars_given_cdf(x, bootstrapped_cdfs[:, i])
    x_bars, _ = bars_given_cdf(x, bootstrapped_cdfs[:, 0])
    return x_bars, pmf_bars


[docs]def bootstrapped_pmf_from_waits_(
        n_samples, num_bootstraps, times, window_sizes, times_allowed,
        progress_bar=False, **kwargs):
    """Does shared work of creating non-bar pmfs, used by other
    bootstrapped_pmf_from_waits_* functions."""
    from ..finite_window import ecdf_windowed
    # the most common bootstrap thing is to draw samples your own size
    if n_samples is None:
        n_samples = len(times)
    n_bins = len(times_allowed) - 1
    num_waits = len(times)
    bootstrapped_pmfs = np.zeros((n_bins, num_bootstraps))
    for i in range(num_bootstraps):
        if progress_bar:
            print('\r{x:02d}%\r'.format(x=int(float(i)/num_bootstraps*100)),
                  end='')
        samples = np.floor(np.random.rand(n_samples)*num_waits).astype(int)
        t = times[samples]
        w = window_sizes[samples]
        _, cdf = ecdf_windowed(t, w, times_allowed, **kwargs)
        bootstrapped_pmfs[:, i] = np.diff(cdf)
    return bootstrapped_pmfs


[docs]def bootstrapped_pmf_from_waits(
        times, window_sizes, times_allowed, n_samples=None, alpha=0.05,
        num_bootstraps=1000, bonferroni=True, progress_bar=False, **kwargs):
    """Takes n_samples, num_bootstraps (# iterations), and calculates the pmf
    of the data num_bootsraps times using n_samples-sized samples drawn with
    replacement from the wait_times/windows that were passed.

    The internal call to cdf_exact_given_windows_quinn needs you to use the
    times_allowed argument, but all kwargs are forwarded to that function just
    in case."""
    n_bins = len(times_allowed) - 1
    if bonferroni:
        alpha = alpha/n_bins
    bootstrapped_pmfs = bootstrapped_pmf_from_waits_(
        n_samples, num_bootstraps, times, window_sizes, times_allowed,
        progress_bar=progress_bar, **kwargs)
    return np.quantile(bootstrapped_pmfs, [alpha, 1-alpha],
                       overwrite_input=True, axis=1)


[docs]def bootstrapped_pmf_from_waits_bars(
        times, window_sizes, times_allowed, n_samples=None,
        num_bootstraps=1000, **kwargs):
    """Same as bootstrapped_pmf_from_waits, but returns bars_given_cdf, ready to
    plot results."""
    n_bins = len(times_allowed) - 1
    bootstrapped_pmfs = bootstrapped_pmf_from_waits_(
        n_samples, num_bootstraps, times, window_sizes, times_allowed, **kwargs
    )
    bootstrapped_cdfs = np.cumsum(bootstrapped_pmfs, axis=0)
    bootstrapped_cdfs = np.concatenate([
        np.zeros((1, num_bootstraps)), bootstrapped_cdfs
    ])
    pmf_bars = np.zeros((2*n_bins, num_bootstraps))
    for i in range(num_bootstraps):
        _, pmf_bars[:, i] = bars_given_cdf(
            times_allowed, bootstrapped_cdfs[:, i]
        )
    x_bars, _ = bars_given_cdf(times_allowed, bootstrapped_cdfs[:, 0])
    return x_bars, pmf_bars


@mark_code_only
def scale_and_test_normality(vx):
    """Should be applied to a vector of velocities, vx"""
    vx = vx[np.isfinite(vx)]
    vx -= np.mean(vx)
    std = np.std(vx)
    if std > 0:
        vx /= np.std(vx)
    num_disps = vx.size
    ksstat, ks_pval = scipy.stats.kstest(vx, 'norm', mode='asymp')
    ngp = np.mean(np.power(vx, 4)) \
        / (3*np.power(np.mean(np.power(vx, 2)), 2)) - 1
    try:
        shapiro_stat, shapiro_pval = scipy.stats.shapiro(vx)
    except Exception:  # shut up linter. don't remember what this raises
        shapiro_stat = shapiro_pval = np.nan
    is_shapiro_pval_accurate = num_disps < 5000  # from scipy docs, v 19.1
    anderson_stat, crit_vals, crit_levels = scipy.stats.anderson(vx)
    if np.isfinite(anderson_stat):
        crit_vals = np.concatenate(([0], crit_vals, [np.inf]))
        crit_alphas = np.concatenate(([100], crit_levels, [0]))/100
        ihigh = np.where(crit_vals < anderson_stat)[0][0]
        ilow = ihigh + 1
    else:
        ihigh = ilow = 0
        crit_vals = [np.nan]
        crit_alphas = [np.nan]
    return {'NGP': ngp, 'KS Statistic': ksstat, 'KS p-value': ks_pval,
            'Shapiro-Wilk Statistic': shapiro_stat,
            'Shapiro-Wilk p-value': shapiro_pval,
            'Shapiro-Wilk p-value is accurate': is_shapiro_pval_accurate,
            'Anderson-Darling Statistic': anderson_stat,
            'Anderson-Darling alpha lower bound': crit_alphas[ilow],
            'Anderson-Darling alpha upper bound': crit_alphas[ihigh],
            'Anderson-Darling upper crit value': crit_vals[ihigh],
            'Anderson-Darling lower crit value': crit_vals[ilow]
            }


@mark_code_only
def add_savgol(traj, window_size, order=3):
    s = 'savgol' + str(window_size) + '_'
    if len(traj['x']) <= window_size:
        traj[s + 'x'] = np.nan
        traj[s + 'y'] = np.nan
        traj[s + 'x_fluct'] = np.nan
        traj[s + 'y_fluct'] = np.nan
    else:
        traj[s + 'x'] = savgol_filter(traj['x'], window_size, order)
        traj[s + 'y'] = savgol_filter(traj['y'], window_size, order)
        traj[s + 'x_fluct'] = traj['x'] - traj[s + 'x']
        traj[s + 'y_fluct'] = traj['y'] - traj[s + 'y']
    return traj


@mark_code_only
def add_savgol_window(data, window_sizes):
    for window_size in window_sizes:
        data = data.groupby(
            ['experiment', 'movie name', 'molecule id']
        ).apply(partial(add_savgol, window_size=window_size))
    return data


@mark_code_only
def get_savgol_vels(data, window_sizes):
    # we're gonna do the same thing twice, basically get vels for each window
    # size
    savgol_vels = []
    for window_size in window_sizes:
        fluct_vels = data.groupby(
            ['experiment', 'movie name', 'molecule id']
        ).apply(partial(
            pos_to_all_vel, delta=1,
            xcol='savgol'+str(window_size)+'_x',
            ycol='savgol'+str(window_size)+'_y')
        )
        fluct_vels['window_size'] = window_size
        savgol_vels.append(fluct_vels)
    savgol_vels = [vel.reset_index().set_index(
        ['experiment', 'movie name', 'molecule id', 'window_size', 'ti',
         'delta']
    ) for vel in savgol_vels]
    savgol_vels = pd.concat(savgol_vels)
# now for _fluct columns
    corrected_vels = []
    for window_size in window_sizes:
        fluct_vels = data.groupby(
            ['experiment', 'movie name', 'molecule id']
        ).apply(partial(
            pos_to_all_vel, delta=1,
            xcol='savgol'+str(window_size)+'_x_fluct',
            ycol='savgol'+str(window_size)+'_y_fluct'
        ))
        fluct_vels['window_size'] = window_size
        corrected_vels.append(fluct_vels)
    corrected_vels = [vel.reset_index().set_index(
        ['experiment', 'movie name', 'molecule id', 'window_size', 'ti',
         'delta']
    ) for vel in corrected_vels]
    return savgol_vels, pd.concat(corrected_vels)


@mark_code_only
def get_savgol_msds(savgol_data, window_sizes):
    msds = []
    for window_size in window_sizes:
        savgol_disps = savgol_data.groupby(
            ['experiment', 'movie name', 'molecule id']
        ).apply(partial(
            pos_to_all_vel, absolute_time=True,
            xcol='savgol'+str(window_size)+'_x_fluct',
            ycol='savgol'+str(window_size)+'_y_fluct'
        ))
        sq_disps = pd.DataFrame(index=savgol_disps.index)
        sq_disps['sq_disp'] = np.power(savgol_disps['vx'], 2) \
            + np.power(savgol_disps['vy'], 2)
        sq_disps['delta_abs'] = savgol_disps['delta_abs'].round(3)
        savgol_msds = sq_disps \
            .groupby(['delta_abs'])['sq_disp'] \
            .agg(['mean', 'std', 'count'])
        savgol_msds['window_size'] = window_size
        savgol_msds = savgol_msds.reset_index().set_index(['window_size',
                                                           'delta_abs'])
        msds.append(savgol_msds)
    return pd.concat(msds)