Source code for multi_locus_analysis.finite_window.stats

import warnings

import numpy as np
import pandas as pd
import scipy.optimize

from ..stats import ecdf


[docs]def average_lifetime(obs, traj_cols=['replicate']): """ Estimate the true means of each state of the process. Doesn't require removing any censoring, because the process is stationary. """ true_ends = obs[~np.isclose(obs['end_time'], obs['window_size'])] # the number of start times of each state are highly # correlated estimators of the total number of "start times" # of the full process that cycles. we combine them all to # average out edge effects, but don't gain much from it ave_num_ends = true_ends.groupby('state')['end_time'] \ .count() \ .mean() total_window = obs \ .groupby(traj_cols)['window_size'].first() \ .sum() total_mean_est = total_window / ave_num_ends # # this also works, but is obviously less robust # start_state = obs.groupby(traj_cols)['state'].first() # start_counts = start_state.value_counts() # mean_est = { # name: total_mean_est*(count/np.sum(start_counts)) # for name, count in start_counts.items() # } total_state_times = obs.groupby('state')['wait_time'].sum() mean_est = { name: total_mean_est*(time/np.sum(total_state_times)) for name, time in total_state_times.items() } return pd.Series(mean_est)
def window_sf(obs, traj_cols): window_sizes = obs.groupby(traj_cols)['window_size'].first().values # now sorted window_sizes, window_cdf = ecdf(window_sizes, pad_left_at_x=0) return 1 - window_cdf def ecdf_combined(exterior, interior, window_sizes, ext_bins='auto', **kwargs): all_times, cdf_int, cdf_ext, Z_X, F_T = ecdf_ext_int( exterior, interior, window_sizes, **kwargs ) y, t_bins = np.histogram(exterior, bins=ext_bins) bin_centers = (t_bins[:-1] + t_bins[1:]) / 2 Z_hist = np.sum(y*np.diff(t_bins)) # exterior estimator ext_est = (1 - y/Z_hist/Z_X) # standard error of mean, Normal approximation. ext_var = y / (Z_hist*Z_X)**2 ext_weight = 1/ext_var # interior estimator int_est = np.interp(bin_centers, all_times, cdf_int*F_T) # Dvoretzky-Kiefer-Wolfowitz lower bound on the error at # confidence level alpha alpha = 0.32 # to match 1 sigma (std dev) N_int = len(interior) int_var = 1/(2*N_int) * np.log(2/alpha) int_weight = 1/int_var # final estimate is just the weighted average final_est = ( int_est*int_weight + ext_est*ext_weight ) / ( int_weight + ext_weight ) return bin_centers, final_est def _ccdf_int(x_int, cdf_int): if not np.isclose(cdf_int[0], 0): raise ValueError("Need to integrate interior CDF, but values start >0") ccdf_int = np.zeros_like(cdf_int) ccdf_int[1:] = np.cumsum(cdf_int[1:] * np.diff(x_int)) return ccdf_int def ecdf_ext_int(exterior, interior, window_sizes, window_sf=None, times_allowed=None, pad_left_at_x=0): if len(exterior) < 5 or len(interior) < 5: raise ValueError("Need enough exterior *and* interior times to " "perform correction.") x_ext, cdf_ext = ecdf(exterior, times_allowed, pad_left_at_x=pad_left_at_x) x_int, cdf_int = ecdf_windowed( interior, window_sizes, window_sf, times_allowed, pad_left_at_x=pad_left_at_x, ) # integral of CDF w.r.t. t ccdf_int = _ccdf_int(x_int, cdf_int) if times_allowed is None: times_allowed = np.sort(np.concatenate((x_int, x_ext))) cdf_ext = np.interp(times_allowed, x_ext, cdf_ext) ccdf_int = np.interp(times_allowed, x_int, ccdf_int) cdf_int = np.interp(times_allowed, x_int, cdf_int) def err_f(ab, t, integrated_cdf, exterior_cdf): return np.linalg.norm( ab[0] * t + ab[1] * integrated_cdf - exterior_cdf ) opt_out = scipy.optimize.minimize( err_f, x0=[1, -1], args=(times_allowed, ccdf_int, cdf_ext), bounds=((0, np.inf), (-np.inf, 0)), ) if not opt_out.success: raise ValueError("Unable to compute F_X(T)!") Z_X, b = opt_out.x F_T = -b/Z_X return times_allowed, cdf_int, cdf_ext, Z_X, F_T
[docs]def ecdf_windowed( times, window_sizes, window_sf=None, times_allowed=None, auto_pad_left=None, pad_left_at_x=0, normalize=True, skip_times_allowed_check=False): """ Empirical cumulative distribution for windowed observations. Parameters ---------- times : (N,) array_like "Interior" waiting times. window_sizes : float or (N,) array_like The window size used. If a single value is passed, the window size is assumed to be constant. times_allowed : (M,) array_like Unique values that the data can take. Mostly useful for adding eCDF values at locations where data could or should have been observed but none was recorded (e.g. if a movie was taken with a given framerate but not all possible window lengths were observed). auto_pad_left : bool Deprecated. It makes more sense to default to left padding at zero for a renewal process. If left False, the data will not have a data value at the point where the eCDF equals zero. Use mean inter-data spacing to automatically generate an aesthetically reasonable such point. You *must* pass ``pad_left_at_x=False`` manually for this to work as expected. pad_left_at_x : float, default: 0 Same as ``auto_pad_left``, but specify the point at which to add the leftmost point. window_sf : (M,) array_like of float For each unique window size in *window_sizes*, the number of trajectories with *at least* that window size. If not specified, it is assumed that each unique value of window size correponds to a unique trajectory. For the case of constant window size, this option is ignored. Returns ------- x : (M,) array_like The values at which the eCDF was computed. By default :code:`np.sort(np.unique(y))`. cdf : (M,) array_like Values of the eCDF at each x. Notes ----- If using ``times_allowed``, the *pad_left* parameters are redundant. """ times = np.array(times) window_sizes = np.array(window_sizes) if times_allowed is not None and not skip_times_allowed_check: # again, not many resources in numpy for dealing with sorted arrays, so # we can't just take advantage of that and do it later... close_to_allowed = np.any( np.isclose(times[None, :], times_allowed[:, None]), axis=0 # one for each in times ) if not np.all(close_to_allowed): # you'd have to modify the core algorithm to allow things to not # compare is_close raise ValueError("times_allowed passed does not contain all " "elements of times passed (not np.is_close).") # unique doesn't take advantage of sorting anyway, so just do this here times_allowed = np.unique(times) \ if times_allowed is None else np.array(times_allowed) times_allowed.sort() # flags only needed for multiple window sizes ignore_window_sf = False # allow providing single window size if window_sizes.size == 1: # and skip the extra pass where we calculate window_sf in that case ignore_window_sf = True window_sizes = window_sizes*np.ones_like(times) # algorithm is basically to sort then count, as in mla.stats.ecdf, but with # weights i = np.argsort(times) times = times[i] window_sizes = window_sizes[i] if auto_pad_left: dx = np.mean(np.diff(times_allowed)) times_allowed = np.insert(times_allowed, 0, times_allowed[0] - dx) elif pad_left_at_x is not None: # remove this test for now, as weibull/pareto variables can generate # VERY small waits if not times_allowed[0] == pad_left_at_x: if times_allowed[0] < pad_left_at_x: warnings.warn('pad_left_at_x not left of x in ecdf_windowed! ' 'Ignoring...') else: times_allowed = np.insert(times_allowed, 0, pad_left_at_x) # base weights due to interior censoring weights = (window_sizes - times) if not ignore_window_sf: if window_sf is None: # get fraction of windows that are at *least* of each width traj_window, window_ecdf = ecdf(window_sizes, pad_left_at_x=0) window_sf = 1 - window_ecdf else: # make sure to pad_left_at_x=0 traj_window = np.insert(np.unique(window_sizes), 0, 0) if not np.isclose(window_sf[0], 1.0): raise ValueError("window_sf must be pad_left_at_x=0.") # for each observed time, we can get number of windows in which it can # have been observed # minus 1 because of how searchsorted returns indices window_i = np.searchsorted(traj_window, times) - 1 # scale by fraction of trajectories where each time was observable weights = weights*window_sf[window_i] # TODO probably numba compile just this for loop i = 0 num_obs = len(times) # weight contributed by each time. need to simply combine weights due to # "isclose" times to get final cdf output. full_cdf = np.cumsum(1/weights) # we loop over the allowed times instead of full_cdf itself so that we can # catch "empty" buckets, introduced either by times_allowed or pad_left* cdf = np.zeros(times_allowed.shape, dtype=np.dtype('float')) for cdf_i, t in enumerate(times_allowed): # if the times_allowed extend beyond any observations, just pad out the # result in a vectorized way if i >= num_obs - 1: cdf[cdf_i:] = full_cdf[-1] break # this triggers both if t is smaller than any observation (i == 0), or # if this particular cdf_i should have the same value as the previous # one (because there are no observations in this bucket) if times[i] > t and not np.isclose(times[i], t): if i == 0: # we're already zero-initialized continue # otherwise, just use the previous value, since there's guaranteed # to be at least one if we've already advanced i cdf[cdf_i] = cdf[cdf_i - 1] # otherwise, there is some amount of weight in this bucket. advance i # until we've counted all the weight while i + 1 < num_obs and np.isclose(times[i+1], t): i += 1 cdf[cdf_i] = full_cdf[i] if normalize: cdf = cdf/cdf[-1] # some heavy-tailed dists can generate VERY small times, (isclose to 0) if pad_left_at_x == 0: cdf[0] = 0 return times_allowed, cdf
[docs]def ecdf_simple(waits, T, pad_left_at_x=0): """cdf of interior times (ts > 0) observed in window of size T""" ts, counts = np.unique(waits, return_counts=True) i = np.argsort(ts) ts = ts[i] counts = counts[i] weights = T/(T - ts) return np.insert(ts, 0, pad_left_at_x), \ np.insert(np.cumsum(counts*weights), 0, 0)/len(weights)