Source code for multi_locus_analysis.dataframes

"""Utilities for massaging DataFrames"""

import re
import json
import pandas as pd
import numpy as np

[docs]def array_from_numpy_string(s): """For unserializing np.array's after DataFrame.to_csv Notes ----- As of 2019-03-13, looks like instead of commas separating the numpy array elements, *two* spaces are printed. We just replace these spaces with commas after removing all other spaces and load the string as json.""" s = re.sub(r' *\[ +', '[', s) s = re.sub(' ([-0-9 ])', '\\1,', s) s = re.sub('\n', ',', s) return np.array(json.loads(s))
[docs]def pivot_loci(df, pivot_cols=['x', 'y', 'z'], spot_col='spot'): """Move between "long" and "short" forms for the spot id column. Simply put, we want to be able to transform between the following two dataframes:: Condensed form X1 Y1 Z1 X2 Y2 Z2 t foci locus genotype exp.rep meiosis cell frame HET5 WT 2 t0 1 1 1.6 2.4 3.1 2.1 1.5 3.1 0 unp 2 1.9 1.5 3.1 1.9 2.5 3.1 30 unp 3 2.0 1.8 3.0 1.5 2.5 3.4 60 unp 4 2.1 1.9 3.0 1.4 2.2 3.4 90 unp 5 2.2 1.8 3.0 1.5 2.4 3.4 120 unp and :: "Long" form X Y Z t foci locus genotype exp.rep meiosis cell frame spot HET5 WT 2 t0 1 1 1 1.6 2.4 3.1 0 unp 2 1 1.9 1.5 3.1 30 unp 3 1 2.0 1.8 3.0 60 unp ... 1 2 2.1 1.5 3.1 0 unp 2 2 1.9 2.5 3.1 30 unp 3 2 1.5 2.5 3.4 60 unp This function can infer which direction to pivot. Because of this, I have found using this function much more convenient (and a smaller cognitive load) than using a multiindex for the column names and using e.g. pd.unstack and friends. Parameters ---------- pivot_cols : List<str> The names of the columns over which to pivot (without their numerical suffixes, these will be inferred). spot_col : str The name of the column that holds (or will hold) the spot id. Returns ------- df : pd.DataFrame The pivot-ed DataFrame. """ cols = list(df.columns) rs = [re.compile(col+'([0-9]+)') for col in pivot_cols] cols_to_pivot = [col for col in cols if any([r.match(col) for r in rs])] # if we are creating the numbered columns if spot_col in df.index.names and len(cols_to_pivot) == 0 \ and all(col in df.columns for col in pivot_cols): extra_cols = list(set(cols) - set(pivot_cols)) def rename_cols(data): data = data.copy() spot_id = str(data.index.get_level_values(spot_col)[0]) for col in df.columns: if col in pivot_cols: data[col+spot_id] = data[col].copy() del data[col] data.index = data.index.droplevel(spot_col) return data # make one dataframe per spot_id, with just that data in it dfs = [rename_cols(data) for _, data in df.groupby(spot_col)] # now copy out the non-traj data (that wasn't split apart in prev line) df = df[extra_cols].copy() # flatten across the "spot_id" dimension by removing that index and # then deleting redundant index values (this assumes that extra_cols # are always constant as you change spot_id), if they are not you # should be including them in the "traj" cols. Otherwise, only the # "spot==1" value is kept here df.index = df.index.droplevel(spot_col) df = df[~df.index.duplicated(keep='first')] # now add back in the columns we manually split apart for data in dfs: df[data.columns] = data # if we are creating a spot column from numbered columns elif spot_col not in df.index.names and len(cols_to_pivot) > 0: extra_cols = list(set(cols) - set(cols_to_pivot)) # loci id => [existing column names] spot_cols = {} for col in cols_to_pivot: for r in rs: if r.match(col): spot_id = int(r.match(col).groups()[0]) if spot_id in spot_cols: spot_cols[spot_id].append(col) else: spot_cols[spot_id] = [col] spot_dfs = {} for spot_id in spot_cols: # trasnform loci id => [existing columns names] # to loci id => df with only columns from that spot id spot_dfs[spot_id] = df[spot_cols[spot_id]].copy() spot_dfs[spot_id].columns = [col[:-len(str(spot_id))] for col in spot_dfs[spot_id].columns] # copy over non-index, non-pivot columns as-is for col in extra_cols: spot_dfs[spot_id][col] = df[col] # now add the correct spot value to a new column for each new # dataframe we've created spot_dfs[spot_id][spot_col] = spot_id df = pd.concat(list(spot_dfs.values())) df = df.set_index(spot_col, append=True) else: raise ValueError('''Could not determine which way to pivot. Either your pivot_cols must exist as numbered columns or your spot_col column should exist, but not neither or both.''') return df