Source code for multi_locus_analysis.dataframes

"""Utilities for massaging DataFrames"""

import re
import json
import pandas as pd
import numpy as np

[docs]def array_from_numpy_string(s):
    """For unserializing np.array's after DataFrame.to_csv

    Notes
    -----

    As of 2019-03-13, looks like instead of commas separating the numpy array
    elements, *two* spaces are printed. We just replace these spaces with
    commas after removing all other spaces and load the string as json."""

    s = re.sub(r' *\[ +', '[', s)
    s = re.sub(' ([-0-9 ])', '\\1,', s)
    s = re.sub('\n', ',', s)
    return np.array(json.loads(s))

[docs]def pivot_loci(df, pivot_cols=['x', 'y', 'z'], spot_col='spot'):
    """Move between "long" and "short" forms for the spot id column.

    Simply put, we want to be able to transform between the following two
    dataframes::

        Condensed form
                                                   X1   Y1   Z1   X2   Y2   Z2     t foci
        locus genotype exp.rep meiosis cell frame
        HET5  WT       2       t0      1    1      1.6  2.4  3.1  2.1  1.5  3.1    0  unp
                                            2      1.9  1.5  3.1  1.9  2.5  3.1   30  unp
                                            3      2.0  1.8  3.0  1.5  2.5  3.4   60  unp
                                            4      2.1  1.9  3.0  1.4  2.2  3.4   90  unp
                                            5      2.2  1.8  3.0  1.5  2.4  3.4  120  unp

    and
    ::

        "Long" form
                                                        X    Y    Z      t foci
        locus genotype exp.rep meiosis cell frame spot
        HET5  WT       2       t0      1    1     1     1.6  2.4  3.1    0  unp
                                            2     1     1.9  1.5  3.1   30  unp
                                            3     1     2.0  1.8  3.0   60  unp
                                            ...
                                            1     2     2.1  1.5  3.1    0  unp
                                            2     2     1.9  2.5  3.1   30  unp
                                            3     2     1.5  2.5  3.4   60  unp

    This function can infer which direction to pivot. Because of this, I have
    found using this function much more convenient (and a smaller cognitive
    load) than using a multiindex for the column names and using e.g. pd.unstack
    and friends.

    Parameters
    ----------
    pivot_cols : List<str>
        The names of the columns over which to pivot (without their numerical
        suffixes, these will be inferred).
    spot_col : str
        The name of the column that holds (or will hold) the spot id.

    Returns
    -------
    df : pd.DataFrame
        The pivot-ed DataFrame.
    """
    cols = list(df.columns)
    rs = [re.compile(col+'([0-9]+)') for col in pivot_cols]
    cols_to_pivot = [col for col in cols if any([r.match(col) for r in rs])]
    # if we are creating the numbered columns
    if spot_col in df.index.names and len(cols_to_pivot) == 0 \
    and all(col in df.columns for col in pivot_cols):
        extra_cols = list(set(cols) - set(pivot_cols))
        def rename_cols(data):
            data = data.copy()
            spot_id = str(data.index.get_level_values(spot_col)[0])
            for col in df.columns:
                if col in pivot_cols:
                    data[col+spot_id] = data[col].copy()
                del data[col]
            data.index = data.index.droplevel(spot_col)
            return data
        # make one dataframe per spot_id, with just that data in it
        dfs = [rename_cols(data) for _, data in df.groupby(spot_col)]
        # now copy out the non-traj data (that wasn't split apart in prev line)
        df = df[extra_cols].copy()
        # flatten across the "spot_id" dimension by removing that index and
        # then deleting redundant index values (this assumes that extra_cols
        # are always constant as you change spot_id), if they are not you
        # should be including them in the "traj" cols. Otherwise, only the
        # "spot==1" value is kept here
        df.index = df.index.droplevel(spot_col)
        df = df[~df.index.duplicated(keep='first')]
        # now add back in the columns we manually split apart
        for data in dfs:
            df[data.columns] = data
    # if we are creating a spot column from numbered columns
    elif spot_col not in df.index.names and len(cols_to_pivot) > 0:
        extra_cols = list(set(cols) - set(cols_to_pivot))
        # loci id => [existing column names]
        spot_cols = {}
        for col in cols_to_pivot:
            for r in rs:
                if r.match(col):
                    spot_id = int(r.match(col).groups()[0])
                    if spot_id in spot_cols:
                        spot_cols[spot_id].append(col)
                    else:
                        spot_cols[spot_id] = [col]
        spot_dfs = {}
        for spot_id in spot_cols:
            # trasnform loci id => [existing columns names]
            # to loci id => df with only columns from that spot id
            spot_dfs[spot_id] = df[spot_cols[spot_id]].copy()
            spot_dfs[spot_id].columns = [col[:-len(str(spot_id))]
                    for col in spot_dfs[spot_id].columns]
            # copy over non-index, non-pivot columns as-is
            for col in extra_cols:
                spot_dfs[spot_id][col] = df[col]
            # now add the correct spot value to a new column for each new
            # dataframe we've created
            spot_dfs[spot_id][spot_col] = spot_id
        df = pd.concat(list(spot_dfs.values()))
        df = df.set_index(spot_col, append=True)
    else:
        raise ValueError('''Could not determine which way to pivot.
Either your pivot_cols must exist as numbered columns or your spot_col column
should exist, but not neither or both.''')
    return df