diff crossSampleOverview.py @ 2:a64ece32a01a draft default tip

"planemo upload for repository https://github.com/ImmPortDB/immport-galaxy-tools/tree/master/flowtools/cs_overview commit a46097db0b6056e1125237393eb6974cfd51eb41"
author azomics
date Tue, 28 Jul 2020 08:32:36 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/crossSampleOverview.py	Tue Jul 28 08:32:36 2020 -0400
@@ -0,0 +1,212 @@
+#!/usr/bin/env python
+
+######################################################################
+#                  Copyright (c) 2016 Northrop Grumman.
+#                          All rights reserved.
+######################################################################
+
+# version 1.1 -- August 2017
+# added checks for consistency between input files
+# and upper limit on nb of cluster to look at
+
+from __future__ import print_function
+import sys
+import os
+import logging
+import fileinput
+import pandas as pd
+from argparse import ArgumentParser
+from jinja2 import Environment, FileSystemLoader
+from shutil import copyfile
+from collections import defaultdict
+
+
+def check_pops(mfi_file, stat1):
+    df = pd.read_table(mfi_file)
+    df1 = pd.read_table(stat1)
+    nb_pop = len(set(df.Population))
+    nb_pop1 = len(df1.columns) - 2
+    if (nb_pop > 40):
+        sys.stderr.write("There are " + str(nb_pop) + " in the input file.")
+        sys.exit(1)
+    if (nb_pop != nb_pop1):
+        sys.exit(2)
+
+
+def panel_to_json_string(df):
+    # from http://stackoverflow.com/questions/28078118/merge-many-json-strings-with-python-pandas-inputs
+    def __merge_stream(key, stream):
+        return '"' + key + '"' + ': ' + stream + ', '
+    try:
+        if 'Unnamed: 0' in df.columns:
+            df = df.drop(['Unnamed: 0'], axis=1)
+        stream = '{'
+        for index, subdf in df.groupby(level=0):
+            stream += __merge_stream(index, df.loc[index, :, :].droplevel(0).to_json())
+        # take out extra last comma
+        stream = stream[:-2]
+        # add the final paren
+        stream += '}'
+    except:
+        logging.exception('Panel Encoding did not work')
+    return stream
+
+
+def get_outliers(group, upper, lower):
+    cat = group.name
+    out = {}
+    for marker in group:
+        # skip population since upper and lower don't contain it, since it was made after a group by Population
+        if marker != 'Population':
+            out[marker] = group[(group[marker] > upper.loc[cat][marker]) | (group[marker] < lower.loc[cat][marker])][marker]
+    return out
+
+
+def get_boxplot_stats(all_data, mfi_file, output_json):
+    # modified code from http://bokeh.pydata.org/en/latest/docs/gallery/boxplot.html
+    # Get initial MFI values
+    mfi = pd.read_table(mfi_file)
+    mfi = mfi.set_index('Population')
+
+    df = pd.read_table(all_data)
+    # check if ever some pops not in cs_files
+    missing_pop = [x for x in mfi.index if x not in set(df.Population)]
+
+    if (missing_pop):
+        zeros = {}
+        for m in df.columns:
+            zeros[m] = [0 for x in missing_pop]
+        tmpdf = pd.DataFrame(zeros)
+        tmpdf.Population = missing_pop
+        df = df.append(tmpdf)
+
+    pops = df.groupby('Population')
+    q1 = pops.quantile(q=0.25)
+    q2 = pops.quantile(q=0.5)
+    q3 = pops.quantile(q=0.75)
+    iqr = q3 - q1
+    upper = q3 + 1.5*iqr
+    lower = q1 - 1.5*iqr
+    resampled = False
+    # get outliers
+    out = pops.apply(get_outliers, upper, lower).dropna()
+    outliers = defaultdict(dict)
+    for population in set(df.Population):
+        for marker in df.columns:
+            if marker != 'Population':
+                tmp_outliers = list(out[population][marker])
+                if (len(list(out[population][marker])) > 100):
+                    tmp_outliers = list(out[population][marker].sample(n=100))
+                    resampled = True
+                outliers[population][marker] = tmp_outliers
+    outdf = pd.DataFrame(outliers)
+
+    data = pd.concat({'q1': q1,
+            'q2': q2,
+            'q3': q3,
+            'upper': upper,
+            'lower': lower,
+            'outliers': outdf.T,
+            'mfi': mfi}, keys=['q1','q2','q3','upper','lower','outliers','mfi'])
+
+    with open(output_json, "w") as js_all:
+        js_all.write(panel_to_json_string(data))
+
+    return resampled
+
+
+def cs_overview(input_file, input_mfi, init_mfi, output_file, output_dir, tools_dir, cs_files):
+    os.mkdir(output_dir)
+
+    env = Environment(loader=FileSystemLoader(tools_dir + "/templates"))
+    template = env.get_template("csOverview.template")
+
+    real_directory = output_dir.replace("/job_working_directory", "")
+    context = {'outputDirectory': real_directory}
+    overview = template.render(**context)
+    with open(output_file, "w") as outf:
+        outf.write(overview)
+
+    cs_overview_file = output_dir + "/csOverview.tsv"
+    copyfile(input_file, cs_overview_file)
+
+    cs_overview_mfis = output_dir + "/csAllMFIs.tsv"
+    copyfile(input_mfi, cs_overview_mfis)
+
+    # Get all the data to calculate quantiles, IRC and outliers.
+    tmp_all_data = "csAllData.tsv"
+    with open(tmp_all_data, "a") as alldata:
+        # assumes that the files have ran through flock and CS and therefore have the same headers
+        df1 = pd.read_table(cs_files[0])
+        df1.to_csv(alldata, sep="\t", header=True, index=False)
+        for i in range(1, len(cs_files)):
+            df = pd.read_table(cs_files[i])
+            df.to_csv(alldata, sep="\t", header=False, index=False)
+
+    cs_boxplot_data = output_dir + "/csBoxplotData.json"
+    resampled = get_boxplot_stats(tmp_all_data, init_mfi, cs_boxplot_data)
+    if resampled:
+        to_find = '<div id="outlierWarning" style="display:none;">'
+        to_replace = '<div id="outlierWarning">'
+        ## yay python 2.7
+        ro = fileinput.input(output_file, inplace=True, backup=".bak")
+        for roline in ro:
+            print(roline.replace(to_find, to_replace), end='')
+        ro.close()
+
+    return
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(
+             prog="csOverview",
+             description="Generate an overview plot of crossSample results.")
+
+    parser.add_argument(
+            '-i',
+            dest="input_file",
+            required=True,
+            help="File location for the summary statistics from CrossSample.")
+
+    parser.add_argument(
+            '-I',
+            dest="input_mfi",
+            required=True,
+            help="File location for the MFI summary statistics from CrossSample.")
+
+    parser.add_argument(
+            '-s',
+            dest="cs_outputs",
+            required=True,
+            action='append',
+            help="File location for the CrossSample output files.")
+
+    parser.add_argument(
+            '-o',
+            dest="output_file",
+            required=True,
+            help="File location for the HTML output file.")
+
+    parser.add_argument(
+            '-m',
+            dest="mfi",
+            required=True,
+            help="File location for the MFI from FLOCK.")
+
+    parser.add_argument(
+            '-d',
+            dest="output_directory",
+            required=True,
+            help="Directory location for the html supporting files.")
+
+    parser.add_argument(
+            '-t',
+            dest="tool_directory",
+            required=True,
+            help="Location of the Tool Directory.")
+
+    args = parser.parse_args()
+
+    cs_files = [f for f in args.cs_outputs]
+    check_pops(args.mfi, args.input_file)
+    cs_overview(args.input_file, args.input_mfi, args.mfi, args.output_file, args.output_directory, args.tool_directory, cs_files)