Mercurial > repos > immport-devteam > cs_overview
comparison crossSampleOverview.py @ 2:a64ece32a01a draft default tip
"planemo upload for repository https://github.com/ImmPortDB/immport-galaxy-tools/tree/master/flowtools/cs_overview commit a46097db0b6056e1125237393eb6974cfd51eb41"
| author | azomics |
|---|---|
| date | Tue, 28 Jul 2020 08:32:36 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:bca68066a957 | 2:a64ece32a01a |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 ###################################################################### | |
| 4 # Copyright (c) 2016 Northrop Grumman. | |
| 5 # All rights reserved. | |
| 6 ###################################################################### | |
| 7 | |
| 8 # version 1.1 -- August 2017 | |
| 9 # added checks for consistency between input files | |
| 10 # and upper limit on nb of cluster to look at | |
| 11 | |
| 12 from __future__ import print_function | |
| 13 import sys | |
| 14 import os | |
| 15 import logging | |
| 16 import fileinput | |
| 17 import pandas as pd | |
| 18 from argparse import ArgumentParser | |
| 19 from jinja2 import Environment, FileSystemLoader | |
| 20 from shutil import copyfile | |
| 21 from collections import defaultdict | |
| 22 | |
| 23 | |
| 24 def check_pops(mfi_file, stat1): | |
| 25 df = pd.read_table(mfi_file) | |
| 26 df1 = pd.read_table(stat1) | |
| 27 nb_pop = len(set(df.Population)) | |
| 28 nb_pop1 = len(df1.columns) - 2 | |
| 29 if (nb_pop > 40): | |
| 30 sys.stderr.write("There are " + str(nb_pop) + " in the input file.") | |
| 31 sys.exit(1) | |
| 32 if (nb_pop != nb_pop1): | |
| 33 sys.exit(2) | |
| 34 | |
| 35 | |
| 36 def panel_to_json_string(df): | |
| 37 # from http://stackoverflow.com/questions/28078118/merge-many-json-strings-with-python-pandas-inputs | |
| 38 def __merge_stream(key, stream): | |
| 39 return '"' + key + '"' + ': ' + stream + ', ' | |
| 40 try: | |
| 41 if 'Unnamed: 0' in df.columns: | |
| 42 df = df.drop(['Unnamed: 0'], axis=1) | |
| 43 stream = '{' | |
| 44 for index, subdf in df.groupby(level=0): | |
| 45 stream += __merge_stream(index, df.loc[index, :, :].droplevel(0).to_json()) | |
| 46 # take out extra last comma | |
| 47 stream = stream[:-2] | |
| 48 # add the final paren | |
| 49 stream += '}' | |
| 50 except: | |
| 51 logging.exception('Panel Encoding did not work') | |
| 52 return stream | |
| 53 | |
| 54 | |
| 55 def get_outliers(group, upper, lower): | |
| 56 cat = group.name | |
| 57 out = {} | |
| 58 for marker in group: | |
| 59 # skip population since upper and lower don't contain it, since it was made after a group by Population | |
| 60 if marker != 'Population': | |
| 61 out[marker] = group[(group[marker] > upper.loc[cat][marker]) | (group[marker] < lower.loc[cat][marker])][marker] | |
| 62 return out | |
| 63 | |
| 64 | |
| 65 def get_boxplot_stats(all_data, mfi_file, output_json): | |
| 66 # modified code from http://bokeh.pydata.org/en/latest/docs/gallery/boxplot.html | |
| 67 # Get initial MFI values | |
| 68 mfi = pd.read_table(mfi_file) | |
| 69 mfi = mfi.set_index('Population') | |
| 70 | |
| 71 df = pd.read_table(all_data) | |
| 72 # check if ever some pops not in cs_files | |
| 73 missing_pop = [x for x in mfi.index if x not in set(df.Population)] | |
| 74 | |
| 75 if (missing_pop): | |
| 76 zeros = {} | |
| 77 for m in df.columns: | |
| 78 zeros[m] = [0 for x in missing_pop] | |
| 79 tmpdf = pd.DataFrame(zeros) | |
| 80 tmpdf.Population = missing_pop | |
| 81 df = df.append(tmpdf) | |
| 82 | |
| 83 pops = df.groupby('Population') | |
| 84 q1 = pops.quantile(q=0.25) | |
| 85 q2 = pops.quantile(q=0.5) | |
| 86 q3 = pops.quantile(q=0.75) | |
| 87 iqr = q3 - q1 | |
| 88 upper = q3 + 1.5*iqr | |
| 89 lower = q1 - 1.5*iqr | |
| 90 resampled = False | |
| 91 # get outliers | |
| 92 out = pops.apply(get_outliers, upper, lower).dropna() | |
| 93 outliers = defaultdict(dict) | |
| 94 for population in set(df.Population): | |
| 95 for marker in df.columns: | |
| 96 if marker != 'Population': | |
| 97 tmp_outliers = list(out[population][marker]) | |
| 98 if (len(list(out[population][marker])) > 100): | |
| 99 tmp_outliers = list(out[population][marker].sample(n=100)) | |
| 100 resampled = True | |
| 101 outliers[population][marker] = tmp_outliers | |
| 102 outdf = pd.DataFrame(outliers) | |
| 103 | |
| 104 data = pd.concat({'q1': q1, | |
| 105 'q2': q2, | |
| 106 'q3': q3, | |
| 107 'upper': upper, | |
| 108 'lower': lower, | |
| 109 'outliers': outdf.T, | |
| 110 'mfi': mfi}, keys=['q1','q2','q3','upper','lower','outliers','mfi']) | |
| 111 | |
| 112 with open(output_json, "w") as js_all: | |
| 113 js_all.write(panel_to_json_string(data)) | |
| 114 | |
| 115 return resampled | |
| 116 | |
| 117 | |
| 118 def cs_overview(input_file, input_mfi, init_mfi, output_file, output_dir, tools_dir, cs_files): | |
| 119 os.mkdir(output_dir) | |
| 120 | |
| 121 env = Environment(loader=FileSystemLoader(tools_dir + "/templates")) | |
| 122 template = env.get_template("csOverview.template") | |
| 123 | |
| 124 real_directory = output_dir.replace("/job_working_directory", "") | |
| 125 context = {'outputDirectory': real_directory} | |
| 126 overview = template.render(**context) | |
| 127 with open(output_file, "w") as outf: | |
| 128 outf.write(overview) | |
| 129 | |
| 130 cs_overview_file = output_dir + "/csOverview.tsv" | |
| 131 copyfile(input_file, cs_overview_file) | |
| 132 | |
| 133 cs_overview_mfis = output_dir + "/csAllMFIs.tsv" | |
| 134 copyfile(input_mfi, cs_overview_mfis) | |
| 135 | |
| 136 # Get all the data to calculate quantiles, IRC and outliers. | |
| 137 tmp_all_data = "csAllData.tsv" | |
| 138 with open(tmp_all_data, "a") as alldata: | |
| 139 # assumes that the files have ran through flock and CS and therefore have the same headers | |
| 140 df1 = pd.read_table(cs_files[0]) | |
| 141 df1.to_csv(alldata, sep="\t", header=True, index=False) | |
| 142 for i in range(1, len(cs_files)): | |
| 143 df = pd.read_table(cs_files[i]) | |
| 144 df.to_csv(alldata, sep="\t", header=False, index=False) | |
| 145 | |
| 146 cs_boxplot_data = output_dir + "/csBoxplotData.json" | |
| 147 resampled = get_boxplot_stats(tmp_all_data, init_mfi, cs_boxplot_data) | |
| 148 if resampled: | |
| 149 to_find = '<div id="outlierWarning" style="display:none;">' | |
| 150 to_replace = '<div id="outlierWarning">' | |
| 151 ## yay python 2.7 | |
| 152 ro = fileinput.input(output_file, inplace=True, backup=".bak") | |
| 153 for roline in ro: | |
| 154 print(roline.replace(to_find, to_replace), end='') | |
| 155 ro.close() | |
| 156 | |
| 157 return | |
| 158 | |
| 159 | |
| 160 if __name__ == "__main__": | |
| 161 parser = ArgumentParser( | |
| 162 prog="csOverview", | |
| 163 description="Generate an overview plot of crossSample results.") | |
| 164 | |
| 165 parser.add_argument( | |
| 166 '-i', | |
| 167 dest="input_file", | |
| 168 required=True, | |
| 169 help="File location for the summary statistics from CrossSample.") | |
| 170 | |
| 171 parser.add_argument( | |
| 172 '-I', | |
| 173 dest="input_mfi", | |
| 174 required=True, | |
| 175 help="File location for the MFI summary statistics from CrossSample.") | |
| 176 | |
| 177 parser.add_argument( | |
| 178 '-s', | |
| 179 dest="cs_outputs", | |
| 180 required=True, | |
| 181 action='append', | |
| 182 help="File location for the CrossSample output files.") | |
| 183 | |
| 184 parser.add_argument( | |
| 185 '-o', | |
| 186 dest="output_file", | |
| 187 required=True, | |
| 188 help="File location for the HTML output file.") | |
| 189 | |
| 190 parser.add_argument( | |
| 191 '-m', | |
| 192 dest="mfi", | |
| 193 required=True, | |
| 194 help="File location for the MFI from FLOCK.") | |
| 195 | |
| 196 parser.add_argument( | |
| 197 '-d', | |
| 198 dest="output_directory", | |
| 199 required=True, | |
| 200 help="Directory location for the html supporting files.") | |
| 201 | |
| 202 parser.add_argument( | |
| 203 '-t', | |
| 204 dest="tool_directory", | |
| 205 required=True, | |
| 206 help="Location of the Tool Directory.") | |
| 207 | |
| 208 args = parser.parse_args() | |
| 209 | |
| 210 cs_files = [f for f in args.cs_outputs] | |
| 211 check_pops(args.mfi, args.input_file) | |
| 212 cs_overview(args.input_file, args.input_mfi, args.mfi, args.output_file, args.output_directory, args.tool_directory, cs_files) |
