comparison crossSampleOverview.py @ 2:a64ece32a01a draft default tip

"planemo upload for repository https://github.com/ImmPortDB/immport-galaxy-tools/tree/master/flowtools/cs_overview commit a46097db0b6056e1125237393eb6974cfd51eb41"
author azomics
date Tue, 28 Jul 2020 08:32:36 -0400
parents
children
comparison
equal deleted inserted replaced
1:bca68066a957 2:a64ece32a01a
1 #!/usr/bin/env python
2
3 ######################################################################
4 # Copyright (c) 2016 Northrop Grumman.
5 # All rights reserved.
6 ######################################################################
7
8 # version 1.1 -- August 2017
9 # added checks for consistency between input files
10 # and upper limit on nb of cluster to look at
11
12 from __future__ import print_function
13 import sys
14 import os
15 import logging
16 import fileinput
17 import pandas as pd
18 from argparse import ArgumentParser
19 from jinja2 import Environment, FileSystemLoader
20 from shutil import copyfile
21 from collections import defaultdict
22
23
24 def check_pops(mfi_file, stat1):
25 df = pd.read_table(mfi_file)
26 df1 = pd.read_table(stat1)
27 nb_pop = len(set(df.Population))
28 nb_pop1 = len(df1.columns) - 2
29 if (nb_pop > 40):
30 sys.stderr.write("There are " + str(nb_pop) + " in the input file.")
31 sys.exit(1)
32 if (nb_pop != nb_pop1):
33 sys.exit(2)
34
35
36 def panel_to_json_string(df):
37 # from http://stackoverflow.com/questions/28078118/merge-many-json-strings-with-python-pandas-inputs
38 def __merge_stream(key, stream):
39 return '"' + key + '"' + ': ' + stream + ', '
40 try:
41 if 'Unnamed: 0' in df.columns:
42 df = df.drop(['Unnamed: 0'], axis=1)
43 stream = '{'
44 for index, subdf in df.groupby(level=0):
45 stream += __merge_stream(index, df.loc[index, :, :].droplevel(0).to_json())
46 # take out extra last comma
47 stream = stream[:-2]
48 # add the final paren
49 stream += '}'
50 except:
51 logging.exception('Panel Encoding did not work')
52 return stream
53
54
55 def get_outliers(group, upper, lower):
56 cat = group.name
57 out = {}
58 for marker in group:
59 # skip population since upper and lower don't contain it, since it was made after a group by Population
60 if marker != 'Population':
61 out[marker] = group[(group[marker] > upper.loc[cat][marker]) | (group[marker] < lower.loc[cat][marker])][marker]
62 return out
63
64
65 def get_boxplot_stats(all_data, mfi_file, output_json):
66 # modified code from http://bokeh.pydata.org/en/latest/docs/gallery/boxplot.html
67 # Get initial MFI values
68 mfi = pd.read_table(mfi_file)
69 mfi = mfi.set_index('Population')
70
71 df = pd.read_table(all_data)
72 # check if ever some pops not in cs_files
73 missing_pop = [x for x in mfi.index if x not in set(df.Population)]
74
75 if (missing_pop):
76 zeros = {}
77 for m in df.columns:
78 zeros[m] = [0 for x in missing_pop]
79 tmpdf = pd.DataFrame(zeros)
80 tmpdf.Population = missing_pop
81 df = df.append(tmpdf)
82
83 pops = df.groupby('Population')
84 q1 = pops.quantile(q=0.25)
85 q2 = pops.quantile(q=0.5)
86 q3 = pops.quantile(q=0.75)
87 iqr = q3 - q1
88 upper = q3 + 1.5*iqr
89 lower = q1 - 1.5*iqr
90 resampled = False
91 # get outliers
92 out = pops.apply(get_outliers, upper, lower).dropna()
93 outliers = defaultdict(dict)
94 for population in set(df.Population):
95 for marker in df.columns:
96 if marker != 'Population':
97 tmp_outliers = list(out[population][marker])
98 if (len(list(out[population][marker])) > 100):
99 tmp_outliers = list(out[population][marker].sample(n=100))
100 resampled = True
101 outliers[population][marker] = tmp_outliers
102 outdf = pd.DataFrame(outliers)
103
104 data = pd.concat({'q1': q1,
105 'q2': q2,
106 'q3': q3,
107 'upper': upper,
108 'lower': lower,
109 'outliers': outdf.T,
110 'mfi': mfi}, keys=['q1','q2','q3','upper','lower','outliers','mfi'])
111
112 with open(output_json, "w") as js_all:
113 js_all.write(panel_to_json_string(data))
114
115 return resampled
116
117
118 def cs_overview(input_file, input_mfi, init_mfi, output_file, output_dir, tools_dir, cs_files):
119 os.mkdir(output_dir)
120
121 env = Environment(loader=FileSystemLoader(tools_dir + "/templates"))
122 template = env.get_template("csOverview.template")
123
124 real_directory = output_dir.replace("/job_working_directory", "")
125 context = {'outputDirectory': real_directory}
126 overview = template.render(**context)
127 with open(output_file, "w") as outf:
128 outf.write(overview)
129
130 cs_overview_file = output_dir + "/csOverview.tsv"
131 copyfile(input_file, cs_overview_file)
132
133 cs_overview_mfis = output_dir + "/csAllMFIs.tsv"
134 copyfile(input_mfi, cs_overview_mfis)
135
136 # Get all the data to calculate quantiles, IRC and outliers.
137 tmp_all_data = "csAllData.tsv"
138 with open(tmp_all_data, "a") as alldata:
139 # assumes that the files have ran through flock and CS and therefore have the same headers
140 df1 = pd.read_table(cs_files[0])
141 df1.to_csv(alldata, sep="\t", header=True, index=False)
142 for i in range(1, len(cs_files)):
143 df = pd.read_table(cs_files[i])
144 df.to_csv(alldata, sep="\t", header=False, index=False)
145
146 cs_boxplot_data = output_dir + "/csBoxplotData.json"
147 resampled = get_boxplot_stats(tmp_all_data, init_mfi, cs_boxplot_data)
148 if resampled:
149 to_find = '<div id="outlierWarning" style="display:none;">'
150 to_replace = '<div id="outlierWarning">'
151 ## yay python 2.7
152 ro = fileinput.input(output_file, inplace=True, backup=".bak")
153 for roline in ro:
154 print(roline.replace(to_find, to_replace), end='')
155 ro.close()
156
157 return
158
159
160 if __name__ == "__main__":
161 parser = ArgumentParser(
162 prog="csOverview",
163 description="Generate an overview plot of crossSample results.")
164
165 parser.add_argument(
166 '-i',
167 dest="input_file",
168 required=True,
169 help="File location for the summary statistics from CrossSample.")
170
171 parser.add_argument(
172 '-I',
173 dest="input_mfi",
174 required=True,
175 help="File location for the MFI summary statistics from CrossSample.")
176
177 parser.add_argument(
178 '-s',
179 dest="cs_outputs",
180 required=True,
181 action='append',
182 help="File location for the CrossSample output files.")
183
184 parser.add_argument(
185 '-o',
186 dest="output_file",
187 required=True,
188 help="File location for the HTML output file.")
189
190 parser.add_argument(
191 '-m',
192 dest="mfi",
193 required=True,
194 help="File location for the MFI from FLOCK.")
195
196 parser.add_argument(
197 '-d',
198 dest="output_directory",
199 required=True,
200 help="Directory location for the html supporting files.")
201
202 parser.add_argument(
203 '-t',
204 dest="tool_directory",
205 required=True,
206 help="Location of the Tool Directory.")
207
208 args = parser.parse_args()
209
210 cs_files = [f for f in args.cs_outputs]
211 check_pops(args.mfi, args.input_file)
212 cs_overview(args.input_file, args.input_mfi, args.mfi, args.output_file, args.output_directory, args.tool_directory, cs_files)