Mercurial > repos > immport-devteam > cs_overview
comparison crossSampleOverview.py @ 2:a64ece32a01a draft default tip
"planemo upload for repository https://github.com/ImmPortDB/immport-galaxy-tools/tree/master/flowtools/cs_overview commit a46097db0b6056e1125237393eb6974cfd51eb41"
author | azomics |
---|---|
date | Tue, 28 Jul 2020 08:32:36 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1:bca68066a957 | 2:a64ece32a01a |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 ###################################################################### | |
4 # Copyright (c) 2016 Northrop Grumman. | |
5 # All rights reserved. | |
6 ###################################################################### | |
7 | |
8 # version 1.1 -- August 2017 | |
9 # added checks for consistency between input files | |
10 # and upper limit on nb of cluster to look at | |
11 | |
12 from __future__ import print_function | |
13 import sys | |
14 import os | |
15 import logging | |
16 import fileinput | |
17 import pandas as pd | |
18 from argparse import ArgumentParser | |
19 from jinja2 import Environment, FileSystemLoader | |
20 from shutil import copyfile | |
21 from collections import defaultdict | |
22 | |
23 | |
24 def check_pops(mfi_file, stat1): | |
25 df = pd.read_table(mfi_file) | |
26 df1 = pd.read_table(stat1) | |
27 nb_pop = len(set(df.Population)) | |
28 nb_pop1 = len(df1.columns) - 2 | |
29 if (nb_pop > 40): | |
30 sys.stderr.write("There are " + str(nb_pop) + " in the input file.") | |
31 sys.exit(1) | |
32 if (nb_pop != nb_pop1): | |
33 sys.exit(2) | |
34 | |
35 | |
36 def panel_to_json_string(df): | |
37 # from http://stackoverflow.com/questions/28078118/merge-many-json-strings-with-python-pandas-inputs | |
38 def __merge_stream(key, stream): | |
39 return '"' + key + '"' + ': ' + stream + ', ' | |
40 try: | |
41 if 'Unnamed: 0' in df.columns: | |
42 df = df.drop(['Unnamed: 0'], axis=1) | |
43 stream = '{' | |
44 for index, subdf in df.groupby(level=0): | |
45 stream += __merge_stream(index, df.loc[index, :, :].droplevel(0).to_json()) | |
46 # take out extra last comma | |
47 stream = stream[:-2] | |
48 # add the final paren | |
49 stream += '}' | |
50 except: | |
51 logging.exception('Panel Encoding did not work') | |
52 return stream | |
53 | |
54 | |
55 def get_outliers(group, upper, lower): | |
56 cat = group.name | |
57 out = {} | |
58 for marker in group: | |
59 # skip population since upper and lower don't contain it, since it was made after a group by Population | |
60 if marker != 'Population': | |
61 out[marker] = group[(group[marker] > upper.loc[cat][marker]) | (group[marker] < lower.loc[cat][marker])][marker] | |
62 return out | |
63 | |
64 | |
65 def get_boxplot_stats(all_data, mfi_file, output_json): | |
66 # modified code from http://bokeh.pydata.org/en/latest/docs/gallery/boxplot.html | |
67 # Get initial MFI values | |
68 mfi = pd.read_table(mfi_file) | |
69 mfi = mfi.set_index('Population') | |
70 | |
71 df = pd.read_table(all_data) | |
72 # check if ever some pops not in cs_files | |
73 missing_pop = [x for x in mfi.index if x not in set(df.Population)] | |
74 | |
75 if (missing_pop): | |
76 zeros = {} | |
77 for m in df.columns: | |
78 zeros[m] = [0 for x in missing_pop] | |
79 tmpdf = pd.DataFrame(zeros) | |
80 tmpdf.Population = missing_pop | |
81 df = df.append(tmpdf) | |
82 | |
83 pops = df.groupby('Population') | |
84 q1 = pops.quantile(q=0.25) | |
85 q2 = pops.quantile(q=0.5) | |
86 q3 = pops.quantile(q=0.75) | |
87 iqr = q3 - q1 | |
88 upper = q3 + 1.5*iqr | |
89 lower = q1 - 1.5*iqr | |
90 resampled = False | |
91 # get outliers | |
92 out = pops.apply(get_outliers, upper, lower).dropna() | |
93 outliers = defaultdict(dict) | |
94 for population in set(df.Population): | |
95 for marker in df.columns: | |
96 if marker != 'Population': | |
97 tmp_outliers = list(out[population][marker]) | |
98 if (len(list(out[population][marker])) > 100): | |
99 tmp_outliers = list(out[population][marker].sample(n=100)) | |
100 resampled = True | |
101 outliers[population][marker] = tmp_outliers | |
102 outdf = pd.DataFrame(outliers) | |
103 | |
104 data = pd.concat({'q1': q1, | |
105 'q2': q2, | |
106 'q3': q3, | |
107 'upper': upper, | |
108 'lower': lower, | |
109 'outliers': outdf.T, | |
110 'mfi': mfi}, keys=['q1','q2','q3','upper','lower','outliers','mfi']) | |
111 | |
112 with open(output_json, "w") as js_all: | |
113 js_all.write(panel_to_json_string(data)) | |
114 | |
115 return resampled | |
116 | |
117 | |
118 def cs_overview(input_file, input_mfi, init_mfi, output_file, output_dir, tools_dir, cs_files): | |
119 os.mkdir(output_dir) | |
120 | |
121 env = Environment(loader=FileSystemLoader(tools_dir + "/templates")) | |
122 template = env.get_template("csOverview.template") | |
123 | |
124 real_directory = output_dir.replace("/job_working_directory", "") | |
125 context = {'outputDirectory': real_directory} | |
126 overview = template.render(**context) | |
127 with open(output_file, "w") as outf: | |
128 outf.write(overview) | |
129 | |
130 cs_overview_file = output_dir + "/csOverview.tsv" | |
131 copyfile(input_file, cs_overview_file) | |
132 | |
133 cs_overview_mfis = output_dir + "/csAllMFIs.tsv" | |
134 copyfile(input_mfi, cs_overview_mfis) | |
135 | |
136 # Get all the data to calculate quantiles, IRC and outliers. | |
137 tmp_all_data = "csAllData.tsv" | |
138 with open(tmp_all_data, "a") as alldata: | |
139 # assumes that the files have ran through flock and CS and therefore have the same headers | |
140 df1 = pd.read_table(cs_files[0]) | |
141 df1.to_csv(alldata, sep="\t", header=True, index=False) | |
142 for i in range(1, len(cs_files)): | |
143 df = pd.read_table(cs_files[i]) | |
144 df.to_csv(alldata, sep="\t", header=False, index=False) | |
145 | |
146 cs_boxplot_data = output_dir + "/csBoxplotData.json" | |
147 resampled = get_boxplot_stats(tmp_all_data, init_mfi, cs_boxplot_data) | |
148 if resampled: | |
149 to_find = '<div id="outlierWarning" style="display:none;">' | |
150 to_replace = '<div id="outlierWarning">' | |
151 ## yay python 2.7 | |
152 ro = fileinput.input(output_file, inplace=True, backup=".bak") | |
153 for roline in ro: | |
154 print(roline.replace(to_find, to_replace), end='') | |
155 ro.close() | |
156 | |
157 return | |
158 | |
159 | |
160 if __name__ == "__main__": | |
161 parser = ArgumentParser( | |
162 prog="csOverview", | |
163 description="Generate an overview plot of crossSample results.") | |
164 | |
165 parser.add_argument( | |
166 '-i', | |
167 dest="input_file", | |
168 required=True, | |
169 help="File location for the summary statistics from CrossSample.") | |
170 | |
171 parser.add_argument( | |
172 '-I', | |
173 dest="input_mfi", | |
174 required=True, | |
175 help="File location for the MFI summary statistics from CrossSample.") | |
176 | |
177 parser.add_argument( | |
178 '-s', | |
179 dest="cs_outputs", | |
180 required=True, | |
181 action='append', | |
182 help="File location for the CrossSample output files.") | |
183 | |
184 parser.add_argument( | |
185 '-o', | |
186 dest="output_file", | |
187 required=True, | |
188 help="File location for the HTML output file.") | |
189 | |
190 parser.add_argument( | |
191 '-m', | |
192 dest="mfi", | |
193 required=True, | |
194 help="File location for the MFI from FLOCK.") | |
195 | |
196 parser.add_argument( | |
197 '-d', | |
198 dest="output_directory", | |
199 required=True, | |
200 help="Directory location for the html supporting files.") | |
201 | |
202 parser.add_argument( | |
203 '-t', | |
204 dest="tool_directory", | |
205 required=True, | |
206 help="Location of the Tool Directory.") | |
207 | |
208 args = parser.parse_args() | |
209 | |
210 cs_files = [f for f in args.cs_outputs] | |
211 check_pops(args.mfi, args.input_file) | |
212 cs_overview(args.input_file, args.input_mfi, args.mfi, args.output_file, args.output_directory, args.tool_directory, cs_files) |