comparison genFlowOverview.py @ 1:b5453d07f740 draft default tip

"planemo upload for repository https://github.com/ImmPortDB/immport-galaxy-tools/tree/master/flowtools/flow_overview commit 65373effef15809f3db0e5f9603ef808f4110aa3"
author azomics
date Wed, 29 Jul 2020 17:03:53 -0400
parents
children
comparison
equal deleted inserted replaced
0:8283ff163ba6 1:b5453d07f740
1
2 #!/usr/bin/env python
3 ######################################################################
4 # Copyright (c) 2016 Northrop Grumman.
5 # All rights reserved.
6 ######################################################################
7
8 # version 1.1 - August 2017
9 # added upper limit to nb of clusters (40)
10 #
11 import sys
12 import os
13 import pandas as pd
14 import logging
15 import fileinput
16
17 from argparse import ArgumentParser
18 from jinja2 import Environment, FileSystemLoader
19 from collections import defaultdict
20
21 from color_palette import color_palette
22 from flowstatlib import gen_overview_stats
23 import matplotlib as mpl
24 mpl.use('Agg')
25 import matplotlib.pyplot as plt
26
27
28 profile_key = {
29 "1": "-",
30 "2": "lo",
31 "3": "+",
32 "4": "hi"
33 }
34
35
36 # flow CL functions
37 def run_flowCL(phenotype, output_txt, output_pdf, tool):
38 run_command = " ". join(["Rscript --slave --vanilla", tool, output_txt,
39 phenotype])
40 os.system(run_command)
41
42 get_graph = " ".join(["mv flowCL_results/*.pdf", output_pdf])
43 os.system(get_graph)
44 return
45
46
47 def generate_flowCL_query(list_markers, list_types):
48 if (len(list_markers) != len(list_types)):
49 return("pb with headers")
50 query = []
51 # go through both lists, remove fsc/ssc
52 for i in range(1, len(list_markers)):
53 if not list_markers[i].startswith("FSC") and not list_markers[i].startswith("SSC"):
54 query.append(list_markers[i].upper())
55 query.append(profile_key[list_types[i]])
56 # return concatenated string
57 return("".join(query))
58
59
60 def translate_profiles(input_file, tool_dir, html_dir):
61 tool = "/".join([tool_dir, "getOntology.R"])
62 html_table = "".join([html_dir, "/CLprofiles.txt"])
63 score_table = "".join(["cp ", input_file, " ", html_dir, "/scores.txt"])
64 os.system(score_table)
65
66 # read profile
67 with open(input_file, "r") as flock_profiles, open(html_table, "w") as out:
68 headers = flock_profiles.readline()
69 headers = headers.strip()
70 markers = headers.split("\t")[:-2]
71 counter = 0
72
73 out.write("Population\tFlowCL Query\tNb Results\tLink to PDF\t")
74 out.write("Top Result Label\tTop Result Score\tTop Result CL\n")
75 queries = {}
76 # create marker query for each population
77 for lines in flock_profiles:
78 lines = lines.strip("\n")
79 pop_profile = lines.split("\t")[:-2]
80 flowcl_query = generate_flowCL_query(markers, pop_profile)
81 counter += 1
82 nb_results = "0"
83 top_label = "no_match"
84 top_score = "NA"
85 top_CL = "NA"
86 pdf_link = "NA"
87
88 # check if query was run before
89 if flowcl_query not in queries:
90 # create filenames for results & graphs
91 txt = "".join(["flowcl_pop", str(counter).zfill(2), ".txt"])
92 text_result = "/".join([html_dir, txt])
93 graph = "".join(["flowcl_pop", str(counter).zfill(2), ".pdf"])
94 graph_output = "/".join([html_dir, graph])
95 # run flowCL for each marker profile
96 run_flowCL(flowcl_query, text_result, graph_output, tool)
97
98 # test that text file exists if not results are all NAs:
99 if os.path.isfile(text_result):
100 with open(text_result, "r") as res:
101 for line in res:
102 if line.startswith("Score"):
103 data = line.split(") ")
104 top_score = data[2][:-2]
105 tot_results = len(data) - 2
106 nb_results = str(tot_results)
107 if tot_results == 5:
108 if len(data[6].split("+")) > 1:
109 nb_results = "5+"
110 elif line.startswith("Cell ID"):
111 prep_link = line.split(") ")[1][:-2]
112 cl = prep_link.replace("_", ":")
113 link = "".join(['<a href="http://www.immport-labs.org/immport-ontology/public/home/home/', cl, '" target="_blank">'])
114 top_CL = "".join([link, prep_link, "</a>"])
115 elif line.startswith("Cell Label"):
116 top_label = line.split(") ")[1][:-2]
117 pdf_link = "".join(['<a href="', graph, '" target="_blank">PDF</a>'])
118 tmpflowcl_query = "".join(['<a href="', txt, '" target="_blank">', flowcl_query, '</a>'])
119 queries[flowcl_query] = {
120 "query": tmpflowcl_query,
121 "results": nb_results,
122 "pdf": pdf_link,
123 "label": top_label,
124 "score": top_score,
125 "CL": top_CL
126 }
127
128 # write query results to CLprofiles.txt
129 out.write("\t".join([pop_profile[0],
130 queries[flowcl_query]["query"],
131 queries[flowcl_query]["results"],
132 queries[flowcl_query]["pdf"],
133 queries[flowcl_query]["label"],
134 queries[flowcl_query]["score"],
135 queries[flowcl_query]["CL"]]) + "\n")
136
137
138 # boxplots data massaging
139 def panel_to_json_string(df):
140 # from http://stackoverflow.com/questions/28078118/merge-many-json-strings-with-python-pandas-inputs
141 def __merge_stream(key, stream):
142 return '"' + key + '"' + ': ' + stream + ', '
143 try:
144 if 'Unnamed: 0' in df.columns:
145 df = df.drop(['Unnamed: 0'], axis=1)
146 stream = '{'
147 for index, subdf in df.groupby(level=0):
148 stream += __merge_stream(index, df.loc[index, :, :].droplevel(0).to_json())
149 # take out extra last comma
150 stream = stream[:-2]
151 # add the final paren
152 stream += '}'
153 except:
154 logging.exception('Panel Encoding did not work')
155 return stream
156
157
158 def get_outliers(group, upper, lower):
159 cat = group.name
160 out = {}
161 for marker in group:
162 # skip population since upper and lower don't contain it, since it was made after a group by Population
163 if marker != 'Population':
164 out[marker] = group[(group[marker] > upper.loc[cat][marker]) | (group[marker] < lower.loc[cat][marker])][marker]
165 return out
166
167
168 def get_boxplot_stats(all_data, mfi_file, output_json):
169 # modified code from http://bokeh.pydata.org/en/latest/docs/gallery/boxplot.html
170 # Get initial MFI values
171 mfi = pd.read_table(mfi_file)
172 mfi = mfi.set_index('Population')
173
174 df = pd.read_table(all_data)
175 # check if ever some pops not in cs_files
176 missing_pop = [x for x in mfi.index if x not in set(df.Population)]
177
178 if (missing_pop):
179 zeros = {}
180 for m in df.columns:
181 zeros[m] = [0 for x in missing_pop]
182 tmpdf = pd.DataFrame(zeros)
183 tmpdf.Population = missing_pop
184 df = df.append(tmpdf)
185
186 pops = df.groupby('Population')
187 q1 = pops.quantile(q=0.25)
188 q2 = pops.quantile(q=0.5)
189 q3 = pops.quantile(q=0.75)
190 iqr = q3 - q1
191 upper = q3 + 1.5*iqr
192 lower = q1 - 1.5*iqr
193 resampled = False
194 # get outliers
195 out = pops.apply(get_outliers, upper, lower).dropna()
196 outliers = defaultdict(dict)
197 for population in set(df.Population):
198 for marker in df.columns:
199 if marker != 'Population':
200 tmp_outliers = list(out[population][marker])
201 if (len(list(out[population][marker]))> 100):
202 tmp_outliers = list(out[population][marker].sample(n=100))
203 resampled = True
204 outliers[population][marker] = tmp_outliers
205 outdf = pd.DataFrame(outliers)
206
207 data = pd.concat({'q1': q1,
208 'q2': q2,
209 'q3': q3,
210 'upper': upper,
211 'lower': lower,
212 'outliers': outdf.T,
213 'mfi': mfi}, keys=['q1','q2','q3','upper','lower','outliers','mfi'])
214
215 with open(output_json, "w") as js_all:
216 js_all.write(panel_to_json_string(data))
217
218 return resampled
219
220 # html generation
221 def gen_flow_overview(args):
222 flow_stats = gen_overview_stats(args.input_file)
223 if len(set(flow_stats['population'])) > 40:
224 nbpop = str(len(set(flow_stats['population'])))
225 sys.stderr.write("There are " + nbpop + " in the input file.")
226 sys.exit(3)
227
228 os.mkdir(args.output_directory)
229 html_template = "genOverview.template"
230
231 if args.scores:
232 translate_profiles(args.scores, args.tool_directory, args.output_directory)
233 html_template = "genOverviewCL.template"
234
235 env = Environment(loader=FileSystemLoader(args.tool_directory + "/templates"))
236 template = env.get_template(html_template)
237
238 real_directory = args.output_directory.replace("/job_working_directory", "")
239 context = {'outputDirectory': real_directory}
240 overview = template.render(**context)
241 with open(args.output_file, "w") as f:
242 f.write(overview)
243
244 flow_sample_file_name = args.output_directory + "/flow.sample"
245 with open(flow_sample_file_name, "w") as flow_sample_file:
246 flow_stats['sample'].to_csv(flow_sample_file, sep="\t", index=False, float_format='%.0f')
247
248 flow_mfi_file_name = args.output_directory + "/flow.mfi"
249 with open(flow_mfi_file_name, "w") as flow_mfi_file:
250 flow_stats[args.mfi_calc].to_csv(flow_mfi_file, sep="\t", float_format='%.0f')
251
252 mpop = "_".join([args.mfi_calc, "pop"])
253 flow_mfi_pop_file_name = args.output_directory + "/flow.mfi_pop"
254 with open(flow_mfi_pop_file_name, "w") as flow_mfi_pop_file:
255 flow_stats[mpop].to_csv(flow_mfi_pop_file, sep="\t", index=False, float_format="%.2f")
256
257 # box plot data
258 boxplot_data = args.output_directory + "/boxplotData.json"
259 resampled = get_boxplot_stats(args.input_file, flow_mfi_file_name, boxplot_data)
260
261 # Generate the Images -- eventually we should change that over to D3
262 fcm = flow_stats['sample_data'].values
263 colors = []
264 for i, j in enumerate(flow_stats['sample_population']):
265 colors.append(color_palette[j])
266
267 for i in range(flow_stats['columns']):
268 for j in range(flow_stats['columns']):
269 file_name = "m" + str(i) + "_m" + str(j)
270 ax = plt.subplot(1, 1, 1)
271 plt.subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=1.0, wspace=0.0, hspace=0.0)
272 plt.scatter(fcm[:, i], fcm[:, j], s=1, c=colors, edgecolors='none')
273 plt.axis([0, 1024, 0, 1024])
274 plt.xticks([])
275 plt.yticks([])
276 F = plt.gcf()
277 F.set_size_inches(1, 1)
278 F.set_dpi(90)
279 png_file = file_name + "_90X90.png"
280 F.savefig(args.output_directory + "/" + png_file)
281 plt.clf()
282
283 flow_overview_file_name = args.output_directory + "/flow.overview"
284 with open(flow_overview_file_name, "w") as flow_overview_file:
285 flow_overview_file.write("<table>\n")
286 flow_overview_file.write("<tr><td>&nbsp;</td>\n")
287 for i in range(flow_stats['columns']):
288 flow_overview_file.write("<td>" + flow_stats['markers'][i] + "</td>\n")
289
290 for i in range(flow_stats['columns']):
291 flow_overview_file.write("<tr>\n")
292 flow_overview_file.write("<td>" + flow_stats['markers'][i] + "</td>\n")
293 for j in range(flow_stats['columns']):
294 file_name = "m" + str(j) + "_m" + str(i)
295 image_file = file_name + "_90X90.png"
296 flow_overview_file.write('<td><img src="' + image_file + '"/></td>')
297 flow_overview_file.write("</tr>\n")
298
299 flow_overview_file.write("</table>\n</body>\n<html>\n")
300
301 if resampled:
302 to_find = '<div id="outlierWarning" style="display:none;">'
303 to_replace = '<div id="outlierWarning">'
304 ## yay python 2.7
305 ro = fileinput.input(args.output_file, inplace=True, backup=".bak")
306 for roline in ro:
307 print(roline.replace(to_find, to_replace), end='')
308 ro.close()
309
310
311 if __name__ == "__main__":
312 parser = ArgumentParser(
313 prog="genOverview",
314 description="Generate an overview plot of Flow results.")
315
316 parser.add_argument(
317 '-i',
318 dest="input_file",
319 required=True,
320 help="File location for the Flow Text file.")
321
322 parser.add_argument(
323 '-o',
324 dest="output_file",
325 required=True,
326 help="File location for the HTML output file.")
327
328 parser.add_argument(
329 '-d',
330 dest="output_directory",
331 required=True,
332 help="Directory location for the Flow Plot.")
333
334 parser.add_argument(
335 '-M',
336 dest="mfi_calc",
337 required=True,
338 help="what to calculate for centroids.")
339
340 parser.add_argument(
341 '-p',
342 dest="scores",
343 help="File location for FLOCK population scores.")
344
345 parser.add_argument(
346 '-t',
347 dest="tool_directory",
348 required=True,
349 help="Location of the Tool Directory.")
350
351 args = parser.parse_args()
352
353 gen_flow_overview(args)