Mercurial > repos > immport-devteam > cross_sample
comparison cross_sample/runCrossSample.py @ 0:8d951baf795f draft
Uploaded
author | immport-devteam |
---|---|
date | Mon, 27 Feb 2017 12:47:17 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:8d951baf795f |
---|---|
1 #!/usr/bin/env python | |
2 ###################################################################### | |
3 # Copyright (c) 2016 Northrop Grumman. | |
4 # All rights reserved. | |
5 ###################################################################### | |
6 from __future__ import print_function | |
7 import sys | |
8 import os | |
9 from scipy.stats import gmean | |
10 from argparse import ArgumentParser | |
11 from collections import defaultdict | |
12 import pandas as pd | |
13 | |
14 # | |
15 # version 1.1 -- April 2016 -- C. Thomas | |
16 # modified to read in several input files and output to a directory | |
17 # + generates summary statistics | |
18 # also checks before running that input files are consistent with centroid file | |
19 # | |
20 | |
21 | |
22 def compare_MFIs(input_files, f_names, mfi_file): | |
23 header_MFIs = "" | |
24 flag_error = False | |
25 with open(mfi_file, "r") as mfi_check: | |
26 mfi_fl = mfi_check.readline().split("\t") | |
27 header_MFIs = "\t".join([mfi_fl[h] for h in range(1, len(mfi_fl))]) | |
28 | |
29 for hh, files in enumerate(input_files): | |
30 with open(files, "r") as inf: | |
31 hdrs = inf.readline() | |
32 if hdrs != header_MFIs: | |
33 sys.stderr.write(hdrs + "headers in " + f_names[hh] + " are not consistent with FLOCK centroid file:\n" + header_MFIs + "\n") | |
34 flag_error = True | |
35 if flag_error: | |
36 sys.exit(2) | |
37 | |
38 | |
39 def stats_MFIs(cs_df, ctr, mfi_calc): | |
40 if mfi_calc == "mfi": | |
41 MFIs = cs_df.groupby('Population').mean().round(decimals=2) | |
42 elif mfi_calc == "gmfi": | |
43 MFIs = cs_df.groupby('Population').agg(lambda x: gmean(list(x))).round(decimals=2) | |
44 else: | |
45 MFIs = cs_df.groupby('Population').median().round(decimals=2) | |
46 pop_freq = (cs_df.Population.value_counts(normalize=True) * 100).round(decimals=2) | |
47 sorted_pop_freq = pop_freq.sort_index() | |
48 MFIs['Percentage'] = sorted_pop_freq | |
49 MFIs['Population'] = MFIs.index | |
50 MFIs['SampleName'] = "".join(["Sample", str(ctr).zfill(2)]) | |
51 return MFIs | |
52 | |
53 | |
54 def get_pop_prop(input_files, summary_stat, mfi_stats, marker_names, mfi_calc): | |
55 pop_count = defaultdict(dict) | |
56 mrk = marker_names.strip().split("\t") | |
57 markers = "\t".join([mrk[m] for m in range(1, len(mrk))]) | |
58 | |
59 ctr_mfi = 0 | |
60 nb_pop = 0 | |
61 tot = {} | |
62 with open(mfi_stats, "a") as mfis: | |
63 mfis.write("\t".join([markers, "Percentage", "Population", "SampleName"]) + "\n") | |
64 for files in input_files: | |
65 cs = pd.read_table(files) | |
66 tot[files] = len(cs.index) | |
67 for pops in cs.Population: | |
68 if pops in pop_count[files]: | |
69 pop_count[files][pops] += 1 | |
70 else: | |
71 pop_count[files][pops] = 1 | |
72 if (len(pop_count[files]) > nb_pop): | |
73 nb_pop = len(pop_count[files]) | |
74 ctr_mfi += 1 | |
75 cs_stats = stats_MFIs(cs, ctr_mfi, mfi_calc) | |
76 cs_stats.to_csv(mfis, sep="\t", header=False, index=False) | |
77 | |
78 ctr = 0 | |
79 with open(summary_stat, "w") as outf: | |
80 itpop = [str(x) for x in range(1, nb_pop + 1)] | |
81 cols = "\t".join(itpop) | |
82 outf.write("FileID\tSampleName\t" + cols + "\n") | |
83 for eachfile in pop_count: | |
84 tmp = [] | |
85 for num in range(1, nb_pop + 1): | |
86 if num not in pop_count[eachfile]: | |
87 pop_count[eachfile][num] = 0 | |
88 tmp.append(str((pop_count[eachfile][num] / float(tot[eachfile])) * 100)) | |
89 props = "\t".join(tmp) | |
90 ctr += 1 | |
91 sample_name = "".join(["Sample", str(ctr).zfill(2)]) | |
92 outf.write("\t".join([input_files[eachfile], sample_name, props]) + "\n") | |
93 | |
94 | |
95 def run_cross_sample(input_files, f_names, mfi_file, output_dir, summary_stat, | |
96 mfi_stats, tool_directory, mfi_calc): | |
97 markers = "" | |
98 # Strip off Header Line | |
99 with open(mfi_file, "r") as mfi_in, open("mfi.txt", "w") as mfi_out: | |
100 markers = mfi_in.readline().strip("\n") | |
101 for line in mfi_in: | |
102 mfi_out.write(line) | |
103 | |
104 # Create output directory | |
105 if not os.path.exists(output_dir): | |
106 os.makedirs(output_dir) | |
107 | |
108 outputs = {} | |
109 # Run cent_adjust | |
110 for nm, flow_file in enumerate(input_files): | |
111 run_command = tool_directory + "/bin/cent_adjust mfi.txt " + flow_file | |
112 print(run_command) | |
113 os.system(run_command) | |
114 flow_name = os.path.split(flow_file)[1] | |
115 outfile = os.path.join(output_dir, flow_name + ".flowclr") | |
116 outputs[outfile] = f_names[nm] | |
117 with open(flow_file, "r") as flowf, open("population_id.txt", "r") as popf, open(outfile, "w") as outf: | |
118 f_line = flowf.readline() | |
119 f_line = f_line.rstrip() | |
120 f_line = f_line + "\tPopulation\n" | |
121 outf.write(f_line) | |
122 | |
123 for line in flowf: | |
124 line = line.rstrip() | |
125 pop_line = popf.readline() | |
126 pop_line = pop_line.rstrip() | |
127 line = line + "\t" + pop_line + "\n" | |
128 outf.write(line) | |
129 get_pop_prop(outputs, summary_stat, mfi_stats, markers, mfi_calc) | |
130 return | |
131 | |
132 | |
133 def generate_CS_stats(mfi_stats, all_stats): | |
134 df = pd.read_table(mfi_stats) | |
135 means = df.groupby('Population').mean().round(decimals=2) | |
136 medians = df.groupby('Population').median().round(decimals=2) | |
137 stdev = df.groupby('Population').std().round(decimals=2) | |
138 all_markers = [] | |
139 with open(mfi_stats, "r") as ms: | |
140 ms_fl = ms.readline().strip() | |
141 all_markers = ms_fl.split("\t")[0:-2] | |
142 | |
143 with open(all_stats, "w") as mstats: | |
144 hdgs = ["\t".join(["_".join([mrs, "mean"]), "_".join([mrs, "median"]), "_".join([mrs, "stdev"])]) for mrs in all_markers] | |
145 mstats.write("Population\t") | |
146 mstats.write("\t".join(hdgs) + "\n") | |
147 for pops in set(df.Population): | |
148 tmp_line = [] | |
149 for mar in all_markers: | |
150 tmp_line.append("\t".join([str(means.loc[pops, mar]), str(medians.loc[pops, mar]), str(stdev.loc[pops, mar])])) | |
151 mstats.write(str(pops) + "\t") | |
152 mstats.write("\t".join(tmp_line) + "\n") | |
153 | |
154 | |
155 if __name__ == "__main__": | |
156 parser = ArgumentParser( | |
157 prog="runCrossSample", | |
158 description="Run CrossSample on Flow file") | |
159 | |
160 parser.add_argument( | |
161 '-i', | |
162 dest="input_files", | |
163 required=True, | |
164 action='append', | |
165 help="File locations for flow text files.") | |
166 | |
167 parser.add_argument( | |
168 '-n', | |
169 dest="filenames", | |
170 required=True, | |
171 action='append', | |
172 help="Filenames") | |
173 | |
174 parser.add_argument( | |
175 '-m', | |
176 dest="mfi", | |
177 required=True, | |
178 help="File location for the MFI text file.") | |
179 | |
180 parser.add_argument( | |
181 '-o', | |
182 dest="out_path", | |
183 required=True, | |
184 help="Path to the directory for the output files.") | |
185 | |
186 parser.add_argument( | |
187 '-M', | |
188 dest="mfi_calc", | |
189 required=True, | |
190 help="what to calculate for centroids.") | |
191 | |
192 parser.add_argument( | |
193 '-s', | |
194 dest="sstat", | |
195 required=True, | |
196 help="File location for the summary statistics.") | |
197 | |
198 parser.add_argument( | |
199 '-S', | |
200 dest="mfi_stat", | |
201 required=True, | |
202 help="File location for the MFI summary statistics.") | |
203 | |
204 parser.add_argument( | |
205 '-t', | |
206 dest="tool_dir", | |
207 required=True, | |
208 help="File location for cent_adjust.") | |
209 | |
210 parser.add_argument( | |
211 '-a', | |
212 dest="all_stats", | |
213 required=True, | |
214 help="File location for stats on all markers.") | |
215 | |
216 args = parser.parse_args() | |
217 | |
218 input_files = [f for f in args.input_files] | |
219 input_names = [n for n in args.filenames] | |
220 compare_MFIs(input_files, input_names, args.mfi) | |
221 run_cross_sample(input_files, input_names, args.mfi, args.out_path, args.sstat, args.mfi_stat, args.tool_dir, args.mfi_calc) | |
222 generate_CS_stats(args.mfi_stat, args.all_stats) | |
223 | |
224 sys.exit(0) |