0
|
1 #!/usr/bin/env python
|
|
2 ######################################################################
|
|
3 # Copyright (c) 2016 Northrop Grumman.
|
|
4 # All rights reserved.
|
|
5 ######################################################################
|
|
6 from __future__ import print_function
|
|
7 import sys
|
|
8 import os
|
|
9 from scipy.stats import gmean
|
|
10 from argparse import ArgumentParser
|
|
11 from collections import defaultdict
|
|
12 import pandas as pd
|
|
13
|
|
14 #
|
|
15 # version 1.1 -- April 2016 -- C. Thomas
|
|
16 # modified to read in several input files and output to a directory
|
|
17 # + generates summary statistics
|
|
18 # also checks before running that input files are consistent with centroid file
|
|
19 #
|
|
20
|
|
21
|
|
22 def compare_MFIs(input_files, f_names, mfi_file):
|
|
23 header_MFIs = ""
|
|
24 flag_error = False
|
|
25 with open(mfi_file, "r") as mfi_check:
|
|
26 mfi_fl = mfi_check.readline().split("\t")
|
|
27 header_MFIs = "\t".join([mfi_fl[h] for h in range(1, len(mfi_fl))])
|
|
28
|
|
29 for hh, files in enumerate(input_files):
|
|
30 with open(files, "r") as inf:
|
|
31 hdrs = inf.readline()
|
|
32 if hdrs != header_MFIs:
|
|
33 sys.stderr.write(hdrs + "headers in " + f_names[hh] + " are not consistent with FLOCK centroid file:\n" + header_MFIs + "\n")
|
|
34 flag_error = True
|
|
35 if flag_error:
|
|
36 sys.exit(2)
|
|
37
|
|
38
|
|
39 def stats_MFIs(cs_df, ctr, mfi_calc):
|
|
40 if mfi_calc == "mfi":
|
|
41 MFIs = cs_df.groupby('Population').mean().round(decimals=2)
|
|
42 elif mfi_calc == "gmfi":
|
|
43 MFIs = cs_df.groupby('Population').agg(lambda x: gmean(list(x))).round(decimals=2)
|
|
44 else:
|
|
45 MFIs = cs_df.groupby('Population').median().round(decimals=2)
|
|
46 pop_freq = (cs_df.Population.value_counts(normalize=True) * 100).round(decimals=2)
|
|
47 sorted_pop_freq = pop_freq.sort_index()
|
|
48 MFIs['Percentage'] = sorted_pop_freq
|
|
49 MFIs['Population'] = MFIs.index
|
|
50 MFIs['SampleName'] = "".join(["Sample", str(ctr).zfill(2)])
|
|
51 return MFIs
|
|
52
|
|
53
|
|
54 def get_pop_prop(input_files, summary_stat, mfi_stats, marker_names, mfi_calc):
|
|
55 pop_count = defaultdict(dict)
|
|
56 mrk = marker_names.strip().split("\t")
|
|
57 markers = "\t".join([mrk[m] for m in range(1, len(mrk))])
|
|
58
|
|
59 ctr_mfi = 0
|
|
60 nb_pop = 0
|
|
61 tot = {}
|
|
62 with open(mfi_stats, "a") as mfis:
|
|
63 mfis.write("\t".join([markers, "Percentage", "Population", "SampleName"]) + "\n")
|
|
64 for files in input_files:
|
|
65 cs = pd.read_table(files)
|
|
66 tot[files] = len(cs.index)
|
|
67 for pops in cs.Population:
|
|
68 if pops in pop_count[files]:
|
|
69 pop_count[files][pops] += 1
|
|
70 else:
|
|
71 pop_count[files][pops] = 1
|
|
72 if (len(pop_count[files]) > nb_pop):
|
|
73 nb_pop = len(pop_count[files])
|
|
74 ctr_mfi += 1
|
|
75 cs_stats = stats_MFIs(cs, ctr_mfi, mfi_calc)
|
|
76 cs_stats.to_csv(mfis, sep="\t", header=False, index=False)
|
|
77
|
|
78 ctr = 0
|
|
79 with open(summary_stat, "w") as outf:
|
|
80 itpop = [str(x) for x in range(1, nb_pop + 1)]
|
|
81 cols = "\t".join(itpop)
|
|
82 outf.write("FileID\tSampleName\t" + cols + "\n")
|
|
83 for eachfile in pop_count:
|
|
84 tmp = []
|
|
85 for num in range(1, nb_pop + 1):
|
|
86 if num not in pop_count[eachfile]:
|
|
87 pop_count[eachfile][num] = 0
|
|
88 tmp.append(str((pop_count[eachfile][num] / float(tot[eachfile])) * 100))
|
|
89 props = "\t".join(tmp)
|
|
90 ctr += 1
|
|
91 sample_name = "".join(["Sample", str(ctr).zfill(2)])
|
|
92 outf.write("\t".join([input_files[eachfile], sample_name, props]) + "\n")
|
|
93
|
|
94
|
|
95 def run_cross_sample(input_files, f_names, mfi_file, output_dir, summary_stat,
|
|
96 mfi_stats, tool_directory, mfi_calc):
|
|
97 markers = ""
|
|
98 # Strip off Header Line
|
|
99 with open(mfi_file, "r") as mfi_in, open("mfi.txt", "w") as mfi_out:
|
|
100 markers = mfi_in.readline().strip("\n")
|
|
101 for line in mfi_in:
|
|
102 mfi_out.write(line)
|
|
103
|
|
104 # Create output directory
|
|
105 if not os.path.exists(output_dir):
|
|
106 os.makedirs(output_dir)
|
|
107
|
|
108 outputs = {}
|
|
109 # Run cent_adjust
|
|
110 for nm, flow_file in enumerate(input_files):
|
|
111 run_command = tool_directory + "/bin/cent_adjust mfi.txt " + flow_file
|
|
112 print(run_command)
|
|
113 os.system(run_command)
|
|
114 flow_name = os.path.split(flow_file)[1]
|
|
115 outfile = os.path.join(output_dir, flow_name + ".flowclr")
|
|
116 outputs[outfile] = f_names[nm]
|
|
117 with open(flow_file, "r") as flowf, open("population_id.txt", "r") as popf, open(outfile, "w") as outf:
|
|
118 f_line = flowf.readline()
|
|
119 f_line = f_line.rstrip()
|
|
120 f_line = f_line + "\tPopulation\n"
|
|
121 outf.write(f_line)
|
|
122
|
|
123 for line in flowf:
|
|
124 line = line.rstrip()
|
|
125 pop_line = popf.readline()
|
|
126 pop_line = pop_line.rstrip()
|
|
127 line = line + "\t" + pop_line + "\n"
|
|
128 outf.write(line)
|
|
129 get_pop_prop(outputs, summary_stat, mfi_stats, markers, mfi_calc)
|
|
130 return
|
|
131
|
|
132
|
|
133 def generate_CS_stats(mfi_stats, all_stats):
|
|
134 df = pd.read_table(mfi_stats)
|
|
135 means = df.groupby('Population').mean().round(decimals=2)
|
|
136 medians = df.groupby('Population').median().round(decimals=2)
|
|
137 stdev = df.groupby('Population').std().round(decimals=2)
|
|
138 all_markers = []
|
|
139 with open(mfi_stats, "r") as ms:
|
|
140 ms_fl = ms.readline().strip()
|
|
141 all_markers = ms_fl.split("\t")[0:-2]
|
|
142
|
|
143 with open(all_stats, "w") as mstats:
|
|
144 hdgs = ["\t".join(["_".join([mrs, "mean"]), "_".join([mrs, "median"]), "_".join([mrs, "stdev"])]) for mrs in all_markers]
|
|
145 mstats.write("Population\t")
|
|
146 mstats.write("\t".join(hdgs) + "\n")
|
|
147 for pops in set(df.Population):
|
|
148 tmp_line = []
|
|
149 for mar in all_markers:
|
|
150 tmp_line.append("\t".join([str(means.loc[pops, mar]), str(medians.loc[pops, mar]), str(stdev.loc[pops, mar])]))
|
|
151 mstats.write(str(pops) + "\t")
|
|
152 mstats.write("\t".join(tmp_line) + "\n")
|
|
153
|
|
154
|
|
155 if __name__ == "__main__":
|
|
156 parser = ArgumentParser(
|
|
157 prog="runCrossSample",
|
|
158 description="Run CrossSample on Flow file")
|
|
159
|
|
160 parser.add_argument(
|
|
161 '-i',
|
|
162 dest="input_files",
|
|
163 required=True,
|
|
164 action='append',
|
|
165 help="File locations for flow text files.")
|
|
166
|
|
167 parser.add_argument(
|
|
168 '-n',
|
|
169 dest="filenames",
|
|
170 required=True,
|
|
171 action='append',
|
|
172 help="Filenames")
|
|
173
|
|
174 parser.add_argument(
|
|
175 '-m',
|
|
176 dest="mfi",
|
|
177 required=True,
|
|
178 help="File location for the MFI text file.")
|
|
179
|
|
180 parser.add_argument(
|
|
181 '-o',
|
|
182 dest="out_path",
|
|
183 required=True,
|
|
184 help="Path to the directory for the output files.")
|
|
185
|
|
186 parser.add_argument(
|
|
187 '-M',
|
|
188 dest="mfi_calc",
|
|
189 required=True,
|
|
190 help="what to calculate for centroids.")
|
|
191
|
|
192 parser.add_argument(
|
|
193 '-s',
|
|
194 dest="sstat",
|
|
195 required=True,
|
|
196 help="File location for the summary statistics.")
|
|
197
|
|
198 parser.add_argument(
|
|
199 '-S',
|
|
200 dest="mfi_stat",
|
|
201 required=True,
|
|
202 help="File location for the MFI summary statistics.")
|
|
203
|
|
204 parser.add_argument(
|
|
205 '-t',
|
|
206 dest="tool_dir",
|
|
207 required=True,
|
|
208 help="File location for cent_adjust.")
|
|
209
|
|
210 parser.add_argument(
|
|
211 '-a',
|
|
212 dest="all_stats",
|
|
213 required=True,
|
|
214 help="File location for stats on all markers.")
|
|
215
|
|
216 args = parser.parse_args()
|
|
217
|
|
218 input_files = [f for f in args.input_files]
|
|
219 input_names = [n for n in args.filenames]
|
|
220 compare_MFIs(input_files, input_names, args.mfi)
|
|
221 run_cross_sample(input_files, input_names, args.mfi, args.out_path, args.sstat, args.mfi_stat, args.tool_dir, args.mfi_calc)
|
|
222 generate_CS_stats(args.mfi_stat, args.all_stats)
|
|
223
|
|
224 sys.exit(0)
|