comparison runCrossSample.py @ 4:e80b0f62ffb3 draft default tip

"planemo upload for repository https://github.com/ImmPortDB/immport-galaxy-tools/tree/master/flowtools/cross_sample commit e7eab2dca0c1f73f580362f61425a78d4c8892ce"
author azomics
date Wed, 29 Jul 2020 13:32:17 -0400
parents
children
comparison
equal deleted inserted replaced
3:5f670146a9af 4:e80b0f62ffb3
1 #!/usr/bin/env python
2 ######################################################################
3 # Copyright (c) 2016 Northrop Grumman.
4 # All rights reserved.
5 ######################################################################
6 import sys
7 import os
8 from scipy.stats import gmean
9 from argparse import ArgumentParser
10 from collections import defaultdict
11 import pandas as pd
12
13 #
14 # version 1.1 -- April 2016 -- C. Thomas
15 # modified to read in several input files and output to a directory
16 # + generates summary statistics
17 # also checks before running that input files are consistent with centroid file
18 #
19
20
21 def compare_MFIs(input_files, f_names, mfi_file):
22 header_MFIs = ""
23 flag_error = False
24 with open(mfi_file, "r") as mfi_check:
25 mfi_fl = mfi_check.readline().split("\t")
26 header_MFIs = "\t".join([mfi_fl[h] for h in range(1, len(mfi_fl))])
27
28 for hh, files in enumerate(input_files):
29 with open(files, "r") as inf:
30 hdrs = inf.readline()
31 if hdrs != header_MFIs:
32 sys.stderr.write(hdrs + "headers in " + f_names[hh] + " are not consistent with FLOCK centroid file:\n" + header_MFIs + "\n")
33 flag_error = True
34 if flag_error:
35 sys.exit(2)
36
37
38 def stats_MFIs(cs_df, ctr, mfi_calc):
39 if mfi_calc == "mfi":
40 MFIs = cs_df.groupby('Population').mean().round(decimals=2)
41 elif mfi_calc == "gmfi":
42 MFIs = cs_df.groupby('Population').agg(lambda x: gmean(list(x))).round(decimals=2)
43 else:
44 MFIs = cs_df.groupby('Population').median().round(decimals=2)
45 pop_freq = (cs_df.Population.value_counts(normalize=True) * 100).round(decimals=2)
46 sorted_pop_freq = pop_freq.sort_index()
47 MFIs['Percentage'] = sorted_pop_freq
48 MFIs['Population'] = MFIs.index
49 MFIs['SampleName'] = "".join(["Sample", str(ctr).zfill(2)])
50 return MFIs
51
52
53 def get_pop_prop(input_files, summary_stat, mfi_stats, marker_names, mfi_calc):
54 pop_count = defaultdict(dict)
55 mrk = marker_names.strip().split("\t")
56 markers = "\t".join([mrk[m] for m in range(1, len(mrk))])
57
58 ctr_mfi = 0
59 nb_pop = 0
60 tot = {}
61 with open(mfi_stats, "a") as mfis:
62 mfis.write("\t".join([markers, "Percentage", "Population", "SampleName"]) + "\n")
63 for files in input_files:
64 cs = pd.read_table(files)
65 tot[files] = len(cs.index)
66 for pops in cs.Population:
67 if pops in pop_count[files]:
68 pop_count[files][pops] += 1
69 else:
70 pop_count[files][pops] = 1
71 max_nb_pop = max(set(cs.Population))
72 if (max_nb_pop > nb_pop):
73 nb_pop = max_nb_pop
74 ctr_mfi += 1
75 cs_stats = stats_MFIs(cs, ctr_mfi, mfi_calc)
76 cs_stats.to_csv(mfis, sep="\t", header=False, index=False)
77
78 ctr = 0
79 with open(summary_stat, "w") as outf:
80 itpop = [str(x) for x in range(1, nb_pop + 1)]
81 cols = "\t".join(itpop)
82 outf.write("FileID\tSampleName\t" + cols + "\n")
83 for eachfile in pop_count:
84 tmp = []
85 for num in range(1, nb_pop + 1):
86 if num not in pop_count[eachfile]:
87 pop_count[eachfile][num] = 0
88 tmp.append(str((pop_count[eachfile][num] / float(tot[eachfile])) * 100))
89 props = "\t".join(tmp)
90 ctr += 1
91 sample_name = "".join(["Sample", str(ctr).zfill(2)])
92 outf.write("\t".join([input_files[eachfile], sample_name, props]) + "\n")
93
94
95 def run_cross_sample(input_files, f_names, mfi_file, output_dir, summary_stat,
96 mfi_stats, mfi_calc):
97 markers = ""
98 # Strip off Header Line
99 with open(mfi_file, "r") as mfi_in, open("mfi.txt", "w") as mfi_out:
100 markers = mfi_in.readline().strip("\n")
101 for line in mfi_in:
102 mfi_out.write(line)
103
104 # Create output directory
105 if not os.path.exists(output_dir):
106 os.makedirs(output_dir)
107
108 outputs = {}
109 # Run cent_adjust
110 for nm, flow_file in enumerate(input_files):
111 run_command = "cent_adjust mfi.txt " + flow_file
112 print(run_command)
113 os.system(run_command)
114 flow_name = os.path.split(flow_file)[1]
115 outfile = os.path.join(output_dir, f_names[nm] + ".flowclr")
116 outputs[outfile] = f_names[nm]
117 with open(flow_file, "r") as flowf, open("population_id.txt", "r") as popf, open(outfile, "w") as outf:
118 f_line = flowf.readline()
119 f_line = f_line.rstrip()
120 f_line = f_line + "\tPopulation\n"
121 outf.write(f_line)
122
123 for line in flowf:
124 line = line.rstrip()
125 pop_line = popf.readline()
126 pop_line = pop_line.rstrip()
127 line = line + "\t" + pop_line + "\n"
128 outf.write(line)
129 get_pop_prop(outputs, summary_stat, mfi_stats, markers, mfi_calc)
130 return
131
132
133 def generate_CS_stats(mfi_stats, all_stats):
134 df = pd.read_table(mfi_stats)
135 means = df.groupby('Population').mean().round(decimals=2)
136 medians = df.groupby('Population').median().round(decimals=2)
137 stdev = df.groupby('Population').std().round(decimals=2)
138 all_markers = []
139 with open(mfi_stats, "r") as ms:
140 ms_fl = ms.readline().strip()
141 all_markers = ms_fl.split("\t")[0:-2]
142
143 with open(all_stats, "w") as mstats:
144 hdgs = ["\t".join(["_".join([mrs, "mean"]), "_".join([mrs, "median"]), "_".join([mrs, "stdev"])]) for mrs in all_markers]
145 mstats.write("Population\t")
146 mstats.write("\t".join(hdgs) + "\n")
147 for pops in set(df.Population):
148 tmp_line = []
149 for mar in all_markers:
150 tmp_line.append("\t".join([str(means.loc[pops, mar]), str(medians.loc[pops, mar]), str(stdev.loc[pops, mar])]))
151 mstats.write(str(pops) + "\t")
152 mstats.write("\t".join(tmp_line) + "\n")
153
154
155 if __name__ == "__main__":
156 parser = ArgumentParser(
157 prog="runCrossSample",
158 description="Run CrossSample on Flow file")
159
160 parser.add_argument(
161 '-i',
162 dest="input_files",
163 required=True,
164 action='append',
165 help="File locations for flow text files.")
166
167 parser.add_argument(
168 '-n',
169 dest="filenames",
170 required=True,
171 action='append',
172 help="Filenames")
173
174 parser.add_argument(
175 '-m',
176 dest="mfi",
177 required=True,
178 help="File location for the MFI text file.")
179
180 parser.add_argument(
181 '-o',
182 dest="out_path",
183 required=True,
184 help="Path to the directory for the output files.")
185
186 parser.add_argument(
187 '-M',
188 dest="mfi_calc",
189 required=True,
190 help="what to calculate for centroids.")
191
192 parser.add_argument(
193 '-s',
194 dest="sstat",
195 required=True,
196 help="File location for the summary statistics.")
197
198 parser.add_argument(
199 '-S',
200 dest="mfi_stat",
201 required=True,
202 help="File location for the MFI summary statistics.")
203
204 parser.add_argument(
205 '-a',
206 dest="all_stats",
207 required=True,
208 help="File location for stats on all markers.")
209
210 args = parser.parse_args()
211
212 input_files = [f for f in args.input_files]
213 input_names = [n for n in args.filenames]
214 compare_MFIs(input_files, input_names, args.mfi)
215 run_cross_sample(input_files, input_names, args.mfi, args.out_path, args.sstat, args.mfi_stat, args.mfi_calc)
216 generate_CS_stats(args.mfi_stat, args.all_stats)