Mercurial > repos > immport-devteam > merge_ds_flowtext
comparison merge_ds_flowtext/FCStxtMergeDownsample.py @ 0:426650130311 draft
Uploaded
author | immport-devteam |
---|---|
date | Mon, 27 Feb 2017 13:03:02 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:426650130311 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 ###################################################################### | |
4 # Copyright (c) 2016 Northrop Grumman. | |
5 # All rights reserved. | |
6 ###################################################################### | |
7 | |
8 from __future__ import print_function | |
9 from __future__ import division | |
10 import sys | |
11 import os | |
12 import pandas as pd | |
13 from argparse import ArgumentParser | |
14 | |
15 | |
16 def is_number(s): | |
17 try: | |
18 float(s) | |
19 return True | |
20 except ValueError: | |
21 return False | |
22 | |
23 | |
24 def is_integer(s): | |
25 try: | |
26 int(s) | |
27 return True | |
28 except ValueError: | |
29 return False | |
30 | |
31 | |
32 def compare_headers(files): | |
33 headers = {} | |
34 for eachfile in files: | |
35 with open(eachfile, "r") as ef: | |
36 headers[eachfile] = ef.readline().strip().lower().split("\t") | |
37 | |
38 hdgs_in_common = [] | |
39 flag = {} | |
40 | |
41 for ref_hdgs in headers[files[0]]: | |
42 flag[ref_hdgs] = 1 | |
43 | |
44 for ij in range(1, len(files)): | |
45 if ref_hdgs in headers[files[ij]]: | |
46 flag[ref_hdgs] += 1 | |
47 if flag[ref_hdgs] == len(files): | |
48 hdgs_in_common.append(ref_hdgs) | |
49 | |
50 if not hdgs_in_common: | |
51 sys.exit(9) | |
52 return(hdgs_in_common) | |
53 | |
54 | |
55 def get_headers_index(list_headings, headings): | |
56 idxs = [] | |
57 lhdgs = [x.lower() for x in headings] | |
58 for element in list_headings: | |
59 idxs.append(int(lhdgs.index(element))) | |
60 return(idxs) | |
61 | |
62 | |
63 def merge_and_DS_txt(in_files, out_file, col_names, factor_ds): | |
64 """Concatenates together tab-separated files. | |
65 The output will have only the columns in common to all the files provided | |
66 as input, as determined by the headers. | |
67 All lines after the header line must contain only numbers. | |
68 Potential errors are logged to stderr. If the number of errors reaches 10, | |
69 the program stops. | |
70 If a downsampling factor is given, returns the indicated fraction of | |
71 random lines. | |
72 """ | |
73 | |
74 nb_errors = 0 | |
75 max_error = 10 | |
76 | |
77 # get list of headers in common to all files | |
78 list_hdgs = compare_headers(in_files) | |
79 | |
80 with open(out_file, "w") as outf: | |
81 ff_order = [] | |
82 # HEADERS: | |
83 with open(in_files[0], "r") as first_file: | |
84 headings_ff = first_file.readline().strip() | |
85 headings = headings_ff.split("\t") | |
86 # Get index of headers in common: | |
87 hdrs_idx = get_headers_index(list_hdgs, headings) | |
88 | |
89 # If column to merge on were provided: | |
90 if col_names: | |
91 for ix in col_names: | |
92 if ix not in hdrs_idx: | |
93 nb_errors += 1 | |
94 sys.stderr.write(" ".join(["WARNING: column", str(ix), "in", in_files[0], | |
95 "does not exist in all files or has a different header.\n"])) | |
96 hdrs_idx = col_names | |
97 | |
98 # Print out to output file: | |
99 headings_to_write = [] | |
100 for cti in range(0, len(headings)): | |
101 if cti in hdrs_idx: | |
102 headings_to_write.append(headings[cti]) | |
103 ff_order.append(headings[cti]) | |
104 outf.write("\t".join(headings_to_write) + "\n") | |
105 | |
106 # DATA | |
107 for infile in in_files: | |
108 with open(infile, "r") as inf: | |
109 headings_inf = inf.readline().strip() | |
110 hdgs = headings_inf.split("\t") | |
111 # Get the index of columns to keep: | |
112 hdgs_idx = [] | |
113 for ctc in ff_order: | |
114 hdgs_idx.append(int(hdgs.index(ctc))) | |
115 if col_names: | |
116 for iy in col_names: | |
117 if iy not in hdgs_idx: | |
118 nb_errors += 1 | |
119 sys.stderr.write(" ".join(["WARNING: column", str(iy), "in", infile, | |
120 "does not exist in all files or has a different header.\n"])) | |
121 hdgs_idx = col_names | |
122 | |
123 df = pd.read_table(infile, usecols=hdrs_idx) | |
124 wc_file = len(df.index) - 1 | |
125 df_ds = df.sample(int(wc_file * factor_ds), replace=False) | |
126 | |
127 for cols in df_ds.columns.values: | |
128 if df_ds[cols].count() != len(df_ds[cols]): | |
129 sys.stderr.write(infile + "contains non-numeric data\n") | |
130 | |
131 with open(infile, "r") as checkfile: | |
132 fl = checkfile.readline() | |
133 count_lines = 1 | |
134 for checklines in checkfile: | |
135 to_check = checklines.strip().split("\t") | |
136 count_lines += 1 | |
137 for item in to_check: | |
138 if not is_number(item): | |
139 sys.stderr.write(" ".join(["WARNING: line", str(count_lines), | |
140 "in", infile, "contains non-numeric results\n"])) | |
141 sys.exit(2) | |
142 | |
143 df_ds = df_ds.ix[:, ff_order] | |
144 df_ds.to_csv(outf, sep="\t", header=False, index=False) | |
145 | |
146 if nb_errors > 0: | |
147 exit_code = 3 | |
148 if nb_errors == max_error: | |
149 exit_code = 4 | |
150 sys.stderr.write("Run aborted - too many errors.") | |
151 os.remove(out_file) | |
152 sys.exit(exit_code) | |
153 return | |
154 | |
155 | |
156 if __name__ == "__main__": | |
157 parser = ArgumentParser( | |
158 prog="FCStxtmerge", | |
159 description="Merge based on headers text-converted FCS files into one text file.") | |
160 | |
161 parser.add_argument( | |
162 '-i', | |
163 dest="input_files", | |
164 required=True, | |
165 action='append', | |
166 help="File location for the text files.") | |
167 | |
168 parser.add_argument( | |
169 '-o', | |
170 dest="output_file", | |
171 required=True, | |
172 help="Name of the output file.") | |
173 | |
174 parser.add_argument( | |
175 '-c', | |
176 dest="columns", | |
177 help="Specify which column to keep in output file") | |
178 | |
179 parser.add_argument( | |
180 '-d', | |
181 dest="downsampling_factor", | |
182 help="How much of each file to keep") | |
183 | |
184 args = parser.parse_args() | |
185 | |
186 # Get columns to merge on if any: | |
187 default_value_col = ["i.e.:1,2,5", "default", "Default"] | |
188 columns = [] | |
189 if args.columns: | |
190 if args.columns not in default_value_col: | |
191 tmp_col = args.columns.split(",") | |
192 if len(tmp_col) == 1: | |
193 if not tmp_col[0].strip(): | |
194 columns = [] | |
195 elif not is_integer(tmp_col[0].strip()): | |
196 sys.exit(7) | |
197 else: | |
198 columns.append(int(tmp_col[0].strip()) - 1) | |
199 else: | |
200 for c in range(0, len(tmp_col)): | |
201 if not is_integer(tmp_col[c].strip()): | |
202 sys.exit(6) | |
203 else: | |
204 columns.append(int(tmp_col[c].strip()) - 1) | |
205 | |
206 # Get down sampling factor if any: | |
207 # Note: change '%' to 'X' because somehow that's what Galaxy passes? | |
208 default_value_ds = ["i.e.:0.1 or 10X", "default", "Default"] | |
209 ds_factor = 1 | |
210 if args.downsampling_factor: | |
211 if args.downsampling_factor not in default_value_ds: | |
212 args.downsampling_factor = args.downsampling_factor.strip() | |
213 downsampling_factor = args.downsampling_factor.rstrip("X") | |
214 if is_number(downsampling_factor): | |
215 ds_factor = float(downsampling_factor) | |
216 if ds_factor > 1: | |
217 ds_factor = float(downsampling_factor) / 100 | |
218 if ds_factor > 100: | |
219 sys.exit(8) | |
220 else: | |
221 sys.exit(8) | |
222 | |
223 input_files = [f for f in args.input_files] | |
224 merge_and_DS_txt(input_files, args.output_file, columns, ds_factor) | |
225 sys.exit(0) |