comparison merge_ds_flowtext/FCStxtMergeDownsample.py @ 0:426650130311 draft

Uploaded
author immport-devteam
date Mon, 27 Feb 2017 13:03:02 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:426650130311
1 #!/usr/bin/env python
2
3 ######################################################################
4 # Copyright (c) 2016 Northrop Grumman.
5 # All rights reserved.
6 ######################################################################
7
8 from __future__ import print_function
9 from __future__ import division
10 import sys
11 import os
12 import pandas as pd
13 from argparse import ArgumentParser
14
15
16 def is_number(s):
17 try:
18 float(s)
19 return True
20 except ValueError:
21 return False
22
23
24 def is_integer(s):
25 try:
26 int(s)
27 return True
28 except ValueError:
29 return False
30
31
32 def compare_headers(files):
33 headers = {}
34 for eachfile in files:
35 with open(eachfile, "r") as ef:
36 headers[eachfile] = ef.readline().strip().lower().split("\t")
37
38 hdgs_in_common = []
39 flag = {}
40
41 for ref_hdgs in headers[files[0]]:
42 flag[ref_hdgs] = 1
43
44 for ij in range(1, len(files)):
45 if ref_hdgs in headers[files[ij]]:
46 flag[ref_hdgs] += 1
47 if flag[ref_hdgs] == len(files):
48 hdgs_in_common.append(ref_hdgs)
49
50 if not hdgs_in_common:
51 sys.exit(9)
52 return(hdgs_in_common)
53
54
55 def get_headers_index(list_headings, headings):
56 idxs = []
57 lhdgs = [x.lower() for x in headings]
58 for element in list_headings:
59 idxs.append(int(lhdgs.index(element)))
60 return(idxs)
61
62
63 def merge_and_DS_txt(in_files, out_file, col_names, factor_ds):
64 """Concatenates together tab-separated files.
65 The output will have only the columns in common to all the files provided
66 as input, as determined by the headers.
67 All lines after the header line must contain only numbers.
68 Potential errors are logged to stderr. If the number of errors reaches 10,
69 the program stops.
70 If a downsampling factor is given, returns the indicated fraction of
71 random lines.
72 """
73
74 nb_errors = 0
75 max_error = 10
76
77 # get list of headers in common to all files
78 list_hdgs = compare_headers(in_files)
79
80 with open(out_file, "w") as outf:
81 ff_order = []
82 # HEADERS:
83 with open(in_files[0], "r") as first_file:
84 headings_ff = first_file.readline().strip()
85 headings = headings_ff.split("\t")
86 # Get index of headers in common:
87 hdrs_idx = get_headers_index(list_hdgs, headings)
88
89 # If column to merge on were provided:
90 if col_names:
91 for ix in col_names:
92 if ix not in hdrs_idx:
93 nb_errors += 1
94 sys.stderr.write(" ".join(["WARNING: column", str(ix), "in", in_files[0],
95 "does not exist in all files or has a different header.\n"]))
96 hdrs_idx = col_names
97
98 # Print out to output file:
99 headings_to_write = []
100 for cti in range(0, len(headings)):
101 if cti in hdrs_idx:
102 headings_to_write.append(headings[cti])
103 ff_order.append(headings[cti])
104 outf.write("\t".join(headings_to_write) + "\n")
105
106 # DATA
107 for infile in in_files:
108 with open(infile, "r") as inf:
109 headings_inf = inf.readline().strip()
110 hdgs = headings_inf.split("\t")
111 # Get the index of columns to keep:
112 hdgs_idx = []
113 for ctc in ff_order:
114 hdgs_idx.append(int(hdgs.index(ctc)))
115 if col_names:
116 for iy in col_names:
117 if iy not in hdgs_idx:
118 nb_errors += 1
119 sys.stderr.write(" ".join(["WARNING: column", str(iy), "in", infile,
120 "does not exist in all files or has a different header.\n"]))
121 hdgs_idx = col_names
122
123 df = pd.read_table(infile, usecols=hdrs_idx)
124 wc_file = len(df.index) - 1
125 df_ds = df.sample(int(wc_file * factor_ds), replace=False)
126
127 for cols in df_ds.columns.values:
128 if df_ds[cols].count() != len(df_ds[cols]):
129 sys.stderr.write(infile + "contains non-numeric data\n")
130
131 with open(infile, "r") as checkfile:
132 fl = checkfile.readline()
133 count_lines = 1
134 for checklines in checkfile:
135 to_check = checklines.strip().split("\t")
136 count_lines += 1
137 for item in to_check:
138 if not is_number(item):
139 sys.stderr.write(" ".join(["WARNING: line", str(count_lines),
140 "in", infile, "contains non-numeric results\n"]))
141 sys.exit(2)
142
143 df_ds = df_ds.ix[:, ff_order]
144 df_ds.to_csv(outf, sep="\t", header=False, index=False)
145
146 if nb_errors > 0:
147 exit_code = 3
148 if nb_errors == max_error:
149 exit_code = 4
150 sys.stderr.write("Run aborted - too many errors.")
151 os.remove(out_file)
152 sys.exit(exit_code)
153 return
154
155
156 if __name__ == "__main__":
157 parser = ArgumentParser(
158 prog="FCStxtmerge",
159 description="Merge based on headers text-converted FCS files into one text file.")
160
161 parser.add_argument(
162 '-i',
163 dest="input_files",
164 required=True,
165 action='append',
166 help="File location for the text files.")
167
168 parser.add_argument(
169 '-o',
170 dest="output_file",
171 required=True,
172 help="Name of the output file.")
173
174 parser.add_argument(
175 '-c',
176 dest="columns",
177 help="Specify which column to keep in output file")
178
179 parser.add_argument(
180 '-d',
181 dest="downsampling_factor",
182 help="How much of each file to keep")
183
184 args = parser.parse_args()
185
186 # Get columns to merge on if any:
187 default_value_col = ["i.e.:1,2,5", "default", "Default"]
188 columns = []
189 if args.columns:
190 if args.columns not in default_value_col:
191 tmp_col = args.columns.split(",")
192 if len(tmp_col) == 1:
193 if not tmp_col[0].strip():
194 columns = []
195 elif not is_integer(tmp_col[0].strip()):
196 sys.exit(7)
197 else:
198 columns.append(int(tmp_col[0].strip()) - 1)
199 else:
200 for c in range(0, len(tmp_col)):
201 if not is_integer(tmp_col[c].strip()):
202 sys.exit(6)
203 else:
204 columns.append(int(tmp_col[c].strip()) - 1)
205
206 # Get down sampling factor if any:
207 # Note: change '%' to 'X' because somehow that's what Galaxy passes?
208 default_value_ds = ["i.e.:0.1 or 10X", "default", "Default"]
209 ds_factor = 1
210 if args.downsampling_factor:
211 if args.downsampling_factor not in default_value_ds:
212 args.downsampling_factor = args.downsampling_factor.strip()
213 downsampling_factor = args.downsampling_factor.rstrip("X")
214 if is_number(downsampling_factor):
215 ds_factor = float(downsampling_factor)
216 if ds_factor > 1:
217 ds_factor = float(downsampling_factor) / 100
218 if ds_factor > 100:
219 sys.exit(8)
220 else:
221 sys.exit(8)
222
223 input_files = [f for f in args.input_files]
224 merge_and_DS_txt(input_files, args.output_file, columns, ds_factor)
225 sys.exit(0)