Mercurial > repos > peterjc > seq_filter_by_id
comparison tools/seq_filter_by_id/seq_filter_by_id.py @ 6:03e134cae41a draft
v0.2.3, ignore blank lines in ID file (contributed by Gildas Le Corguille)
author | peterjc |
---|---|
date | Tue, 17 May 2016 05:59:24 -0400 |
parents | 832c1fd57852 |
children | fb1313d79396 |
comparison
equal
deleted
inserted
replaced
5:832c1fd57852 | 6:03e134cae41a |
---|---|
30 import os | 30 import os |
31 import sys | 31 import sys |
32 import re | 32 import re |
33 from optparse import OptionParser | 33 from optparse import OptionParser |
34 | 34 |
35 def sys_exit(msg, err=1): | 35 # Parse Command Line |
36 sys.stderr.write(msg.rstrip() + "\n") | |
37 sys.exit(err) | |
38 | |
39 #Parse Command Line | |
40 usage = """Use as follows: | 36 usage = """Use as follows: |
41 | 37 |
42 $ python seq_filter_by_id.py [options] tab1 cols1 [, tab2 cols2, ...] | 38 $ python seq_filter_by_id.py [options] tab1 cols1 [, tab2 cols2, ...] |
43 | 39 |
44 e.g. Positive matches using column one from tabular file: | 40 e.g. Positive matches using column one from tabular file: |
76 help="Show version and quit") | 72 help="Show version and quit") |
77 | 73 |
78 options, args = parser.parse_args() | 74 options, args = parser.parse_args() |
79 | 75 |
80 if options.version: | 76 if options.version: |
81 print "v0.2.1" | 77 print "v0.2.3" |
82 sys.exit(0) | 78 sys.exit(0) |
83 | 79 |
84 in_file = options.input | 80 in_file = options.input |
85 seq_format = options.format | 81 seq_format = options.format |
86 out_positive_file = options.output_positive | 82 out_positive_file = options.output_positive |
87 out_negative_file = options.output_negative | 83 out_negative_file = options.output_negative |
88 logic = options.logic | 84 logic = options.logic |
89 drop_suffices = bool(options.suffix) | 85 drop_suffices = bool(options.suffix) |
90 | 86 |
91 if in_file is None or not os.path.isfile(in_file): | 87 if in_file is None or not os.path.isfile(in_file): |
92 sys_exit("Missing input file: %r" % in_file) | 88 sys.exit("Missing input file: %r" % in_file) |
93 if out_positive_file is None and out_negative_file is None: | 89 if out_positive_file is None and out_negative_file is None: |
94 sys_exit("Neither output file requested") | 90 sys.exit("Neither output file requested") |
95 if seq_format is None: | 91 if seq_format is None: |
96 sys_exit("Missing sequence format") | 92 sys.exit("Missing sequence format") |
97 if logic not in ["UNION", "INTERSECTION"]: | 93 if logic not in ["UNION", "INTERSECTION"]: |
98 sys_exit("Logic agrument should be 'UNION' or 'INTERSECTION', not %r" % logic) | 94 sys.exit("Logic agrument should be 'UNION' or 'INTERSECTION', not %r" % logic) |
99 if options.id_list and args: | 95 if options.id_list and args: |
100 sys_exit("Cannot accepted IDs via both -t and as tabular files") | 96 sys.exit("Cannot accepted IDs via both -t and as tabular files") |
101 elif not options.id_list and not args: | 97 elif not options.id_list and not args: |
102 sys_exit("Expected matched pairs of tabular files and columns (or -t given)") | 98 sys.exit("Expected matched pairs of tabular files and columns (or -t given)") |
103 if len(args) % 2: | 99 if len(args) % 2: |
104 sys_exit("Expected matched pairs of tabular files and columns, not: %r" % args) | 100 sys.exit("Expected matched pairs of tabular files and columns, not: %r" % args) |
105 | 101 |
106 | 102 |
107 #Cope with three widely used suffix naming convensions, | 103 # Cope with three widely used suffix naming convensions, |
108 #Illumina: /1 or /2 | 104 # Illumina: /1 or /2 |
109 #Forward/revered: .f or .r | 105 # Forward/revered: .f or .r |
110 #Sanger, e.g. .p1k and .q1k | 106 # Sanger, e.g. .p1k and .q1k |
111 #See http://staden.sourceforge.net/manual/pregap4_unix_50.html | 107 # See http://staden.sourceforge.net/manual/pregap4_unix_50.html |
112 #re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") | 108 # re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") |
113 #re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") | 109 # re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") |
114 re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$") | 110 re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$") |
115 assert re_suffix.search("demo.f") | 111 assert re_suffix.search("demo.f") |
116 assert re_suffix.search("demo.s1") | 112 assert re_suffix.search("demo.s1") |
117 assert re_suffix.search("demo.f1k") | 113 assert re_suffix.search("demo.f1k") |
118 assert re_suffix.search("demo.p1") | 114 assert re_suffix.search("demo.p1") |
123 assert re_suffix.search("demo.q1") | 119 assert re_suffix.search("demo.q1") |
124 assert re_suffix.search("demo.q1lk") | 120 assert re_suffix.search("demo.q1lk") |
125 | 121 |
126 identifiers = [] | 122 identifiers = [] |
127 for i in range(len(args) // 2): | 123 for i in range(len(args) // 2): |
128 tabular_file = args[2*i] | 124 tabular_file = args[2 * i] |
129 cols_arg = args[2*i + 1] | 125 cols_arg = args[2 * i + 1] |
130 if not os.path.isfile(tabular_file): | 126 if not os.path.isfile(tabular_file): |
131 sys_exit("Missing tabular identifier file %r" % tabular_file) | 127 sys.exit("Missing tabular identifier file %r" % tabular_file) |
132 try: | 128 try: |
133 columns = [int(arg)-1 for arg in cols_arg.split(",")] | 129 columns = [int(arg) - 1 for arg in cols_arg.split(",")] |
134 except ValueError: | 130 except ValueError: |
135 sys_exit("Expected list of columns (comma separated integers), got %r" % cols_arg) | 131 sys.exit("Expected list of columns (comma separated integers), got %r" % cols_arg) |
136 if min(columns) < 0: | 132 if min(columns) < 0: |
137 sys_exit("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg) | 133 sys.exit("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg) |
138 identifiers.append((tabular_file, columns)) | 134 identifiers.append((tabular_file, columns)) |
139 | 135 |
140 name_warn = False | 136 name_warn = False |
137 | |
138 | |
141 def check_white_space(name): | 139 def check_white_space(name): |
142 parts = name.split(None, 1) | 140 parts = name.split(None, 1) |
143 global name_warn | 141 global name_warn |
144 if not name_warn and len(parts) > 1: | 142 if not name_warn and len(parts) > 1: |
145 name_warn = "WARNING: Some of your identifiers had white space in them, " + \ | 143 name_warn = "WARNING: Some of your identifiers had white space in them, " + \ |
167 else: | 165 else: |
168 # Just check the white space | 166 # Just check the white space |
169 clean_name = check_white_space | 167 clean_name = check_white_space |
170 | 168 |
171 | 169 |
172 mapped_chars = { '>' :'__gt__', | 170 mapped_chars = { |
173 '<' :'__lt__', | 171 '>': '__gt__', |
174 "'" :'__sq__', | 172 '<': '__lt__', |
175 '"' :'__dq__', | 173 "'": '__sq__', |
176 '[' :'__ob__', | 174 '"': '__dq__', |
177 ']' :'__cb__', | 175 '[': '__ob__', |
178 '{' :'__oc__', | 176 ']': '__cb__', |
179 '}' :'__cc__', | 177 '{': '__oc__', |
180 '@' : '__at__', | 178 '}': '__cc__', |
181 '\n' : '__cn__', | 179 '@': '__at__', |
182 '\r' : '__cr__', | 180 '\n': '__cn__', |
183 '\t' : '__tc__', | 181 '\r': '__cr__', |
184 '#' : '__pd__' | 182 '\t': '__tc__', |
185 } | 183 '#': '__pd__', |
186 | 184 } |
187 #Read tabular file(s) and record all specified identifiers | 185 |
188 ids = None #Will be a set | 186 # Read tabular file(s) and record all specified identifiers |
187 ids = None # Will be a set | |
189 if options.id_list: | 188 if options.id_list: |
190 assert not identifiers | 189 assert not identifiers |
191 ids = set() | 190 ids = set() |
192 id_list = options.id_list | 191 id_list = options.id_list |
193 #Galaxy turns \r into __cr__ (CR) etc | 192 # Galaxy turns \r into __cr__ (CR) etc |
194 for k in mapped_chars: | 193 for k in mapped_chars: |
195 id_list = id_list.replace(mapped_chars[k], k) | 194 id_list = id_list.replace(mapped_chars[k], k) |
196 for x in options.id_list.split(): | 195 for x in options.id_list.split(): |
197 ids.add(clean_name(x.strip())) | 196 ids.add(clean_name(x.strip())) |
198 print("Have %i unique identifiers from list" % len(ids)) | 197 print("Have %i unique identifiers from list" % len(ids)) |
199 for tabular_file, columns in identifiers: | 198 for tabular_file, columns in identifiers: |
200 file_ids = set() | 199 file_ids = set() |
201 handle = open(tabular_file, "rU") | 200 handle = open(tabular_file, "rU") |
202 if len(columns)>1: | 201 if len(columns) > 1: |
203 #General case of many columns | 202 # General case of many columns |
204 for line in handle: | 203 for line in handle: |
205 if line.startswith("#"): | 204 if line.startswith("#"): |
206 #Ignore comments | 205 # Ignore comments |
207 continue | 206 continue |
208 parts = line.rstrip("\n").split("\t") | 207 parts = line.rstrip("\n").split("\t") |
209 for col in columns: | 208 for col in columns: |
210 file_ids.add(clean_name(parts[col])) | 209 file_ids.add(clean_name(parts[col])) |
211 else: | 210 else: |
212 #Single column, special case speed up | 211 # Single column, special case speed up |
213 col = columns[0] | 212 col = columns[0] |
214 for line in handle: | 213 for line in handle: |
214 if not line.strip(): #skip empty lines | |
215 continue | |
215 if not line.startswith("#"): | 216 if not line.startswith("#"): |
216 file_ids.add(clean_name(line.rstrip("\n").split("\t")[col])) | 217 file_ids.add(clean_name(line.rstrip("\n").split("\t")[col])) |
217 print "Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col+1) for col in columns)) | 218 print "Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col + 1) for col in columns)) |
218 if ids is None: | 219 if ids is None: |
219 ids = file_ids | 220 ids = file_ids |
220 if logic == "UNION": | 221 if logic == "UNION": |
221 ids.update(file_ids) | 222 ids.update(file_ids) |
222 else: | 223 else: |
228 else: | 229 else: |
229 print "Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers)) | 230 print "Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers)) |
230 if name_warn: | 231 if name_warn: |
231 sys.stderr.write(name_warn) | 232 sys.stderr.write(name_warn) |
232 | 233 |
234 | |
233 def crude_fasta_iterator(handle): | 235 def crude_fasta_iterator(handle): |
234 """Yields tuples, record ID and the full record as a string.""" | 236 """Yields tuples, record ID and the full record as a string.""" |
235 while True: | 237 while True: |
236 line = handle.readline() | 238 line = handle.readline() |
237 if line == "": | 239 if line == "": |
238 return # Premature end of file, or just empty? | 240 return # Premature end of file, or just empty? |
239 if line[0] == ">": | 241 if line[0] == ">": |
240 break | 242 break |
241 | 243 |
242 no_id_warned = False | 244 no_id_warned = False |
243 while True: | 245 while True: |
260 break | 262 break |
261 lines.append(line) | 263 lines.append(line) |
262 line = handle.readline() | 264 line = handle.readline() |
263 yield id, "".join(lines) | 265 yield id, "".join(lines) |
264 if not line: | 266 if not line: |
265 return # StopIteration | 267 return # StopIteration |
266 | 268 |
267 | 269 |
268 def fasta_filter(in_file, pos_file, neg_file, wanted): | 270 def fasta_filter(in_file, pos_file, neg_file, wanted): |
269 """FASTA filter producing 60 character line wrapped outout.""" | 271 """FASTA filter producing 60 character line wrapped outout.""" |
270 pos_count = neg_count = 0 | 272 pos_count = neg_count = 0 |
271 #Galaxy now requires Python 2.5+ so can use with statements, | 273 # Galaxy now requires Python 2.5+ so can use with statements, |
272 with open(in_file) as in_handle: | 274 with open(in_file) as in_handle: |
273 #Doing the if statement outside the loop for speed | 275 # Doing the if statement outside the loop for speed |
274 #(with the downside of three very similar loops). | 276 # (with the downside of three very similar loops). |
275 if pos_file is not None and neg_file is not None: | 277 if pos_file is not None and neg_file is not None: |
276 print "Generating two FASTA files" | 278 print "Generating two FASTA files" |
277 with open(pos_file, "w") as pos_handle: | 279 with open(pos_file, "w") as pos_handle: |
278 with open(neg_file, "w") as neg_handle: | 280 with open(neg_file, "w") as neg_handle: |
279 for identifier, record in crude_fasta_iterator(in_handle): | 281 for identifier, record in crude_fasta_iterator(in_handle): |
307 | 309 |
308 def fastq_filter(in_file, pos_file, neg_file, wanted): | 310 def fastq_filter(in_file, pos_file, neg_file, wanted): |
309 """FASTQ filter.""" | 311 """FASTQ filter.""" |
310 from Bio.SeqIO.QualityIO import FastqGeneralIterator | 312 from Bio.SeqIO.QualityIO import FastqGeneralIterator |
311 handle = open(in_file, "r") | 313 handle = open(in_file, "r") |
312 if out_positive_file is not None and out_negative_file is not None: | 314 if pos_file is not None and neg_file is not None: |
313 print "Generating two FASTQ files" | 315 print "Generating two FASTQ files" |
314 positive_handle = open(out_positive_file, "w") | 316 positive_handle = open(pos_file, "w") |
315 negative_handle = open(out_negative_file, "w") | 317 negative_handle = open(neg_file, "w") |
316 print in_file | 318 print in_file |
317 for title, seq, qual in FastqGeneralIterator(handle): | 319 for title, seq, qual in FastqGeneralIterator(handle): |
318 print("%s --> %s" % (title, clean_name(title.split(None, 1)[0]))) | 320 print("%s --> %s" % (title, clean_name(title.split(None, 1)[0]))) |
319 if clean_name(title.split(None, 1)[0]) in ids: | 321 if clean_name(title.split(None, 1)[0]) in wanted: |
320 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | 322 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) |
321 else: | 323 else: |
322 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | 324 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) |
323 positive_handle.close() | 325 positive_handle.close() |
324 negative_handle.close() | 326 negative_handle.close() |
325 elif out_positive_file is not None: | 327 elif pos_file is not None: |
326 print "Generating matching FASTQ file" | 328 print "Generating matching FASTQ file" |
327 positive_handle = open(out_positive_file, "w") | 329 positive_handle = open(pos_file, "w") |
328 for title, seq, qual in FastqGeneralIterator(handle): | 330 for title, seq, qual in FastqGeneralIterator(handle): |
329 if clean_name(title.split(None, 1)[0]) in ids: | 331 if clean_name(title.split(None, 1)[0]) in wanted: |
330 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | 332 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) |
331 positive_handle.close() | 333 positive_handle.close() |
332 elif out_negative_file is not None: | 334 elif neg_file is not None: |
333 print "Generating non-matching FASTQ file" | 335 print "Generating non-matching FASTQ file" |
334 negative_handle = open(out_negative_file, "w") | 336 negative_handle = open(neg_file, "w") |
335 for title, seq, qual in FastqGeneralIterator(handle): | 337 for title, seq, qual in FastqGeneralIterator(handle): |
336 if clean_name(title.split(None, 1)[0]) not in ids: | 338 if clean_name(title.split(None, 1)[0]) not in wanted: |
337 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | 339 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) |
338 negative_handle.close() | 340 negative_handle.close() |
339 handle.close() | 341 handle.close() |
340 # This does not currently bother to record record counts (faster) | 342 # This does not currently bother to record record counts (faster) |
341 | 343 |
343 def sff_filter(in_file, pos_file, neg_file, wanted): | 345 def sff_filter(in_file, pos_file, neg_file, wanted): |
344 """SFF filter.""" | 346 """SFF filter.""" |
345 try: | 347 try: |
346 from Bio.SeqIO.SffIO import SffIterator, SffWriter | 348 from Bio.SeqIO.SffIO import SffIterator, SffWriter |
347 except ImportError: | 349 except ImportError: |
348 sys_exit("SFF filtering requires Biopython 1.54 or later") | 350 sys.exit("SFF filtering requires Biopython 1.54 or later") |
349 | 351 |
350 try: | 352 try: |
351 from Bio.SeqIO.SffIO import ReadRocheXmlManifest | 353 from Bio.SeqIO.SffIO import ReadRocheXmlManifest |
352 except ImportError: | 354 except ImportError: |
353 #Prior to Biopython 1.56 this was a private function | 355 # Prior to Biopython 1.56 this was a private function |
354 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest | 356 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest |
355 | 357 |
356 in_handle = open(in_file, "rb") #must be binary mode! | 358 in_handle = open(in_file, "rb") # must be binary mode! |
357 try: | 359 try: |
358 manifest = ReadRocheXmlManifest(in_handle) | 360 manifest = ReadRocheXmlManifest(in_handle) |
359 except ValueError: | 361 except ValueError: |
360 manifest = None | 362 manifest = None |
361 | 363 |
362 #This makes two passes though the SFF file with isn't so efficient, | 364 # This makes two passes though the SFF file with isn't so efficient, |
363 #but this makes the code simple. | 365 # but this makes the code simple. |
364 pos_count = neg_count = 0 | 366 pos_count = neg_count = 0 |
365 if out_positive_file is not None: | 367 if pos_file is not None: |
366 out_handle = open(out_positive_file, "wb") | 368 out_handle = open(pos_file, "wb") |
367 writer = SffWriter(out_handle, xml=manifest) | 369 writer = SffWriter(out_handle, xml=manifest) |
368 in_handle.seek(0) #start again after getting manifest | 370 in_handle.seek(0) # start again after getting manifest |
369 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids) | 371 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted) |
370 out_handle.close() | 372 out_handle.close() |
371 if out_negative_file is not None: | 373 if neg_file is not None: |
372 out_handle = open(out_negative_file, "wb") | 374 out_handle = open(neg_file, "wb") |
373 writer = SffWriter(out_handle, xml=manifest) | 375 writer = SffWriter(out_handle, xml=manifest) |
374 in_handle.seek(0) #start again | 376 in_handle.seek(0) # start again |
375 neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids) | 377 neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted) |
376 out_handle.close() | 378 out_handle.close() |
377 #And we're done | 379 # And we're done |
378 in_handle.close() | 380 in_handle.close() |
379 #At the time of writing, Galaxy doesn't show SFF file read counts, | 381 # At the time of writing, Galaxy doesn't show SFF file read counts, |
380 #so it is useful to put them in stdout and thus shown in job info. | 382 # so it is useful to put them in stdout and thus shown in job info. |
381 return pos_count, neg_count | 383 return pos_count, neg_count |
382 | 384 |
383 | 385 |
384 if seq_format.lower()=="sff": | 386 if seq_format.lower() == "sff": |
385 # Now write filtered SFF file based on IDs wanted | 387 # Now write filtered SFF file based on IDs wanted |
386 pos_count, neg_count = sff_filter(in_file, out_positive_file, out_negative_file, ids) | 388 pos_count, neg_count = sff_filter(in_file, out_positive_file, out_negative_file, ids) |
387 # At the time of writing, Galaxy doesn't show SFF file read counts, | 389 # At the time of writing, Galaxy doesn't show SFF file read counts, |
388 # so it is useful to put them in stdout and thus shown in job info. | 390 # so it is useful to put them in stdout and thus shown in job info. |
389 elif seq_format.lower()=="fasta": | 391 elif seq_format.lower() == "fasta": |
390 # Write filtered FASTA file based on IDs from tabular file | 392 # Write filtered FASTA file based on IDs from tabular file |
391 pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids) | 393 pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids) |
392 print "%i with and %i without specified IDs" % (pos_count, neg_count) | 394 print "%i with and %i without specified IDs" % (pos_count, neg_count) |
393 elif seq_format.lower().startswith("fastq"): | 395 elif seq_format.lower().startswith("fastq"): |
394 # Write filtered FASTQ file based on IDs from tabular file | 396 # Write filtered FASTQ file based on IDs from tabular file |
395 fastq_filter(in_file, out_positive_file, out_negative_file, ids) | 397 fastq_filter(in_file, out_positive_file, out_negative_file, ids) |
396 # This does not currently track the counts | 398 # This does not currently track the counts |
397 else: | 399 else: |
398 sys_exit("Unsupported file type %r" % seq_format) | 400 sys.exit("Unsupported file type %r" % seq_format) |