comparison tools/seq_filter_by_id/seq_filter_by_id.py @ 6:03e134cae41a draft

v0.2.3, ignore blank lines in ID file (contributed by Gildas Le Corguille)
author peterjc
date Tue, 17 May 2016 05:59:24 -0400
parents 832c1fd57852
children fb1313d79396
comparison
equal deleted inserted replaced
5:832c1fd57852 6:03e134cae41a
30 import os 30 import os
31 import sys 31 import sys
32 import re 32 import re
33 from optparse import OptionParser 33 from optparse import OptionParser
34 34
35 def sys_exit(msg, err=1): 35 # Parse Command Line
36 sys.stderr.write(msg.rstrip() + "\n")
37 sys.exit(err)
38
39 #Parse Command Line
40 usage = """Use as follows: 36 usage = """Use as follows:
41 37
42 $ python seq_filter_by_id.py [options] tab1 cols1 [, tab2 cols2, ...] 38 $ python seq_filter_by_id.py [options] tab1 cols1 [, tab2 cols2, ...]
43 39
44 e.g. Positive matches using column one from tabular file: 40 e.g. Positive matches using column one from tabular file:
76 help="Show version and quit") 72 help="Show version and quit")
77 73
78 options, args = parser.parse_args() 74 options, args = parser.parse_args()
79 75
80 if options.version: 76 if options.version:
81 print "v0.2.1" 77 print "v0.2.3"
82 sys.exit(0) 78 sys.exit(0)
83 79
84 in_file = options.input 80 in_file = options.input
85 seq_format = options.format 81 seq_format = options.format
86 out_positive_file = options.output_positive 82 out_positive_file = options.output_positive
87 out_negative_file = options.output_negative 83 out_negative_file = options.output_negative
88 logic = options.logic 84 logic = options.logic
89 drop_suffices = bool(options.suffix) 85 drop_suffices = bool(options.suffix)
90 86
91 if in_file is None or not os.path.isfile(in_file): 87 if in_file is None or not os.path.isfile(in_file):
92 sys_exit("Missing input file: %r" % in_file) 88 sys.exit("Missing input file: %r" % in_file)
93 if out_positive_file is None and out_negative_file is None: 89 if out_positive_file is None and out_negative_file is None:
94 sys_exit("Neither output file requested") 90 sys.exit("Neither output file requested")
95 if seq_format is None: 91 if seq_format is None:
96 sys_exit("Missing sequence format") 92 sys.exit("Missing sequence format")
97 if logic not in ["UNION", "INTERSECTION"]: 93 if logic not in ["UNION", "INTERSECTION"]:
98 sys_exit("Logic agrument should be 'UNION' or 'INTERSECTION', not %r" % logic) 94 sys.exit("Logic agrument should be 'UNION' or 'INTERSECTION', not %r" % logic)
99 if options.id_list and args: 95 if options.id_list and args:
100 sys_exit("Cannot accepted IDs via both -t and as tabular files") 96 sys.exit("Cannot accepted IDs via both -t and as tabular files")
101 elif not options.id_list and not args: 97 elif not options.id_list and not args:
102 sys_exit("Expected matched pairs of tabular files and columns (or -t given)") 98 sys.exit("Expected matched pairs of tabular files and columns (or -t given)")
103 if len(args) % 2: 99 if len(args) % 2:
104 sys_exit("Expected matched pairs of tabular files and columns, not: %r" % args) 100 sys.exit("Expected matched pairs of tabular files and columns, not: %r" % args)
105 101
106 102
107 #Cope with three widely used suffix naming convensions, 103 # Cope with three widely used suffix naming convensions,
108 #Illumina: /1 or /2 104 # Illumina: /1 or /2
109 #Forward/revered: .f or .r 105 # Forward/revered: .f or .r
110 #Sanger, e.g. .p1k and .q1k 106 # Sanger, e.g. .p1k and .q1k
111 #See http://staden.sourceforge.net/manual/pregap4_unix_50.html 107 # See http://staden.sourceforge.net/manual/pregap4_unix_50.html
112 #re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") 108 # re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$")
113 #re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") 109 # re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$")
114 re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$") 110 re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$")
115 assert re_suffix.search("demo.f") 111 assert re_suffix.search("demo.f")
116 assert re_suffix.search("demo.s1") 112 assert re_suffix.search("demo.s1")
117 assert re_suffix.search("demo.f1k") 113 assert re_suffix.search("demo.f1k")
118 assert re_suffix.search("demo.p1") 114 assert re_suffix.search("demo.p1")
123 assert re_suffix.search("demo.q1") 119 assert re_suffix.search("demo.q1")
124 assert re_suffix.search("demo.q1lk") 120 assert re_suffix.search("demo.q1lk")
125 121
126 identifiers = [] 122 identifiers = []
127 for i in range(len(args) // 2): 123 for i in range(len(args) // 2):
128 tabular_file = args[2*i] 124 tabular_file = args[2 * i]
129 cols_arg = args[2*i + 1] 125 cols_arg = args[2 * i + 1]
130 if not os.path.isfile(tabular_file): 126 if not os.path.isfile(tabular_file):
131 sys_exit("Missing tabular identifier file %r" % tabular_file) 127 sys.exit("Missing tabular identifier file %r" % tabular_file)
132 try: 128 try:
133 columns = [int(arg)-1 for arg in cols_arg.split(",")] 129 columns = [int(arg) - 1 for arg in cols_arg.split(",")]
134 except ValueError: 130 except ValueError:
135 sys_exit("Expected list of columns (comma separated integers), got %r" % cols_arg) 131 sys.exit("Expected list of columns (comma separated integers), got %r" % cols_arg)
136 if min(columns) < 0: 132 if min(columns) < 0:
137 sys_exit("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg) 133 sys.exit("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg)
138 identifiers.append((tabular_file, columns)) 134 identifiers.append((tabular_file, columns))
139 135
140 name_warn = False 136 name_warn = False
137
138
141 def check_white_space(name): 139 def check_white_space(name):
142 parts = name.split(None, 1) 140 parts = name.split(None, 1)
143 global name_warn 141 global name_warn
144 if not name_warn and len(parts) > 1: 142 if not name_warn and len(parts) > 1:
145 name_warn = "WARNING: Some of your identifiers had white space in them, " + \ 143 name_warn = "WARNING: Some of your identifiers had white space in them, " + \
167 else: 165 else:
168 # Just check the white space 166 # Just check the white space
169 clean_name = check_white_space 167 clean_name = check_white_space
170 168
171 169
172 mapped_chars = { '>' :'__gt__', 170 mapped_chars = {
173 '<' :'__lt__', 171 '>': '__gt__',
174 "'" :'__sq__', 172 '<': '__lt__',
175 '"' :'__dq__', 173 "'": '__sq__',
176 '[' :'__ob__', 174 '"': '__dq__',
177 ']' :'__cb__', 175 '[': '__ob__',
178 '{' :'__oc__', 176 ']': '__cb__',
179 '}' :'__cc__', 177 '{': '__oc__',
180 '@' : '__at__', 178 '}': '__cc__',
181 '\n' : '__cn__', 179 '@': '__at__',
182 '\r' : '__cr__', 180 '\n': '__cn__',
183 '\t' : '__tc__', 181 '\r': '__cr__',
184 '#' : '__pd__' 182 '\t': '__tc__',
185 } 183 '#': '__pd__',
186 184 }
187 #Read tabular file(s) and record all specified identifiers 185
188 ids = None #Will be a set 186 # Read tabular file(s) and record all specified identifiers
187 ids = None # Will be a set
189 if options.id_list: 188 if options.id_list:
190 assert not identifiers 189 assert not identifiers
191 ids = set() 190 ids = set()
192 id_list = options.id_list 191 id_list = options.id_list
193 #Galaxy turns \r into __cr__ (CR) etc 192 # Galaxy turns \r into __cr__ (CR) etc
194 for k in mapped_chars: 193 for k in mapped_chars:
195 id_list = id_list.replace(mapped_chars[k], k) 194 id_list = id_list.replace(mapped_chars[k], k)
196 for x in options.id_list.split(): 195 for x in options.id_list.split():
197 ids.add(clean_name(x.strip())) 196 ids.add(clean_name(x.strip()))
198 print("Have %i unique identifiers from list" % len(ids)) 197 print("Have %i unique identifiers from list" % len(ids))
199 for tabular_file, columns in identifiers: 198 for tabular_file, columns in identifiers:
200 file_ids = set() 199 file_ids = set()
201 handle = open(tabular_file, "rU") 200 handle = open(tabular_file, "rU")
202 if len(columns)>1: 201 if len(columns) > 1:
203 #General case of many columns 202 # General case of many columns
204 for line in handle: 203 for line in handle:
205 if line.startswith("#"): 204 if line.startswith("#"):
206 #Ignore comments 205 # Ignore comments
207 continue 206 continue
208 parts = line.rstrip("\n").split("\t") 207 parts = line.rstrip("\n").split("\t")
209 for col in columns: 208 for col in columns:
210 file_ids.add(clean_name(parts[col])) 209 file_ids.add(clean_name(parts[col]))
211 else: 210 else:
212 #Single column, special case speed up 211 # Single column, special case speed up
213 col = columns[0] 212 col = columns[0]
214 for line in handle: 213 for line in handle:
214 if not line.strip(): #skip empty lines
215 continue
215 if not line.startswith("#"): 216 if not line.startswith("#"):
216 file_ids.add(clean_name(line.rstrip("\n").split("\t")[col])) 217 file_ids.add(clean_name(line.rstrip("\n").split("\t")[col]))
217 print "Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col+1) for col in columns)) 218 print "Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col + 1) for col in columns))
218 if ids is None: 219 if ids is None:
219 ids = file_ids 220 ids = file_ids
220 if logic == "UNION": 221 if logic == "UNION":
221 ids.update(file_ids) 222 ids.update(file_ids)
222 else: 223 else:
228 else: 229 else:
229 print "Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers)) 230 print "Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers))
230 if name_warn: 231 if name_warn:
231 sys.stderr.write(name_warn) 232 sys.stderr.write(name_warn)
232 233
234
233 def crude_fasta_iterator(handle): 235 def crude_fasta_iterator(handle):
234 """Yields tuples, record ID and the full record as a string.""" 236 """Yields tuples, record ID and the full record as a string."""
235 while True: 237 while True:
236 line = handle.readline() 238 line = handle.readline()
237 if line == "": 239 if line == "":
238 return # Premature end of file, or just empty? 240 return # Premature end of file, or just empty?
239 if line[0] == ">": 241 if line[0] == ">":
240 break 242 break
241 243
242 no_id_warned = False 244 no_id_warned = False
243 while True: 245 while True:
260 break 262 break
261 lines.append(line) 263 lines.append(line)
262 line = handle.readline() 264 line = handle.readline()
263 yield id, "".join(lines) 265 yield id, "".join(lines)
264 if not line: 266 if not line:
265 return # StopIteration 267 return # StopIteration
266 268
267 269
268 def fasta_filter(in_file, pos_file, neg_file, wanted): 270 def fasta_filter(in_file, pos_file, neg_file, wanted):
269 """FASTA filter producing 60 character line wrapped outout.""" 271 """FASTA filter producing 60 character line wrapped outout."""
270 pos_count = neg_count = 0 272 pos_count = neg_count = 0
271 #Galaxy now requires Python 2.5+ so can use with statements, 273 # Galaxy now requires Python 2.5+ so can use with statements,
272 with open(in_file) as in_handle: 274 with open(in_file) as in_handle:
273 #Doing the if statement outside the loop for speed 275 # Doing the if statement outside the loop for speed
274 #(with the downside of three very similar loops). 276 # (with the downside of three very similar loops).
275 if pos_file is not None and neg_file is not None: 277 if pos_file is not None and neg_file is not None:
276 print "Generating two FASTA files" 278 print "Generating two FASTA files"
277 with open(pos_file, "w") as pos_handle: 279 with open(pos_file, "w") as pos_handle:
278 with open(neg_file, "w") as neg_handle: 280 with open(neg_file, "w") as neg_handle:
279 for identifier, record in crude_fasta_iterator(in_handle): 281 for identifier, record in crude_fasta_iterator(in_handle):
307 309
308 def fastq_filter(in_file, pos_file, neg_file, wanted): 310 def fastq_filter(in_file, pos_file, neg_file, wanted):
309 """FASTQ filter.""" 311 """FASTQ filter."""
310 from Bio.SeqIO.QualityIO import FastqGeneralIterator 312 from Bio.SeqIO.QualityIO import FastqGeneralIterator
311 handle = open(in_file, "r") 313 handle = open(in_file, "r")
312 if out_positive_file is not None and out_negative_file is not None: 314 if pos_file is not None and neg_file is not None:
313 print "Generating two FASTQ files" 315 print "Generating two FASTQ files"
314 positive_handle = open(out_positive_file, "w") 316 positive_handle = open(pos_file, "w")
315 negative_handle = open(out_negative_file, "w") 317 negative_handle = open(neg_file, "w")
316 print in_file 318 print in_file
317 for title, seq, qual in FastqGeneralIterator(handle): 319 for title, seq, qual in FastqGeneralIterator(handle):
318 print("%s --> %s" % (title, clean_name(title.split(None, 1)[0]))) 320 print("%s --> %s" % (title, clean_name(title.split(None, 1)[0])))
319 if clean_name(title.split(None, 1)[0]) in ids: 321 if clean_name(title.split(None, 1)[0]) in wanted:
320 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 322 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
321 else: 323 else:
322 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 324 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
323 positive_handle.close() 325 positive_handle.close()
324 negative_handle.close() 326 negative_handle.close()
325 elif out_positive_file is not None: 327 elif pos_file is not None:
326 print "Generating matching FASTQ file" 328 print "Generating matching FASTQ file"
327 positive_handle = open(out_positive_file, "w") 329 positive_handle = open(pos_file, "w")
328 for title, seq, qual in FastqGeneralIterator(handle): 330 for title, seq, qual in FastqGeneralIterator(handle):
329 if clean_name(title.split(None, 1)[0]) in ids: 331 if clean_name(title.split(None, 1)[0]) in wanted:
330 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 332 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
331 positive_handle.close() 333 positive_handle.close()
332 elif out_negative_file is not None: 334 elif neg_file is not None:
333 print "Generating non-matching FASTQ file" 335 print "Generating non-matching FASTQ file"
334 negative_handle = open(out_negative_file, "w") 336 negative_handle = open(neg_file, "w")
335 for title, seq, qual in FastqGeneralIterator(handle): 337 for title, seq, qual in FastqGeneralIterator(handle):
336 if clean_name(title.split(None, 1)[0]) not in ids: 338 if clean_name(title.split(None, 1)[0]) not in wanted:
337 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 339 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
338 negative_handle.close() 340 negative_handle.close()
339 handle.close() 341 handle.close()
340 # This does not currently bother to record record counts (faster) 342 # This does not currently bother to record record counts (faster)
341 343
343 def sff_filter(in_file, pos_file, neg_file, wanted): 345 def sff_filter(in_file, pos_file, neg_file, wanted):
344 """SFF filter.""" 346 """SFF filter."""
345 try: 347 try:
346 from Bio.SeqIO.SffIO import SffIterator, SffWriter 348 from Bio.SeqIO.SffIO import SffIterator, SffWriter
347 except ImportError: 349 except ImportError:
348 sys_exit("SFF filtering requires Biopython 1.54 or later") 350 sys.exit("SFF filtering requires Biopython 1.54 or later")
349 351
350 try: 352 try:
351 from Bio.SeqIO.SffIO import ReadRocheXmlManifest 353 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
352 except ImportError: 354 except ImportError:
353 #Prior to Biopython 1.56 this was a private function 355 # Prior to Biopython 1.56 this was a private function
354 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest 356 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
355 357
356 in_handle = open(in_file, "rb") #must be binary mode! 358 in_handle = open(in_file, "rb") # must be binary mode!
357 try: 359 try:
358 manifest = ReadRocheXmlManifest(in_handle) 360 manifest = ReadRocheXmlManifest(in_handle)
359 except ValueError: 361 except ValueError:
360 manifest = None 362 manifest = None
361 363
362 #This makes two passes though the SFF file with isn't so efficient, 364 # This makes two passes though the SFF file with isn't so efficient,
363 #but this makes the code simple. 365 # but this makes the code simple.
364 pos_count = neg_count = 0 366 pos_count = neg_count = 0
365 if out_positive_file is not None: 367 if pos_file is not None:
366 out_handle = open(out_positive_file, "wb") 368 out_handle = open(pos_file, "wb")
367 writer = SffWriter(out_handle, xml=manifest) 369 writer = SffWriter(out_handle, xml=manifest)
368 in_handle.seek(0) #start again after getting manifest 370 in_handle.seek(0) # start again after getting manifest
369 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids) 371 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted)
370 out_handle.close() 372 out_handle.close()
371 if out_negative_file is not None: 373 if neg_file is not None:
372 out_handle = open(out_negative_file, "wb") 374 out_handle = open(neg_file, "wb")
373 writer = SffWriter(out_handle, xml=manifest) 375 writer = SffWriter(out_handle, xml=manifest)
374 in_handle.seek(0) #start again 376 in_handle.seek(0) # start again
375 neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids) 377 neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted)
376 out_handle.close() 378 out_handle.close()
377 #And we're done 379 # And we're done
378 in_handle.close() 380 in_handle.close()
379 #At the time of writing, Galaxy doesn't show SFF file read counts, 381 # At the time of writing, Galaxy doesn't show SFF file read counts,
380 #so it is useful to put them in stdout and thus shown in job info. 382 # so it is useful to put them in stdout and thus shown in job info.
381 return pos_count, neg_count 383 return pos_count, neg_count
382 384
383 385
384 if seq_format.lower()=="sff": 386 if seq_format.lower() == "sff":
385 # Now write filtered SFF file based on IDs wanted 387 # Now write filtered SFF file based on IDs wanted
386 pos_count, neg_count = sff_filter(in_file, out_positive_file, out_negative_file, ids) 388 pos_count, neg_count = sff_filter(in_file, out_positive_file, out_negative_file, ids)
387 # At the time of writing, Galaxy doesn't show SFF file read counts, 389 # At the time of writing, Galaxy doesn't show SFF file read counts,
388 # so it is useful to put them in stdout and thus shown in job info. 390 # so it is useful to put them in stdout and thus shown in job info.
389 elif seq_format.lower()=="fasta": 391 elif seq_format.lower() == "fasta":
390 # Write filtered FASTA file based on IDs from tabular file 392 # Write filtered FASTA file based on IDs from tabular file
391 pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids) 393 pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids)
392 print "%i with and %i without specified IDs" % (pos_count, neg_count) 394 print "%i with and %i without specified IDs" % (pos_count, neg_count)
393 elif seq_format.lower().startswith("fastq"): 395 elif seq_format.lower().startswith("fastq"):
394 # Write filtered FASTQ file based on IDs from tabular file 396 # Write filtered FASTQ file based on IDs from tabular file
395 fastq_filter(in_file, out_positive_file, out_negative_file, ids) 397 fastq_filter(in_file, out_positive_file, out_negative_file, ids)
396 # This does not currently track the counts 398 # This does not currently track the counts
397 else: 399 else:
398 sys_exit("Unsupported file type %r" % seq_format) 400 sys.exit("Unsupported file type %r" % seq_format)