comparison tools/seq_filter_by_id/seq_filter_by_id.py @ 9:141612f8c3e3 draft

v0.2.7 Python 3 compatible print etc
author peterjc
date Thu, 11 May 2017 12:18:52 -0400
parents 2d4537dbf0bc
children 4a7d8ad2a983
comparison
equal deleted inserted replaced
8:2d4537dbf0bc 9:141612f8c3e3
19 19
20 Cock et al 2009. Biopython: freely available Python tools for computational 20 Cock et al 2009. Biopython: freely available Python tools for computational
21 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. 21 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
22 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. 22 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
23 23
24 This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute 24 This script is copyright 2010-2017 by Peter Cock, The James Hutton Institute
25 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. 25 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
26 See accompanying text file for licence details (MIT license). 26 See accompanying text file for licence details (MIT license).
27 27
28 Use -v or --version to get the version, -h or --help for help. 28 Use -v or --version to get the version, -h or --help for help.
29 """ 29 """
30
31 from __future__ import print_function
30 32
31 import os 33 import os
32 import re 34 import re
33 import sys 35 import sys
34 36
74 help="Show version and quit") 76 help="Show version and quit")
75 77
76 options, args = parser.parse_args() 78 options, args = parser.parse_args()
77 79
78 if options.version: 80 if options.version:
79 print "v0.2.5" 81 print("v0.2.7")
80 sys.exit(0) 82 sys.exit(0)
81 83
82 in_file = options.input 84 in_file = options.input
83 seq_format = options.format 85 seq_format = options.format
84 out_positive_file = options.output_positive 86 out_positive_file = options.output_positive
137 139
138 name_warn = False 140 name_warn = False
139 141
140 142
141 def check_white_space(name): 143 def check_white_space(name):
144 """Check identifier for white space, take first word only."""
142 parts = name.split(None, 1) 145 parts = name.split(None, 1)
143 global name_warn 146 global name_warn
144 if not name_warn and len(parts) > 1: 147 if not name_warn and len(parts) > 1:
145 name_warn = "WARNING: Some of your identifiers had white space in them, " + \ 148 name_warn = "WARNING: Some of your identifiers had white space in them, " + \
146 "using first word only. e.g.:\n%s\n" % name 149 "using first word only. e.g.:\n%s\n" % name
220 continue 223 continue
221 if not line.startswith("#"): 224 if not line.startswith("#"):
222 name = clean_name(line.rstrip("\n").split("\t")[col]) 225 name = clean_name(line.rstrip("\n").split("\t")[col])
223 if name: 226 if name:
224 file_ids.add(name) 227 file_ids.add(name)
225 print "Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col + 1) for col in columns)) 228 print("Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col + 1) for col in columns)))
226 if ids is None: 229 if ids is None:
227 ids = file_ids 230 ids = file_ids
228 if logic == "UNION": 231 if logic == "UNION":
229 ids.update(file_ids) 232 ids.update(file_ids)
230 else: 233 else:
231 ids.intersection_update(file_ids) 234 ids.intersection_update(file_ids)
232 handle.close() 235 handle.close()
233 if len(identifiers) > 1: 236 if len(identifiers) > 1:
234 if logic == "UNION": 237 if logic == "UNION":
235 print "Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers)) 238 print("Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers)))
236 else: 239 else:
237 print "Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers)) 240 print("Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers)))
238 if name_warn: 241 if name_warn:
239 sys.stderr.write(name_warn) 242 sys.stderr.write(name_warn)
240 243
241 244
242 def crude_fasta_iterator(handle): 245 def crude_fasta_iterator(handle):
280 # Galaxy now requires Python 2.5+ so can use with statements, 283 # Galaxy now requires Python 2.5+ so can use with statements,
281 with open(in_file) as in_handle: 284 with open(in_file) as in_handle:
282 # Doing the if statement outside the loop for speed 285 # Doing the if statement outside the loop for speed
283 # (with the downside of three very similar loops). 286 # (with the downside of three very similar loops).
284 if pos_file is not None and neg_file is not None: 287 if pos_file is not None and neg_file is not None:
285 print "Generating two FASTA files" 288 print("Generating two FASTA files")
286 with open(pos_file, "w") as pos_handle: 289 with open(pos_file, "w") as pos_handle:
287 with open(neg_file, "w") as neg_handle: 290 with open(neg_file, "w") as neg_handle:
288 for identifier, record in crude_fasta_iterator(in_handle): 291 for identifier, record in crude_fasta_iterator(in_handle):
289 if clean_name(identifier) in wanted: 292 if clean_name(identifier) in wanted:
290 pos_handle.write(record) 293 pos_handle.write(record)
291 pos_count += 1 294 pos_count += 1
292 else: 295 else:
293 neg_handle.write(record) 296 neg_handle.write(record)
294 neg_count += 1 297 neg_count += 1
295 elif pos_file is not None: 298 elif pos_file is not None:
296 print "Generating matching FASTA file" 299 print("Generating matching FASTA file")
297 with open(pos_file, "w") as pos_handle: 300 with open(pos_file, "w") as pos_handle:
298 for identifier, record in crude_fasta_iterator(in_handle): 301 for identifier, record in crude_fasta_iterator(in_handle):
299 if clean_name(identifier) in wanted: 302 if clean_name(identifier) in wanted:
300 pos_handle.write(record) 303 pos_handle.write(record)
301 pos_count += 1 304 pos_count += 1
302 else: 305 else:
303 neg_count += 1 306 neg_count += 1
304 else: 307 else:
305 print "Generating non-matching FASTA file" 308 print("Generating non-matching FASTA file")
306 assert neg_file is not None 309 assert neg_file is not None
307 with open(neg_file, "w") as neg_handle: 310 with open(neg_file, "w") as neg_handle:
308 for identifier, record in crude_fasta_iterator(in_handle): 311 for identifier, record in crude_fasta_iterator(in_handle):
309 if clean_name(identifier) in wanted: 312 if clean_name(identifier) in wanted:
310 pos_count += 1 313 pos_count += 1
317 def fastq_filter(in_file, pos_file, neg_file, wanted): 320 def fastq_filter(in_file, pos_file, neg_file, wanted):
318 """FASTQ filter.""" 321 """FASTQ filter."""
319 from Bio.SeqIO.QualityIO import FastqGeneralIterator 322 from Bio.SeqIO.QualityIO import FastqGeneralIterator
320 handle = open(in_file, "r") 323 handle = open(in_file, "r")
321 if pos_file is not None and neg_file is not None: 324 if pos_file is not None and neg_file is not None:
322 print "Generating two FASTQ files" 325 print("Generating two FASTQ files")
323 positive_handle = open(pos_file, "w") 326 positive_handle = open(pos_file, "w")
324 negative_handle = open(neg_file, "w") 327 negative_handle = open(neg_file, "w")
325 print in_file 328 print(in_file)
326 for title, seq, qual in FastqGeneralIterator(handle): 329 for title, seq, qual in FastqGeneralIterator(handle):
327 print("%s --> %s" % (title, clean_name(title.split(None, 1)[0]))) 330 print("%s --> %s" % (title, clean_name(title.split(None, 1)[0])))
328 if clean_name(title.split(None, 1)[0]) in wanted: 331 if clean_name(title.split(None, 1)[0]) in wanted:
329 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 332 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
330 else: 333 else:
331 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 334 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
332 positive_handle.close() 335 positive_handle.close()
333 negative_handle.close() 336 negative_handle.close()
334 elif pos_file is not None: 337 elif pos_file is not None:
335 print "Generating matching FASTQ file" 338 print("Generating matching FASTQ file")
336 positive_handle = open(pos_file, "w") 339 positive_handle = open(pos_file, "w")
337 for title, seq, qual in FastqGeneralIterator(handle): 340 for title, seq, qual in FastqGeneralIterator(handle):
338 if clean_name(title.split(None, 1)[0]) in wanted: 341 if clean_name(title.split(None, 1)[0]) in wanted:
339 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 342 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
340 positive_handle.close() 343 positive_handle.close()
341 elif neg_file is not None: 344 elif neg_file is not None:
342 print "Generating non-matching FASTQ file" 345 print("Generating non-matching FASTQ file")
343 negative_handle = open(neg_file, "w") 346 negative_handle = open(neg_file, "w")
344 for title, seq, qual in FastqGeneralIterator(handle): 347 for title, seq, qual in FastqGeneralIterator(handle):
345 if clean_name(title.split(None, 1)[0]) not in wanted: 348 if clean_name(title.split(None, 1)[0]) not in wanted:
346 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 349 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
347 negative_handle.close() 350 negative_handle.close()
396 # At the time of writing, Galaxy doesn't show SFF file read counts, 399 # At the time of writing, Galaxy doesn't show SFF file read counts,
397 # so it is useful to put them in stdout and thus shown in job info. 400 # so it is useful to put them in stdout and thus shown in job info.
398 elif seq_format.lower() == "fasta": 401 elif seq_format.lower() == "fasta":
399 # Write filtered FASTA file based on IDs from tabular file 402 # Write filtered FASTA file based on IDs from tabular file
400 pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids) 403 pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids)
401 print "%i with and %i without specified IDs" % (pos_count, neg_count) 404 print("%i with and %i without specified IDs" % (pos_count, neg_count))
402 elif seq_format.lower().startswith("fastq"): 405 elif seq_format.lower().startswith("fastq"):
403 # Write filtered FASTQ file based on IDs from tabular file 406 # Write filtered FASTQ file based on IDs from tabular file
404 fastq_filter(in_file, out_positive_file, out_negative_file, ids) 407 fastq_filter(in_file, out_positive_file, out_negative_file, ids)
405 # This does not currently track the counts 408 # This does not currently track the counts
406 else: 409 else: