Mercurial > repos > peterjc > seq_filter_by_id
comparison tools/seq_filter_by_id/seq_filter_by_id.py @ 9:141612f8c3e3 draft
v0.2.7 Python 3 compatible print etc
author | peterjc |
---|---|
date | Thu, 11 May 2017 12:18:52 -0400 |
parents | 2d4537dbf0bc |
children | 4a7d8ad2a983 |
comparison
equal
deleted
inserted
replaced
8:2d4537dbf0bc | 9:141612f8c3e3 |
---|---|
19 | 19 |
20 Cock et al 2009. Biopython: freely available Python tools for computational | 20 Cock et al 2009. Biopython: freely available Python tools for computational |
21 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | 21 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. |
22 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | 22 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. |
23 | 23 |
24 This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute | 24 This script is copyright 2010-2017 by Peter Cock, The James Hutton Institute |
25 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. | 25 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. |
26 See accompanying text file for licence details (MIT license). | 26 See accompanying text file for licence details (MIT license). |
27 | 27 |
28 Use -v or --version to get the version, -h or --help for help. | 28 Use -v or --version to get the version, -h or --help for help. |
29 """ | 29 """ |
30 | |
31 from __future__ import print_function | |
30 | 32 |
31 import os | 33 import os |
32 import re | 34 import re |
33 import sys | 35 import sys |
34 | 36 |
74 help="Show version and quit") | 76 help="Show version and quit") |
75 | 77 |
76 options, args = parser.parse_args() | 78 options, args = parser.parse_args() |
77 | 79 |
78 if options.version: | 80 if options.version: |
79 print "v0.2.5" | 81 print("v0.2.7") |
80 sys.exit(0) | 82 sys.exit(0) |
81 | 83 |
82 in_file = options.input | 84 in_file = options.input |
83 seq_format = options.format | 85 seq_format = options.format |
84 out_positive_file = options.output_positive | 86 out_positive_file = options.output_positive |
137 | 139 |
138 name_warn = False | 140 name_warn = False |
139 | 141 |
140 | 142 |
141 def check_white_space(name): | 143 def check_white_space(name): |
144 """Check identifier for white space, take first word only.""" | |
142 parts = name.split(None, 1) | 145 parts = name.split(None, 1) |
143 global name_warn | 146 global name_warn |
144 if not name_warn and len(parts) > 1: | 147 if not name_warn and len(parts) > 1: |
145 name_warn = "WARNING: Some of your identifiers had white space in them, " + \ | 148 name_warn = "WARNING: Some of your identifiers had white space in them, " + \ |
146 "using first word only. e.g.:\n%s\n" % name | 149 "using first word only. e.g.:\n%s\n" % name |
220 continue | 223 continue |
221 if not line.startswith("#"): | 224 if not line.startswith("#"): |
222 name = clean_name(line.rstrip("\n").split("\t")[col]) | 225 name = clean_name(line.rstrip("\n").split("\t")[col]) |
223 if name: | 226 if name: |
224 file_ids.add(name) | 227 file_ids.add(name) |
225 print "Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col + 1) for col in columns)) | 228 print("Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col + 1) for col in columns))) |
226 if ids is None: | 229 if ids is None: |
227 ids = file_ids | 230 ids = file_ids |
228 if logic == "UNION": | 231 if logic == "UNION": |
229 ids.update(file_ids) | 232 ids.update(file_ids) |
230 else: | 233 else: |
231 ids.intersection_update(file_ids) | 234 ids.intersection_update(file_ids) |
232 handle.close() | 235 handle.close() |
233 if len(identifiers) > 1: | 236 if len(identifiers) > 1: |
234 if logic == "UNION": | 237 if logic == "UNION": |
235 print "Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers)) | 238 print("Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers))) |
236 else: | 239 else: |
237 print "Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers)) | 240 print("Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers))) |
238 if name_warn: | 241 if name_warn: |
239 sys.stderr.write(name_warn) | 242 sys.stderr.write(name_warn) |
240 | 243 |
241 | 244 |
242 def crude_fasta_iterator(handle): | 245 def crude_fasta_iterator(handle): |
280 # Galaxy now requires Python 2.5+ so can use with statements, | 283 # Galaxy now requires Python 2.5+ so can use with statements, |
281 with open(in_file) as in_handle: | 284 with open(in_file) as in_handle: |
282 # Doing the if statement outside the loop for speed | 285 # Doing the if statement outside the loop for speed |
283 # (with the downside of three very similar loops). | 286 # (with the downside of three very similar loops). |
284 if pos_file is not None and neg_file is not None: | 287 if pos_file is not None and neg_file is not None: |
285 print "Generating two FASTA files" | 288 print("Generating two FASTA files") |
286 with open(pos_file, "w") as pos_handle: | 289 with open(pos_file, "w") as pos_handle: |
287 with open(neg_file, "w") as neg_handle: | 290 with open(neg_file, "w") as neg_handle: |
288 for identifier, record in crude_fasta_iterator(in_handle): | 291 for identifier, record in crude_fasta_iterator(in_handle): |
289 if clean_name(identifier) in wanted: | 292 if clean_name(identifier) in wanted: |
290 pos_handle.write(record) | 293 pos_handle.write(record) |
291 pos_count += 1 | 294 pos_count += 1 |
292 else: | 295 else: |
293 neg_handle.write(record) | 296 neg_handle.write(record) |
294 neg_count += 1 | 297 neg_count += 1 |
295 elif pos_file is not None: | 298 elif pos_file is not None: |
296 print "Generating matching FASTA file" | 299 print("Generating matching FASTA file") |
297 with open(pos_file, "w") as pos_handle: | 300 with open(pos_file, "w") as pos_handle: |
298 for identifier, record in crude_fasta_iterator(in_handle): | 301 for identifier, record in crude_fasta_iterator(in_handle): |
299 if clean_name(identifier) in wanted: | 302 if clean_name(identifier) in wanted: |
300 pos_handle.write(record) | 303 pos_handle.write(record) |
301 pos_count += 1 | 304 pos_count += 1 |
302 else: | 305 else: |
303 neg_count += 1 | 306 neg_count += 1 |
304 else: | 307 else: |
305 print "Generating non-matching FASTA file" | 308 print("Generating non-matching FASTA file") |
306 assert neg_file is not None | 309 assert neg_file is not None |
307 with open(neg_file, "w") as neg_handle: | 310 with open(neg_file, "w") as neg_handle: |
308 for identifier, record in crude_fasta_iterator(in_handle): | 311 for identifier, record in crude_fasta_iterator(in_handle): |
309 if clean_name(identifier) in wanted: | 312 if clean_name(identifier) in wanted: |
310 pos_count += 1 | 313 pos_count += 1 |
317 def fastq_filter(in_file, pos_file, neg_file, wanted): | 320 def fastq_filter(in_file, pos_file, neg_file, wanted): |
318 """FASTQ filter.""" | 321 """FASTQ filter.""" |
319 from Bio.SeqIO.QualityIO import FastqGeneralIterator | 322 from Bio.SeqIO.QualityIO import FastqGeneralIterator |
320 handle = open(in_file, "r") | 323 handle = open(in_file, "r") |
321 if pos_file is not None and neg_file is not None: | 324 if pos_file is not None and neg_file is not None: |
322 print "Generating two FASTQ files" | 325 print("Generating two FASTQ files") |
323 positive_handle = open(pos_file, "w") | 326 positive_handle = open(pos_file, "w") |
324 negative_handle = open(neg_file, "w") | 327 negative_handle = open(neg_file, "w") |
325 print in_file | 328 print(in_file) |
326 for title, seq, qual in FastqGeneralIterator(handle): | 329 for title, seq, qual in FastqGeneralIterator(handle): |
327 print("%s --> %s" % (title, clean_name(title.split(None, 1)[0]))) | 330 print("%s --> %s" % (title, clean_name(title.split(None, 1)[0]))) |
328 if clean_name(title.split(None, 1)[0]) in wanted: | 331 if clean_name(title.split(None, 1)[0]) in wanted: |
329 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | 332 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) |
330 else: | 333 else: |
331 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | 334 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) |
332 positive_handle.close() | 335 positive_handle.close() |
333 negative_handle.close() | 336 negative_handle.close() |
334 elif pos_file is not None: | 337 elif pos_file is not None: |
335 print "Generating matching FASTQ file" | 338 print("Generating matching FASTQ file") |
336 positive_handle = open(pos_file, "w") | 339 positive_handle = open(pos_file, "w") |
337 for title, seq, qual in FastqGeneralIterator(handle): | 340 for title, seq, qual in FastqGeneralIterator(handle): |
338 if clean_name(title.split(None, 1)[0]) in wanted: | 341 if clean_name(title.split(None, 1)[0]) in wanted: |
339 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | 342 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) |
340 positive_handle.close() | 343 positive_handle.close() |
341 elif neg_file is not None: | 344 elif neg_file is not None: |
342 print "Generating non-matching FASTQ file" | 345 print("Generating non-matching FASTQ file") |
343 negative_handle = open(neg_file, "w") | 346 negative_handle = open(neg_file, "w") |
344 for title, seq, qual in FastqGeneralIterator(handle): | 347 for title, seq, qual in FastqGeneralIterator(handle): |
345 if clean_name(title.split(None, 1)[0]) not in wanted: | 348 if clean_name(title.split(None, 1)[0]) not in wanted: |
346 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | 349 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) |
347 negative_handle.close() | 350 negative_handle.close() |
396 # At the time of writing, Galaxy doesn't show SFF file read counts, | 399 # At the time of writing, Galaxy doesn't show SFF file read counts, |
397 # so it is useful to put them in stdout and thus shown in job info. | 400 # so it is useful to put them in stdout and thus shown in job info. |
398 elif seq_format.lower() == "fasta": | 401 elif seq_format.lower() == "fasta": |
399 # Write filtered FASTA file based on IDs from tabular file | 402 # Write filtered FASTA file based on IDs from tabular file |
400 pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids) | 403 pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids) |
401 print "%i with and %i without specified IDs" % (pos_count, neg_count) | 404 print("%i with and %i without specified IDs" % (pos_count, neg_count)) |
402 elif seq_format.lower().startswith("fastq"): | 405 elif seq_format.lower().startswith("fastq"): |
403 # Write filtered FASTQ file based on IDs from tabular file | 406 # Write filtered FASTQ file based on IDs from tabular file |
404 fastq_filter(in_file, out_positive_file, out_negative_file, ids) | 407 fastq_filter(in_file, out_positive_file, out_negative_file, ids) |
405 # This does not currently track the counts | 408 # This does not currently track the counts |
406 else: | 409 else: |