Mercurial > repos > peterjc > fastq_filter_by_id
comparison tools/fastq/fastq_filter_by_id.py @ 3:e0041942a12d draft default tip
v0.0.5 - galaxy_sequence_utils dependency and other cleanups inc using MIT license
author | peterjc |
---|---|
date | Fri, 03 Feb 2017 05:34:18 -0500 |
parents | d570cc324779 |
children |
comparison
equal
deleted
inserted
replaced
2:d570cc324779 | 3:e0041942a12d |
---|---|
1 #!/usr/bin/env python | |
2 """Filter a FASTQ file with IDs from a tabular file, e.g. from BLAST. | |
3 | |
4 NOTE - This script is now OBSOLETE, having been replaced by a new verion | |
5 which handles FASTA, FASTQ and SFF all in one. | |
6 | |
7 Takes five command line options, tabular filename, ID column numbers | |
8 (comma separated list using one based counting), input FASTA filename, and | |
9 two output FASTA filenames (for records with and without the given IDs). | |
10 | |
11 If either output filename is just a minus sign, that file is not created. | |
12 This is intended to allow output for just the matched (or just the non-matched) | |
13 records. | |
14 | |
15 Note in the default NCBI BLAST+ tabular output, the query sequence ID is | |
16 in column one, and the ID of the match from the database is in column two. | |
17 Here sensible values for the column numbers would therefore be "1" or "2". | |
18 | |
19 This script is copyright 2010-2011 by Peter Cock, SCRI, UK. All rights reserved. | |
20 See accompanying text file for licence details (MIT/BSD style). | |
21 | |
22 This is version 0.0.4 of the script. | |
23 """ | |
24 import sys | |
25 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter | |
26 | |
27 def stop_err( msg ): | |
28 sys.stderr.write( msg ) | |
29 sys.exit() | |
30 | |
31 #Parse Command Line | |
32 try: | |
33 tabular_file, cols_arg, in_file, out_positive_file, out_negative_file = sys.argv[1:] | |
34 except ValueError: | |
35 stop_err("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) | |
36 try: | |
37 columns = [int(arg)-1 for arg in cols_arg.split(",")] | |
38 except ValueError: | |
39 stop_err("Expected list of columns (comma separated integers), got %s" % cols_arg) | |
40 | |
41 #Read tabular file and record all specified identifiers | |
42 ids = set() | |
43 handle = open(tabular_file, "rU") | |
44 if len(columns)>1: | |
45 #General case of many columns | |
46 for line in handle: | |
47 if line.startswith("#"): | |
48 #Ignore comments | |
49 continue | |
50 parts = line.rstrip("\n").split("\t") | |
51 for col in columns: | |
52 ids.add(parts[col]) | |
53 print "Using %i IDs from %i columns of tabular file" % (len(ids), len(columns)) | |
54 else: | |
55 #Single column, special case speed up | |
56 col = columns[0] | |
57 for line in handle: | |
58 if not line.startswith("#"): | |
59 ids.add(line.rstrip("\n").split("\t")[col]) | |
60 print "Using %i IDs from tabular file" % (len(ids)) | |
61 handle.close() | |
62 | |
63 #Write filtered FASTQ file based on IDs from tabular file | |
64 reader = fastqReader(open(in_file, "rU")) | |
65 if out_positive_file != "-" and out_negative_file != "-": | |
66 print "Generating two FASTQ files" | |
67 positive_writer = fastqWriter(open(out_positive_file, "w")) | |
68 negative_writer = fastqWriter(open(out_negative_file, "w")) | |
69 for record in reader: | |
70 #The [1:] is because the fastaReader leaves the @ on the identifer. | |
71 if record.identifier and record.identifier.split()[0][1:] in ids: | |
72 positive_writer.write(record) | |
73 else: | |
74 negative_writer.write(record) | |
75 positive_writer.close() | |
76 negative_writer.close() | |
77 elif out_positive_file != "-": | |
78 print "Generating matching FASTQ file" | |
79 positive_writer = fastqWriter(open(out_positive_file, "w")) | |
80 for record in reader: | |
81 #The [1:] is because the fastaReader leaves the @ on the identifer. | |
82 if record.identifier and record.identifier.split()[0][1:] in ids: | |
83 positive_writer.write(record) | |
84 positive_writer.close() | |
85 elif out_negative_file != "-": | |
86 print "Generating non-matching FASTQ file" | |
87 negative_writer = fastqWriter(open(out_negative_file, "w")) | |
88 for record in reader: | |
89 #The [1:] is because the fastaReader leaves the @ on the identifer. | |
90 if not record.identifier or record.identifier.split()[0][1:] not in ids: | |
91 negative_writer.write(record) | |
92 negative_writer.close() | |
93 else: | |
94 stop_err("Neither output file requested") | |
95 reader.close() |