comparison tools/seq_filter_by_id/seq_filter_by_id.py @ 3:44ab4c0f7683 draft

Uploaded v0.0.6, automatic dependency on Biopython 1.62, new README file, citation information, MIT licence
author peterjc
date Fri, 11 Oct 2013 04:37:12 -0400
parents
children 832c1fd57852
comparison
equal deleted inserted replaced
2:abdd608c869b 3:44ab4c0f7683
1 #!/usr/bin/env python
2 """Filter a FASTA, FASTQ or SSF file with IDs from a tabular file.
3
4 Takes six command line options, tabular filename, ID column numbers (comma
5 separated list using one based counting), input filename, input type (e.g.
6 FASTA or SFF) and two output filenames (for records with and without the
7 given IDs, same format as input sequence file).
8
9 If either output filename is just a minus sign, that file is not created.
10 This is intended to allow output for just the matched (or just the non-matched)
11 records.
12
13 When filtering an SFF file, any Roche XML manifest in the input file is
14 preserved in both output files.
15
16 Note in the default NCBI BLAST+ tabular output, the query sequence ID is
17 in column one, and the ID of the match from the database is in column two.
18 Here sensible values for the column numbers would therefore be "1" or "2".
19
20 This tool is a short Python script which requires Biopython 1.54 or later
21 for SFF file support. If you use this tool in scientific work leading to a
22 publication, please cite the Biopython application note:
23
24 Cock et al 2009. Biopython: freely available Python tools for computational
25 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
26 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
27
28 This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute
29 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
30 See accompanying text file for licence details (MIT license).
31
32 This is version 0.1.0 of the script, use -v or --version to get the version.
33 """
34 import os
35 import sys
36
37 def stop_err(msg, err=1):
38 sys.stderr.write(msg.rstrip() + "\n")
39 sys.exit(err)
40
41 if "-v" in sys.argv or "--version" in sys.argv:
42 print "v0.1.0"
43 sys.exit(0)
44
45 #Parse Command Line
46 if len(sys.argv) - 1 < 7 or len(sys.argv) % 2 == 1:
47 stop_err("Expected 7 or more arguments, 5 required "
48 "(in seq, seq format, out pos, out neg, logic) "
49 "then one or more pairs (tab file, columns), "
50 "got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
51
52 in_file, seq_format, out_positive_file, out_negative_file, logic = sys.argv[1:6]
53
54 if not os.path.isfile(in_file):
55 stop_err("Missing input file %r" % in_file)
56 if out_positive_file == "-" and out_negative_file == "-":
57 stop_err("Neither output file requested")
58 if logic not in ["UNION", "INTERSECTION"]:
59 stop_err("Fifth agrument should be 'UNION' or 'INTERSECTION', not %r" % logic)
60
61 identifiers = []
62 for i in range((len(sys.argv) - 6) // 2):
63 tabular_file = sys.argv[6+2*i]
64 cols_arg = sys.argv[7+2*i]
65 if not os.path.isfile(tabular_file):
66 stop_err("Missing tabular identifier file %r" % tabular_file)
67 try:
68 columns = [int(arg)-1 for arg in cols_arg.split(",")]
69 except ValueError:
70 stop_err("Expected list of columns (comma separated integers), got %r" % cols_arg)
71 if min(columns) < 0:
72 stop_err("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg)
73 identifiers.append((tabular_file, columns))
74
75 #Read tabular file(s) and record all specified identifiers
76 ids = None #Will be a set
77 for tabular_file, columns in identifiers:
78 file_ids = set()
79 handle = open(tabular_file, "rU")
80 if len(columns)>1:
81 #General case of many columns
82 for line in handle:
83 if line.startswith("#"):
84 #Ignore comments
85 continue
86 parts = line.rstrip("\n").split("\t")
87 for col in columns:
88 file_ids.add(parts[col])
89 else:
90 #Single column, special case speed up
91 col = columns[0]
92 for line in handle:
93 if not line.startswith("#"):
94 file_ids.add(line.rstrip("\n").split("\t")[col])
95 print "Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col+1) for col in columns))
96 if ids is None:
97 ids = file_ids
98 if logic == "UNION":
99 ids.update(file_ids)
100 else:
101 ids.intersection_update(file_ids)
102 handle.close()
103 if len(identifiers) > 1:
104 if logic == "UNION":
105 print "Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers))
106 else:
107 print "Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers))
108
109
110 def crude_fasta_iterator(handle):
111 """Yields tuples, record ID and the full record as a string."""
112 while True:
113 line = handle.readline()
114 if line == "":
115 return # Premature end of file, or just empty?
116 if line[0] == ">":
117 break
118
119 no_id_warned = False
120 while True:
121 if line[0] != ">":
122 raise ValueError(
123 "Records in Fasta files should start with '>' character")
124 try:
125 id = line[1:].split(None, 1)[0]
126 except IndexError:
127 if not no_id_warned:
128 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n")
129 no_id_warned = True
130 id = None
131 lines = [line]
132 line = handle.readline()
133 while True:
134 if not line:
135 break
136 if line[0] == ">":
137 break
138 lines.append(line)
139 line = handle.readline()
140 yield id, "".join(lines)
141 if not line:
142 return # StopIteration
143
144
145 def fasta_filter(in_file, pos_file, neg_file, wanted):
146 """FASTA filter producing 60 character line wrapped outout."""
147 pos_count = neg_count = 0
148 #Galaxy now requires Python 2.5+ so can use with statements,
149 with open(in_file) as in_handle:
150 #Doing the if statement outside the loop for speed
151 #(with the downside of three very similar loops).
152 if pos_file != "-" and neg_file != "-":
153 print "Generating two FASTA files"
154 with open(pos_file, "w") as pos_handle:
155 with open(neg_file, "w") as neg_handle:
156 for identifier, record in crude_fasta_iterator(in_handle):
157 if identifier in wanted:
158 pos_handle.write(record)
159 pos_count += 1
160 else:
161 neg_handle.write(record)
162 neg_count += 1
163 elif pos_file != "-":
164 print "Generating matching FASTA file"
165 with open(pos_file, "w") as pos_handle:
166 for identifier, record in crude_fasta_iterator(in_handle):
167 if identifier in wanted:
168 pos_handle.write(record)
169 pos_count += 1
170 else:
171 neg_count += 1
172 else:
173 print "Generating non-matching FASTA file"
174 assert neg_file != "-"
175 with open(neg_file, "w") as neg_handle:
176 for identifier, record in crude_fasta_iterator(in_handle):
177 if identifier in wanted:
178 pos_count += 1
179 else:
180 neg_handle.write(record)
181 neg_count += 1
182 return pos_count, neg_count
183
184
185 if seq_format.lower()=="sff":
186 #Now write filtered SFF file based on IDs from BLAST file
187 try:
188 from Bio.SeqIO.SffIO import SffIterator, SffWriter
189 except ImportError:
190 stop_err("SFF filtering requires Biopython 1.54 or later")
191
192 try:
193 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
194 except ImportError:
195 #Prior to Biopython 1.56 this was a private function
196 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
197 in_handle = open(in_file, "rb") #must be binary mode!
198 try:
199 manifest = ReadRocheXmlManifest(in_handle)
200 except ValueError:
201 manifest = None
202 #This makes two passes though the SFF file with isn't so efficient,
203 #but this makes the code simple.
204 pos_count = neg_count = 0
205 if out_positive_file != "-":
206 out_handle = open(out_positive_file, "wb")
207 writer = SffWriter(out_handle, xml=manifest)
208 in_handle.seek(0) #start again after getting manifest
209 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id in ids)
210 out_handle.close()
211 if out_negative_file != "-":
212 out_handle = open(out_negative_file, "wb")
213 writer = SffWriter(out_handle, xml=manifest)
214 in_handle.seek(0) #start again
215 neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id not in ids)
216 out_handle.close()
217 #And we're done
218 in_handle.close()
219 #At the time of writing, Galaxy doesn't show SFF file read counts,
220 #so it is useful to put them in stdout and thus shown in job info.
221 print "%i with and %i without specified IDs" % (pos_count, neg_count)
222 elif seq_format.lower()=="fasta":
223 #Write filtered FASTA file based on IDs from tabular file
224 pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids)
225 print "%i with and %i without specified IDs" % (pos_count, neg_count)
226 elif seq_format.lower().startswith("fastq"):
227 #Write filtered FASTQ file based on IDs from tabular file
228 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
229 reader = fastqReader(open(in_file, "rU"))
230 if out_positive_file != "-" and out_negative_file != "-":
231 print "Generating two FASTQ files"
232 positive_writer = fastqWriter(open(out_positive_file, "w"))
233 negative_writer = fastqWriter(open(out_negative_file, "w"))
234 for record in reader:
235 #The [1:] is because the fastaReader leaves the > on the identifier.
236 if record.identifier and record.identifier.split()[0][1:] in ids:
237 positive_writer.write(record)
238 else:
239 negative_writer.write(record)
240 positive_writer.close()
241 negative_writer.close()
242 elif out_positive_file != "-":
243 print "Generating matching FASTQ file"
244 positive_writer = fastqWriter(open(out_positive_file, "w"))
245 for record in reader:
246 #The [1:] is because the fastaReader leaves the > on the identifier.
247 if record.identifier and record.identifier.split()[0][1:] in ids:
248 positive_writer.write(record)
249 positive_writer.close()
250 elif out_negative_file != "-":
251 print "Generating non-matching FASTQ file"
252 negative_writer = fastqWriter(open(out_negative_file, "w"))
253 for record in reader:
254 #The [1:] is because the fastaReader leaves the > on the identifier.
255 if not record.identifier or record.identifier.split()[0][1:] not in ids:
256 negative_writer.write(record)
257 negative_writer.close()
258 reader.close()
259 else:
260 stop_err("Unsupported file type %r" % seq_format)