comparison tools/seq_filter_by_id/seq_filter_by_id.py @ 5:832c1fd57852 draft

v0.2.2; New options for IDs via text parameter, ignore paired read suffix; misc changes
author peterjc
date Wed, 13 May 2015 11:03:57 -0400
parents 44ab4c0f7683
children 03e134cae41a
comparison
equal deleted inserted replaced
4:1c36cf8ef133 5:832c1fd57852
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 """Filter a FASTA, FASTQ or SSF file with IDs from a tabular file. 2 """Filter a FASTA, FASTQ or SSF file with IDs from a tabular file.
3 3
4 Takes six command line options, tabular filename, ID column numbers (comma 4 Takes six command line options, tabular filename, ID column numbers (comma
5 separated list using one based counting), input filename, input type (e.g. 5 separated list using one based counting), input filename, input type (e.g.
6 FASTA or SFF) and two output filenames (for records with and without the 6 FASTA or SFF) and up to two output filenames (for records with and without
7 given IDs, same format as input sequence file). 7 the given IDs, same format as input sequence file).
8
9 If either output filename is just a minus sign, that file is not created.
10 This is intended to allow output for just the matched (or just the non-matched)
11 records.
12 8
13 When filtering an SFF file, any Roche XML manifest in the input file is 9 When filtering an SFF file, any Roche XML manifest in the input file is
14 preserved in both output files. 10 preserved in both output files.
15 11
16 Note in the default NCBI BLAST+ tabular output, the query sequence ID is 12 Note in the default NCBI BLAST+ tabular output, the query sequence ID is
17 in column one, and the ID of the match from the database is in column two. 13 in column one, and the ID of the match from the database is in column two.
18 Here sensible values for the column numbers would therefore be "1" or "2". 14 Here sensible values for the column numbers would therefore be "1" or "2".
19 15
20 This tool is a short Python script which requires Biopython 1.54 or later 16 This tool is a short Python script which requires Biopython 1.54 or later.
21 for SFF file support. If you use this tool in scientific work leading to a 17 If you use this tool in scientific work leading to a publication, please
22 publication, please cite the Biopython application note: 18 cite the Biopython application note:
23 19
24 Cock et al 2009. Biopython: freely available Python tools for computational 20 Cock et al 2009. Biopython: freely available Python tools for computational
25 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. 21 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
26 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. 22 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
27 23
28 This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute 24 This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute
29 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. 25 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
30 See accompanying text file for licence details (MIT license). 26 See accompanying text file for licence details (MIT license).
31 27
32 This is version 0.1.0 of the script, use -v or --version to get the version. 28 Use -v or --version to get the version, -h or --help for help.
33 """ 29 """
34 import os 30 import os
35 import sys 31 import sys
36 32 import re
37 def stop_err(msg, err=1): 33 from optparse import OptionParser
34
35 def sys_exit(msg, err=1):
38 sys.stderr.write(msg.rstrip() + "\n") 36 sys.stderr.write(msg.rstrip() + "\n")
39 sys.exit(err) 37 sys.exit(err)
40 38
41 if "-v" in sys.argv or "--version" in sys.argv: 39 #Parse Command Line
42 print "v0.1.0" 40 usage = """Use as follows:
41
42 $ python seq_filter_by_id.py [options] tab1 cols1 [, tab2 cols2, ...]
43
44 e.g. Positive matches using column one from tabular file:
45
46 $ seq_filter_by_id.py -i my_seqs.fastq -f fastq -p matches.fastq ids.tabular 1
47
48 Multiple tabular files and column numbers may be given, or replaced with
49 the -t or --text option.
50 """
51 parser = OptionParser(usage=usage)
52 parser.add_option('-i', '--input', dest='input',
53 default=None, help='Input sequences filename',
54 metavar="FILE")
55 parser.add_option('-f', '--format', dest='format',
56 default=None,
57 help='Input sequence format (e.g. fasta, fastq, sff)')
58 parser.add_option('-t', '--text', dest='id_list',
59 default=None, help="Lists of white space separated IDs (instead of a tabular file)")
60 parser.add_option('-p', '--positive', dest='output_positive',
61 default=None,
62 help='Output filename for matches',
63 metavar="FILE")
64 parser.add_option('-n', '--negative', dest='output_negative',
65 default=None,
66 help='Output filename for non-matches',
67 metavar="FILE")
68 parser.add_option("-l", "--logic", dest="logic",
69 default="UNION",
70 help="How to combined multiple ID columns (UNION or INTERSECTION)")
71 parser.add_option("-s", "--suffix", dest="suffix",
72 action="store_true",
73 help="Ignore pair-read suffices for matching names")
74 parser.add_option("-v", "--version", dest="version",
75 default=False, action="store_true",
76 help="Show version and quit")
77
78 options, args = parser.parse_args()
79
80 if options.version:
81 print "v0.2.1"
43 sys.exit(0) 82 sys.exit(0)
44 83
45 #Parse Command Line 84 in_file = options.input
46 if len(sys.argv) - 1 < 7 or len(sys.argv) % 2 == 1: 85 seq_format = options.format
47 stop_err("Expected 7 or more arguments, 5 required " 86 out_positive_file = options.output_positive
48 "(in seq, seq format, out pos, out neg, logic) " 87 out_negative_file = options.output_negative
49 "then one or more pairs (tab file, columns), " 88 logic = options.logic
50 "got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) 89 drop_suffices = bool(options.suffix)
51 90
52 in_file, seq_format, out_positive_file, out_negative_file, logic = sys.argv[1:6] 91 if in_file is None or not os.path.isfile(in_file):
53 92 sys_exit("Missing input file: %r" % in_file)
54 if not os.path.isfile(in_file): 93 if out_positive_file is None and out_negative_file is None:
55 stop_err("Missing input file %r" % in_file) 94 sys_exit("Neither output file requested")
56 if out_positive_file == "-" and out_negative_file == "-": 95 if seq_format is None:
57 stop_err("Neither output file requested") 96 sys_exit("Missing sequence format")
58 if logic not in ["UNION", "INTERSECTION"]: 97 if logic not in ["UNION", "INTERSECTION"]:
59 stop_err("Fifth agrument should be 'UNION' or 'INTERSECTION', not %r" % logic) 98 sys_exit("Logic agrument should be 'UNION' or 'INTERSECTION', not %r" % logic)
99 if options.id_list and args:
100 sys_exit("Cannot accepted IDs via both -t and as tabular files")
101 elif not options.id_list and not args:
102 sys_exit("Expected matched pairs of tabular files and columns (or -t given)")
103 if len(args) % 2:
104 sys_exit("Expected matched pairs of tabular files and columns, not: %r" % args)
105
106
107 #Cope with three widely used suffix naming convensions,
108 #Illumina: /1 or /2
109 #Forward/revered: .f or .r
110 #Sanger, e.g. .p1k and .q1k
111 #See http://staden.sourceforge.net/manual/pregap4_unix_50.html
112 #re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$")
113 #re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$")
114 re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$")
115 assert re_suffix.search("demo.f")
116 assert re_suffix.search("demo.s1")
117 assert re_suffix.search("demo.f1k")
118 assert re_suffix.search("demo.p1")
119 assert re_suffix.search("demo.p1k")
120 assert re_suffix.search("demo.p1lk")
121 assert re_suffix.search("demo/2")
122 assert re_suffix.search("demo.r")
123 assert re_suffix.search("demo.q1")
124 assert re_suffix.search("demo.q1lk")
60 125
61 identifiers = [] 126 identifiers = []
62 for i in range((len(sys.argv) - 6) // 2): 127 for i in range(len(args) // 2):
63 tabular_file = sys.argv[6+2*i] 128 tabular_file = args[2*i]
64 cols_arg = sys.argv[7+2*i] 129 cols_arg = args[2*i + 1]
65 if not os.path.isfile(tabular_file): 130 if not os.path.isfile(tabular_file):
66 stop_err("Missing tabular identifier file %r" % tabular_file) 131 sys_exit("Missing tabular identifier file %r" % tabular_file)
67 try: 132 try:
68 columns = [int(arg)-1 for arg in cols_arg.split(",")] 133 columns = [int(arg)-1 for arg in cols_arg.split(",")]
69 except ValueError: 134 except ValueError:
70 stop_err("Expected list of columns (comma separated integers), got %r" % cols_arg) 135 sys_exit("Expected list of columns (comma separated integers), got %r" % cols_arg)
71 if min(columns) < 0: 136 if min(columns) < 0:
72 stop_err("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg) 137 sys_exit("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg)
73 identifiers.append((tabular_file, columns)) 138 identifiers.append((tabular_file, columns))
139
140 name_warn = False
141 def check_white_space(name):
142 parts = name.split(None, 1)
143 global name_warn
144 if not name_warn and len(parts) > 1:
145 name_warn = "WARNING: Some of your identifiers had white space in them, " + \
146 "using first word only. e.g.:\n%s\n" % name
147 return parts[0]
148
149 if drop_suffices:
150 def clean_name(name):
151 """Remove suffix."""
152 name = check_white_space(name)
153 match = re_suffix.search(name)
154 if match:
155 # Use the fact this is a suffix, and regular expression will be
156 # anchored to the end of the name:
157 return name[:match.start()]
158 else:
159 # Nothing to do
160 return name
161 assert clean_name("foo/1") == "foo"
162 assert clean_name("foo/2") == "foo"
163 assert clean_name("bar.f") == "bar"
164 assert clean_name("bar.r") == "bar"
165 assert clean_name("baz.p1") == "baz"
166 assert clean_name("baz.q2") == "baz"
167 else:
168 # Just check the white space
169 clean_name = check_white_space
170
171
172 mapped_chars = { '>' :'__gt__',
173 '<' :'__lt__',
174 "'" :'__sq__',
175 '"' :'__dq__',
176 '[' :'__ob__',
177 ']' :'__cb__',
178 '{' :'__oc__',
179 '}' :'__cc__',
180 '@' : '__at__',
181 '\n' : '__cn__',
182 '\r' : '__cr__',
183 '\t' : '__tc__',
184 '#' : '__pd__'
185 }
74 186
75 #Read tabular file(s) and record all specified identifiers 187 #Read tabular file(s) and record all specified identifiers
76 ids = None #Will be a set 188 ids = None #Will be a set
189 if options.id_list:
190 assert not identifiers
191 ids = set()
192 id_list = options.id_list
193 #Galaxy turns \r into __cr__ (CR) etc
194 for k in mapped_chars:
195 id_list = id_list.replace(mapped_chars[k], k)
196 for x in options.id_list.split():
197 ids.add(clean_name(x.strip()))
198 print("Have %i unique identifiers from list" % len(ids))
77 for tabular_file, columns in identifiers: 199 for tabular_file, columns in identifiers:
78 file_ids = set() 200 file_ids = set()
79 handle = open(tabular_file, "rU") 201 handle = open(tabular_file, "rU")
80 if len(columns)>1: 202 if len(columns)>1:
81 #General case of many columns 203 #General case of many columns
83 if line.startswith("#"): 205 if line.startswith("#"):
84 #Ignore comments 206 #Ignore comments
85 continue 207 continue
86 parts = line.rstrip("\n").split("\t") 208 parts = line.rstrip("\n").split("\t")
87 for col in columns: 209 for col in columns:
88 file_ids.add(parts[col]) 210 file_ids.add(clean_name(parts[col]))
89 else: 211 else:
90 #Single column, special case speed up 212 #Single column, special case speed up
91 col = columns[0] 213 col = columns[0]
92 for line in handle: 214 for line in handle:
93 if not line.startswith("#"): 215 if not line.startswith("#"):
94 file_ids.add(line.rstrip("\n").split("\t")[col]) 216 file_ids.add(clean_name(line.rstrip("\n").split("\t")[col]))
95 print "Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col+1) for col in columns)) 217 print "Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col+1) for col in columns))
96 if ids is None: 218 if ids is None:
97 ids = file_ids 219 ids = file_ids
98 if logic == "UNION": 220 if logic == "UNION":
99 ids.update(file_ids) 221 ids.update(file_ids)
103 if len(identifiers) > 1: 225 if len(identifiers) > 1:
104 if logic == "UNION": 226 if logic == "UNION":
105 print "Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers)) 227 print "Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers))
106 else: 228 else:
107 print "Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers)) 229 print "Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers))
108 230 if name_warn:
231 sys.stderr.write(name_warn)
109 232
110 def crude_fasta_iterator(handle): 233 def crude_fasta_iterator(handle):
111 """Yields tuples, record ID and the full record as a string.""" 234 """Yields tuples, record ID and the full record as a string."""
112 while True: 235 while True:
113 line = handle.readline() 236 line = handle.readline()
147 pos_count = neg_count = 0 270 pos_count = neg_count = 0
148 #Galaxy now requires Python 2.5+ so can use with statements, 271 #Galaxy now requires Python 2.5+ so can use with statements,
149 with open(in_file) as in_handle: 272 with open(in_file) as in_handle:
150 #Doing the if statement outside the loop for speed 273 #Doing the if statement outside the loop for speed
151 #(with the downside of three very similar loops). 274 #(with the downside of three very similar loops).
152 if pos_file != "-" and neg_file != "-": 275 if pos_file is not None and neg_file is not None:
153 print "Generating two FASTA files" 276 print "Generating two FASTA files"
154 with open(pos_file, "w") as pos_handle: 277 with open(pos_file, "w") as pos_handle:
155 with open(neg_file, "w") as neg_handle: 278 with open(neg_file, "w") as neg_handle:
156 for identifier, record in crude_fasta_iterator(in_handle): 279 for identifier, record in crude_fasta_iterator(in_handle):
157 if identifier in wanted: 280 if clean_name(identifier) in wanted:
158 pos_handle.write(record) 281 pos_handle.write(record)
159 pos_count += 1 282 pos_count += 1
160 else: 283 else:
161 neg_handle.write(record) 284 neg_handle.write(record)
162 neg_count += 1 285 neg_count += 1
163 elif pos_file != "-": 286 elif pos_file is not None:
164 print "Generating matching FASTA file" 287 print "Generating matching FASTA file"
165 with open(pos_file, "w") as pos_handle: 288 with open(pos_file, "w") as pos_handle:
166 for identifier, record in crude_fasta_iterator(in_handle): 289 for identifier, record in crude_fasta_iterator(in_handle):
167 if identifier in wanted: 290 if clean_name(identifier) in wanted:
168 pos_handle.write(record) 291 pos_handle.write(record)
169 pos_count += 1 292 pos_count += 1
170 else: 293 else:
171 neg_count += 1 294 neg_count += 1
172 else: 295 else:
173 print "Generating non-matching FASTA file" 296 print "Generating non-matching FASTA file"
174 assert neg_file != "-" 297 assert neg_file is not None
175 with open(neg_file, "w") as neg_handle: 298 with open(neg_file, "w") as neg_handle:
176 for identifier, record in crude_fasta_iterator(in_handle): 299 for identifier, record in crude_fasta_iterator(in_handle):
177 if identifier in wanted: 300 if clean_name(identifier) in wanted:
178 pos_count += 1 301 pos_count += 1
179 else: 302 else:
180 neg_handle.write(record) 303 neg_handle.write(record)
181 neg_count += 1 304 neg_count += 1
182 return pos_count, neg_count 305 return pos_count, neg_count
183 306
184 307
185 if seq_format.lower()=="sff": 308 def fastq_filter(in_file, pos_file, neg_file, wanted):
186 #Now write filtered SFF file based on IDs from BLAST file 309 """FASTQ filter."""
310 from Bio.SeqIO.QualityIO import FastqGeneralIterator
311 handle = open(in_file, "r")
312 if out_positive_file is not None and out_negative_file is not None:
313 print "Generating two FASTQ files"
314 positive_handle = open(out_positive_file, "w")
315 negative_handle = open(out_negative_file, "w")
316 print in_file
317 for title, seq, qual in FastqGeneralIterator(handle):
318 print("%s --> %s" % (title, clean_name(title.split(None, 1)[0])))
319 if clean_name(title.split(None, 1)[0]) in ids:
320 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
321 else:
322 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
323 positive_handle.close()
324 negative_handle.close()
325 elif out_positive_file is not None:
326 print "Generating matching FASTQ file"
327 positive_handle = open(out_positive_file, "w")
328 for title, seq, qual in FastqGeneralIterator(handle):
329 if clean_name(title.split(None, 1)[0]) in ids:
330 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
331 positive_handle.close()
332 elif out_negative_file is not None:
333 print "Generating non-matching FASTQ file"
334 negative_handle = open(out_negative_file, "w")
335 for title, seq, qual in FastqGeneralIterator(handle):
336 if clean_name(title.split(None, 1)[0]) not in ids:
337 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
338 negative_handle.close()
339 handle.close()
340 # This does not currently bother to record record counts (faster)
341
342
343 def sff_filter(in_file, pos_file, neg_file, wanted):
344 """SFF filter."""
187 try: 345 try:
188 from Bio.SeqIO.SffIO import SffIterator, SffWriter 346 from Bio.SeqIO.SffIO import SffIterator, SffWriter
189 except ImportError: 347 except ImportError:
190 stop_err("SFF filtering requires Biopython 1.54 or later") 348 sys_exit("SFF filtering requires Biopython 1.54 or later")
191 349
192 try: 350 try:
193 from Bio.SeqIO.SffIO import ReadRocheXmlManifest 351 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
194 except ImportError: 352 except ImportError:
195 #Prior to Biopython 1.56 this was a private function 353 #Prior to Biopython 1.56 this was a private function
196 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest 354 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
355
197 in_handle = open(in_file, "rb") #must be binary mode! 356 in_handle = open(in_file, "rb") #must be binary mode!
198 try: 357 try:
199 manifest = ReadRocheXmlManifest(in_handle) 358 manifest = ReadRocheXmlManifest(in_handle)
200 except ValueError: 359 except ValueError:
201 manifest = None 360 manifest = None
361
202 #This makes two passes though the SFF file with isn't so efficient, 362 #This makes two passes though the SFF file with isn't so efficient,
203 #but this makes the code simple. 363 #but this makes the code simple.
204 pos_count = neg_count = 0 364 pos_count = neg_count = 0
205 if out_positive_file != "-": 365 if out_positive_file is not None:
206 out_handle = open(out_positive_file, "wb") 366 out_handle = open(out_positive_file, "wb")
207 writer = SffWriter(out_handle, xml=manifest) 367 writer = SffWriter(out_handle, xml=manifest)
208 in_handle.seek(0) #start again after getting manifest 368 in_handle.seek(0) #start again after getting manifest
209 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id in ids) 369 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids)
210 out_handle.close() 370 out_handle.close()
211 if out_negative_file != "-": 371 if out_negative_file is not None:
212 out_handle = open(out_negative_file, "wb") 372 out_handle = open(out_negative_file, "wb")
213 writer = SffWriter(out_handle, xml=manifest) 373 writer = SffWriter(out_handle, xml=manifest)
214 in_handle.seek(0) #start again 374 in_handle.seek(0) #start again
215 neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id not in ids) 375 neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids)
216 out_handle.close() 376 out_handle.close()
217 #And we're done 377 #And we're done
218 in_handle.close() 378 in_handle.close()
219 #At the time of writing, Galaxy doesn't show SFF file read counts, 379 #At the time of writing, Galaxy doesn't show SFF file read counts,
220 #so it is useful to put them in stdout and thus shown in job info. 380 #so it is useful to put them in stdout and thus shown in job info.
221 print "%i with and %i without specified IDs" % (pos_count, neg_count) 381 return pos_count, neg_count
382
383
384 if seq_format.lower()=="sff":
385 # Now write filtered SFF file based on IDs wanted
386 pos_count, neg_count = sff_filter(in_file, out_positive_file, out_negative_file, ids)
387 # At the time of writing, Galaxy doesn't show SFF file read counts,
388 # so it is useful to put them in stdout and thus shown in job info.
222 elif seq_format.lower()=="fasta": 389 elif seq_format.lower()=="fasta":
223 #Write filtered FASTA file based on IDs from tabular file 390 # Write filtered FASTA file based on IDs from tabular file
224 pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids) 391 pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids)
225 print "%i with and %i without specified IDs" % (pos_count, neg_count) 392 print "%i with and %i without specified IDs" % (pos_count, neg_count)
226 elif seq_format.lower().startswith("fastq"): 393 elif seq_format.lower().startswith("fastq"):
227 #Write filtered FASTQ file based on IDs from tabular file 394 # Write filtered FASTQ file based on IDs from tabular file
228 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter 395 fastq_filter(in_file, out_positive_file, out_negative_file, ids)
229 reader = fastqReader(open(in_file, "rU")) 396 # This does not currently track the counts
230 if out_positive_file != "-" and out_negative_file != "-":
231 print "Generating two FASTQ files"
232 positive_writer = fastqWriter(open(out_positive_file, "w"))
233 negative_writer = fastqWriter(open(out_negative_file, "w"))
234 for record in reader:
235 #The [1:] is because the fastaReader leaves the > on the identifier.
236 if record.identifier and record.identifier.split()[0][1:] in ids:
237 positive_writer.write(record)
238 else:
239 negative_writer.write(record)
240 positive_writer.close()
241 negative_writer.close()
242 elif out_positive_file != "-":
243 print "Generating matching FASTQ file"
244 positive_writer = fastqWriter(open(out_positive_file, "w"))
245 for record in reader:
246 #The [1:] is because the fastaReader leaves the > on the identifier.
247 if record.identifier and record.identifier.split()[0][1:] in ids:
248 positive_writer.write(record)
249 positive_writer.close()
250 elif out_negative_file != "-":
251 print "Generating non-matching FASTQ file"
252 negative_writer = fastqWriter(open(out_negative_file, "w"))
253 for record in reader:
254 #The [1:] is because the fastaReader leaves the > on the identifier.
255 if not record.identifier or record.identifier.split()[0][1:] not in ids:
256 negative_writer.write(record)
257 negative_writer.close()
258 reader.close()
259 else: 397 else:
260 stop_err("Unsupported file type %r" % seq_format) 398 sys_exit("Unsupported file type %r" % seq_format)