Mercurial > repos > peterjc > seq_filter_by_id
comparison tools/seq_filter_by_id/seq_filter_by_id.py @ 5:832c1fd57852 draft
v0.2.2; New options for IDs via text parameter, ignore paired read suffix; misc changes
author | peterjc |
---|---|
date | Wed, 13 May 2015 11:03:57 -0400 |
parents | 44ab4c0f7683 |
children | 03e134cae41a |
comparison
equal
deleted
inserted
replaced
4:1c36cf8ef133 | 5:832c1fd57852 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 """Filter a FASTA, FASTQ or SSF file with IDs from a tabular file. | 2 """Filter a FASTA, FASTQ or SSF file with IDs from a tabular file. |
3 | 3 |
4 Takes six command line options, tabular filename, ID column numbers (comma | 4 Takes six command line options, tabular filename, ID column numbers (comma |
5 separated list using one based counting), input filename, input type (e.g. | 5 separated list using one based counting), input filename, input type (e.g. |
6 FASTA or SFF) and two output filenames (for records with and without the | 6 FASTA or SFF) and up to two output filenames (for records with and without |
7 given IDs, same format as input sequence file). | 7 the given IDs, same format as input sequence file). |
8 | |
9 If either output filename is just a minus sign, that file is not created. | |
10 This is intended to allow output for just the matched (or just the non-matched) | |
11 records. | |
12 | 8 |
13 When filtering an SFF file, any Roche XML manifest in the input file is | 9 When filtering an SFF file, any Roche XML manifest in the input file is |
14 preserved in both output files. | 10 preserved in both output files. |
15 | 11 |
16 Note in the default NCBI BLAST+ tabular output, the query sequence ID is | 12 Note in the default NCBI BLAST+ tabular output, the query sequence ID is |
17 in column one, and the ID of the match from the database is in column two. | 13 in column one, and the ID of the match from the database is in column two. |
18 Here sensible values for the column numbers would therefore be "1" or "2". | 14 Here sensible values for the column numbers would therefore be "1" or "2". |
19 | 15 |
20 This tool is a short Python script which requires Biopython 1.54 or later | 16 This tool is a short Python script which requires Biopython 1.54 or later. |
21 for SFF file support. If you use this tool in scientific work leading to a | 17 If you use this tool in scientific work leading to a publication, please |
22 publication, please cite the Biopython application note: | 18 cite the Biopython application note: |
23 | 19 |
24 Cock et al 2009. Biopython: freely available Python tools for computational | 20 Cock et al 2009. Biopython: freely available Python tools for computational |
25 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | 21 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. |
26 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | 22 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. |
27 | 23 |
28 This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute | 24 This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute |
29 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. | 25 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. |
30 See accompanying text file for licence details (MIT license). | 26 See accompanying text file for licence details (MIT license). |
31 | 27 |
32 This is version 0.1.0 of the script, use -v or --version to get the version. | 28 Use -v or --version to get the version, -h or --help for help. |
33 """ | 29 """ |
34 import os | 30 import os |
35 import sys | 31 import sys |
36 | 32 import re |
37 def stop_err(msg, err=1): | 33 from optparse import OptionParser |
34 | |
35 def sys_exit(msg, err=1): | |
38 sys.stderr.write(msg.rstrip() + "\n") | 36 sys.stderr.write(msg.rstrip() + "\n") |
39 sys.exit(err) | 37 sys.exit(err) |
40 | 38 |
41 if "-v" in sys.argv or "--version" in sys.argv: | 39 #Parse Command Line |
42 print "v0.1.0" | 40 usage = """Use as follows: |
41 | |
42 $ python seq_filter_by_id.py [options] tab1 cols1 [, tab2 cols2, ...] | |
43 | |
44 e.g. Positive matches using column one from tabular file: | |
45 | |
46 $ seq_filter_by_id.py -i my_seqs.fastq -f fastq -p matches.fastq ids.tabular 1 | |
47 | |
48 Multiple tabular files and column numbers may be given, or replaced with | |
49 the -t or --text option. | |
50 """ | |
51 parser = OptionParser(usage=usage) | |
52 parser.add_option('-i', '--input', dest='input', | |
53 default=None, help='Input sequences filename', | |
54 metavar="FILE") | |
55 parser.add_option('-f', '--format', dest='format', | |
56 default=None, | |
57 help='Input sequence format (e.g. fasta, fastq, sff)') | |
58 parser.add_option('-t', '--text', dest='id_list', | |
59 default=None, help="Lists of white space separated IDs (instead of a tabular file)") | |
60 parser.add_option('-p', '--positive', dest='output_positive', | |
61 default=None, | |
62 help='Output filename for matches', | |
63 metavar="FILE") | |
64 parser.add_option('-n', '--negative', dest='output_negative', | |
65 default=None, | |
66 help='Output filename for non-matches', | |
67 metavar="FILE") | |
68 parser.add_option("-l", "--logic", dest="logic", | |
69 default="UNION", | |
70 help="How to combined multiple ID columns (UNION or INTERSECTION)") | |
71 parser.add_option("-s", "--suffix", dest="suffix", | |
72 action="store_true", | |
73 help="Ignore pair-read suffices for matching names") | |
74 parser.add_option("-v", "--version", dest="version", | |
75 default=False, action="store_true", | |
76 help="Show version and quit") | |
77 | |
78 options, args = parser.parse_args() | |
79 | |
80 if options.version: | |
81 print "v0.2.1" | |
43 sys.exit(0) | 82 sys.exit(0) |
44 | 83 |
45 #Parse Command Line | 84 in_file = options.input |
46 if len(sys.argv) - 1 < 7 or len(sys.argv) % 2 == 1: | 85 seq_format = options.format |
47 stop_err("Expected 7 or more arguments, 5 required " | 86 out_positive_file = options.output_positive |
48 "(in seq, seq format, out pos, out neg, logic) " | 87 out_negative_file = options.output_negative |
49 "then one or more pairs (tab file, columns), " | 88 logic = options.logic |
50 "got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) | 89 drop_suffices = bool(options.suffix) |
51 | 90 |
52 in_file, seq_format, out_positive_file, out_negative_file, logic = sys.argv[1:6] | 91 if in_file is None or not os.path.isfile(in_file): |
53 | 92 sys_exit("Missing input file: %r" % in_file) |
54 if not os.path.isfile(in_file): | 93 if out_positive_file is None and out_negative_file is None: |
55 stop_err("Missing input file %r" % in_file) | 94 sys_exit("Neither output file requested") |
56 if out_positive_file == "-" and out_negative_file == "-": | 95 if seq_format is None: |
57 stop_err("Neither output file requested") | 96 sys_exit("Missing sequence format") |
58 if logic not in ["UNION", "INTERSECTION"]: | 97 if logic not in ["UNION", "INTERSECTION"]: |
59 stop_err("Fifth agrument should be 'UNION' or 'INTERSECTION', not %r" % logic) | 98 sys_exit("Logic agrument should be 'UNION' or 'INTERSECTION', not %r" % logic) |
99 if options.id_list and args: | |
100 sys_exit("Cannot accepted IDs via both -t and as tabular files") | |
101 elif not options.id_list and not args: | |
102 sys_exit("Expected matched pairs of tabular files and columns (or -t given)") | |
103 if len(args) % 2: | |
104 sys_exit("Expected matched pairs of tabular files and columns, not: %r" % args) | |
105 | |
106 | |
107 #Cope with three widely used suffix naming convensions, | |
108 #Illumina: /1 or /2 | |
109 #Forward/revered: .f or .r | |
110 #Sanger, e.g. .p1k and .q1k | |
111 #See http://staden.sourceforge.net/manual/pregap4_unix_50.html | |
112 #re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") | |
113 #re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") | |
114 re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$") | |
115 assert re_suffix.search("demo.f") | |
116 assert re_suffix.search("demo.s1") | |
117 assert re_suffix.search("demo.f1k") | |
118 assert re_suffix.search("demo.p1") | |
119 assert re_suffix.search("demo.p1k") | |
120 assert re_suffix.search("demo.p1lk") | |
121 assert re_suffix.search("demo/2") | |
122 assert re_suffix.search("demo.r") | |
123 assert re_suffix.search("demo.q1") | |
124 assert re_suffix.search("demo.q1lk") | |
60 | 125 |
61 identifiers = [] | 126 identifiers = [] |
62 for i in range((len(sys.argv) - 6) // 2): | 127 for i in range(len(args) // 2): |
63 tabular_file = sys.argv[6+2*i] | 128 tabular_file = args[2*i] |
64 cols_arg = sys.argv[7+2*i] | 129 cols_arg = args[2*i + 1] |
65 if not os.path.isfile(tabular_file): | 130 if not os.path.isfile(tabular_file): |
66 stop_err("Missing tabular identifier file %r" % tabular_file) | 131 sys_exit("Missing tabular identifier file %r" % tabular_file) |
67 try: | 132 try: |
68 columns = [int(arg)-1 for arg in cols_arg.split(",")] | 133 columns = [int(arg)-1 for arg in cols_arg.split(",")] |
69 except ValueError: | 134 except ValueError: |
70 stop_err("Expected list of columns (comma separated integers), got %r" % cols_arg) | 135 sys_exit("Expected list of columns (comma separated integers), got %r" % cols_arg) |
71 if min(columns) < 0: | 136 if min(columns) < 0: |
72 stop_err("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg) | 137 sys_exit("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg) |
73 identifiers.append((tabular_file, columns)) | 138 identifiers.append((tabular_file, columns)) |
139 | |
140 name_warn = False | |
141 def check_white_space(name): | |
142 parts = name.split(None, 1) | |
143 global name_warn | |
144 if not name_warn and len(parts) > 1: | |
145 name_warn = "WARNING: Some of your identifiers had white space in them, " + \ | |
146 "using first word only. e.g.:\n%s\n" % name | |
147 return parts[0] | |
148 | |
149 if drop_suffices: | |
150 def clean_name(name): | |
151 """Remove suffix.""" | |
152 name = check_white_space(name) | |
153 match = re_suffix.search(name) | |
154 if match: | |
155 # Use the fact this is a suffix, and regular expression will be | |
156 # anchored to the end of the name: | |
157 return name[:match.start()] | |
158 else: | |
159 # Nothing to do | |
160 return name | |
161 assert clean_name("foo/1") == "foo" | |
162 assert clean_name("foo/2") == "foo" | |
163 assert clean_name("bar.f") == "bar" | |
164 assert clean_name("bar.r") == "bar" | |
165 assert clean_name("baz.p1") == "baz" | |
166 assert clean_name("baz.q2") == "baz" | |
167 else: | |
168 # Just check the white space | |
169 clean_name = check_white_space | |
170 | |
171 | |
172 mapped_chars = { '>' :'__gt__', | |
173 '<' :'__lt__', | |
174 "'" :'__sq__', | |
175 '"' :'__dq__', | |
176 '[' :'__ob__', | |
177 ']' :'__cb__', | |
178 '{' :'__oc__', | |
179 '}' :'__cc__', | |
180 '@' : '__at__', | |
181 '\n' : '__cn__', | |
182 '\r' : '__cr__', | |
183 '\t' : '__tc__', | |
184 '#' : '__pd__' | |
185 } | |
74 | 186 |
75 #Read tabular file(s) and record all specified identifiers | 187 #Read tabular file(s) and record all specified identifiers |
76 ids = None #Will be a set | 188 ids = None #Will be a set |
189 if options.id_list: | |
190 assert not identifiers | |
191 ids = set() | |
192 id_list = options.id_list | |
193 #Galaxy turns \r into __cr__ (CR) etc | |
194 for k in mapped_chars: | |
195 id_list = id_list.replace(mapped_chars[k], k) | |
196 for x in options.id_list.split(): | |
197 ids.add(clean_name(x.strip())) | |
198 print("Have %i unique identifiers from list" % len(ids)) | |
77 for tabular_file, columns in identifiers: | 199 for tabular_file, columns in identifiers: |
78 file_ids = set() | 200 file_ids = set() |
79 handle = open(tabular_file, "rU") | 201 handle = open(tabular_file, "rU") |
80 if len(columns)>1: | 202 if len(columns)>1: |
81 #General case of many columns | 203 #General case of many columns |
83 if line.startswith("#"): | 205 if line.startswith("#"): |
84 #Ignore comments | 206 #Ignore comments |
85 continue | 207 continue |
86 parts = line.rstrip("\n").split("\t") | 208 parts = line.rstrip("\n").split("\t") |
87 for col in columns: | 209 for col in columns: |
88 file_ids.add(parts[col]) | 210 file_ids.add(clean_name(parts[col])) |
89 else: | 211 else: |
90 #Single column, special case speed up | 212 #Single column, special case speed up |
91 col = columns[0] | 213 col = columns[0] |
92 for line in handle: | 214 for line in handle: |
93 if not line.startswith("#"): | 215 if not line.startswith("#"): |
94 file_ids.add(line.rstrip("\n").split("\t")[col]) | 216 file_ids.add(clean_name(line.rstrip("\n").split("\t")[col])) |
95 print "Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col+1) for col in columns)) | 217 print "Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col+1) for col in columns)) |
96 if ids is None: | 218 if ids is None: |
97 ids = file_ids | 219 ids = file_ids |
98 if logic == "UNION": | 220 if logic == "UNION": |
99 ids.update(file_ids) | 221 ids.update(file_ids) |
103 if len(identifiers) > 1: | 225 if len(identifiers) > 1: |
104 if logic == "UNION": | 226 if logic == "UNION": |
105 print "Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers)) | 227 print "Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers)) |
106 else: | 228 else: |
107 print "Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers)) | 229 print "Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers)) |
108 | 230 if name_warn: |
231 sys.stderr.write(name_warn) | |
109 | 232 |
110 def crude_fasta_iterator(handle): | 233 def crude_fasta_iterator(handle): |
111 """Yields tuples, record ID and the full record as a string.""" | 234 """Yields tuples, record ID and the full record as a string.""" |
112 while True: | 235 while True: |
113 line = handle.readline() | 236 line = handle.readline() |
147 pos_count = neg_count = 0 | 270 pos_count = neg_count = 0 |
148 #Galaxy now requires Python 2.5+ so can use with statements, | 271 #Galaxy now requires Python 2.5+ so can use with statements, |
149 with open(in_file) as in_handle: | 272 with open(in_file) as in_handle: |
150 #Doing the if statement outside the loop for speed | 273 #Doing the if statement outside the loop for speed |
151 #(with the downside of three very similar loops). | 274 #(with the downside of three very similar loops). |
152 if pos_file != "-" and neg_file != "-": | 275 if pos_file is not None and neg_file is not None: |
153 print "Generating two FASTA files" | 276 print "Generating two FASTA files" |
154 with open(pos_file, "w") as pos_handle: | 277 with open(pos_file, "w") as pos_handle: |
155 with open(neg_file, "w") as neg_handle: | 278 with open(neg_file, "w") as neg_handle: |
156 for identifier, record in crude_fasta_iterator(in_handle): | 279 for identifier, record in crude_fasta_iterator(in_handle): |
157 if identifier in wanted: | 280 if clean_name(identifier) in wanted: |
158 pos_handle.write(record) | 281 pos_handle.write(record) |
159 pos_count += 1 | 282 pos_count += 1 |
160 else: | 283 else: |
161 neg_handle.write(record) | 284 neg_handle.write(record) |
162 neg_count += 1 | 285 neg_count += 1 |
163 elif pos_file != "-": | 286 elif pos_file is not None: |
164 print "Generating matching FASTA file" | 287 print "Generating matching FASTA file" |
165 with open(pos_file, "w") as pos_handle: | 288 with open(pos_file, "w") as pos_handle: |
166 for identifier, record in crude_fasta_iterator(in_handle): | 289 for identifier, record in crude_fasta_iterator(in_handle): |
167 if identifier in wanted: | 290 if clean_name(identifier) in wanted: |
168 pos_handle.write(record) | 291 pos_handle.write(record) |
169 pos_count += 1 | 292 pos_count += 1 |
170 else: | 293 else: |
171 neg_count += 1 | 294 neg_count += 1 |
172 else: | 295 else: |
173 print "Generating non-matching FASTA file" | 296 print "Generating non-matching FASTA file" |
174 assert neg_file != "-" | 297 assert neg_file is not None |
175 with open(neg_file, "w") as neg_handle: | 298 with open(neg_file, "w") as neg_handle: |
176 for identifier, record in crude_fasta_iterator(in_handle): | 299 for identifier, record in crude_fasta_iterator(in_handle): |
177 if identifier in wanted: | 300 if clean_name(identifier) in wanted: |
178 pos_count += 1 | 301 pos_count += 1 |
179 else: | 302 else: |
180 neg_handle.write(record) | 303 neg_handle.write(record) |
181 neg_count += 1 | 304 neg_count += 1 |
182 return pos_count, neg_count | 305 return pos_count, neg_count |
183 | 306 |
184 | 307 |
185 if seq_format.lower()=="sff": | 308 def fastq_filter(in_file, pos_file, neg_file, wanted): |
186 #Now write filtered SFF file based on IDs from BLAST file | 309 """FASTQ filter.""" |
310 from Bio.SeqIO.QualityIO import FastqGeneralIterator | |
311 handle = open(in_file, "r") | |
312 if out_positive_file is not None and out_negative_file is not None: | |
313 print "Generating two FASTQ files" | |
314 positive_handle = open(out_positive_file, "w") | |
315 negative_handle = open(out_negative_file, "w") | |
316 print in_file | |
317 for title, seq, qual in FastqGeneralIterator(handle): | |
318 print("%s --> %s" % (title, clean_name(title.split(None, 1)[0]))) | |
319 if clean_name(title.split(None, 1)[0]) in ids: | |
320 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | |
321 else: | |
322 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | |
323 positive_handle.close() | |
324 negative_handle.close() | |
325 elif out_positive_file is not None: | |
326 print "Generating matching FASTQ file" | |
327 positive_handle = open(out_positive_file, "w") | |
328 for title, seq, qual in FastqGeneralIterator(handle): | |
329 if clean_name(title.split(None, 1)[0]) in ids: | |
330 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | |
331 positive_handle.close() | |
332 elif out_negative_file is not None: | |
333 print "Generating non-matching FASTQ file" | |
334 negative_handle = open(out_negative_file, "w") | |
335 for title, seq, qual in FastqGeneralIterator(handle): | |
336 if clean_name(title.split(None, 1)[0]) not in ids: | |
337 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | |
338 negative_handle.close() | |
339 handle.close() | |
340 # This does not currently bother to record record counts (faster) | |
341 | |
342 | |
343 def sff_filter(in_file, pos_file, neg_file, wanted): | |
344 """SFF filter.""" | |
187 try: | 345 try: |
188 from Bio.SeqIO.SffIO import SffIterator, SffWriter | 346 from Bio.SeqIO.SffIO import SffIterator, SffWriter |
189 except ImportError: | 347 except ImportError: |
190 stop_err("SFF filtering requires Biopython 1.54 or later") | 348 sys_exit("SFF filtering requires Biopython 1.54 or later") |
191 | 349 |
192 try: | 350 try: |
193 from Bio.SeqIO.SffIO import ReadRocheXmlManifest | 351 from Bio.SeqIO.SffIO import ReadRocheXmlManifest |
194 except ImportError: | 352 except ImportError: |
195 #Prior to Biopython 1.56 this was a private function | 353 #Prior to Biopython 1.56 this was a private function |
196 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest | 354 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest |
355 | |
197 in_handle = open(in_file, "rb") #must be binary mode! | 356 in_handle = open(in_file, "rb") #must be binary mode! |
198 try: | 357 try: |
199 manifest = ReadRocheXmlManifest(in_handle) | 358 manifest = ReadRocheXmlManifest(in_handle) |
200 except ValueError: | 359 except ValueError: |
201 manifest = None | 360 manifest = None |
361 | |
202 #This makes two passes though the SFF file with isn't so efficient, | 362 #This makes two passes though the SFF file with isn't so efficient, |
203 #but this makes the code simple. | 363 #but this makes the code simple. |
204 pos_count = neg_count = 0 | 364 pos_count = neg_count = 0 |
205 if out_positive_file != "-": | 365 if out_positive_file is not None: |
206 out_handle = open(out_positive_file, "wb") | 366 out_handle = open(out_positive_file, "wb") |
207 writer = SffWriter(out_handle, xml=manifest) | 367 writer = SffWriter(out_handle, xml=manifest) |
208 in_handle.seek(0) #start again after getting manifest | 368 in_handle.seek(0) #start again after getting manifest |
209 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id in ids) | 369 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids) |
210 out_handle.close() | 370 out_handle.close() |
211 if out_negative_file != "-": | 371 if out_negative_file is not None: |
212 out_handle = open(out_negative_file, "wb") | 372 out_handle = open(out_negative_file, "wb") |
213 writer = SffWriter(out_handle, xml=manifest) | 373 writer = SffWriter(out_handle, xml=manifest) |
214 in_handle.seek(0) #start again | 374 in_handle.seek(0) #start again |
215 neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id not in ids) | 375 neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids) |
216 out_handle.close() | 376 out_handle.close() |
217 #And we're done | 377 #And we're done |
218 in_handle.close() | 378 in_handle.close() |
219 #At the time of writing, Galaxy doesn't show SFF file read counts, | 379 #At the time of writing, Galaxy doesn't show SFF file read counts, |
220 #so it is useful to put them in stdout and thus shown in job info. | 380 #so it is useful to put them in stdout and thus shown in job info. |
221 print "%i with and %i without specified IDs" % (pos_count, neg_count) | 381 return pos_count, neg_count |
382 | |
383 | |
384 if seq_format.lower()=="sff": | |
385 # Now write filtered SFF file based on IDs wanted | |
386 pos_count, neg_count = sff_filter(in_file, out_positive_file, out_negative_file, ids) | |
387 # At the time of writing, Galaxy doesn't show SFF file read counts, | |
388 # so it is useful to put them in stdout and thus shown in job info. | |
222 elif seq_format.lower()=="fasta": | 389 elif seq_format.lower()=="fasta": |
223 #Write filtered FASTA file based on IDs from tabular file | 390 # Write filtered FASTA file based on IDs from tabular file |
224 pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids) | 391 pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids) |
225 print "%i with and %i without specified IDs" % (pos_count, neg_count) | 392 print "%i with and %i without specified IDs" % (pos_count, neg_count) |
226 elif seq_format.lower().startswith("fastq"): | 393 elif seq_format.lower().startswith("fastq"): |
227 #Write filtered FASTQ file based on IDs from tabular file | 394 # Write filtered FASTQ file based on IDs from tabular file |
228 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter | 395 fastq_filter(in_file, out_positive_file, out_negative_file, ids) |
229 reader = fastqReader(open(in_file, "rU")) | 396 # This does not currently track the counts |
230 if out_positive_file != "-" and out_negative_file != "-": | |
231 print "Generating two FASTQ files" | |
232 positive_writer = fastqWriter(open(out_positive_file, "w")) | |
233 negative_writer = fastqWriter(open(out_negative_file, "w")) | |
234 for record in reader: | |
235 #The [1:] is because the fastaReader leaves the > on the identifier. | |
236 if record.identifier and record.identifier.split()[0][1:] in ids: | |
237 positive_writer.write(record) | |
238 else: | |
239 negative_writer.write(record) | |
240 positive_writer.close() | |
241 negative_writer.close() | |
242 elif out_positive_file != "-": | |
243 print "Generating matching FASTQ file" | |
244 positive_writer = fastqWriter(open(out_positive_file, "w")) | |
245 for record in reader: | |
246 #The [1:] is because the fastaReader leaves the > on the identifier. | |
247 if record.identifier and record.identifier.split()[0][1:] in ids: | |
248 positive_writer.write(record) | |
249 positive_writer.close() | |
250 elif out_negative_file != "-": | |
251 print "Generating non-matching FASTQ file" | |
252 negative_writer = fastqWriter(open(out_negative_file, "w")) | |
253 for record in reader: | |
254 #The [1:] is because the fastaReader leaves the > on the identifier. | |
255 if not record.identifier or record.identifier.split()[0][1:] not in ids: | |
256 negative_writer.write(record) | |
257 negative_writer.close() | |
258 reader.close() | |
259 else: | 397 else: |
260 stop_err("Unsupported file type %r" % seq_format) | 398 sys_exit("Unsupported file type %r" % seq_format) |