0
|
1 #!/usr/bin/env python
|
|
2 """Sub-sample sequence from a FASTA, FASTQ or SFF file.
|
|
3
|
|
4 This tool is a short Python script which requires Biopython 1.62 or later
|
2
|
5 for sequence parsing. If you use this tool in scientific work leading to a
|
0
|
6 publication, please cite the Biopython application note:
|
|
7
|
|
8 Cock et al 2009. Biopython: freely available Python tools for computational
|
|
9 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
|
|
10 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
|
|
11
|
2
|
12 This script is copyright 2014-2015 by Peter Cock, The James Hutton Institute
|
0
|
13 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
|
|
14 See accompanying text file for licence details (MIT license).
|
|
15
|
2
|
16 Use -v or --version to get the version, -h or --help for help.
|
0
|
17 """
|
|
18 import os
|
|
19 import sys
|
2
|
20 from optparse import OptionParser
|
0
|
21
|
2
|
22
|
|
23 def sys_exit(msg, err=1):
|
0
|
24 sys.stderr.write(msg.rstrip() + "\n")
|
|
25 sys.exit(err)
|
|
26
|
2
|
27 #Parse Command Line
|
|
28 usage = """Use as follows:
|
|
29
|
|
30 $ python sample_seqs.py [options]
|
|
31
|
|
32 e.g. Sample 20% of the reads:
|
|
33
|
|
34 $ python sample_seqs.py -i my_seq.fastq -f fastq -p 20.0 -o sample.fastq
|
|
35
|
|
36 This samples uniformly though the file, rather than at random, and therefore
|
|
37 should be reproducible.
|
|
38 """
|
|
39 parser = OptionParser(usage=usage)
|
|
40 parser.add_option('-i', '--input', dest='input',
|
|
41 default=None, help='Input sequences filename',
|
|
42 metavar="FILE")
|
|
43 parser.add_option('-f', '--format', dest='format',
|
|
44 default=None,
|
|
45 help='Input sequence format (e.g. fasta, fastq, sff)')
|
|
46 parser.add_option('-o', '--output', dest='output',
|
|
47 default=None, help='Output sampled sequenced filename',
|
|
48 metavar="FILE")
|
|
49 parser.add_option('-p', '--percent', dest='percent',
|
|
50 default=None,
|
|
51 help='Take this percent of the reads')
|
|
52 parser.add_option('-n', '--everyn', dest='everyn',
|
|
53 default=None,
|
|
54 help='Take every N-th read')
|
|
55 parser.add_option('-c', '--count', dest='count',
|
|
56 default=None,
|
|
57 help='Take exactly N reads')
|
|
58 parser.add_option("--interleaved", dest="interleaved",
|
|
59 default=False, action="store_true",
|
|
60 help="Input is interleaved reads, preserve the pairings")
|
|
61 parser.add_option("-v", "--version", dest="version",
|
|
62 default=False, action="store_true",
|
|
63 help="Show version and quit")
|
|
64 options, args = parser.parse_args()
|
|
65
|
|
66 if options.version:
|
|
67 print("v0.2.0")
|
0
|
68 sys.exit(0)
|
|
69
|
2
|
70 in_file = options.input
|
|
71 out_file = options.output
|
|
72 interleaved = options.interleaved
|
|
73
|
|
74 if not in_file:
|
|
75 sys_exit("Require an input filename")
|
0
|
76 if in_file != "/dev/stdin" and not os.path.isfile(in_file):
|
2
|
77 sys_exit("Missing input file %r" % in_file)
|
|
78 if not out_file:
|
|
79 sys_exit("Require an output filename")
|
|
80 if not options.format:
|
|
81 sys_exit("Require the sequence format")
|
|
82 seq_format = options.format.lower()
|
|
83
|
|
84
|
|
85 def count_fasta(filename):
|
|
86 from Bio.SeqIO.FastaIO import SimpleFastaParser
|
|
87 count = 0
|
|
88 with open(filename) as handle:
|
|
89 for title, seq in SimpleFastaParser(handle):
|
|
90 count += 1
|
|
91 return count
|
|
92
|
|
93
|
|
94 def count_fastq(filename):
|
|
95 from Bio.SeqIO.QualityIO import FastqGeneralIterator
|
|
96 count = 0
|
|
97 with open(filename) as handle:
|
|
98 for title, seq, qual in FastqGeneralIterator(handle):
|
|
99 count += 1
|
|
100 return count
|
|
101
|
0
|
102
|
2
|
103 def count_sff(filename):
|
|
104 from Bio import SeqIO
|
|
105 # If the SFF file has a built in index (which is normal),
|
|
106 # this will be parsed and is the quicker than scanning
|
|
107 # the whole file.
|
|
108 return len(SeqIO.index(filename, "sff"))
|
|
109
|
|
110
|
|
111 def count_sequences(filename, format):
|
|
112 if seq_format == "sff":
|
|
113 return count_sff(filename)
|
|
114 elif seq_format == "fasta":
|
|
115 return count_fasta(filename)
|
|
116 elif seq_format.startswith("fastq"):
|
|
117 return count_fastq(filename)
|
|
118 else:
|
|
119 sys_exit("Unsupported file type %r" % seq_format)
|
|
120
|
|
121
|
|
122 if options.percent and options.everyn:
|
|
123 sys_exit("Cannot combine -p and -n options")
|
|
124 elif options.everyn and options.count:
|
|
125 sys_exit("Cannot combine -p and -c options")
|
|
126 elif options.percent and options.count:
|
|
127 sys_exit("Cannot combine -n and -c options")
|
|
128 elif options.everyn:
|
0
|
129 try:
|
2
|
130 N = int(options.everyn)
|
0
|
131 except:
|
2
|
132 sys_exit("Bad -n argument %r" % options.everyn)
|
0
|
133 if N < 2:
|
2
|
134 sys_exit("Bad -n argument %r" % options.everyn)
|
0
|
135 if (N % 10) == 1:
|
|
136 sys.stderr.write("Sampling every %ist sequence\n" % N)
|
|
137 elif (N % 10) == 2:
|
|
138 sys.stderr.write("Sampling every %ind sequence\n" % N)
|
|
139 elif (N % 10) == 3:
|
|
140 sys.stderr.write("Sampling every %ird sequence\n" % N)
|
|
141 else:
|
|
142 sys.stderr.write("Sampling every %ith sequence\n" % N)
|
|
143 def sampler(iterator):
|
|
144 global N
|
|
145 count = 0
|
|
146 for record in iterator:
|
|
147 count += 1
|
|
148 if count % N == 1:
|
|
149 yield record
|
2
|
150 elif options.percent:
|
0
|
151 try:
|
2
|
152 percent = float(options.percent) / 100.0
|
0
|
153 except:
|
2
|
154 sys_exit("Bad -p percent argument %r" % options.percent)
|
0
|
155 if percent <= 0.0 or 1.0 <= percent:
|
2
|
156 sys_exit("Bad -p percent argument %r" % options.percent)
|
0
|
157 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent))
|
|
158 def sampler(iterator):
|
|
159 global percent
|
|
160 count = 0
|
|
161 taken = 0
|
|
162 for record in iterator:
|
|
163 count += 1
|
|
164 if percent * count > taken:
|
|
165 taken += 1
|
|
166 yield record
|
2
|
167 elif options.count:
|
|
168 try:
|
|
169 N = int(options.count)
|
|
170 except:
|
|
171 sys_exit("Bad -c count argument %r" % options.count)
|
|
172 if N < 1:
|
|
173 sys_exit("Bad -c count argument %r" % options.count)
|
|
174 total = count_sequences(in_file, seq_format)
|
|
175 print("Input file has %i sequences" % total)
|
|
176 if interleaved:
|
|
177 # Paired
|
|
178 if total % 2:
|
|
179 sys_exit("Paired mode, but input file has an odd number of sequences: %i"
|
|
180 % total)
|
|
181 elif N > total // 2:
|
|
182 sys_exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)."
|
|
183 % (N, total // 2, total))
|
|
184 total = total // 2
|
|
185 if N == 1:
|
|
186 sys.stderr.write("Sampling just first sequence pair!\n")
|
|
187 elif N == total:
|
|
188 sys.stderr.write("Taking all the sequence pairs\n")
|
|
189 else:
|
|
190 sys.stderr.write("Sampling %i sequence pairs\n" % N)
|
|
191 else:
|
|
192 # Not paired
|
|
193 if total < N:
|
|
194 sys_exit("Requested %i sequences, but file only has %i." % (N, total))
|
|
195 if N == 1:
|
|
196 sys.stderr.write("Sampling just first sequence!\n")
|
|
197 elif N == total:
|
|
198 sys.stderr.write("Taking all the sequences\n")
|
|
199 else:
|
|
200 sys.stderr.write("Sampling %i sequences\n" % N)
|
|
201 if N == total:
|
|
202 def sampler(iterator):
|
|
203 """Dummy filter to filter nothing, taking everything."""
|
|
204 global N
|
|
205 taken = 0
|
|
206 for record in iterator:
|
|
207 taken += 1
|
|
208 yield record
|
|
209 assert taken == N, "Picked %i, wanted %i" % (taken, N)
|
|
210 else:
|
|
211 def sampler(iterator):
|
|
212 # Mimic the percentage sampler, with double check on final count
|
|
213 global N, total
|
|
214 # Do we need a floating point fudge factor epsilon?
|
|
215 # i.e. What if percentage comes out slighty too low, and
|
|
216 # we could end up missing last few desired sequences?
|
|
217 percentage = float(N) / float(total)
|
|
218 #print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f"
|
|
219 # % (N, total, percentage * 100.0))
|
|
220 count = 0
|
|
221 taken = 0
|
|
222 for record in iterator:
|
|
223 count += 1
|
|
224 # Do we need the extra upper bound?
|
|
225 if percentage * count > taken and taken < N:
|
|
226 taken += 1
|
|
227 yield record
|
|
228 elif total - count + 1 <= N - taken:
|
|
229 # remaining records (incuding this one) <= what we still need.
|
|
230 # This is a safey check for floating point edge cases where
|
|
231 # we need to take all remaining sequences to meet target
|
|
232 taken += 1
|
|
233 yield record
|
|
234 assert taken == N, "Picked %i, wanted %i" % (taken, N)
|
0
|
235 else:
|
2
|
236 sys_exit("Must use either -n, -p or -c")
|
|
237
|
|
238
|
|
239 def pair(iterator):
|
|
240 """Quick and dirty pair batched iterator."""
|
|
241 while True:
|
|
242 a = next(iterator)
|
|
243 b = next(iterator)
|
|
244 if not b:
|
|
245 assert not a, "Odd number of records?"
|
|
246 break
|
|
247 yield (a, b)
|
|
248
|
0
|
249
|
|
250 def raw_fasta_iterator(handle):
|
|
251 """Yields raw FASTA records as multi-line strings."""
|
|
252 while True:
|
|
253 line = handle.readline()
|
|
254 if line == "":
|
|
255 return # Premature end of file, or just empty?
|
|
256 if line[0] == ">":
|
|
257 break
|
|
258
|
|
259 no_id_warned = False
|
|
260 while True:
|
|
261 if line[0] != ">":
|
|
262 raise ValueError(
|
|
263 "Records in Fasta files should start with '>' character")
|
|
264 try:
|
|
265 id = line[1:].split(None, 1)[0]
|
|
266 except IndexError:
|
|
267 if not no_id_warned:
|
|
268 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n")
|
|
269 no_id_warned = True
|
|
270 id = None
|
|
271 lines = [line]
|
|
272 line = handle.readline()
|
|
273 while True:
|
|
274 if not line:
|
|
275 break
|
|
276 if line[0] == ">":
|
|
277 break
|
|
278 lines.append(line)
|
|
279 line = handle.readline()
|
|
280 yield "".join(lines)
|
|
281 if not line:
|
|
282 return # StopIteration
|
|
283
|
2
|
284 def fasta_filter(in_file, out_file, iterator_filter, inter):
|
0
|
285 count = 0
|
|
286 #Galaxy now requires Python 2.5+ so can use with statements,
|
|
287 with open(in_file) as in_handle:
|
|
288 with open(out_file, "w") as pos_handle:
|
2
|
289 if inter:
|
|
290 for r1, r2 in iterator_filter(pair(raw_fasta_iterator(in_handle))):
|
|
291 count += 1
|
|
292 pos_handle.write(r1)
|
|
293 pos_handle.write(r2)
|
|
294 else:
|
|
295 for record in iterator_filter(raw_fasta_iterator(in_handle)):
|
|
296 count += 1
|
|
297 pos_handle.write(record)
|
0
|
298 return count
|
|
299
|
2
|
300
|
|
301 from Bio.SeqIO.QualityIO import FastqGeneralIterator
|
|
302 def fastq_filter(in_file, out_file, iterator_filter, inter):
|
|
303 count = 0
|
|
304 with open(in_file) as in_handle:
|
|
305 with open(out_file, "w") as pos_handle:
|
|
306 if inter:
|
|
307 for r1, r2 in iterator_filter(pair(FastqGeneralIterator(in_handle))):
|
|
308 count += 1
|
|
309 pos_handle.write("@%s\n%s\n+\n%s\n" % r1)
|
|
310 pos_handle.write("@%s\n%s\n+\n%s\n" % r2)
|
|
311 else:
|
0
|
312 for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)):
|
|
313 count += 1
|
|
314 pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
|
2
|
315 return count
|
0
|
316
|
2
|
317
|
|
318 def sff_filter(in_file, out_file, iterator_filter, inter):
|
0
|
319 count = 0
|
|
320 try:
|
|
321 from Bio.SeqIO.SffIO import SffIterator, SffWriter
|
|
322 except ImportError:
|
2
|
323 sys_exit("SFF filtering requires Biopython 1.54 or later")
|
0
|
324 try:
|
|
325 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
|
|
326 except ImportError:
|
|
327 #Prior to Biopython 1.56 this was a private function
|
|
328 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
|
|
329 with open(in_file, "rb") as in_handle:
|
|
330 try:
|
|
331 manifest = ReadRocheXmlManifest(in_handle)
|
|
332 except ValueError:
|
|
333 manifest = None
|
|
334 in_handle.seek(0)
|
|
335 with open(out_file, "wb") as out_handle:
|
|
336 writer = SffWriter(out_handle, xml=manifest)
|
|
337 in_handle.seek(0) #start again after getting manifest
|
2
|
338 if inter:
|
|
339 from itertools import chain
|
|
340 count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle)))))
|
|
341 assert count % 2 == 0, "Odd number of records? %i" % count
|
|
342 count /= 2
|
|
343 else:
|
|
344 count = writer.write_file(iterator_filter(SffIterator(in_handle)))
|
|
345 #count = writer.write_file(SffIterator(in_handle))
|
0
|
346 return count
|
|
347
|
2
|
348 if seq_format == "sff":
|
|
349 count = sff_filter(in_file, out_file, sampler, interleaved)
|
|
350 elif seq_format == "fasta":
|
|
351 count = fasta_filter(in_file, out_file, sampler, interleaved)
|
|
352 elif seq_format.startswith("fastq"):
|
|
353 count = fastq_filter(in_file, out_file, sampler, interleaved)
|
0
|
354 else:
|
2
|
355 sys_exit("Unsupported file type %r" % seq_format)
|
0
|
356
|
2
|
357 if interleaved:
|
|
358 sys.stderr.write("Selected %i pairs\n" % count)
|
|
359 else:
|
|
360 sys.stderr.write("Selected %i records\n" % count)
|