Mercurial > repos > peterjc > sample_seqs
annotate tools/sample_seqs/sample_seqs.py @ 3:02c13ef1a669 draft
Uploaded v0.2.1, fixed missing test file, more tests.
author | peterjc |
---|---|
date | Fri, 27 Mar 2015 09:34:27 -0400 |
parents | da64f6a9e32b |
children | 6b71ad5d43fb |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python |
2 """Sub-sample sequence from a FASTA, FASTQ or SFF file. | |
3 | |
4 This tool is a short Python script which requires Biopython 1.62 or later | |
2 | 5 for sequence parsing. If you use this tool in scientific work leading to a |
0 | 6 publication, please cite the Biopython application note: |
7 | |
8 Cock et al 2009. Biopython: freely available Python tools for computational | |
9 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | |
10 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | |
11 | |
2 | 12 This script is copyright 2014-2015 by Peter Cock, The James Hutton Institute |
0 | 13 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. |
14 See accompanying text file for licence details (MIT license). | |
15 | |
2 | 16 Use -v or --version to get the version, -h or --help for help. |
0 | 17 """ |
18 import os | |
19 import sys | |
2 | 20 from optparse import OptionParser |
0 | 21 |
2 | 22 |
23 def sys_exit(msg, err=1): | |
0 | 24 sys.stderr.write(msg.rstrip() + "\n") |
25 sys.exit(err) | |
26 | |
2 | 27 #Parse Command Line |
28 usage = """Use as follows: | |
29 | |
30 $ python sample_seqs.py [options] | |
31 | |
32 e.g. Sample 20% of the reads: | |
33 | |
34 $ python sample_seqs.py -i my_seq.fastq -f fastq -p 20.0 -o sample.fastq | |
35 | |
36 This samples uniformly though the file, rather than at random, and therefore | |
37 should be reproducible. | |
38 """ | |
39 parser = OptionParser(usage=usage) | |
40 parser.add_option('-i', '--input', dest='input', | |
41 default=None, help='Input sequences filename', | |
42 metavar="FILE") | |
43 parser.add_option('-f', '--format', dest='format', | |
44 default=None, | |
45 help='Input sequence format (e.g. fasta, fastq, sff)') | |
46 parser.add_option('-o', '--output', dest='output', | |
47 default=None, help='Output sampled sequenced filename', | |
48 metavar="FILE") | |
49 parser.add_option('-p', '--percent', dest='percent', | |
50 default=None, | |
51 help='Take this percent of the reads') | |
52 parser.add_option('-n', '--everyn', dest='everyn', | |
53 default=None, | |
54 help='Take every N-th read') | |
55 parser.add_option('-c', '--count', dest='count', | |
56 default=None, | |
57 help='Take exactly N reads') | |
58 parser.add_option("--interleaved", dest="interleaved", | |
59 default=False, action="store_true", | |
60 help="Input is interleaved reads, preserve the pairings") | |
61 parser.add_option("-v", "--version", dest="version", | |
62 default=False, action="store_true", | |
63 help="Show version and quit") | |
64 options, args = parser.parse_args() | |
65 | |
66 if options.version: | |
3
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
67 print("v0.2.1") |
0 | 68 sys.exit(0) |
69 | |
2 | 70 in_file = options.input |
71 out_file = options.output | |
72 interleaved = options.interleaved | |
73 | |
74 if not in_file: | |
75 sys_exit("Require an input filename") | |
0 | 76 if in_file != "/dev/stdin" and not os.path.isfile(in_file): |
2 | 77 sys_exit("Missing input file %r" % in_file) |
78 if not out_file: | |
79 sys_exit("Require an output filename") | |
80 if not options.format: | |
81 sys_exit("Require the sequence format") | |
82 seq_format = options.format.lower() | |
83 | |
84 | |
85 def count_fasta(filename): | |
86 from Bio.SeqIO.FastaIO import SimpleFastaParser | |
87 count = 0 | |
88 with open(filename) as handle: | |
89 for title, seq in SimpleFastaParser(handle): | |
90 count += 1 | |
91 return count | |
92 | |
93 | |
94 def count_fastq(filename): | |
95 from Bio.SeqIO.QualityIO import FastqGeneralIterator | |
96 count = 0 | |
97 with open(filename) as handle: | |
98 for title, seq, qual in FastqGeneralIterator(handle): | |
99 count += 1 | |
100 return count | |
101 | |
0 | 102 |
2 | 103 def count_sff(filename): |
104 from Bio import SeqIO | |
105 # If the SFF file has a built in index (which is normal), | |
106 # this will be parsed and is the quicker than scanning | |
107 # the whole file. | |
108 return len(SeqIO.index(filename, "sff")) | |
109 | |
110 | |
111 def count_sequences(filename, format): | |
112 if seq_format == "sff": | |
113 return count_sff(filename) | |
114 elif seq_format == "fasta": | |
115 return count_fasta(filename) | |
116 elif seq_format.startswith("fastq"): | |
117 return count_fastq(filename) | |
118 else: | |
119 sys_exit("Unsupported file type %r" % seq_format) | |
120 | |
121 | |
122 if options.percent and options.everyn: | |
123 sys_exit("Cannot combine -p and -n options") | |
124 elif options.everyn and options.count: | |
125 sys_exit("Cannot combine -p and -c options") | |
126 elif options.percent and options.count: | |
127 sys_exit("Cannot combine -n and -c options") | |
128 elif options.everyn: | |
0 | 129 try: |
2 | 130 N = int(options.everyn) |
0 | 131 except: |
2 | 132 sys_exit("Bad -n argument %r" % options.everyn) |
0 | 133 if N < 2: |
2 | 134 sys_exit("Bad -n argument %r" % options.everyn) |
0 | 135 if (N % 10) == 1: |
136 sys.stderr.write("Sampling every %ist sequence\n" % N) | |
137 elif (N % 10) == 2: | |
138 sys.stderr.write("Sampling every %ind sequence\n" % N) | |
139 elif (N % 10) == 3: | |
140 sys.stderr.write("Sampling every %ird sequence\n" % N) | |
141 else: | |
142 sys.stderr.write("Sampling every %ith sequence\n" % N) | |
143 def sampler(iterator): | |
144 global N | |
145 count = 0 | |
146 for record in iterator: | |
147 count += 1 | |
148 if count % N == 1: | |
149 yield record | |
2 | 150 elif options.percent: |
0 | 151 try: |
2 | 152 percent = float(options.percent) / 100.0 |
0 | 153 except: |
2 | 154 sys_exit("Bad -p percent argument %r" % options.percent) |
0 | 155 if percent <= 0.0 or 1.0 <= percent: |
2 | 156 sys_exit("Bad -p percent argument %r" % options.percent) |
0 | 157 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent)) |
158 def sampler(iterator): | |
159 global percent | |
160 count = 0 | |
161 taken = 0 | |
162 for record in iterator: | |
163 count += 1 | |
164 if percent * count > taken: | |
165 taken += 1 | |
166 yield record | |
2 | 167 elif options.count: |
168 try: | |
169 N = int(options.count) | |
170 except: | |
171 sys_exit("Bad -c count argument %r" % options.count) | |
172 if N < 1: | |
173 sys_exit("Bad -c count argument %r" % options.count) | |
174 total = count_sequences(in_file, seq_format) | |
3
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
175 sys.stderr.write("Input file has %i sequences\n" % total) |
2 | 176 if interleaved: |
177 # Paired | |
178 if total % 2: | |
179 sys_exit("Paired mode, but input file has an odd number of sequences: %i" | |
180 % total) | |
181 elif N > total // 2: | |
182 sys_exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)." | |
183 % (N, total // 2, total)) | |
184 total = total // 2 | |
185 if N == 1: | |
186 sys.stderr.write("Sampling just first sequence pair!\n") | |
187 elif N == total: | |
188 sys.stderr.write("Taking all the sequence pairs\n") | |
189 else: | |
190 sys.stderr.write("Sampling %i sequence pairs\n" % N) | |
191 else: | |
192 # Not paired | |
193 if total < N: | |
194 sys_exit("Requested %i sequences, but file only has %i." % (N, total)) | |
195 if N == 1: | |
196 sys.stderr.write("Sampling just first sequence!\n") | |
197 elif N == total: | |
198 sys.stderr.write("Taking all the sequences\n") | |
199 else: | |
200 sys.stderr.write("Sampling %i sequences\n" % N) | |
201 if N == total: | |
202 def sampler(iterator): | |
203 """Dummy filter to filter nothing, taking everything.""" | |
204 global N | |
205 taken = 0 | |
206 for record in iterator: | |
207 taken += 1 | |
208 yield record | |
209 assert taken == N, "Picked %i, wanted %i" % (taken, N) | |
210 else: | |
211 def sampler(iterator): | |
212 # Mimic the percentage sampler, with double check on final count | |
213 global N, total | |
214 # Do we need a floating point fudge factor epsilon? | |
215 # i.e. What if percentage comes out slighty too low, and | |
216 # we could end up missing last few desired sequences? | |
217 percentage = float(N) / float(total) | |
218 #print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f" | |
219 # % (N, total, percentage * 100.0)) | |
220 count = 0 | |
221 taken = 0 | |
222 for record in iterator: | |
223 count += 1 | |
224 # Do we need the extra upper bound? | |
225 if percentage * count > taken and taken < N: | |
226 taken += 1 | |
227 yield record | |
228 elif total - count + 1 <= N - taken: | |
229 # remaining records (incuding this one) <= what we still need. | |
230 # This is a safey check for floating point edge cases where | |
231 # we need to take all remaining sequences to meet target | |
232 taken += 1 | |
233 yield record | |
234 assert taken == N, "Picked %i, wanted %i" % (taken, N) | |
0 | 235 else: |
2 | 236 sys_exit("Must use either -n, -p or -c") |
237 | |
238 | |
239 def pair(iterator): | |
240 """Quick and dirty pair batched iterator.""" | |
241 while True: | |
242 a = next(iterator) | |
243 b = next(iterator) | |
244 if not b: | |
245 assert not a, "Odd number of records?" | |
246 break | |
247 yield (a, b) | |
248 | |
0 | 249 |
250 def raw_fasta_iterator(handle): | |
251 """Yields raw FASTA records as multi-line strings.""" | |
252 while True: | |
253 line = handle.readline() | |
254 if line == "": | |
255 return # Premature end of file, or just empty? | |
256 if line[0] == ">": | |
257 break | |
258 | |
259 no_id_warned = False | |
260 while True: | |
261 if line[0] != ">": | |
262 raise ValueError( | |
263 "Records in Fasta files should start with '>' character") | |
264 try: | |
265 id = line[1:].split(None, 1)[0] | |
266 except IndexError: | |
267 if not no_id_warned: | |
268 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n") | |
269 no_id_warned = True | |
270 id = None | |
271 lines = [line] | |
272 line = handle.readline() | |
273 while True: | |
274 if not line: | |
275 break | |
276 if line[0] == ">": | |
277 break | |
278 lines.append(line) | |
279 line = handle.readline() | |
280 yield "".join(lines) | |
281 if not line: | |
282 return # StopIteration | |
283 | |
2 | 284 def fasta_filter(in_file, out_file, iterator_filter, inter): |
0 | 285 count = 0 |
286 #Galaxy now requires Python 2.5+ so can use with statements, | |
287 with open(in_file) as in_handle: | |
288 with open(out_file, "w") as pos_handle: | |
2 | 289 if inter: |
290 for r1, r2 in iterator_filter(pair(raw_fasta_iterator(in_handle))): | |
291 count += 1 | |
292 pos_handle.write(r1) | |
293 pos_handle.write(r2) | |
294 else: | |
295 for record in iterator_filter(raw_fasta_iterator(in_handle)): | |
296 count += 1 | |
297 pos_handle.write(record) | |
0 | 298 return count |
299 | |
2 | 300 |
301 from Bio.SeqIO.QualityIO import FastqGeneralIterator | |
302 def fastq_filter(in_file, out_file, iterator_filter, inter): | |
303 count = 0 | |
304 with open(in_file) as in_handle: | |
305 with open(out_file, "w") as pos_handle: | |
306 if inter: | |
307 for r1, r2 in iterator_filter(pair(FastqGeneralIterator(in_handle))): | |
308 count += 1 | |
309 pos_handle.write("@%s\n%s\n+\n%s\n" % r1) | |
310 pos_handle.write("@%s\n%s\n+\n%s\n" % r2) | |
311 else: | |
0 | 312 for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)): |
313 count += 1 | |
314 pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | |
2 | 315 return count |
0 | 316 |
2 | 317 |
318 def sff_filter(in_file, out_file, iterator_filter, inter): | |
0 | 319 count = 0 |
320 try: | |
321 from Bio.SeqIO.SffIO import SffIterator, SffWriter | |
322 except ImportError: | |
2 | 323 sys_exit("SFF filtering requires Biopython 1.54 or later") |
0 | 324 try: |
325 from Bio.SeqIO.SffIO import ReadRocheXmlManifest | |
326 except ImportError: | |
327 #Prior to Biopython 1.56 this was a private function | |
328 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest | |
329 with open(in_file, "rb") as in_handle: | |
330 try: | |
331 manifest = ReadRocheXmlManifest(in_handle) | |
332 except ValueError: | |
333 manifest = None | |
334 in_handle.seek(0) | |
335 with open(out_file, "wb") as out_handle: | |
336 writer = SffWriter(out_handle, xml=manifest) | |
337 in_handle.seek(0) #start again after getting manifest | |
2 | 338 if inter: |
339 from itertools import chain | |
340 count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle))))) | |
341 assert count % 2 == 0, "Odd number of records? %i" % count | |
342 count /= 2 | |
343 else: | |
344 count = writer.write_file(iterator_filter(SffIterator(in_handle))) | |
345 #count = writer.write_file(SffIterator(in_handle)) | |
0 | 346 return count |
347 | |
2 | 348 if seq_format == "sff": |
349 count = sff_filter(in_file, out_file, sampler, interleaved) | |
350 elif seq_format == "fasta": | |
351 count = fasta_filter(in_file, out_file, sampler, interleaved) | |
352 elif seq_format.startswith("fastq"): | |
353 count = fastq_filter(in_file, out_file, sampler, interleaved) | |
0 | 354 else: |
2 | 355 sys_exit("Unsupported file type %r" % seq_format) |
0 | 356 |
2 | 357 if interleaved: |
358 sys.stderr.write("Selected %i pairs\n" % count) | |
359 else: | |
360 sys.stderr.write("Selected %i records\n" % count) |