Mercurial > repos > peterjc > sample_seqs
annotate tools/sample_seqs/sample_seqs.py @ 6:31f5701cd2e9 draft
v0.2.4 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda.
author | peterjc |
---|---|
date | Thu, 11 May 2017 07:24:38 -0400 |
parents | 6b71ad5d43fb |
children | 5f505ed46e16 |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python |
2 """Sub-sample sequence from a FASTA, FASTQ or SFF file. | |
3 | |
4 This tool is a short Python script which requires Biopython 1.62 or later | |
2 | 5 for sequence parsing. If you use this tool in scientific work leading to a |
0 | 6 publication, please cite the Biopython application note: |
7 | |
8 Cock et al 2009. Biopython: freely available Python tools for computational | |
9 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | |
10 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | |
11 | |
2 | 12 This script is copyright 2014-2015 by Peter Cock, The James Hutton Institute |
0 | 13 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. |
14 See accompanying text file for licence details (MIT license). | |
15 | |
2 | 16 Use -v or --version to get the version, -h or --help for help. |
0 | 17 """ |
18 import os | |
19 import sys | |
2 | 20 from optparse import OptionParser |
0 | 21 |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
22 # Parse Command Line |
2 | 23 usage = """Use as follows: |
24 | |
25 $ python sample_seqs.py [options] | |
26 | |
27 e.g. Sample 20% of the reads: | |
28 | |
29 $ python sample_seqs.py -i my_seq.fastq -f fastq -p 20.0 -o sample.fastq | |
30 | |
31 This samples uniformly though the file, rather than at random, and therefore | |
32 should be reproducible. | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
33 |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
34 If you have interleaved paired reads, use the --interleaved switch. If |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
35 instead you have two matched files (one for each pair), run the two |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
36 twice with the same sampling options to make to matched smaller files. |
2 | 37 """ |
38 parser = OptionParser(usage=usage) | |
39 parser.add_option('-i', '--input', dest='input', | |
40 default=None, help='Input sequences filename', | |
41 metavar="FILE") | |
42 parser.add_option('-f', '--format', dest='format', | |
43 default=None, | |
44 help='Input sequence format (e.g. fasta, fastq, sff)') | |
45 parser.add_option('-o', '--output', dest='output', | |
46 default=None, help='Output sampled sequenced filename', | |
47 metavar="FILE") | |
48 parser.add_option('-p', '--percent', dest='percent', | |
49 default=None, | |
50 help='Take this percent of the reads') | |
51 parser.add_option('-n', '--everyn', dest='everyn', | |
52 default=None, | |
53 help='Take every N-th read') | |
54 parser.add_option('-c', '--count', dest='count', | |
55 default=None, | |
56 help='Take exactly N reads') | |
57 parser.add_option("--interleaved", dest="interleaved", | |
58 default=False, action="store_true", | |
59 help="Input is interleaved reads, preserve the pairings") | |
60 parser.add_option("-v", "--version", dest="version", | |
61 default=False, action="store_true", | |
62 help="Show version and quit") | |
63 options, args = parser.parse_args() | |
64 | |
65 if options.version: | |
6
31f5701cd2e9
v0.2.4 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda.
peterjc
parents:
5
diff
changeset
|
66 print("v0.2.4") |
0 | 67 sys.exit(0) |
68 | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
69 try: |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
70 from Bio import SeqIO |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
71 from Bio.SeqIO.QualityIO import FastqGeneralIterator |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
72 from Bio.SeqIO.FastaIO import SimpleFastaParser |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
73 from Bio.SeqIO.SffIO import SffIterator, SffWriter |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
74 except ImportError: |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
75 sys.exit("This script requires Biopython.") |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
76 |
2 | 77 in_file = options.input |
78 out_file = options.output | |
79 interleaved = options.interleaved | |
80 | |
81 if not in_file: | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
82 sys.exit("Require an input filename") |
0 | 83 if in_file != "/dev/stdin" and not os.path.isfile(in_file): |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
84 sys.exit("Missing input file %r" % in_file) |
2 | 85 if not out_file: |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
86 sys.exit("Require an output filename") |
2 | 87 if not options.format: |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
88 sys.exit("Require the sequence format") |
2 | 89 seq_format = options.format.lower() |
90 | |
91 | |
92 def count_fasta(filename): | |
93 count = 0 | |
94 with open(filename) as handle: | |
95 for title, seq in SimpleFastaParser(handle): | |
96 count += 1 | |
97 return count | |
98 | |
99 | |
100 def count_fastq(filename): | |
101 count = 0 | |
102 with open(filename) as handle: | |
103 for title, seq, qual in FastqGeneralIterator(handle): | |
104 count += 1 | |
105 return count | |
106 | |
0 | 107 |
2 | 108 def count_sff(filename): |
109 # If the SFF file has a built in index (which is normal), | |
110 # this will be parsed and is the quicker than scanning | |
111 # the whole file. | |
112 return len(SeqIO.index(filename, "sff")) | |
113 | |
114 | |
115 def count_sequences(filename, format): | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
116 if format == "sff": |
2 | 117 return count_sff(filename) |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
118 elif format == "fasta": |
2 | 119 return count_fasta(filename) |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
120 elif format.startswith("fastq"): |
2 | 121 return count_fastq(filename) |
122 else: | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
123 sys.exit("Unsupported file type %r" % format) |
2 | 124 |
125 | |
126 if options.percent and options.everyn: | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
127 sys.exit("Cannot combine -p and -n options") |
2 | 128 elif options.everyn and options.count: |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
129 sys.exit("Cannot combine -p and -c options") |
2 | 130 elif options.percent and options.count: |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
131 sys.exit("Cannot combine -n and -c options") |
2 | 132 elif options.everyn: |
0 | 133 try: |
2 | 134 N = int(options.everyn) |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
135 except ValueError: |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
136 sys.exit("Bad -n argument %r" % options.everyn) |
0 | 137 if N < 2: |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
138 sys.exit("Bad -n argument %r" % options.everyn) |
0 | 139 if (N % 10) == 1: |
140 sys.stderr.write("Sampling every %ist sequence\n" % N) | |
141 elif (N % 10) == 2: | |
142 sys.stderr.write("Sampling every %ind sequence\n" % N) | |
143 elif (N % 10) == 3: | |
144 sys.stderr.write("Sampling every %ird sequence\n" % N) | |
145 else: | |
146 sys.stderr.write("Sampling every %ith sequence\n" % N) | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
147 |
0 | 148 def sampler(iterator): |
6
31f5701cd2e9
v0.2.4 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda.
peterjc
parents:
5
diff
changeset
|
149 """Sample every Nth sequence.""" |
0 | 150 global N |
151 count = 0 | |
152 for record in iterator: | |
153 count += 1 | |
154 if count % N == 1: | |
155 yield record | |
2 | 156 elif options.percent: |
0 | 157 try: |
2 | 158 percent = float(options.percent) / 100.0 |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
159 except ValueError: |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
160 sys.exit("Bad -p percent argument %r" % options.percent) |
6
31f5701cd2e9
v0.2.4 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda.
peterjc
parents:
5
diff
changeset
|
161 if not(0.0 <= percent <= 1.0): |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
162 sys.exit("Bad -p percent argument %r" % options.percent) |
0 | 163 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent)) |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
164 |
0 | 165 def sampler(iterator): |
6
31f5701cd2e9
v0.2.4 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda.
peterjc
parents:
5
diff
changeset
|
166 """Sample given percentage of sequences.""" |
0 | 167 global percent |
168 count = 0 | |
169 taken = 0 | |
170 for record in iterator: | |
171 count += 1 | |
172 if percent * count > taken: | |
173 taken += 1 | |
174 yield record | |
2 | 175 elif options.count: |
176 try: | |
177 N = int(options.count) | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
178 except ValueError: |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
179 sys.exit("Bad -c count argument %r" % options.count) |
2 | 180 if N < 1: |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
181 sys.exit("Bad -c count argument %r" % options.count) |
2 | 182 total = count_sequences(in_file, seq_format) |
3
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
183 sys.stderr.write("Input file has %i sequences\n" % total) |
2 | 184 if interleaved: |
185 # Paired | |
186 if total % 2: | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
187 sys.exit("Paired mode, but input file has an odd number of sequences: %i" |
2 | 188 % total) |
189 elif N > total // 2: | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
190 sys.exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)." |
2 | 191 % (N, total // 2, total)) |
192 total = total // 2 | |
193 if N == 1: | |
194 sys.stderr.write("Sampling just first sequence pair!\n") | |
195 elif N == total: | |
196 sys.stderr.write("Taking all the sequence pairs\n") | |
197 else: | |
198 sys.stderr.write("Sampling %i sequence pairs\n" % N) | |
199 else: | |
200 # Not paired | |
201 if total < N: | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
202 sys.exit("Requested %i sequences, but file only has %i." % (N, total)) |
2 | 203 if N == 1: |
204 sys.stderr.write("Sampling just first sequence!\n") | |
205 elif N == total: | |
206 sys.stderr.write("Taking all the sequences\n") | |
207 else: | |
208 sys.stderr.write("Sampling %i sequences\n" % N) | |
209 if N == total: | |
210 def sampler(iterator): | |
211 """Dummy filter to filter nothing, taking everything.""" | |
212 global N | |
213 taken = 0 | |
214 for record in iterator: | |
215 taken += 1 | |
216 yield record | |
217 assert taken == N, "Picked %i, wanted %i" % (taken, N) | |
218 else: | |
219 def sampler(iterator): | |
6
31f5701cd2e9
v0.2.4 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda.
peterjc
parents:
5
diff
changeset
|
220 """Sample given number of sequences.""" |
2 | 221 # Mimic the percentage sampler, with double check on final count |
222 global N, total | |
223 # Do we need a floating point fudge factor epsilon? | |
224 # i.e. What if percentage comes out slighty too low, and | |
225 # we could end up missing last few desired sequences? | |
226 percentage = float(N) / float(total) | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
227 # print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f" |
2 | 228 # % (N, total, percentage * 100.0)) |
229 count = 0 | |
230 taken = 0 | |
231 for record in iterator: | |
232 count += 1 | |
233 # Do we need the extra upper bound? | |
234 if percentage * count > taken and taken < N: | |
235 taken += 1 | |
236 yield record | |
237 elif total - count + 1 <= N - taken: | |
238 # remaining records (incuding this one) <= what we still need. | |
239 # This is a safey check for floating point edge cases where | |
240 # we need to take all remaining sequences to meet target | |
241 taken += 1 | |
242 yield record | |
243 assert taken == N, "Picked %i, wanted %i" % (taken, N) | |
0 | 244 else: |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
245 sys.exit("Must use either -n, -p or -c") |
2 | 246 |
247 | |
248 def pair(iterator): | |
249 """Quick and dirty pair batched iterator.""" | |
250 while True: | |
251 a = next(iterator) | |
252 b = next(iterator) | |
253 if not b: | |
254 assert not a, "Odd number of records?" | |
255 break | |
256 yield (a, b) | |
257 | |
0 | 258 |
259 def raw_fasta_iterator(handle): | |
260 """Yields raw FASTA records as multi-line strings.""" | |
261 while True: | |
262 line = handle.readline() | |
263 if line == "": | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
264 return # Premature end of file, or just empty? |
0 | 265 if line[0] == ">": |
266 break | |
267 | |
268 no_id_warned = False | |
269 while True: | |
270 if line[0] != ">": | |
271 raise ValueError( | |
272 "Records in Fasta files should start with '>' character") | |
273 try: | |
6
31f5701cd2e9
v0.2.4 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda.
peterjc
parents:
5
diff
changeset
|
274 line[1:].split(None, 1)[0] |
0 | 275 except IndexError: |
276 if not no_id_warned: | |
277 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n") | |
6
31f5701cd2e9
v0.2.4 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda.
peterjc
parents:
5
diff
changeset
|
278 no_id_warned = True |
0 | 279 lines = [line] |
280 line = handle.readline() | |
281 while True: | |
282 if not line: | |
283 break | |
284 if line[0] == ">": | |
285 break | |
286 lines.append(line) | |
287 line = handle.readline() | |
288 yield "".join(lines) | |
289 if not line: | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
290 return # StopIteration |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
291 |
0 | 292 |
2 | 293 def fasta_filter(in_file, out_file, iterator_filter, inter): |
0 | 294 count = 0 |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
295 # Galaxy now requires Python 2.5+ so can use with statements, |
0 | 296 with open(in_file) as in_handle: |
297 with open(out_file, "w") as pos_handle: | |
2 | 298 if inter: |
299 for r1, r2 in iterator_filter(pair(raw_fasta_iterator(in_handle))): | |
300 count += 1 | |
301 pos_handle.write(r1) | |
302 pos_handle.write(r2) | |
303 else: | |
304 for record in iterator_filter(raw_fasta_iterator(in_handle)): | |
305 count += 1 | |
306 pos_handle.write(record) | |
0 | 307 return count |
308 | |
2 | 309 |
310 def fastq_filter(in_file, out_file, iterator_filter, inter): | |
311 count = 0 | |
312 with open(in_file) as in_handle: | |
313 with open(out_file, "w") as pos_handle: | |
314 if inter: | |
315 for r1, r2 in iterator_filter(pair(FastqGeneralIterator(in_handle))): | |
316 count += 1 | |
317 pos_handle.write("@%s\n%s\n+\n%s\n" % r1) | |
318 pos_handle.write("@%s\n%s\n+\n%s\n" % r2) | |
319 else: | |
0 | 320 for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)): |
321 count += 1 | |
322 pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | |
2 | 323 return count |
0 | 324 |
2 | 325 |
326 def sff_filter(in_file, out_file, iterator_filter, inter): | |
0 | 327 count = 0 |
328 try: | |
329 from Bio.SeqIO.SffIO import ReadRocheXmlManifest | |
330 except ImportError: | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
331 # Prior to Biopython 1.56 this was a private function |
0 | 332 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest |
333 with open(in_file, "rb") as in_handle: | |
334 try: | |
335 manifest = ReadRocheXmlManifest(in_handle) | |
336 except ValueError: | |
337 manifest = None | |
338 in_handle.seek(0) | |
339 with open(out_file, "wb") as out_handle: | |
340 writer = SffWriter(out_handle, xml=manifest) | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
341 in_handle.seek(0) # start again after getting manifest |
2 | 342 if inter: |
343 from itertools import chain | |
344 count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle))))) | |
345 assert count % 2 == 0, "Odd number of records? %i" % count | |
346 count /= 2 | |
347 else: | |
348 count = writer.write_file(iterator_filter(SffIterator(in_handle))) | |
0 | 349 return count |
350 | |
6
31f5701cd2e9
v0.2.4 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda.
peterjc
parents:
5
diff
changeset
|
351 |
2 | 352 if seq_format == "sff": |
353 count = sff_filter(in_file, out_file, sampler, interleaved) | |
354 elif seq_format == "fasta": | |
355 count = fasta_filter(in_file, out_file, sampler, interleaved) | |
356 elif seq_format.startswith("fastq"): | |
357 count = fastq_filter(in_file, out_file, sampler, interleaved) | |
0 | 358 else: |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
359 sys.exit("Unsupported file type %r" % seq_format) |
0 | 360 |
2 | 361 if interleaved: |
362 sys.stderr.write("Selected %i pairs\n" % count) | |
363 else: | |
364 sys.stderr.write("Selected %i records\n" % count) |