Mercurial > repos > peterjc > sample_seqs
annotate tools/sample_seqs/sample_seqs.py @ 9:5f505ed46e16 draft
Bump Biopython dependency
author | peterjc |
---|---|
date | Thu, 30 Nov 2023 09:50:46 +0000 |
parents | 31f5701cd2e9 |
children |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python |
2 """Sub-sample sequence from a FASTA, FASTQ or SFF file. | |
3 | |
4 This tool is a short Python script which requires Biopython 1.62 or later | |
2 | 5 for sequence parsing. If you use this tool in scientific work leading to a |
0 | 6 publication, please cite the Biopython application note: |
7 | |
8 Cock et al 2009. Biopython: freely available Python tools for computational | |
9 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | |
9 | 10 https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878. |
0 | 11 |
9 | 12 This script is copyright 2014-2021 by Peter Cock, The James Hutton Institute |
0 | 13 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. |
14 See accompanying text file for licence details (MIT license). | |
15 | |
2 | 16 Use -v or --version to get the version, -h or --help for help. |
0 | 17 """ |
18 import os | |
19 import sys | |
2 | 20 from optparse import OptionParser |
0 | 21 |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
22 # Parse Command Line |
2 | 23 usage = """Use as follows: |
24 | |
25 $ python sample_seqs.py [options] | |
26 | |
27 e.g. Sample 20% of the reads: | |
28 | |
29 $ python sample_seqs.py -i my_seq.fastq -f fastq -p 20.0 -o sample.fastq | |
30 | |
31 This samples uniformly though the file, rather than at random, and therefore | |
32 should be reproducible. | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
33 |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
34 If you have interleaved paired reads, use the --interleaved switch. If |
9 | 35 instead you have two matched files (one for each pair), run this tool |
36 on each with the same sampling options to make two matched smaller files. | |
2 | 37 """ |
38 parser = OptionParser(usage=usage) | |
9 | 39 parser.add_option( |
40 "-i", | |
41 "--input", | |
42 dest="input", | |
43 default=None, | |
44 help="Input sequences filename", | |
45 metavar="FILE", | |
46 ) | |
47 parser.add_option( | |
48 "-f", | |
49 "--format", | |
50 dest="format", | |
51 default=None, | |
52 help="Input sequence format (e.g. fasta, fastq, sff)", | |
53 ) | |
54 parser.add_option( | |
55 "-o", | |
56 "--output", | |
57 dest="output", | |
58 default=None, | |
59 help="Output sampled sequenced filename", | |
60 metavar="FILE", | |
61 ) | |
62 parser.add_option( | |
63 "-p", | |
64 "--percent", | |
65 dest="percent", | |
66 default=None, | |
67 help="Take this percent of the reads", | |
68 ) | |
69 parser.add_option( | |
70 "-n", "--everyn", dest="everyn", default=None, help="Take every N-th read" | |
71 ) | |
72 parser.add_option( | |
73 "-c", "--count", dest="count", default=None, help="Take exactly N reads" | |
74 ) | |
75 parser.add_option( | |
76 "--interleaved", | |
77 dest="interleaved", | |
78 default=False, | |
79 action="store_true", | |
80 help="Input is interleaved reads, preserve the pairings", | |
81 ) | |
82 parser.add_option( | |
83 "-v", | |
84 "--version", | |
85 dest="version", | |
86 default=False, | |
87 action="store_true", | |
88 help="Show version and quit", | |
89 ) | |
2 | 90 options, args = parser.parse_args() |
91 | |
92 if options.version: | |
6
31f5701cd2e9
v0.2.4 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda.
peterjc
parents:
5
diff
changeset
|
93 print("v0.2.4") |
0 | 94 sys.exit(0) |
95 | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
96 try: |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
97 from Bio import SeqIO |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
98 from Bio.SeqIO.QualityIO import FastqGeneralIterator |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
99 from Bio.SeqIO.FastaIO import SimpleFastaParser |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
100 from Bio.SeqIO.SffIO import SffIterator, SffWriter |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
101 except ImportError: |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
102 sys.exit("This script requires Biopython.") |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
103 |
2 | 104 in_file = options.input |
105 out_file = options.output | |
106 interleaved = options.interleaved | |
107 | |
108 if not in_file: | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
109 sys.exit("Require an input filename") |
0 | 110 if in_file != "/dev/stdin" and not os.path.isfile(in_file): |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
111 sys.exit("Missing input file %r" % in_file) |
2 | 112 if not out_file: |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
113 sys.exit("Require an output filename") |
2 | 114 if not options.format: |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
115 sys.exit("Require the sequence format") |
2 | 116 seq_format = options.format.lower() |
117 | |
118 | |
119 def count_fasta(filename): | |
120 count = 0 | |
121 with open(filename) as handle: | |
122 for title, seq in SimpleFastaParser(handle): | |
123 count += 1 | |
124 return count | |
125 | |
126 | |
127 def count_fastq(filename): | |
128 count = 0 | |
129 with open(filename) as handle: | |
130 for title, seq, qual in FastqGeneralIterator(handle): | |
131 count += 1 | |
132 return count | |
133 | |
0 | 134 |
2 | 135 def count_sff(filename): |
136 # If the SFF file has a built in index (which is normal), | |
137 # this will be parsed and is the quicker than scanning | |
138 # the whole file. | |
139 return len(SeqIO.index(filename, "sff")) | |
140 | |
141 | |
142 def count_sequences(filename, format): | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
143 if format == "sff": |
2 | 144 return count_sff(filename) |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
145 elif format == "fasta": |
2 | 146 return count_fasta(filename) |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
147 elif format.startswith("fastq"): |
2 | 148 return count_fastq(filename) |
149 else: | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
150 sys.exit("Unsupported file type %r" % format) |
2 | 151 |
152 | |
153 if options.percent and options.everyn: | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
154 sys.exit("Cannot combine -p and -n options") |
2 | 155 elif options.everyn and options.count: |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
156 sys.exit("Cannot combine -p and -c options") |
2 | 157 elif options.percent and options.count: |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
158 sys.exit("Cannot combine -n and -c options") |
2 | 159 elif options.everyn: |
0 | 160 try: |
2 | 161 N = int(options.everyn) |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
162 except ValueError: |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
163 sys.exit("Bad -n argument %r" % options.everyn) |
0 | 164 if N < 2: |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
165 sys.exit("Bad -n argument %r" % options.everyn) |
0 | 166 if (N % 10) == 1: |
167 sys.stderr.write("Sampling every %ist sequence\n" % N) | |
168 elif (N % 10) == 2: | |
169 sys.stderr.write("Sampling every %ind sequence\n" % N) | |
170 elif (N % 10) == 3: | |
171 sys.stderr.write("Sampling every %ird sequence\n" % N) | |
172 else: | |
173 sys.stderr.write("Sampling every %ith sequence\n" % N) | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
174 |
0 | 175 def sampler(iterator): |
6
31f5701cd2e9
v0.2.4 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda.
peterjc
parents:
5
diff
changeset
|
176 """Sample every Nth sequence.""" |
0 | 177 global N |
178 count = 0 | |
179 for record in iterator: | |
180 count += 1 | |
181 if count % N == 1: | |
182 yield record | |
9 | 183 |
2 | 184 elif options.percent: |
0 | 185 try: |
2 | 186 percent = float(options.percent) / 100.0 |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
187 except ValueError: |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
188 sys.exit("Bad -p percent argument %r" % options.percent) |
9 | 189 if not (0.0 <= percent <= 1.0): |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
190 sys.exit("Bad -p percent argument %r" % options.percent) |
0 | 191 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent)) |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
192 |
0 | 193 def sampler(iterator): |
6
31f5701cd2e9
v0.2.4 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda.
peterjc
parents:
5
diff
changeset
|
194 """Sample given percentage of sequences.""" |
0 | 195 global percent |
196 count = 0 | |
197 taken = 0 | |
198 for record in iterator: | |
199 count += 1 | |
200 if percent * count > taken: | |
201 taken += 1 | |
202 yield record | |
9 | 203 |
2 | 204 elif options.count: |
205 try: | |
206 N = int(options.count) | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
207 except ValueError: |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
208 sys.exit("Bad -c count argument %r" % options.count) |
2 | 209 if N < 1: |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
210 sys.exit("Bad -c count argument %r" % options.count) |
2 | 211 total = count_sequences(in_file, seq_format) |
3
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
212 sys.stderr.write("Input file has %i sequences\n" % total) |
2 | 213 if interleaved: |
214 # Paired | |
215 if total % 2: | |
9 | 216 sys.exit( |
217 "Paired mode, but input file has an odd number of sequences: %i" % total | |
218 ) | |
2 | 219 elif N > total // 2: |
9 | 220 sys.exit( |
221 "Requested %i sequence pairs, " | |
222 "but file only has %i pairs (%i sequences)." % (N, total // 2, total) | |
223 ) | |
2 | 224 total = total // 2 |
225 if N == 1: | |
226 sys.stderr.write("Sampling just first sequence pair!\n") | |
227 elif N == total: | |
228 sys.stderr.write("Taking all the sequence pairs\n") | |
229 else: | |
230 sys.stderr.write("Sampling %i sequence pairs\n" % N) | |
231 else: | |
232 # Not paired | |
233 if total < N: | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
234 sys.exit("Requested %i sequences, but file only has %i." % (N, total)) |
2 | 235 if N == 1: |
236 sys.stderr.write("Sampling just first sequence!\n") | |
237 elif N == total: | |
238 sys.stderr.write("Taking all the sequences\n") | |
239 else: | |
240 sys.stderr.write("Sampling %i sequences\n" % N) | |
241 if N == total: | |
9 | 242 |
2 | 243 def sampler(iterator): |
9 | 244 """No-operation dummy filter, taking everything.""" |
2 | 245 global N |
246 taken = 0 | |
247 for record in iterator: | |
248 taken += 1 | |
249 yield record | |
250 assert taken == N, "Picked %i, wanted %i" % (taken, N) | |
9 | 251 |
2 | 252 else: |
9 | 253 |
2 | 254 def sampler(iterator): |
6
31f5701cd2e9
v0.2.4 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda.
peterjc
parents:
5
diff
changeset
|
255 """Sample given number of sequences.""" |
2 | 256 # Mimic the percentage sampler, with double check on final count |
257 global N, total | |
258 # Do we need a floating point fudge factor epsilon? | |
259 # i.e. What if percentage comes out slighty too low, and | |
260 # we could end up missing last few desired sequences? | |
261 percentage = float(N) / float(total) | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
262 # print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f" |
2 | 263 # % (N, total, percentage * 100.0)) |
264 count = 0 | |
265 taken = 0 | |
266 for record in iterator: | |
267 count += 1 | |
268 # Do we need the extra upper bound? | |
269 if percentage * count > taken and taken < N: | |
270 taken += 1 | |
271 yield record | |
272 elif total - count + 1 <= N - taken: | |
273 # remaining records (incuding this one) <= what we still need. | |
274 # This is a safey check for floating point edge cases where | |
275 # we need to take all remaining sequences to meet target | |
276 taken += 1 | |
277 yield record | |
278 assert taken == N, "Picked %i, wanted %i" % (taken, N) | |
9 | 279 |
0 | 280 else: |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
281 sys.exit("Must use either -n, -p or -c") |
2 | 282 |
283 | |
284 def pair(iterator): | |
285 """Quick and dirty pair batched iterator.""" | |
286 while True: | |
287 a = next(iterator) | |
288 b = next(iterator) | |
289 if not b: | |
290 assert not a, "Odd number of records?" | |
291 break | |
292 yield (a, b) | |
293 | |
0 | 294 |
295 def raw_fasta_iterator(handle): | |
9 | 296 """Yield raw FASTA records as multi-line strings.""" |
0 | 297 while True: |
298 line = handle.readline() | |
299 if line == "": | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
300 return # Premature end of file, or just empty? |
0 | 301 if line[0] == ">": |
302 break | |
303 | |
304 no_id_warned = False | |
305 while True: | |
306 if line[0] != ">": | |
9 | 307 raise ValueError("Records in Fasta files should start with '>' character") |
0 | 308 try: |
6
31f5701cd2e9
v0.2.4 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda.
peterjc
parents:
5
diff
changeset
|
309 line[1:].split(None, 1)[0] |
0 | 310 except IndexError: |
311 if not no_id_warned: | |
312 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n") | |
6
31f5701cd2e9
v0.2.4 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda.
peterjc
parents:
5
diff
changeset
|
313 no_id_warned = True |
0 | 314 lines = [line] |
315 line = handle.readline() | |
316 while True: | |
317 if not line: | |
318 break | |
319 if line[0] == ">": | |
320 break | |
321 lines.append(line) | |
322 line = handle.readline() | |
323 yield "".join(lines) | |
324 if not line: | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
325 return # StopIteration |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
326 |
0 | 327 |
2 | 328 def fasta_filter(in_file, out_file, iterator_filter, inter): |
0 | 329 count = 0 |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
330 # Galaxy now requires Python 2.5+ so can use with statements, |
0 | 331 with open(in_file) as in_handle: |
332 with open(out_file, "w") as pos_handle: | |
2 | 333 if inter: |
334 for r1, r2 in iterator_filter(pair(raw_fasta_iterator(in_handle))): | |
335 count += 1 | |
336 pos_handle.write(r1) | |
337 pos_handle.write(r2) | |
338 else: | |
339 for record in iterator_filter(raw_fasta_iterator(in_handle)): | |
340 count += 1 | |
341 pos_handle.write(record) | |
0 | 342 return count |
343 | |
2 | 344 |
345 def fastq_filter(in_file, out_file, iterator_filter, inter): | |
346 count = 0 | |
347 with open(in_file) as in_handle: | |
348 with open(out_file, "w") as pos_handle: | |
349 if inter: | |
350 for r1, r2 in iterator_filter(pair(FastqGeneralIterator(in_handle))): | |
351 count += 1 | |
352 pos_handle.write("@%s\n%s\n+\n%s\n" % r1) | |
353 pos_handle.write("@%s\n%s\n+\n%s\n" % r2) | |
354 else: | |
9 | 355 for title, seq, qual in iterator_filter( |
356 FastqGeneralIterator(in_handle) | |
357 ): | |
0 | 358 count += 1 |
359 pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | |
2 | 360 return count |
0 | 361 |
2 | 362 |
363 def sff_filter(in_file, out_file, iterator_filter, inter): | |
0 | 364 count = 0 |
365 try: | |
366 from Bio.SeqIO.SffIO import ReadRocheXmlManifest | |
367 except ImportError: | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
368 # Prior to Biopython 1.56 this was a private function |
0 | 369 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest |
370 with open(in_file, "rb") as in_handle: | |
371 try: | |
372 manifest = ReadRocheXmlManifest(in_handle) | |
373 except ValueError: | |
374 manifest = None | |
375 in_handle.seek(0) | |
376 with open(out_file, "wb") as out_handle: | |
377 writer = SffWriter(out_handle, xml=manifest) | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
378 in_handle.seek(0) # start again after getting manifest |
2 | 379 if inter: |
380 from itertools import chain | |
9 | 381 |
382 count = writer.write_file( | |
383 chain.from_iterable(iterator_filter(pair(SffIterator(in_handle)))) | |
384 ) | |
2 | 385 assert count % 2 == 0, "Odd number of records? %i" % count |
386 count /= 2 | |
387 else: | |
388 count = writer.write_file(iterator_filter(SffIterator(in_handle))) | |
0 | 389 return count |
390 | |
6
31f5701cd2e9
v0.2.4 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda.
peterjc
parents:
5
diff
changeset
|
391 |
2 | 392 if seq_format == "sff": |
393 count = sff_filter(in_file, out_file, sampler, interleaved) | |
394 elif seq_format == "fasta": | |
395 count = fasta_filter(in_file, out_file, sampler, interleaved) | |
396 elif seq_format.startswith("fastq"): | |
397 count = fastq_filter(in_file, out_file, sampler, interleaved) | |
0 | 398 else: |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
399 sys.exit("Unsupported file type %r" % seq_format) |
0 | 400 |
2 | 401 if interleaved: |
402 sys.stderr.write("Selected %i pairs\n" % count) | |
403 else: | |
404 sys.stderr.write("Selected %i records\n" % count) |