Mercurial > repos > peterjc > sample_seqs
annotate tools/sample_seqs/sample_seqs.py @ 5:6b71ad5d43fb draft
v0.2.3 clarified help, internal cleanup of Python script
| author | peterjc |
|---|---|
| date | Wed, 01 Feb 2017 09:39:36 -0500 |
| parents | 02c13ef1a669 |
| children | 31f5701cd2e9 |
| rev | line source |
|---|---|
| 0 | 1 #!/usr/bin/env python |
| 2 """Sub-sample sequence from a FASTA, FASTQ or SFF file. | |
| 3 | |
| 4 This tool is a short Python script which requires Biopython 1.62 or later | |
| 2 | 5 for sequence parsing. If you use this tool in scientific work leading to a |
| 0 | 6 publication, please cite the Biopython application note: |
| 7 | |
| 8 Cock et al 2009. Biopython: freely available Python tools for computational | |
| 9 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | |
| 10 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | |
| 11 | |
| 2 | 12 This script is copyright 2014-2015 by Peter Cock, The James Hutton Institute |
| 0 | 13 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. |
| 14 See accompanying text file for licence details (MIT license). | |
| 15 | |
| 2 | 16 Use -v or --version to get the version, -h or --help for help. |
| 0 | 17 """ |
| 18 import os | |
| 19 import sys | |
| 2 | 20 from optparse import OptionParser |
| 0 | 21 |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
22 # Parse Command Line |
| 2 | 23 usage = """Use as follows: |
| 24 | |
| 25 $ python sample_seqs.py [options] | |
| 26 | |
| 27 e.g. Sample 20% of the reads: | |
| 28 | |
| 29 $ python sample_seqs.py -i my_seq.fastq -f fastq -p 20.0 -o sample.fastq | |
| 30 | |
| 31 This samples uniformly though the file, rather than at random, and therefore | |
| 32 should be reproducible. | |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
33 |
|
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
34 If you have interleaved paired reads, use the --interleaved switch. If |
|
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
35 instead you have two matched files (one for each pair), run the two |
|
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
36 twice with the same sampling options to make to matched smaller files. |
| 2 | 37 """ |
| 38 parser = OptionParser(usage=usage) | |
| 39 parser.add_option('-i', '--input', dest='input', | |
| 40 default=None, help='Input sequences filename', | |
| 41 metavar="FILE") | |
| 42 parser.add_option('-f', '--format', dest='format', | |
| 43 default=None, | |
| 44 help='Input sequence format (e.g. fasta, fastq, sff)') | |
| 45 parser.add_option('-o', '--output', dest='output', | |
| 46 default=None, help='Output sampled sequenced filename', | |
| 47 metavar="FILE") | |
| 48 parser.add_option('-p', '--percent', dest='percent', | |
| 49 default=None, | |
| 50 help='Take this percent of the reads') | |
| 51 parser.add_option('-n', '--everyn', dest='everyn', | |
| 52 default=None, | |
| 53 help='Take every N-th read') | |
| 54 parser.add_option('-c', '--count', dest='count', | |
| 55 default=None, | |
| 56 help='Take exactly N reads') | |
| 57 parser.add_option("--interleaved", dest="interleaved", | |
| 58 default=False, action="store_true", | |
| 59 help="Input is interleaved reads, preserve the pairings") | |
| 60 parser.add_option("-v", "--version", dest="version", | |
| 61 default=False, action="store_true", | |
| 62 help="Show version and quit") | |
| 63 options, args = parser.parse_args() | |
| 64 | |
| 65 if options.version: | |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
66 print("v0.2.3") |
| 0 | 67 sys.exit(0) |
| 68 | |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
69 try: |
|
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
70 from Bio import SeqIO |
|
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
71 from Bio.SeqIO.QualityIO import FastqGeneralIterator |
|
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
72 from Bio.SeqIO.FastaIO import SimpleFastaParser |
|
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
73 from Bio.SeqIO.SffIO import SffIterator, SffWriter |
|
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
74 except ImportError: |
|
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
75 sys.exit("This script requires Biopython.") |
|
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
76 |
| 2 | 77 in_file = options.input |
| 78 out_file = options.output | |
| 79 interleaved = options.interleaved | |
| 80 | |
| 81 if not in_file: | |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
82 sys.exit("Require an input filename") |
| 0 | 83 if in_file != "/dev/stdin" and not os.path.isfile(in_file): |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
84 sys.exit("Missing input file %r" % in_file) |
| 2 | 85 if not out_file: |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
86 sys.exit("Require an output filename") |
| 2 | 87 if not options.format: |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
88 sys.exit("Require the sequence format") |
| 2 | 89 seq_format = options.format.lower() |
| 90 | |
| 91 | |
| 92 def count_fasta(filename): | |
| 93 count = 0 | |
| 94 with open(filename) as handle: | |
| 95 for title, seq in SimpleFastaParser(handle): | |
| 96 count += 1 | |
| 97 return count | |
| 98 | |
| 99 | |
| 100 def count_fastq(filename): | |
| 101 count = 0 | |
| 102 with open(filename) as handle: | |
| 103 for title, seq, qual in FastqGeneralIterator(handle): | |
| 104 count += 1 | |
| 105 return count | |
| 106 | |
| 0 | 107 |
| 2 | 108 def count_sff(filename): |
| 109 # If the SFF file has a built in index (which is normal), | |
| 110 # this will be parsed and is the quicker than scanning | |
| 111 # the whole file. | |
| 112 return len(SeqIO.index(filename, "sff")) | |
| 113 | |
| 114 | |
| 115 def count_sequences(filename, format): | |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
116 if format == "sff": |
| 2 | 117 return count_sff(filename) |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
118 elif format == "fasta": |
| 2 | 119 return count_fasta(filename) |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
120 elif format.startswith("fastq"): |
| 2 | 121 return count_fastq(filename) |
| 122 else: | |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
123 sys.exit("Unsupported file type %r" % format) |
| 2 | 124 |
| 125 | |
| 126 if options.percent and options.everyn: | |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
127 sys.exit("Cannot combine -p and -n options") |
| 2 | 128 elif options.everyn and options.count: |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
129 sys.exit("Cannot combine -p and -c options") |
| 2 | 130 elif options.percent and options.count: |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
131 sys.exit("Cannot combine -n and -c options") |
| 2 | 132 elif options.everyn: |
| 0 | 133 try: |
| 2 | 134 N = int(options.everyn) |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
135 except ValueError: |
|
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
136 sys.exit("Bad -n argument %r" % options.everyn) |
| 0 | 137 if N < 2: |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
138 sys.exit("Bad -n argument %r" % options.everyn) |
| 0 | 139 if (N % 10) == 1: |
| 140 sys.stderr.write("Sampling every %ist sequence\n" % N) | |
| 141 elif (N % 10) == 2: | |
| 142 sys.stderr.write("Sampling every %ind sequence\n" % N) | |
| 143 elif (N % 10) == 3: | |
| 144 sys.stderr.write("Sampling every %ird sequence\n" % N) | |
| 145 else: | |
| 146 sys.stderr.write("Sampling every %ith sequence\n" % N) | |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
147 |
| 0 | 148 def sampler(iterator): |
| 149 global N | |
| 150 count = 0 | |
| 151 for record in iterator: | |
| 152 count += 1 | |
| 153 if count % N == 1: | |
| 154 yield record | |
| 2 | 155 elif options.percent: |
| 0 | 156 try: |
| 2 | 157 percent = float(options.percent) / 100.0 |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
158 except ValueError: |
|
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
159 sys.exit("Bad -p percent argument %r" % options.percent) |
| 0 | 160 if percent <= 0.0 or 1.0 <= percent: |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
161 sys.exit("Bad -p percent argument %r" % options.percent) |
| 0 | 162 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent)) |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
163 |
| 0 | 164 def sampler(iterator): |
| 165 global percent | |
| 166 count = 0 | |
| 167 taken = 0 | |
| 168 for record in iterator: | |
| 169 count += 1 | |
| 170 if percent * count > taken: | |
| 171 taken += 1 | |
| 172 yield record | |
| 2 | 173 elif options.count: |
| 174 try: | |
| 175 N = int(options.count) | |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
176 except ValueError: |
|
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
177 sys.exit("Bad -c count argument %r" % options.count) |
| 2 | 178 if N < 1: |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
179 sys.exit("Bad -c count argument %r" % options.count) |
| 2 | 180 total = count_sequences(in_file, seq_format) |
|
3
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
181 sys.stderr.write("Input file has %i sequences\n" % total) |
| 2 | 182 if interleaved: |
| 183 # Paired | |
| 184 if total % 2: | |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
185 sys.exit("Paired mode, but input file has an odd number of sequences: %i" |
| 2 | 186 % total) |
| 187 elif N > total // 2: | |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
188 sys.exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)." |
| 2 | 189 % (N, total // 2, total)) |
| 190 total = total // 2 | |
| 191 if N == 1: | |
| 192 sys.stderr.write("Sampling just first sequence pair!\n") | |
| 193 elif N == total: | |
| 194 sys.stderr.write("Taking all the sequence pairs\n") | |
| 195 else: | |
| 196 sys.stderr.write("Sampling %i sequence pairs\n" % N) | |
| 197 else: | |
| 198 # Not paired | |
| 199 if total < N: | |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
200 sys.exit("Requested %i sequences, but file only has %i." % (N, total)) |
| 2 | 201 if N == 1: |
| 202 sys.stderr.write("Sampling just first sequence!\n") | |
| 203 elif N == total: | |
| 204 sys.stderr.write("Taking all the sequences\n") | |
| 205 else: | |
| 206 sys.stderr.write("Sampling %i sequences\n" % N) | |
| 207 if N == total: | |
| 208 def sampler(iterator): | |
| 209 """Dummy filter to filter nothing, taking everything.""" | |
| 210 global N | |
| 211 taken = 0 | |
| 212 for record in iterator: | |
| 213 taken += 1 | |
| 214 yield record | |
| 215 assert taken == N, "Picked %i, wanted %i" % (taken, N) | |
| 216 else: | |
| 217 def sampler(iterator): | |
| 218 # Mimic the percentage sampler, with double check on final count | |
| 219 global N, total | |
| 220 # Do we need a floating point fudge factor epsilon? | |
| 221 # i.e. What if percentage comes out slighty too low, and | |
| 222 # we could end up missing last few desired sequences? | |
| 223 percentage = float(N) / float(total) | |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
224 # print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f" |
| 2 | 225 # % (N, total, percentage * 100.0)) |
| 226 count = 0 | |
| 227 taken = 0 | |
| 228 for record in iterator: | |
| 229 count += 1 | |
| 230 # Do we need the extra upper bound? | |
| 231 if percentage * count > taken and taken < N: | |
| 232 taken += 1 | |
| 233 yield record | |
| 234 elif total - count + 1 <= N - taken: | |
| 235 # remaining records (incuding this one) <= what we still need. | |
| 236 # This is a safey check for floating point edge cases where | |
| 237 # we need to take all remaining sequences to meet target | |
| 238 taken += 1 | |
| 239 yield record | |
| 240 assert taken == N, "Picked %i, wanted %i" % (taken, N) | |
| 0 | 241 else: |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
242 sys.exit("Must use either -n, -p or -c") |
| 2 | 243 |
| 244 | |
| 245 def pair(iterator): | |
| 246 """Quick and dirty pair batched iterator.""" | |
| 247 while True: | |
| 248 a = next(iterator) | |
| 249 b = next(iterator) | |
| 250 if not b: | |
| 251 assert not a, "Odd number of records?" | |
| 252 break | |
| 253 yield (a, b) | |
| 254 | |
| 0 | 255 |
| 256 def raw_fasta_iterator(handle): | |
| 257 """Yields raw FASTA records as multi-line strings.""" | |
| 258 while True: | |
| 259 line = handle.readline() | |
| 260 if line == "": | |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
261 return # Premature end of file, or just empty? |
| 0 | 262 if line[0] == ">": |
| 263 break | |
| 264 | |
| 265 no_id_warned = False | |
| 266 while True: | |
| 267 if line[0] != ">": | |
| 268 raise ValueError( | |
| 269 "Records in Fasta files should start with '>' character") | |
| 270 try: | |
| 271 id = line[1:].split(None, 1)[0] | |
| 272 except IndexError: | |
| 273 if not no_id_warned: | |
| 274 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n") | |
| 275 no_id_warned = True | |
| 276 id = None | |
| 277 lines = [line] | |
| 278 line = handle.readline() | |
| 279 while True: | |
| 280 if not line: | |
| 281 break | |
| 282 if line[0] == ">": | |
| 283 break | |
| 284 lines.append(line) | |
| 285 line = handle.readline() | |
| 286 yield "".join(lines) | |
| 287 if not line: | |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
288 return # StopIteration |
|
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
289 |
| 0 | 290 |
| 2 | 291 def fasta_filter(in_file, out_file, iterator_filter, inter): |
| 0 | 292 count = 0 |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
293 # Galaxy now requires Python 2.5+ so can use with statements, |
| 0 | 294 with open(in_file) as in_handle: |
| 295 with open(out_file, "w") as pos_handle: | |
| 2 | 296 if inter: |
| 297 for r1, r2 in iterator_filter(pair(raw_fasta_iterator(in_handle))): | |
| 298 count += 1 | |
| 299 pos_handle.write(r1) | |
| 300 pos_handle.write(r2) | |
| 301 else: | |
| 302 for record in iterator_filter(raw_fasta_iterator(in_handle)): | |
| 303 count += 1 | |
| 304 pos_handle.write(record) | |
| 0 | 305 return count |
| 306 | |
| 2 | 307 |
| 308 def fastq_filter(in_file, out_file, iterator_filter, inter): | |
| 309 count = 0 | |
| 310 with open(in_file) as in_handle: | |
| 311 with open(out_file, "w") as pos_handle: | |
| 312 if inter: | |
| 313 for r1, r2 in iterator_filter(pair(FastqGeneralIterator(in_handle))): | |
| 314 count += 1 | |
| 315 pos_handle.write("@%s\n%s\n+\n%s\n" % r1) | |
| 316 pos_handle.write("@%s\n%s\n+\n%s\n" % r2) | |
| 317 else: | |
| 0 | 318 for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)): |
| 319 count += 1 | |
| 320 pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | |
| 2 | 321 return count |
| 0 | 322 |
| 2 | 323 |
| 324 def sff_filter(in_file, out_file, iterator_filter, inter): | |
| 0 | 325 count = 0 |
| 326 try: | |
| 327 from Bio.SeqIO.SffIO import ReadRocheXmlManifest | |
| 328 except ImportError: | |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
329 # Prior to Biopython 1.56 this was a private function |
| 0 | 330 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest |
| 331 with open(in_file, "rb") as in_handle: | |
| 332 try: | |
| 333 manifest = ReadRocheXmlManifest(in_handle) | |
| 334 except ValueError: | |
| 335 manifest = None | |
| 336 in_handle.seek(0) | |
| 337 with open(out_file, "wb") as out_handle: | |
| 338 writer = SffWriter(out_handle, xml=manifest) | |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
339 in_handle.seek(0) # start again after getting manifest |
| 2 | 340 if inter: |
| 341 from itertools import chain | |
| 342 count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle))))) | |
| 343 assert count % 2 == 0, "Odd number of records? %i" % count | |
| 344 count /= 2 | |
| 345 else: | |
| 346 count = writer.write_file(iterator_filter(SffIterator(in_handle))) | |
| 0 | 347 return count |
| 348 | |
| 2 | 349 if seq_format == "sff": |
| 350 count = sff_filter(in_file, out_file, sampler, interleaved) | |
| 351 elif seq_format == "fasta": | |
| 352 count = fasta_filter(in_file, out_file, sampler, interleaved) | |
| 353 elif seq_format.startswith("fastq"): | |
| 354 count = fastq_filter(in_file, out_file, sampler, interleaved) | |
| 0 | 355 else: |
|
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
3
diff
changeset
|
356 sys.exit("Unsupported file type %r" % seq_format) |
| 0 | 357 |
| 2 | 358 if interleaved: |
| 359 sys.stderr.write("Selected %i pairs\n" % count) | |
| 360 else: | |
| 361 sys.stderr.write("Selected %i records\n" % count) |
