Mercurial > repos > peterjc > sample_seqs
comparison tools/sample_seqs/sample_seqs.py @ 5:6b71ad5d43fb draft
v0.2.3 clarified help, internal cleanup of Python script
| author | peterjc |
|---|---|
| date | Wed, 01 Feb 2017 09:39:36 -0500 |
| parents | 02c13ef1a669 |
| children | 31f5701cd2e9 |
comparison
equal
deleted
inserted
replaced
| 4:d3aa9f25c24c | 5:6b71ad5d43fb |
|---|---|
| 17 """ | 17 """ |
| 18 import os | 18 import os |
| 19 import sys | 19 import sys |
| 20 from optparse import OptionParser | 20 from optparse import OptionParser |
| 21 | 21 |
| 22 | 22 # Parse Command Line |
| 23 def sys_exit(msg, err=1): | |
| 24 sys.stderr.write(msg.rstrip() + "\n") | |
| 25 sys.exit(err) | |
| 26 | |
| 27 #Parse Command Line | |
| 28 usage = """Use as follows: | 23 usage = """Use as follows: |
| 29 | 24 |
| 30 $ python sample_seqs.py [options] | 25 $ python sample_seqs.py [options] |
| 31 | 26 |
| 32 e.g. Sample 20% of the reads: | 27 e.g. Sample 20% of the reads: |
| 33 | 28 |
| 34 $ python sample_seqs.py -i my_seq.fastq -f fastq -p 20.0 -o sample.fastq | 29 $ python sample_seqs.py -i my_seq.fastq -f fastq -p 20.0 -o sample.fastq |
| 35 | 30 |
| 36 This samples uniformly though the file, rather than at random, and therefore | 31 This samples uniformly though the file, rather than at random, and therefore |
| 37 should be reproducible. | 32 should be reproducible. |
| 33 | |
| 34 If you have interleaved paired reads, use the --interleaved switch. If | |
| 35 instead you have two matched files (one for each pair), run the two | |
| 36 twice with the same sampling options to make to matched smaller files. | |
| 38 """ | 37 """ |
| 39 parser = OptionParser(usage=usage) | 38 parser = OptionParser(usage=usage) |
| 40 parser.add_option('-i', '--input', dest='input', | 39 parser.add_option('-i', '--input', dest='input', |
| 41 default=None, help='Input sequences filename', | 40 default=None, help='Input sequences filename', |
| 42 metavar="FILE") | 41 metavar="FILE") |
| 62 default=False, action="store_true", | 61 default=False, action="store_true", |
| 63 help="Show version and quit") | 62 help="Show version and quit") |
| 64 options, args = parser.parse_args() | 63 options, args = parser.parse_args() |
| 65 | 64 |
| 66 if options.version: | 65 if options.version: |
| 67 print("v0.2.1") | 66 print("v0.2.3") |
| 68 sys.exit(0) | 67 sys.exit(0) |
| 68 | |
| 69 try: | |
| 70 from Bio import SeqIO | |
| 71 from Bio.SeqIO.QualityIO import FastqGeneralIterator | |
| 72 from Bio.SeqIO.FastaIO import SimpleFastaParser | |
| 73 from Bio.SeqIO.SffIO import SffIterator, SffWriter | |
| 74 except ImportError: | |
| 75 sys.exit("This script requires Biopython.") | |
| 69 | 76 |
| 70 in_file = options.input | 77 in_file = options.input |
| 71 out_file = options.output | 78 out_file = options.output |
| 72 interleaved = options.interleaved | 79 interleaved = options.interleaved |
| 73 | 80 |
| 74 if not in_file: | 81 if not in_file: |
| 75 sys_exit("Require an input filename") | 82 sys.exit("Require an input filename") |
| 76 if in_file != "/dev/stdin" and not os.path.isfile(in_file): | 83 if in_file != "/dev/stdin" and not os.path.isfile(in_file): |
| 77 sys_exit("Missing input file %r" % in_file) | 84 sys.exit("Missing input file %r" % in_file) |
| 78 if not out_file: | 85 if not out_file: |
| 79 sys_exit("Require an output filename") | 86 sys.exit("Require an output filename") |
| 80 if not options.format: | 87 if not options.format: |
| 81 sys_exit("Require the sequence format") | 88 sys.exit("Require the sequence format") |
| 82 seq_format = options.format.lower() | 89 seq_format = options.format.lower() |
| 83 | 90 |
| 84 | 91 |
| 85 def count_fasta(filename): | 92 def count_fasta(filename): |
| 86 from Bio.SeqIO.FastaIO import SimpleFastaParser | |
| 87 count = 0 | 93 count = 0 |
| 88 with open(filename) as handle: | 94 with open(filename) as handle: |
| 89 for title, seq in SimpleFastaParser(handle): | 95 for title, seq in SimpleFastaParser(handle): |
| 90 count += 1 | 96 count += 1 |
| 91 return count | 97 return count |
| 92 | 98 |
| 93 | 99 |
| 94 def count_fastq(filename): | 100 def count_fastq(filename): |
| 95 from Bio.SeqIO.QualityIO import FastqGeneralIterator | |
| 96 count = 0 | 101 count = 0 |
| 97 with open(filename) as handle: | 102 with open(filename) as handle: |
| 98 for title, seq, qual in FastqGeneralIterator(handle): | 103 for title, seq, qual in FastqGeneralIterator(handle): |
| 99 count += 1 | 104 count += 1 |
| 100 return count | 105 return count |
| 101 | 106 |
| 102 | 107 |
| 103 def count_sff(filename): | 108 def count_sff(filename): |
| 104 from Bio import SeqIO | |
| 105 # If the SFF file has a built in index (which is normal), | 109 # If the SFF file has a built in index (which is normal), |
| 106 # this will be parsed and is the quicker than scanning | 110 # this will be parsed and is the quicker than scanning |
| 107 # the whole file. | 111 # the whole file. |
| 108 return len(SeqIO.index(filename, "sff")) | 112 return len(SeqIO.index(filename, "sff")) |
| 109 | 113 |
| 110 | 114 |
| 111 def count_sequences(filename, format): | 115 def count_sequences(filename, format): |
| 112 if seq_format == "sff": | 116 if format == "sff": |
| 113 return count_sff(filename) | 117 return count_sff(filename) |
| 114 elif seq_format == "fasta": | 118 elif format == "fasta": |
| 115 return count_fasta(filename) | 119 return count_fasta(filename) |
| 116 elif seq_format.startswith("fastq"): | 120 elif format.startswith("fastq"): |
| 117 return count_fastq(filename) | 121 return count_fastq(filename) |
| 118 else: | 122 else: |
| 119 sys_exit("Unsupported file type %r" % seq_format) | 123 sys.exit("Unsupported file type %r" % format) |
| 120 | 124 |
| 121 | 125 |
| 122 if options.percent and options.everyn: | 126 if options.percent and options.everyn: |
| 123 sys_exit("Cannot combine -p and -n options") | 127 sys.exit("Cannot combine -p and -n options") |
| 124 elif options.everyn and options.count: | 128 elif options.everyn and options.count: |
| 125 sys_exit("Cannot combine -p and -c options") | 129 sys.exit("Cannot combine -p and -c options") |
| 126 elif options.percent and options.count: | 130 elif options.percent and options.count: |
| 127 sys_exit("Cannot combine -n and -c options") | 131 sys.exit("Cannot combine -n and -c options") |
| 128 elif options.everyn: | 132 elif options.everyn: |
| 129 try: | 133 try: |
| 130 N = int(options.everyn) | 134 N = int(options.everyn) |
| 131 except: | 135 except ValueError: |
| 132 sys_exit("Bad -n argument %r" % options.everyn) | 136 sys.exit("Bad -n argument %r" % options.everyn) |
| 133 if N < 2: | 137 if N < 2: |
| 134 sys_exit("Bad -n argument %r" % options.everyn) | 138 sys.exit("Bad -n argument %r" % options.everyn) |
| 135 if (N % 10) == 1: | 139 if (N % 10) == 1: |
| 136 sys.stderr.write("Sampling every %ist sequence\n" % N) | 140 sys.stderr.write("Sampling every %ist sequence\n" % N) |
| 137 elif (N % 10) == 2: | 141 elif (N % 10) == 2: |
| 138 sys.stderr.write("Sampling every %ind sequence\n" % N) | 142 sys.stderr.write("Sampling every %ind sequence\n" % N) |
| 139 elif (N % 10) == 3: | 143 elif (N % 10) == 3: |
| 140 sys.stderr.write("Sampling every %ird sequence\n" % N) | 144 sys.stderr.write("Sampling every %ird sequence\n" % N) |
| 141 else: | 145 else: |
| 142 sys.stderr.write("Sampling every %ith sequence\n" % N) | 146 sys.stderr.write("Sampling every %ith sequence\n" % N) |
| 147 | |
| 143 def sampler(iterator): | 148 def sampler(iterator): |
| 144 global N | 149 global N |
| 145 count = 0 | 150 count = 0 |
| 146 for record in iterator: | 151 for record in iterator: |
| 147 count += 1 | 152 count += 1 |
| 148 if count % N == 1: | 153 if count % N == 1: |
| 149 yield record | 154 yield record |
| 150 elif options.percent: | 155 elif options.percent: |
| 151 try: | 156 try: |
| 152 percent = float(options.percent) / 100.0 | 157 percent = float(options.percent) / 100.0 |
| 153 except: | 158 except ValueError: |
| 154 sys_exit("Bad -p percent argument %r" % options.percent) | 159 sys.exit("Bad -p percent argument %r" % options.percent) |
| 155 if percent <= 0.0 or 1.0 <= percent: | 160 if percent <= 0.0 or 1.0 <= percent: |
| 156 sys_exit("Bad -p percent argument %r" % options.percent) | 161 sys.exit("Bad -p percent argument %r" % options.percent) |
| 157 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent)) | 162 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent)) |
| 163 | |
| 158 def sampler(iterator): | 164 def sampler(iterator): |
| 159 global percent | 165 global percent |
| 160 count = 0 | 166 count = 0 |
| 161 taken = 0 | 167 taken = 0 |
| 162 for record in iterator: | 168 for record in iterator: |
| 165 taken += 1 | 171 taken += 1 |
| 166 yield record | 172 yield record |
| 167 elif options.count: | 173 elif options.count: |
| 168 try: | 174 try: |
| 169 N = int(options.count) | 175 N = int(options.count) |
| 170 except: | 176 except ValueError: |
| 171 sys_exit("Bad -c count argument %r" % options.count) | 177 sys.exit("Bad -c count argument %r" % options.count) |
| 172 if N < 1: | 178 if N < 1: |
| 173 sys_exit("Bad -c count argument %r" % options.count) | 179 sys.exit("Bad -c count argument %r" % options.count) |
| 174 total = count_sequences(in_file, seq_format) | 180 total = count_sequences(in_file, seq_format) |
| 175 sys.stderr.write("Input file has %i sequences\n" % total) | 181 sys.stderr.write("Input file has %i sequences\n" % total) |
| 176 if interleaved: | 182 if interleaved: |
| 177 # Paired | 183 # Paired |
| 178 if total % 2: | 184 if total % 2: |
| 179 sys_exit("Paired mode, but input file has an odd number of sequences: %i" | 185 sys.exit("Paired mode, but input file has an odd number of sequences: %i" |
| 180 % total) | 186 % total) |
| 181 elif N > total // 2: | 187 elif N > total // 2: |
| 182 sys_exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)." | 188 sys.exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)." |
| 183 % (N, total // 2, total)) | 189 % (N, total // 2, total)) |
| 184 total = total // 2 | 190 total = total // 2 |
| 185 if N == 1: | 191 if N == 1: |
| 186 sys.stderr.write("Sampling just first sequence pair!\n") | 192 sys.stderr.write("Sampling just first sequence pair!\n") |
| 187 elif N == total: | 193 elif N == total: |
| 189 else: | 195 else: |
| 190 sys.stderr.write("Sampling %i sequence pairs\n" % N) | 196 sys.stderr.write("Sampling %i sequence pairs\n" % N) |
| 191 else: | 197 else: |
| 192 # Not paired | 198 # Not paired |
| 193 if total < N: | 199 if total < N: |
| 194 sys_exit("Requested %i sequences, but file only has %i." % (N, total)) | 200 sys.exit("Requested %i sequences, but file only has %i." % (N, total)) |
| 195 if N == 1: | 201 if N == 1: |
| 196 sys.stderr.write("Sampling just first sequence!\n") | 202 sys.stderr.write("Sampling just first sequence!\n") |
| 197 elif N == total: | 203 elif N == total: |
| 198 sys.stderr.write("Taking all the sequences\n") | 204 sys.stderr.write("Taking all the sequences\n") |
| 199 else: | 205 else: |
| 213 global N, total | 219 global N, total |
| 214 # Do we need a floating point fudge factor epsilon? | 220 # Do we need a floating point fudge factor epsilon? |
| 215 # i.e. What if percentage comes out slighty too low, and | 221 # i.e. What if percentage comes out slighty too low, and |
| 216 # we could end up missing last few desired sequences? | 222 # we could end up missing last few desired sequences? |
| 217 percentage = float(N) / float(total) | 223 percentage = float(N) / float(total) |
| 218 #print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f" | 224 # print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f" |
| 219 # % (N, total, percentage * 100.0)) | 225 # % (N, total, percentage * 100.0)) |
| 220 count = 0 | 226 count = 0 |
| 221 taken = 0 | 227 taken = 0 |
| 222 for record in iterator: | 228 for record in iterator: |
| 223 count += 1 | 229 count += 1 |
| 231 # we need to take all remaining sequences to meet target | 237 # we need to take all remaining sequences to meet target |
| 232 taken += 1 | 238 taken += 1 |
| 233 yield record | 239 yield record |
| 234 assert taken == N, "Picked %i, wanted %i" % (taken, N) | 240 assert taken == N, "Picked %i, wanted %i" % (taken, N) |
| 235 else: | 241 else: |
| 236 sys_exit("Must use either -n, -p or -c") | 242 sys.exit("Must use either -n, -p or -c") |
| 237 | 243 |
| 238 | 244 |
| 239 def pair(iterator): | 245 def pair(iterator): |
| 240 """Quick and dirty pair batched iterator.""" | 246 """Quick and dirty pair batched iterator.""" |
| 241 while True: | 247 while True: |
| 250 def raw_fasta_iterator(handle): | 256 def raw_fasta_iterator(handle): |
| 251 """Yields raw FASTA records as multi-line strings.""" | 257 """Yields raw FASTA records as multi-line strings.""" |
| 252 while True: | 258 while True: |
| 253 line = handle.readline() | 259 line = handle.readline() |
| 254 if line == "": | 260 if line == "": |
| 255 return # Premature end of file, or just empty? | 261 return # Premature end of file, or just empty? |
| 256 if line[0] == ">": | 262 if line[0] == ">": |
| 257 break | 263 break |
| 258 | 264 |
| 259 no_id_warned = False | 265 no_id_warned = False |
| 260 while True: | 266 while True: |
| 277 break | 283 break |
| 278 lines.append(line) | 284 lines.append(line) |
| 279 line = handle.readline() | 285 line = handle.readline() |
| 280 yield "".join(lines) | 286 yield "".join(lines) |
| 281 if not line: | 287 if not line: |
| 282 return # StopIteration | 288 return # StopIteration |
| 289 | |
| 283 | 290 |
| 284 def fasta_filter(in_file, out_file, iterator_filter, inter): | 291 def fasta_filter(in_file, out_file, iterator_filter, inter): |
| 285 count = 0 | 292 count = 0 |
| 286 #Galaxy now requires Python 2.5+ so can use with statements, | 293 # Galaxy now requires Python 2.5+ so can use with statements, |
| 287 with open(in_file) as in_handle: | 294 with open(in_file) as in_handle: |
| 288 with open(out_file, "w") as pos_handle: | 295 with open(out_file, "w") as pos_handle: |
| 289 if inter: | 296 if inter: |
| 290 for r1, r2 in iterator_filter(pair(raw_fasta_iterator(in_handle))): | 297 for r1, r2 in iterator_filter(pair(raw_fasta_iterator(in_handle))): |
| 291 count += 1 | 298 count += 1 |
| 296 count += 1 | 303 count += 1 |
| 297 pos_handle.write(record) | 304 pos_handle.write(record) |
| 298 return count | 305 return count |
| 299 | 306 |
| 300 | 307 |
| 301 from Bio.SeqIO.QualityIO import FastqGeneralIterator | |
| 302 def fastq_filter(in_file, out_file, iterator_filter, inter): | 308 def fastq_filter(in_file, out_file, iterator_filter, inter): |
| 303 count = 0 | 309 count = 0 |
| 304 with open(in_file) as in_handle: | 310 with open(in_file) as in_handle: |
| 305 with open(out_file, "w") as pos_handle: | 311 with open(out_file, "w") as pos_handle: |
| 306 if inter: | 312 if inter: |
| 316 | 322 |
| 317 | 323 |
| 318 def sff_filter(in_file, out_file, iterator_filter, inter): | 324 def sff_filter(in_file, out_file, iterator_filter, inter): |
| 319 count = 0 | 325 count = 0 |
| 320 try: | 326 try: |
| 321 from Bio.SeqIO.SffIO import SffIterator, SffWriter | |
| 322 except ImportError: | |
| 323 sys_exit("SFF filtering requires Biopython 1.54 or later") | |
| 324 try: | |
| 325 from Bio.SeqIO.SffIO import ReadRocheXmlManifest | 327 from Bio.SeqIO.SffIO import ReadRocheXmlManifest |
| 326 except ImportError: | 328 except ImportError: |
| 327 #Prior to Biopython 1.56 this was a private function | 329 # Prior to Biopython 1.56 this was a private function |
| 328 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest | 330 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest |
| 329 with open(in_file, "rb") as in_handle: | 331 with open(in_file, "rb") as in_handle: |
| 330 try: | 332 try: |
| 331 manifest = ReadRocheXmlManifest(in_handle) | 333 manifest = ReadRocheXmlManifest(in_handle) |
| 332 except ValueError: | 334 except ValueError: |
| 333 manifest = None | 335 manifest = None |
| 334 in_handle.seek(0) | 336 in_handle.seek(0) |
| 335 with open(out_file, "wb") as out_handle: | 337 with open(out_file, "wb") as out_handle: |
| 336 writer = SffWriter(out_handle, xml=manifest) | 338 writer = SffWriter(out_handle, xml=manifest) |
| 337 in_handle.seek(0) #start again after getting manifest | 339 in_handle.seek(0) # start again after getting manifest |
| 338 if inter: | 340 if inter: |
| 339 from itertools import chain | 341 from itertools import chain |
| 340 count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle))))) | 342 count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle))))) |
| 341 assert count % 2 == 0, "Odd number of records? %i" % count | 343 assert count % 2 == 0, "Odd number of records? %i" % count |
| 342 count /= 2 | 344 count /= 2 |
| 343 else: | 345 else: |
| 344 count = writer.write_file(iterator_filter(SffIterator(in_handle))) | 346 count = writer.write_file(iterator_filter(SffIterator(in_handle))) |
| 345 #count = writer.write_file(SffIterator(in_handle)) | |
| 346 return count | 347 return count |
| 347 | 348 |
| 348 if seq_format == "sff": | 349 if seq_format == "sff": |
| 349 count = sff_filter(in_file, out_file, sampler, interleaved) | 350 count = sff_filter(in_file, out_file, sampler, interleaved) |
| 350 elif seq_format == "fasta": | 351 elif seq_format == "fasta": |
| 351 count = fasta_filter(in_file, out_file, sampler, interleaved) | 352 count = fasta_filter(in_file, out_file, sampler, interleaved) |
| 352 elif seq_format.startswith("fastq"): | 353 elif seq_format.startswith("fastq"): |
| 353 count = fastq_filter(in_file, out_file, sampler, interleaved) | 354 count = fastq_filter(in_file, out_file, sampler, interleaved) |
| 354 else: | 355 else: |
| 355 sys_exit("Unsupported file type %r" % seq_format) | 356 sys.exit("Unsupported file type %r" % seq_format) |
| 356 | 357 |
| 357 if interleaved: | 358 if interleaved: |
| 358 sys.stderr.write("Selected %i pairs\n" % count) | 359 sys.stderr.write("Selected %i pairs\n" % count) |
| 359 else: | 360 else: |
| 360 sys.stderr.write("Selected %i records\n" % count) | 361 sys.stderr.write("Selected %i records\n" % count) |
