Mercurial > repos > peterjc > sample_seqs
comparison tools/sample_seqs/sample_seqs.py @ 6:31f5701cd2e9 draft
v0.2.4 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda.
author | peterjc |
---|---|
date | Thu, 11 May 2017 07:24:38 -0400 |
parents | 6b71ad5d43fb |
children | 5f505ed46e16 |
comparison
equal
deleted
inserted
replaced
5:6b71ad5d43fb | 6:31f5701cd2e9 |
---|---|
61 default=False, action="store_true", | 61 default=False, action="store_true", |
62 help="Show version and quit") | 62 help="Show version and quit") |
63 options, args = parser.parse_args() | 63 options, args = parser.parse_args() |
64 | 64 |
65 if options.version: | 65 if options.version: |
66 print("v0.2.3") | 66 print("v0.2.4") |
67 sys.exit(0) | 67 sys.exit(0) |
68 | 68 |
69 try: | 69 try: |
70 from Bio import SeqIO | 70 from Bio import SeqIO |
71 from Bio.SeqIO.QualityIO import FastqGeneralIterator | 71 from Bio.SeqIO.QualityIO import FastqGeneralIterator |
144 sys.stderr.write("Sampling every %ird sequence\n" % N) | 144 sys.stderr.write("Sampling every %ird sequence\n" % N) |
145 else: | 145 else: |
146 sys.stderr.write("Sampling every %ith sequence\n" % N) | 146 sys.stderr.write("Sampling every %ith sequence\n" % N) |
147 | 147 |
148 def sampler(iterator): | 148 def sampler(iterator): |
149 """Sample every Nth sequence.""" | |
149 global N | 150 global N |
150 count = 0 | 151 count = 0 |
151 for record in iterator: | 152 for record in iterator: |
152 count += 1 | 153 count += 1 |
153 if count % N == 1: | 154 if count % N == 1: |
155 elif options.percent: | 156 elif options.percent: |
156 try: | 157 try: |
157 percent = float(options.percent) / 100.0 | 158 percent = float(options.percent) / 100.0 |
158 except ValueError: | 159 except ValueError: |
159 sys.exit("Bad -p percent argument %r" % options.percent) | 160 sys.exit("Bad -p percent argument %r" % options.percent) |
160 if percent <= 0.0 or 1.0 <= percent: | 161 if not(0.0 <= percent <= 1.0): |
161 sys.exit("Bad -p percent argument %r" % options.percent) | 162 sys.exit("Bad -p percent argument %r" % options.percent) |
162 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent)) | 163 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent)) |
163 | 164 |
164 def sampler(iterator): | 165 def sampler(iterator): |
166 """Sample given percentage of sequences.""" | |
165 global percent | 167 global percent |
166 count = 0 | 168 count = 0 |
167 taken = 0 | 169 taken = 0 |
168 for record in iterator: | 170 for record in iterator: |
169 count += 1 | 171 count += 1 |
213 taken += 1 | 215 taken += 1 |
214 yield record | 216 yield record |
215 assert taken == N, "Picked %i, wanted %i" % (taken, N) | 217 assert taken == N, "Picked %i, wanted %i" % (taken, N) |
216 else: | 218 else: |
217 def sampler(iterator): | 219 def sampler(iterator): |
220 """Sample given number of sequences.""" | |
218 # Mimic the percentage sampler, with double check on final count | 221 # Mimic the percentage sampler, with double check on final count |
219 global N, total | 222 global N, total |
220 # Do we need a floating point fudge factor epsilon? | 223 # Do we need a floating point fudge factor epsilon? |
221 # i.e. What if percentage comes out slighty too low, and | 224 # i.e. What if percentage comes out slighty too low, and |
222 # we could end up missing last few desired sequences? | 225 # we could end up missing last few desired sequences? |
266 while True: | 269 while True: |
267 if line[0] != ">": | 270 if line[0] != ">": |
268 raise ValueError( | 271 raise ValueError( |
269 "Records in Fasta files should start with '>' character") | 272 "Records in Fasta files should start with '>' character") |
270 try: | 273 try: |
271 id = line[1:].split(None, 1)[0] | 274 line[1:].split(None, 1)[0] |
272 except IndexError: | 275 except IndexError: |
273 if not no_id_warned: | 276 if not no_id_warned: |
274 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n") | 277 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n") |
275 no_id_warned = True | 278 no_id_warned = True |
276 id = None | |
277 lines = [line] | 279 lines = [line] |
278 line = handle.readline() | 280 line = handle.readline() |
279 while True: | 281 while True: |
280 if not line: | 282 if not line: |
281 break | 283 break |
344 count /= 2 | 346 count /= 2 |
345 else: | 347 else: |
346 count = writer.write_file(iterator_filter(SffIterator(in_handle))) | 348 count = writer.write_file(iterator_filter(SffIterator(in_handle))) |
347 return count | 349 return count |
348 | 350 |
351 | |
349 if seq_format == "sff": | 352 if seq_format == "sff": |
350 count = sff_filter(in_file, out_file, sampler, interleaved) | 353 count = sff_filter(in_file, out_file, sampler, interleaved) |
351 elif seq_format == "fasta": | 354 elif seq_format == "fasta": |
352 count = fasta_filter(in_file, out_file, sampler, interleaved) | 355 count = fasta_filter(in_file, out_file, sampler, interleaved) |
353 elif seq_format.startswith("fastq"): | 356 elif seq_format.startswith("fastq"): |