comparison tools/sample_seqs/sample_seqs.py @ 6:31f5701cd2e9 draft

v0.2.4 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda.
author peterjc
date Thu, 11 May 2017 07:24:38 -0400
parents 6b71ad5d43fb
children 5f505ed46e16
comparison
equal deleted inserted replaced
5:6b71ad5d43fb 6:31f5701cd2e9
61 default=False, action="store_true", 61 default=False, action="store_true",
62 help="Show version and quit") 62 help="Show version and quit")
63 options, args = parser.parse_args() 63 options, args = parser.parse_args()
64 64
65 if options.version: 65 if options.version:
66 print("v0.2.3") 66 print("v0.2.4")
67 sys.exit(0) 67 sys.exit(0)
68 68
69 try: 69 try:
70 from Bio import SeqIO 70 from Bio import SeqIO
71 from Bio.SeqIO.QualityIO import FastqGeneralIterator 71 from Bio.SeqIO.QualityIO import FastqGeneralIterator
144 sys.stderr.write("Sampling every %ird sequence\n" % N) 144 sys.stderr.write("Sampling every %ird sequence\n" % N)
145 else: 145 else:
146 sys.stderr.write("Sampling every %ith sequence\n" % N) 146 sys.stderr.write("Sampling every %ith sequence\n" % N)
147 147
148 def sampler(iterator): 148 def sampler(iterator):
149 """Sample every Nth sequence."""
149 global N 150 global N
150 count = 0 151 count = 0
151 for record in iterator: 152 for record in iterator:
152 count += 1 153 count += 1
153 if count % N == 1: 154 if count % N == 1:
155 elif options.percent: 156 elif options.percent:
156 try: 157 try:
157 percent = float(options.percent) / 100.0 158 percent = float(options.percent) / 100.0
158 except ValueError: 159 except ValueError:
159 sys.exit("Bad -p percent argument %r" % options.percent) 160 sys.exit("Bad -p percent argument %r" % options.percent)
160 if percent <= 0.0 or 1.0 <= percent: 161 if not(0.0 <= percent <= 1.0):
161 sys.exit("Bad -p percent argument %r" % options.percent) 162 sys.exit("Bad -p percent argument %r" % options.percent)
162 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent)) 163 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent))
163 164
164 def sampler(iterator): 165 def sampler(iterator):
166 """Sample given percentage of sequences."""
165 global percent 167 global percent
166 count = 0 168 count = 0
167 taken = 0 169 taken = 0
168 for record in iterator: 170 for record in iterator:
169 count += 1 171 count += 1
213 taken += 1 215 taken += 1
214 yield record 216 yield record
215 assert taken == N, "Picked %i, wanted %i" % (taken, N) 217 assert taken == N, "Picked %i, wanted %i" % (taken, N)
216 else: 218 else:
217 def sampler(iterator): 219 def sampler(iterator):
220 """Sample given number of sequences."""
218 # Mimic the percentage sampler, with double check on final count 221 # Mimic the percentage sampler, with double check on final count
219 global N, total 222 global N, total
220 # Do we need a floating point fudge factor epsilon? 223 # Do we need a floating point fudge factor epsilon?
221 # i.e. What if percentage comes out slighty too low, and 224 # i.e. What if percentage comes out slighty too low, and
222 # we could end up missing last few desired sequences? 225 # we could end up missing last few desired sequences?
266 while True: 269 while True:
267 if line[0] != ">": 270 if line[0] != ">":
268 raise ValueError( 271 raise ValueError(
269 "Records in Fasta files should start with '>' character") 272 "Records in Fasta files should start with '>' character")
270 try: 273 try:
271 id = line[1:].split(None, 1)[0] 274 line[1:].split(None, 1)[0]
272 except IndexError: 275 except IndexError:
273 if not no_id_warned: 276 if not no_id_warned:
274 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n") 277 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n")
275 no_id_warned = True 278 no_id_warned = True
276 id = None
277 lines = [line] 279 lines = [line]
278 line = handle.readline() 280 line = handle.readline()
279 while True: 281 while True:
280 if not line: 282 if not line:
281 break 283 break
344 count /= 2 346 count /= 2
345 else: 347 else:
346 count = writer.write_file(iterator_filter(SffIterator(in_handle))) 348 count = writer.write_file(iterator_filter(SffIterator(in_handle)))
347 return count 349 return count
348 350
351
349 if seq_format == "sff": 352 if seq_format == "sff":
350 count = sff_filter(in_file, out_file, sampler, interleaved) 353 count = sff_filter(in_file, out_file, sampler, interleaved)
351 elif seq_format == "fasta": 354 elif seq_format == "fasta":
352 count = fasta_filter(in_file, out_file, sampler, interleaved) 355 count = fasta_filter(in_file, out_file, sampler, interleaved)
353 elif seq_format.startswith("fastq"): 356 elif seq_format.startswith("fastq"):