comparison tools/sample_seqs/sample_seqs.py @ 5:6b71ad5d43fb draft

v0.2.3 clarified help, internal cleanup of Python script
author peterjc
date Wed, 01 Feb 2017 09:39:36 -0500
parents 02c13ef1a669
children 31f5701cd2e9
comparison
equal deleted inserted replaced
4:d3aa9f25c24c 5:6b71ad5d43fb
17 """ 17 """
18 import os 18 import os
19 import sys 19 import sys
20 from optparse import OptionParser 20 from optparse import OptionParser
21 21
22 22 # Parse Command Line
23 def sys_exit(msg, err=1):
24 sys.stderr.write(msg.rstrip() + "\n")
25 sys.exit(err)
26
27 #Parse Command Line
28 usage = """Use as follows: 23 usage = """Use as follows:
29 24
30 $ python sample_seqs.py [options] 25 $ python sample_seqs.py [options]
31 26
32 e.g. Sample 20% of the reads: 27 e.g. Sample 20% of the reads:
33 28
34 $ python sample_seqs.py -i my_seq.fastq -f fastq -p 20.0 -o sample.fastq 29 $ python sample_seqs.py -i my_seq.fastq -f fastq -p 20.0 -o sample.fastq
35 30
36 This samples uniformly though the file, rather than at random, and therefore 31 This samples uniformly though the file, rather than at random, and therefore
37 should be reproducible. 32 should be reproducible.
33
34 If you have interleaved paired reads, use the --interleaved switch. If
35 instead you have two matched files (one for each pair), run the two
36 twice with the same sampling options to make to matched smaller files.
38 """ 37 """
39 parser = OptionParser(usage=usage) 38 parser = OptionParser(usage=usage)
40 parser.add_option('-i', '--input', dest='input', 39 parser.add_option('-i', '--input', dest='input',
41 default=None, help='Input sequences filename', 40 default=None, help='Input sequences filename',
42 metavar="FILE") 41 metavar="FILE")
62 default=False, action="store_true", 61 default=False, action="store_true",
63 help="Show version and quit") 62 help="Show version and quit")
64 options, args = parser.parse_args() 63 options, args = parser.parse_args()
65 64
66 if options.version: 65 if options.version:
67 print("v0.2.1") 66 print("v0.2.3")
68 sys.exit(0) 67 sys.exit(0)
68
69 try:
70 from Bio import SeqIO
71 from Bio.SeqIO.QualityIO import FastqGeneralIterator
72 from Bio.SeqIO.FastaIO import SimpleFastaParser
73 from Bio.SeqIO.SffIO import SffIterator, SffWriter
74 except ImportError:
75 sys.exit("This script requires Biopython.")
69 76
70 in_file = options.input 77 in_file = options.input
71 out_file = options.output 78 out_file = options.output
72 interleaved = options.interleaved 79 interleaved = options.interleaved
73 80
74 if not in_file: 81 if not in_file:
75 sys_exit("Require an input filename") 82 sys.exit("Require an input filename")
76 if in_file != "/dev/stdin" and not os.path.isfile(in_file): 83 if in_file != "/dev/stdin" and not os.path.isfile(in_file):
77 sys_exit("Missing input file %r" % in_file) 84 sys.exit("Missing input file %r" % in_file)
78 if not out_file: 85 if not out_file:
79 sys_exit("Require an output filename") 86 sys.exit("Require an output filename")
80 if not options.format: 87 if not options.format:
81 sys_exit("Require the sequence format") 88 sys.exit("Require the sequence format")
82 seq_format = options.format.lower() 89 seq_format = options.format.lower()
83 90
84 91
85 def count_fasta(filename): 92 def count_fasta(filename):
86 from Bio.SeqIO.FastaIO import SimpleFastaParser
87 count = 0 93 count = 0
88 with open(filename) as handle: 94 with open(filename) as handle:
89 for title, seq in SimpleFastaParser(handle): 95 for title, seq in SimpleFastaParser(handle):
90 count += 1 96 count += 1
91 return count 97 return count
92 98
93 99
94 def count_fastq(filename): 100 def count_fastq(filename):
95 from Bio.SeqIO.QualityIO import FastqGeneralIterator
96 count = 0 101 count = 0
97 with open(filename) as handle: 102 with open(filename) as handle:
98 for title, seq, qual in FastqGeneralIterator(handle): 103 for title, seq, qual in FastqGeneralIterator(handle):
99 count += 1 104 count += 1
100 return count 105 return count
101 106
102 107
103 def count_sff(filename): 108 def count_sff(filename):
104 from Bio import SeqIO
105 # If the SFF file has a built in index (which is normal), 109 # If the SFF file has a built in index (which is normal),
106 # this will be parsed and is the quicker than scanning 110 # this will be parsed and is the quicker than scanning
107 # the whole file. 111 # the whole file.
108 return len(SeqIO.index(filename, "sff")) 112 return len(SeqIO.index(filename, "sff"))
109 113
110 114
111 def count_sequences(filename, format): 115 def count_sequences(filename, format):
112 if seq_format == "sff": 116 if format == "sff":
113 return count_sff(filename) 117 return count_sff(filename)
114 elif seq_format == "fasta": 118 elif format == "fasta":
115 return count_fasta(filename) 119 return count_fasta(filename)
116 elif seq_format.startswith("fastq"): 120 elif format.startswith("fastq"):
117 return count_fastq(filename) 121 return count_fastq(filename)
118 else: 122 else:
119 sys_exit("Unsupported file type %r" % seq_format) 123 sys.exit("Unsupported file type %r" % format)
120 124
121 125
122 if options.percent and options.everyn: 126 if options.percent and options.everyn:
123 sys_exit("Cannot combine -p and -n options") 127 sys.exit("Cannot combine -p and -n options")
124 elif options.everyn and options.count: 128 elif options.everyn and options.count:
125 sys_exit("Cannot combine -p and -c options") 129 sys.exit("Cannot combine -p and -c options")
126 elif options.percent and options.count: 130 elif options.percent and options.count:
127 sys_exit("Cannot combine -n and -c options") 131 sys.exit("Cannot combine -n and -c options")
128 elif options.everyn: 132 elif options.everyn:
129 try: 133 try:
130 N = int(options.everyn) 134 N = int(options.everyn)
131 except: 135 except ValueError:
132 sys_exit("Bad -n argument %r" % options.everyn) 136 sys.exit("Bad -n argument %r" % options.everyn)
133 if N < 2: 137 if N < 2:
134 sys_exit("Bad -n argument %r" % options.everyn) 138 sys.exit("Bad -n argument %r" % options.everyn)
135 if (N % 10) == 1: 139 if (N % 10) == 1:
136 sys.stderr.write("Sampling every %ist sequence\n" % N) 140 sys.stderr.write("Sampling every %ist sequence\n" % N)
137 elif (N % 10) == 2: 141 elif (N % 10) == 2:
138 sys.stderr.write("Sampling every %ind sequence\n" % N) 142 sys.stderr.write("Sampling every %ind sequence\n" % N)
139 elif (N % 10) == 3: 143 elif (N % 10) == 3:
140 sys.stderr.write("Sampling every %ird sequence\n" % N) 144 sys.stderr.write("Sampling every %ird sequence\n" % N)
141 else: 145 else:
142 sys.stderr.write("Sampling every %ith sequence\n" % N) 146 sys.stderr.write("Sampling every %ith sequence\n" % N)
147
143 def sampler(iterator): 148 def sampler(iterator):
144 global N 149 global N
145 count = 0 150 count = 0
146 for record in iterator: 151 for record in iterator:
147 count += 1 152 count += 1
148 if count % N == 1: 153 if count % N == 1:
149 yield record 154 yield record
150 elif options.percent: 155 elif options.percent:
151 try: 156 try:
152 percent = float(options.percent) / 100.0 157 percent = float(options.percent) / 100.0
153 except: 158 except ValueError:
154 sys_exit("Bad -p percent argument %r" % options.percent) 159 sys.exit("Bad -p percent argument %r" % options.percent)
155 if percent <= 0.0 or 1.0 <= percent: 160 if percent <= 0.0 or 1.0 <= percent:
156 sys_exit("Bad -p percent argument %r" % options.percent) 161 sys.exit("Bad -p percent argument %r" % options.percent)
157 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent)) 162 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent))
163
158 def sampler(iterator): 164 def sampler(iterator):
159 global percent 165 global percent
160 count = 0 166 count = 0
161 taken = 0 167 taken = 0
162 for record in iterator: 168 for record in iterator:
165 taken += 1 171 taken += 1
166 yield record 172 yield record
167 elif options.count: 173 elif options.count:
168 try: 174 try:
169 N = int(options.count) 175 N = int(options.count)
170 except: 176 except ValueError:
171 sys_exit("Bad -c count argument %r" % options.count) 177 sys.exit("Bad -c count argument %r" % options.count)
172 if N < 1: 178 if N < 1:
173 sys_exit("Bad -c count argument %r" % options.count) 179 sys.exit("Bad -c count argument %r" % options.count)
174 total = count_sequences(in_file, seq_format) 180 total = count_sequences(in_file, seq_format)
175 sys.stderr.write("Input file has %i sequences\n" % total) 181 sys.stderr.write("Input file has %i sequences\n" % total)
176 if interleaved: 182 if interleaved:
177 # Paired 183 # Paired
178 if total % 2: 184 if total % 2:
179 sys_exit("Paired mode, but input file has an odd number of sequences: %i" 185 sys.exit("Paired mode, but input file has an odd number of sequences: %i"
180 % total) 186 % total)
181 elif N > total // 2: 187 elif N > total // 2:
182 sys_exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)." 188 sys.exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)."
183 % (N, total // 2, total)) 189 % (N, total // 2, total))
184 total = total // 2 190 total = total // 2
185 if N == 1: 191 if N == 1:
186 sys.stderr.write("Sampling just first sequence pair!\n") 192 sys.stderr.write("Sampling just first sequence pair!\n")
187 elif N == total: 193 elif N == total:
189 else: 195 else:
190 sys.stderr.write("Sampling %i sequence pairs\n" % N) 196 sys.stderr.write("Sampling %i sequence pairs\n" % N)
191 else: 197 else:
192 # Not paired 198 # Not paired
193 if total < N: 199 if total < N:
194 sys_exit("Requested %i sequences, but file only has %i." % (N, total)) 200 sys.exit("Requested %i sequences, but file only has %i." % (N, total))
195 if N == 1: 201 if N == 1:
196 sys.stderr.write("Sampling just first sequence!\n") 202 sys.stderr.write("Sampling just first sequence!\n")
197 elif N == total: 203 elif N == total:
198 sys.stderr.write("Taking all the sequences\n") 204 sys.stderr.write("Taking all the sequences\n")
199 else: 205 else:
213 global N, total 219 global N, total
214 # Do we need a floating point fudge factor epsilon? 220 # Do we need a floating point fudge factor epsilon?
215 # i.e. What if percentage comes out slighty too low, and 221 # i.e. What if percentage comes out slighty too low, and
216 # we could end up missing last few desired sequences? 222 # we could end up missing last few desired sequences?
217 percentage = float(N) / float(total) 223 percentage = float(N) / float(total)
218 #print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f" 224 # print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f"
219 # % (N, total, percentage * 100.0)) 225 # % (N, total, percentage * 100.0))
220 count = 0 226 count = 0
221 taken = 0 227 taken = 0
222 for record in iterator: 228 for record in iterator:
223 count += 1 229 count += 1
231 # we need to take all remaining sequences to meet target 237 # we need to take all remaining sequences to meet target
232 taken += 1 238 taken += 1
233 yield record 239 yield record
234 assert taken == N, "Picked %i, wanted %i" % (taken, N) 240 assert taken == N, "Picked %i, wanted %i" % (taken, N)
235 else: 241 else:
236 sys_exit("Must use either -n, -p or -c") 242 sys.exit("Must use either -n, -p or -c")
237 243
238 244
239 def pair(iterator): 245 def pair(iterator):
240 """Quick and dirty pair batched iterator.""" 246 """Quick and dirty pair batched iterator."""
241 while True: 247 while True:
250 def raw_fasta_iterator(handle): 256 def raw_fasta_iterator(handle):
251 """Yields raw FASTA records as multi-line strings.""" 257 """Yields raw FASTA records as multi-line strings."""
252 while True: 258 while True:
253 line = handle.readline() 259 line = handle.readline()
254 if line == "": 260 if line == "":
255 return # Premature end of file, or just empty? 261 return # Premature end of file, or just empty?
256 if line[0] == ">": 262 if line[0] == ">":
257 break 263 break
258 264
259 no_id_warned = False 265 no_id_warned = False
260 while True: 266 while True:
277 break 283 break
278 lines.append(line) 284 lines.append(line)
279 line = handle.readline() 285 line = handle.readline()
280 yield "".join(lines) 286 yield "".join(lines)
281 if not line: 287 if not line:
282 return # StopIteration 288 return # StopIteration
289
283 290
284 def fasta_filter(in_file, out_file, iterator_filter, inter): 291 def fasta_filter(in_file, out_file, iterator_filter, inter):
285 count = 0 292 count = 0
286 #Galaxy now requires Python 2.5+ so can use with statements, 293 # Galaxy now requires Python 2.5+ so can use with statements,
287 with open(in_file) as in_handle: 294 with open(in_file) as in_handle:
288 with open(out_file, "w") as pos_handle: 295 with open(out_file, "w") as pos_handle:
289 if inter: 296 if inter:
290 for r1, r2 in iterator_filter(pair(raw_fasta_iterator(in_handle))): 297 for r1, r2 in iterator_filter(pair(raw_fasta_iterator(in_handle))):
291 count += 1 298 count += 1
296 count += 1 303 count += 1
297 pos_handle.write(record) 304 pos_handle.write(record)
298 return count 305 return count
299 306
300 307
301 from Bio.SeqIO.QualityIO import FastqGeneralIterator
302 def fastq_filter(in_file, out_file, iterator_filter, inter): 308 def fastq_filter(in_file, out_file, iterator_filter, inter):
303 count = 0 309 count = 0
304 with open(in_file) as in_handle: 310 with open(in_file) as in_handle:
305 with open(out_file, "w") as pos_handle: 311 with open(out_file, "w") as pos_handle:
306 if inter: 312 if inter:
316 322
317 323
318 def sff_filter(in_file, out_file, iterator_filter, inter): 324 def sff_filter(in_file, out_file, iterator_filter, inter):
319 count = 0 325 count = 0
320 try: 326 try:
321 from Bio.SeqIO.SffIO import SffIterator, SffWriter
322 except ImportError:
323 sys_exit("SFF filtering requires Biopython 1.54 or later")
324 try:
325 from Bio.SeqIO.SffIO import ReadRocheXmlManifest 327 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
326 except ImportError: 328 except ImportError:
327 #Prior to Biopython 1.56 this was a private function 329 # Prior to Biopython 1.56 this was a private function
328 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest 330 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
329 with open(in_file, "rb") as in_handle: 331 with open(in_file, "rb") as in_handle:
330 try: 332 try:
331 manifest = ReadRocheXmlManifest(in_handle) 333 manifest = ReadRocheXmlManifest(in_handle)
332 except ValueError: 334 except ValueError:
333 manifest = None 335 manifest = None
334 in_handle.seek(0) 336 in_handle.seek(0)
335 with open(out_file, "wb") as out_handle: 337 with open(out_file, "wb") as out_handle:
336 writer = SffWriter(out_handle, xml=manifest) 338 writer = SffWriter(out_handle, xml=manifest)
337 in_handle.seek(0) #start again after getting manifest 339 in_handle.seek(0) # start again after getting manifest
338 if inter: 340 if inter:
339 from itertools import chain 341 from itertools import chain
340 count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle))))) 342 count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle)))))
341 assert count % 2 == 0, "Odd number of records? %i" % count 343 assert count % 2 == 0, "Odd number of records? %i" % count
342 count /= 2 344 count /= 2
343 else: 345 else:
344 count = writer.write_file(iterator_filter(SffIterator(in_handle))) 346 count = writer.write_file(iterator_filter(SffIterator(in_handle)))
345 #count = writer.write_file(SffIterator(in_handle))
346 return count 347 return count
347 348
348 if seq_format == "sff": 349 if seq_format == "sff":
349 count = sff_filter(in_file, out_file, sampler, interleaved) 350 count = sff_filter(in_file, out_file, sampler, interleaved)
350 elif seq_format == "fasta": 351 elif seq_format == "fasta":
351 count = fasta_filter(in_file, out_file, sampler, interleaved) 352 count = fasta_filter(in_file, out_file, sampler, interleaved)
352 elif seq_format.startswith("fastq"): 353 elif seq_format.startswith("fastq"):
353 count = fastq_filter(in_file, out_file, sampler, interleaved) 354 count = fastq_filter(in_file, out_file, sampler, interleaved)
354 else: 355 else:
355 sys_exit("Unsupported file type %r" % seq_format) 356 sys.exit("Unsupported file type %r" % seq_format)
356 357
357 if interleaved: 358 if interleaved:
358 sys.stderr.write("Selected %i pairs\n" % count) 359 sys.stderr.write("Selected %i pairs\n" % count)
359 else: 360 else:
360 sys.stderr.write("Selected %i records\n" % count) 361 sys.stderr.write("Selected %i records\n" % count)