Mercurial > repos > peterjc > sample_seqs
comparison tools/sample_seqs/sample_seqs.py @ 5:6b71ad5d43fb draft
v0.2.3 clarified help, internal cleanup of Python script
author | peterjc |
---|---|
date | Wed, 01 Feb 2017 09:39:36 -0500 |
parents | 02c13ef1a669 |
children | 31f5701cd2e9 |
comparison
equal
deleted
inserted
replaced
4:d3aa9f25c24c | 5:6b71ad5d43fb |
---|---|
17 """ | 17 """ |
18 import os | 18 import os |
19 import sys | 19 import sys |
20 from optparse import OptionParser | 20 from optparse import OptionParser |
21 | 21 |
22 | 22 # Parse Command Line |
23 def sys_exit(msg, err=1): | |
24 sys.stderr.write(msg.rstrip() + "\n") | |
25 sys.exit(err) | |
26 | |
27 #Parse Command Line | |
28 usage = """Use as follows: | 23 usage = """Use as follows: |
29 | 24 |
30 $ python sample_seqs.py [options] | 25 $ python sample_seqs.py [options] |
31 | 26 |
32 e.g. Sample 20% of the reads: | 27 e.g. Sample 20% of the reads: |
33 | 28 |
34 $ python sample_seqs.py -i my_seq.fastq -f fastq -p 20.0 -o sample.fastq | 29 $ python sample_seqs.py -i my_seq.fastq -f fastq -p 20.0 -o sample.fastq |
35 | 30 |
36 This samples uniformly though the file, rather than at random, and therefore | 31 This samples uniformly though the file, rather than at random, and therefore |
37 should be reproducible. | 32 should be reproducible. |
33 | |
34 If you have interleaved paired reads, use the --interleaved switch. If | |
35 instead you have two matched files (one for each pair), run the two | |
36 twice with the same sampling options to make to matched smaller files. | |
38 """ | 37 """ |
39 parser = OptionParser(usage=usage) | 38 parser = OptionParser(usage=usage) |
40 parser.add_option('-i', '--input', dest='input', | 39 parser.add_option('-i', '--input', dest='input', |
41 default=None, help='Input sequences filename', | 40 default=None, help='Input sequences filename', |
42 metavar="FILE") | 41 metavar="FILE") |
62 default=False, action="store_true", | 61 default=False, action="store_true", |
63 help="Show version and quit") | 62 help="Show version and quit") |
64 options, args = parser.parse_args() | 63 options, args = parser.parse_args() |
65 | 64 |
66 if options.version: | 65 if options.version: |
67 print("v0.2.1") | 66 print("v0.2.3") |
68 sys.exit(0) | 67 sys.exit(0) |
68 | |
69 try: | |
70 from Bio import SeqIO | |
71 from Bio.SeqIO.QualityIO import FastqGeneralIterator | |
72 from Bio.SeqIO.FastaIO import SimpleFastaParser | |
73 from Bio.SeqIO.SffIO import SffIterator, SffWriter | |
74 except ImportError: | |
75 sys.exit("This script requires Biopython.") | |
69 | 76 |
70 in_file = options.input | 77 in_file = options.input |
71 out_file = options.output | 78 out_file = options.output |
72 interleaved = options.interleaved | 79 interleaved = options.interleaved |
73 | 80 |
74 if not in_file: | 81 if not in_file: |
75 sys_exit("Require an input filename") | 82 sys.exit("Require an input filename") |
76 if in_file != "/dev/stdin" and not os.path.isfile(in_file): | 83 if in_file != "/dev/stdin" and not os.path.isfile(in_file): |
77 sys_exit("Missing input file %r" % in_file) | 84 sys.exit("Missing input file %r" % in_file) |
78 if not out_file: | 85 if not out_file: |
79 sys_exit("Require an output filename") | 86 sys.exit("Require an output filename") |
80 if not options.format: | 87 if not options.format: |
81 sys_exit("Require the sequence format") | 88 sys.exit("Require the sequence format") |
82 seq_format = options.format.lower() | 89 seq_format = options.format.lower() |
83 | 90 |
84 | 91 |
85 def count_fasta(filename): | 92 def count_fasta(filename): |
86 from Bio.SeqIO.FastaIO import SimpleFastaParser | |
87 count = 0 | 93 count = 0 |
88 with open(filename) as handle: | 94 with open(filename) as handle: |
89 for title, seq in SimpleFastaParser(handle): | 95 for title, seq in SimpleFastaParser(handle): |
90 count += 1 | 96 count += 1 |
91 return count | 97 return count |
92 | 98 |
93 | 99 |
94 def count_fastq(filename): | 100 def count_fastq(filename): |
95 from Bio.SeqIO.QualityIO import FastqGeneralIterator | |
96 count = 0 | 101 count = 0 |
97 with open(filename) as handle: | 102 with open(filename) as handle: |
98 for title, seq, qual in FastqGeneralIterator(handle): | 103 for title, seq, qual in FastqGeneralIterator(handle): |
99 count += 1 | 104 count += 1 |
100 return count | 105 return count |
101 | 106 |
102 | 107 |
103 def count_sff(filename): | 108 def count_sff(filename): |
104 from Bio import SeqIO | |
105 # If the SFF file has a built in index (which is normal), | 109 # If the SFF file has a built in index (which is normal), |
106 # this will be parsed and is the quicker than scanning | 110 # this will be parsed and is the quicker than scanning |
107 # the whole file. | 111 # the whole file. |
108 return len(SeqIO.index(filename, "sff")) | 112 return len(SeqIO.index(filename, "sff")) |
109 | 113 |
110 | 114 |
111 def count_sequences(filename, format): | 115 def count_sequences(filename, format): |
112 if seq_format == "sff": | 116 if format == "sff": |
113 return count_sff(filename) | 117 return count_sff(filename) |
114 elif seq_format == "fasta": | 118 elif format == "fasta": |
115 return count_fasta(filename) | 119 return count_fasta(filename) |
116 elif seq_format.startswith("fastq"): | 120 elif format.startswith("fastq"): |
117 return count_fastq(filename) | 121 return count_fastq(filename) |
118 else: | 122 else: |
119 sys_exit("Unsupported file type %r" % seq_format) | 123 sys.exit("Unsupported file type %r" % format) |
120 | 124 |
121 | 125 |
122 if options.percent and options.everyn: | 126 if options.percent and options.everyn: |
123 sys_exit("Cannot combine -p and -n options") | 127 sys.exit("Cannot combine -p and -n options") |
124 elif options.everyn and options.count: | 128 elif options.everyn and options.count: |
125 sys_exit("Cannot combine -p and -c options") | 129 sys.exit("Cannot combine -p and -c options") |
126 elif options.percent and options.count: | 130 elif options.percent and options.count: |
127 sys_exit("Cannot combine -n and -c options") | 131 sys.exit("Cannot combine -n and -c options") |
128 elif options.everyn: | 132 elif options.everyn: |
129 try: | 133 try: |
130 N = int(options.everyn) | 134 N = int(options.everyn) |
131 except: | 135 except ValueError: |
132 sys_exit("Bad -n argument %r" % options.everyn) | 136 sys.exit("Bad -n argument %r" % options.everyn) |
133 if N < 2: | 137 if N < 2: |
134 sys_exit("Bad -n argument %r" % options.everyn) | 138 sys.exit("Bad -n argument %r" % options.everyn) |
135 if (N % 10) == 1: | 139 if (N % 10) == 1: |
136 sys.stderr.write("Sampling every %ist sequence\n" % N) | 140 sys.stderr.write("Sampling every %ist sequence\n" % N) |
137 elif (N % 10) == 2: | 141 elif (N % 10) == 2: |
138 sys.stderr.write("Sampling every %ind sequence\n" % N) | 142 sys.stderr.write("Sampling every %ind sequence\n" % N) |
139 elif (N % 10) == 3: | 143 elif (N % 10) == 3: |
140 sys.stderr.write("Sampling every %ird sequence\n" % N) | 144 sys.stderr.write("Sampling every %ird sequence\n" % N) |
141 else: | 145 else: |
142 sys.stderr.write("Sampling every %ith sequence\n" % N) | 146 sys.stderr.write("Sampling every %ith sequence\n" % N) |
147 | |
143 def sampler(iterator): | 148 def sampler(iterator): |
144 global N | 149 global N |
145 count = 0 | 150 count = 0 |
146 for record in iterator: | 151 for record in iterator: |
147 count += 1 | 152 count += 1 |
148 if count % N == 1: | 153 if count % N == 1: |
149 yield record | 154 yield record |
150 elif options.percent: | 155 elif options.percent: |
151 try: | 156 try: |
152 percent = float(options.percent) / 100.0 | 157 percent = float(options.percent) / 100.0 |
153 except: | 158 except ValueError: |
154 sys_exit("Bad -p percent argument %r" % options.percent) | 159 sys.exit("Bad -p percent argument %r" % options.percent) |
155 if percent <= 0.0 or 1.0 <= percent: | 160 if percent <= 0.0 or 1.0 <= percent: |
156 sys_exit("Bad -p percent argument %r" % options.percent) | 161 sys.exit("Bad -p percent argument %r" % options.percent) |
157 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent)) | 162 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent)) |
163 | |
158 def sampler(iterator): | 164 def sampler(iterator): |
159 global percent | 165 global percent |
160 count = 0 | 166 count = 0 |
161 taken = 0 | 167 taken = 0 |
162 for record in iterator: | 168 for record in iterator: |
165 taken += 1 | 171 taken += 1 |
166 yield record | 172 yield record |
167 elif options.count: | 173 elif options.count: |
168 try: | 174 try: |
169 N = int(options.count) | 175 N = int(options.count) |
170 except: | 176 except ValueError: |
171 sys_exit("Bad -c count argument %r" % options.count) | 177 sys.exit("Bad -c count argument %r" % options.count) |
172 if N < 1: | 178 if N < 1: |
173 sys_exit("Bad -c count argument %r" % options.count) | 179 sys.exit("Bad -c count argument %r" % options.count) |
174 total = count_sequences(in_file, seq_format) | 180 total = count_sequences(in_file, seq_format) |
175 sys.stderr.write("Input file has %i sequences\n" % total) | 181 sys.stderr.write("Input file has %i sequences\n" % total) |
176 if interleaved: | 182 if interleaved: |
177 # Paired | 183 # Paired |
178 if total % 2: | 184 if total % 2: |
179 sys_exit("Paired mode, but input file has an odd number of sequences: %i" | 185 sys.exit("Paired mode, but input file has an odd number of sequences: %i" |
180 % total) | 186 % total) |
181 elif N > total // 2: | 187 elif N > total // 2: |
182 sys_exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)." | 188 sys.exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)." |
183 % (N, total // 2, total)) | 189 % (N, total // 2, total)) |
184 total = total // 2 | 190 total = total // 2 |
185 if N == 1: | 191 if N == 1: |
186 sys.stderr.write("Sampling just first sequence pair!\n") | 192 sys.stderr.write("Sampling just first sequence pair!\n") |
187 elif N == total: | 193 elif N == total: |
189 else: | 195 else: |
190 sys.stderr.write("Sampling %i sequence pairs\n" % N) | 196 sys.stderr.write("Sampling %i sequence pairs\n" % N) |
191 else: | 197 else: |
192 # Not paired | 198 # Not paired |
193 if total < N: | 199 if total < N: |
194 sys_exit("Requested %i sequences, but file only has %i." % (N, total)) | 200 sys.exit("Requested %i sequences, but file only has %i." % (N, total)) |
195 if N == 1: | 201 if N == 1: |
196 sys.stderr.write("Sampling just first sequence!\n") | 202 sys.stderr.write("Sampling just first sequence!\n") |
197 elif N == total: | 203 elif N == total: |
198 sys.stderr.write("Taking all the sequences\n") | 204 sys.stderr.write("Taking all the sequences\n") |
199 else: | 205 else: |
213 global N, total | 219 global N, total |
214 # Do we need a floating point fudge factor epsilon? | 220 # Do we need a floating point fudge factor epsilon? |
215 # i.e. What if percentage comes out slighty too low, and | 221 # i.e. What if percentage comes out slighty too low, and |
216 # we could end up missing last few desired sequences? | 222 # we could end up missing last few desired sequences? |
217 percentage = float(N) / float(total) | 223 percentage = float(N) / float(total) |
218 #print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f" | 224 # print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f" |
219 # % (N, total, percentage * 100.0)) | 225 # % (N, total, percentage * 100.0)) |
220 count = 0 | 226 count = 0 |
221 taken = 0 | 227 taken = 0 |
222 for record in iterator: | 228 for record in iterator: |
223 count += 1 | 229 count += 1 |
231 # we need to take all remaining sequences to meet target | 237 # we need to take all remaining sequences to meet target |
232 taken += 1 | 238 taken += 1 |
233 yield record | 239 yield record |
234 assert taken == N, "Picked %i, wanted %i" % (taken, N) | 240 assert taken == N, "Picked %i, wanted %i" % (taken, N) |
235 else: | 241 else: |
236 sys_exit("Must use either -n, -p or -c") | 242 sys.exit("Must use either -n, -p or -c") |
237 | 243 |
238 | 244 |
239 def pair(iterator): | 245 def pair(iterator): |
240 """Quick and dirty pair batched iterator.""" | 246 """Quick and dirty pair batched iterator.""" |
241 while True: | 247 while True: |
250 def raw_fasta_iterator(handle): | 256 def raw_fasta_iterator(handle): |
251 """Yields raw FASTA records as multi-line strings.""" | 257 """Yields raw FASTA records as multi-line strings.""" |
252 while True: | 258 while True: |
253 line = handle.readline() | 259 line = handle.readline() |
254 if line == "": | 260 if line == "": |
255 return # Premature end of file, or just empty? | 261 return # Premature end of file, or just empty? |
256 if line[0] == ">": | 262 if line[0] == ">": |
257 break | 263 break |
258 | 264 |
259 no_id_warned = False | 265 no_id_warned = False |
260 while True: | 266 while True: |
277 break | 283 break |
278 lines.append(line) | 284 lines.append(line) |
279 line = handle.readline() | 285 line = handle.readline() |
280 yield "".join(lines) | 286 yield "".join(lines) |
281 if not line: | 287 if not line: |
282 return # StopIteration | 288 return # StopIteration |
289 | |
283 | 290 |
284 def fasta_filter(in_file, out_file, iterator_filter, inter): | 291 def fasta_filter(in_file, out_file, iterator_filter, inter): |
285 count = 0 | 292 count = 0 |
286 #Galaxy now requires Python 2.5+ so can use with statements, | 293 # Galaxy now requires Python 2.5+ so can use with statements, |
287 with open(in_file) as in_handle: | 294 with open(in_file) as in_handle: |
288 with open(out_file, "w") as pos_handle: | 295 with open(out_file, "w") as pos_handle: |
289 if inter: | 296 if inter: |
290 for r1, r2 in iterator_filter(pair(raw_fasta_iterator(in_handle))): | 297 for r1, r2 in iterator_filter(pair(raw_fasta_iterator(in_handle))): |
291 count += 1 | 298 count += 1 |
296 count += 1 | 303 count += 1 |
297 pos_handle.write(record) | 304 pos_handle.write(record) |
298 return count | 305 return count |
299 | 306 |
300 | 307 |
301 from Bio.SeqIO.QualityIO import FastqGeneralIterator | |
302 def fastq_filter(in_file, out_file, iterator_filter, inter): | 308 def fastq_filter(in_file, out_file, iterator_filter, inter): |
303 count = 0 | 309 count = 0 |
304 with open(in_file) as in_handle: | 310 with open(in_file) as in_handle: |
305 with open(out_file, "w") as pos_handle: | 311 with open(out_file, "w") as pos_handle: |
306 if inter: | 312 if inter: |
316 | 322 |
317 | 323 |
318 def sff_filter(in_file, out_file, iterator_filter, inter): | 324 def sff_filter(in_file, out_file, iterator_filter, inter): |
319 count = 0 | 325 count = 0 |
320 try: | 326 try: |
321 from Bio.SeqIO.SffIO import SffIterator, SffWriter | |
322 except ImportError: | |
323 sys_exit("SFF filtering requires Biopython 1.54 or later") | |
324 try: | |
325 from Bio.SeqIO.SffIO import ReadRocheXmlManifest | 327 from Bio.SeqIO.SffIO import ReadRocheXmlManifest |
326 except ImportError: | 328 except ImportError: |
327 #Prior to Biopython 1.56 this was a private function | 329 # Prior to Biopython 1.56 this was a private function |
328 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest | 330 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest |
329 with open(in_file, "rb") as in_handle: | 331 with open(in_file, "rb") as in_handle: |
330 try: | 332 try: |
331 manifest = ReadRocheXmlManifest(in_handle) | 333 manifest = ReadRocheXmlManifest(in_handle) |
332 except ValueError: | 334 except ValueError: |
333 manifest = None | 335 manifest = None |
334 in_handle.seek(0) | 336 in_handle.seek(0) |
335 with open(out_file, "wb") as out_handle: | 337 with open(out_file, "wb") as out_handle: |
336 writer = SffWriter(out_handle, xml=manifest) | 338 writer = SffWriter(out_handle, xml=manifest) |
337 in_handle.seek(0) #start again after getting manifest | 339 in_handle.seek(0) # start again after getting manifest |
338 if inter: | 340 if inter: |
339 from itertools import chain | 341 from itertools import chain |
340 count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle))))) | 342 count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle))))) |
341 assert count % 2 == 0, "Odd number of records? %i" % count | 343 assert count % 2 == 0, "Odd number of records? %i" % count |
342 count /= 2 | 344 count /= 2 |
343 else: | 345 else: |
344 count = writer.write_file(iterator_filter(SffIterator(in_handle))) | 346 count = writer.write_file(iterator_filter(SffIterator(in_handle))) |
345 #count = writer.write_file(SffIterator(in_handle)) | |
346 return count | 347 return count |
347 | 348 |
348 if seq_format == "sff": | 349 if seq_format == "sff": |
349 count = sff_filter(in_file, out_file, sampler, interleaved) | 350 count = sff_filter(in_file, out_file, sampler, interleaved) |
350 elif seq_format == "fasta": | 351 elif seq_format == "fasta": |
351 count = fasta_filter(in_file, out_file, sampler, interleaved) | 352 count = fasta_filter(in_file, out_file, sampler, interleaved) |
352 elif seq_format.startswith("fastq"): | 353 elif seq_format.startswith("fastq"): |
353 count = fastq_filter(in_file, out_file, sampler, interleaved) | 354 count = fastq_filter(in_file, out_file, sampler, interleaved) |
354 else: | 355 else: |
355 sys_exit("Unsupported file type %r" % seq_format) | 356 sys.exit("Unsupported file type %r" % seq_format) |
356 | 357 |
357 if interleaved: | 358 if interleaved: |
358 sys.stderr.write("Selected %i pairs\n" % count) | 359 sys.stderr.write("Selected %i pairs\n" % count) |
359 else: | 360 else: |
360 sys.stderr.write("Selected %i records\n" % count) | 361 sys.stderr.write("Selected %i records\n" % count) |