Mercurial > repos > peterjc > sample_seqs
diff tools/sample_seqs/sample_seqs.py @ 5:6b71ad5d43fb draft
v0.2.3 clarified help, internal cleanup of Python script
author | peterjc |
---|---|
date | Wed, 01 Feb 2017 09:39:36 -0500 |
parents | 02c13ef1a669 |
children | 31f5701cd2e9 |
line wrap: on
line diff
--- a/tools/sample_seqs/sample_seqs.py Wed Aug 05 12:30:18 2015 -0400 +++ b/tools/sample_seqs/sample_seqs.py Wed Feb 01 09:39:36 2017 -0500 @@ -19,12 +19,7 @@ import sys from optparse import OptionParser - -def sys_exit(msg, err=1): - sys.stderr.write(msg.rstrip() + "\n") - sys.exit(err) - -#Parse Command Line +# Parse Command Line usage = """Use as follows: $ python sample_seqs.py [options] @@ -35,6 +30,10 @@ This samples uniformly though the file, rather than at random, and therefore should be reproducible. + +If you have interleaved paired reads, use the --interleaved switch. If +instead you have two matched files (one for each pair), run the two +twice with the same sampling options to make to matched smaller files. """ parser = OptionParser(usage=usage) parser.add_option('-i', '--input', dest='input', @@ -64,26 +63,33 @@ options, args = parser.parse_args() if options.version: - print("v0.2.1") + print("v0.2.3") sys.exit(0) +try: + from Bio import SeqIO + from Bio.SeqIO.QualityIO import FastqGeneralIterator + from Bio.SeqIO.FastaIO import SimpleFastaParser + from Bio.SeqIO.SffIO import SffIterator, SffWriter +except ImportError: + sys.exit("This script requires Biopython.") + in_file = options.input out_file = options.output interleaved = options.interleaved if not in_file: - sys_exit("Require an input filename") + sys.exit("Require an input filename") if in_file != "/dev/stdin" and not os.path.isfile(in_file): - sys_exit("Missing input file %r" % in_file) + sys.exit("Missing input file %r" % in_file) if not out_file: - sys_exit("Require an output filename") + sys.exit("Require an output filename") if not options.format: - sys_exit("Require the sequence format") + sys.exit("Require the sequence format") seq_format = options.format.lower() def count_fasta(filename): - from Bio.SeqIO.FastaIO import SimpleFastaParser count = 0 with open(filename) as handle: for title, seq in SimpleFastaParser(handle): @@ -92,7 +98,6 @@ def count_fastq(filename): - from Bio.SeqIO.QualityIO import FastqGeneralIterator count = 0 with open(filename) as handle: for title, seq, qual in FastqGeneralIterator(handle): @@ -101,7 +106,6 @@ def count_sff(filename): - from Bio import SeqIO # If the SFF file has a built in index (which is normal), # this will be parsed and is the quicker than scanning # the whole file. @@ -109,29 +113,29 @@ def count_sequences(filename, format): - if seq_format == "sff": + if format == "sff": return count_sff(filename) - elif seq_format == "fasta": + elif format == "fasta": return count_fasta(filename) - elif seq_format.startswith("fastq"): + elif format.startswith("fastq"): return count_fastq(filename) else: - sys_exit("Unsupported file type %r" % seq_format) + sys.exit("Unsupported file type %r" % format) if options.percent and options.everyn: - sys_exit("Cannot combine -p and -n options") + sys.exit("Cannot combine -p and -n options") elif options.everyn and options.count: - sys_exit("Cannot combine -p and -c options") + sys.exit("Cannot combine -p and -c options") elif options.percent and options.count: - sys_exit("Cannot combine -n and -c options") + sys.exit("Cannot combine -n and -c options") elif options.everyn: try: N = int(options.everyn) - except: - sys_exit("Bad -n argument %r" % options.everyn) + except ValueError: + sys.exit("Bad -n argument %r" % options.everyn) if N < 2: - sys_exit("Bad -n argument %r" % options.everyn) + sys.exit("Bad -n argument %r" % options.everyn) if (N % 10) == 1: sys.stderr.write("Sampling every %ist sequence\n" % N) elif (N % 10) == 2: @@ -140,6 +144,7 @@ sys.stderr.write("Sampling every %ird sequence\n" % N) else: sys.stderr.write("Sampling every %ith sequence\n" % N) + def sampler(iterator): global N count = 0 @@ -150,11 +155,12 @@ elif options.percent: try: percent = float(options.percent) / 100.0 - except: - sys_exit("Bad -p percent argument %r" % options.percent) + except ValueError: + sys.exit("Bad -p percent argument %r" % options.percent) if percent <= 0.0 or 1.0 <= percent: - sys_exit("Bad -p percent argument %r" % options.percent) + sys.exit("Bad -p percent argument %r" % options.percent) sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent)) + def sampler(iterator): global percent count = 0 @@ -167,19 +173,19 @@ elif options.count: try: N = int(options.count) - except: - sys_exit("Bad -c count argument %r" % options.count) + except ValueError: + sys.exit("Bad -c count argument %r" % options.count) if N < 1: - sys_exit("Bad -c count argument %r" % options.count) + sys.exit("Bad -c count argument %r" % options.count) total = count_sequences(in_file, seq_format) sys.stderr.write("Input file has %i sequences\n" % total) if interleaved: # Paired if total % 2: - sys_exit("Paired mode, but input file has an odd number of sequences: %i" + sys.exit("Paired mode, but input file has an odd number of sequences: %i" % total) elif N > total // 2: - sys_exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)." + sys.exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)." % (N, total // 2, total)) total = total // 2 if N == 1: @@ -191,7 +197,7 @@ else: # Not paired if total < N: - sys_exit("Requested %i sequences, but file only has %i." % (N, total)) + sys.exit("Requested %i sequences, but file only has %i." % (N, total)) if N == 1: sys.stderr.write("Sampling just first sequence!\n") elif N == total: @@ -215,7 +221,7 @@ # i.e. What if percentage comes out slighty too low, and # we could end up missing last few desired sequences? percentage = float(N) / float(total) - #print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f" + # print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f" # % (N, total, percentage * 100.0)) count = 0 taken = 0 @@ -233,7 +239,7 @@ yield record assert taken == N, "Picked %i, wanted %i" % (taken, N) else: - sys_exit("Must use either -n, -p or -c") + sys.exit("Must use either -n, -p or -c") def pair(iterator): @@ -252,7 +258,7 @@ while True: line = handle.readline() if line == "": - return # Premature end of file, or just empty? + return # Premature end of file, or just empty? if line[0] == ">": break @@ -279,11 +285,12 @@ line = handle.readline() yield "".join(lines) if not line: - return # StopIteration + return # StopIteration + def fasta_filter(in_file, out_file, iterator_filter, inter): count = 0 - #Galaxy now requires Python 2.5+ so can use with statements, + # Galaxy now requires Python 2.5+ so can use with statements, with open(in_file) as in_handle: with open(out_file, "w") as pos_handle: if inter: @@ -298,7 +305,6 @@ return count -from Bio.SeqIO.QualityIO import FastqGeneralIterator def fastq_filter(in_file, out_file, iterator_filter, inter): count = 0 with open(in_file) as in_handle: @@ -318,13 +324,9 @@ def sff_filter(in_file, out_file, iterator_filter, inter): count = 0 try: - from Bio.SeqIO.SffIO import SffIterator, SffWriter - except ImportError: - sys_exit("SFF filtering requires Biopython 1.54 or later") - try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: - #Prior to Biopython 1.56 this was a private function + # Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest with open(in_file, "rb") as in_handle: try: @@ -334,7 +336,7 @@ in_handle.seek(0) with open(out_file, "wb") as out_handle: writer = SffWriter(out_handle, xml=manifest) - in_handle.seek(0) #start again after getting manifest + in_handle.seek(0) # start again after getting manifest if inter: from itertools import chain count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle))))) @@ -342,7 +344,6 @@ count /= 2 else: count = writer.write_file(iterator_filter(SffIterator(in_handle))) - #count = writer.write_file(SffIterator(in_handle)) return count if seq_format == "sff": @@ -352,7 +353,7 @@ elif seq_format.startswith("fastq"): count = fastq_filter(in_file, out_file, sampler, interleaved) else: - sys_exit("Unsupported file type %r" % seq_format) + sys.exit("Unsupported file type %r" % seq_format) if interleaved: sys.stderr.write("Selected %i pairs\n" % count)