sample_seqs: tools/sample_seqs/sample

comparison tools/sample_seqs/sample_seqs.py @ 5:6b71ad5d43fb draft

v0.2.3 clarified help, internal cleanup of Python script

author	peterjc
date	Wed, 01 Feb 2017 09:39:36 -0500
parents	02c13ef1a669
children	31f5701cd2e9

comparison

equal deleted inserted replaced

-:d3aa9f25c24c
+:6b71ad5d43fb
 """
 import os
 import sys
 from optparse import OptionParser
+# Parse Command Line
-def sys_exit(msg, err=1):
-sys.stderr.write(msg.rstrip() + "\n")
-sys.exit(err)
-#Parse Command Line
 usage = """Use as follows:
 $ python sample_seqs.py [options]
 e.g. Sample 20% of the reads:
 $ python sample_seqs.py -i my_seq.fastq -f fastq -p 20.0 -o sample.fastq
 This samples uniformly though the file, rather than at random, and therefore
 should be reproducible.
+If you have interleaved paired reads, use the --interleaved switch. If
+instead you have two matched files (one for each pair), run the two
+twice with the same sampling options to make to matched smaller files.
 """
 parser = OptionParser(usage=usage)
 parser.add_option('-i', '--input', dest='input',
 default=None, help='Input sequences filename',
 metavar="FILE")
 default=False, action="store_true",
 help="Show version and quit")
 options, args = parser.parse_args()
 if options.version:
-print("v0.2.1")
+print("v0.2.3")
 sys.exit(0)
+try:
+from Bio import SeqIO
+from Bio.SeqIO.QualityIO import FastqGeneralIterator
+from Bio.SeqIO.FastaIO import SimpleFastaParser
+from Bio.SeqIO.SffIO import SffIterator, SffWriter
+except ImportError:
+sys.exit("This script requires Biopython.")
 in_file = options.input
 out_file = options.output
 interleaved = options.interleaved
 if not in_file:
-sys_exit("Require an input filename")
+sys.exit("Require an input filename")
 if in_file != "/dev/stdin" and not os.path.isfile(in_file):
-sys_exit("Missing input file %r" % in_file)
+sys.exit("Missing input file %r" % in_file)
 if not out_file:
-sys_exit("Require an output filename")
+sys.exit("Require an output filename")
 if not options.format:
-sys_exit("Require the sequence format")
+sys.exit("Require the sequence format")
 seq_format = options.format.lower()
 def count_fasta(filename):
-from Bio.SeqIO.FastaIO import SimpleFastaParser
 count = 0
 with open(filename) as handle:
 for title, seq in SimpleFastaParser(handle):
 count += 1
 return count
 def count_fastq(filename):
-from Bio.SeqIO.QualityIO import FastqGeneralIterator
 count = 0
 with open(filename) as handle:
 for title, seq, qual in FastqGeneralIterator(handle):
 count += 1
 return count
 def count_sff(filename):
-from Bio import SeqIO
 # If the SFF file has a built in index (which is normal),
 # this will be parsed and is the quicker than scanning
 # the whole file.
 return len(SeqIO.index(filename, "sff"))
 def count_sequences(filename, format):
-if seq_format == "sff":
+if format == "sff":
 return count_sff(filename)
-elif seq_format == "fasta":
+elif format == "fasta":
 return count_fasta(filename)
-elif seq_format.startswith("fastq"):
+elif format.startswith("fastq"):
 return count_fastq(filename)
 else:
-sys_exit("Unsupported file type %r" % seq_format)
+sys.exit("Unsupported file type %r" % format)
 if options.percent and options.everyn:
-sys_exit("Cannot combine -p and -n options")
+sys.exit("Cannot combine -p and -n options")
 elif options.everyn and options.count:
-sys_exit("Cannot combine -p and -c options")
+sys.exit("Cannot combine -p and -c options")
 elif options.percent and options.count:
-sys_exit("Cannot combine -n and -c options")
+sys.exit("Cannot combine -n and -c options")
 elif options.everyn:
 try:
 N = int(options.everyn)
-except:
+except ValueError:
-sys_exit("Bad -n argument %r" % options.everyn)
+sys.exit("Bad -n argument %r" % options.everyn)
 if N < 2:
-sys_exit("Bad -n argument %r" % options.everyn)
+sys.exit("Bad -n argument %r" % options.everyn)
 if (N % 10) == 1:
 sys.stderr.write("Sampling every %ist sequence\n" % N)
 elif (N % 10) == 2:
 sys.stderr.write("Sampling every %ind sequence\n" % N)
 elif (N % 10) == 3:
 sys.stderr.write("Sampling every %ird sequence\n" % N)
 else:
 sys.stderr.write("Sampling every %ith sequence\n" % N)
 def sampler(iterator):
 global N
 count = 0
 for record in iterator:
 count += 1
 if count % N == 1:
 yield record
 elif options.percent:
 try:
 percent = float(options.percent) / 100.0
-except:
+except ValueError:
-sys_exit("Bad -p percent argument %r" % options.percent)
+sys.exit("Bad -p percent argument %r" % options.percent)
 if percent <= 0.0 or 1.0 <= percent:
-sys_exit("Bad -p percent argument %r" % options.percent)
+sys.exit("Bad -p percent argument %r" % options.percent)
 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent))
 def sampler(iterator):
 global percent
 count = 0
 taken = 0
 for record in iterator:
 taken += 1
 yield record
 elif options.count:
 try:
 N = int(options.count)
-except:
+except ValueError:
-sys_exit("Bad -c count argument %r" % options.count)
+sys.exit("Bad -c count argument %r" % options.count)
 if N < 1:
-sys_exit("Bad -c count argument %r" % options.count)
+sys.exit("Bad -c count argument %r" % options.count)
 total = count_sequences(in_file, seq_format)
 sys.stderr.write("Input file has %i sequences\n" % total)
 if interleaved:
 # Paired
 if total % 2:
-sys_exit("Paired mode, but input file has an odd number of sequences: %i"
+sys.exit("Paired mode, but input file has an odd number of sequences: %i"
 % total)
 elif N > total // 2:
-sys_exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)."
+sys.exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)."
 % (N, total // 2, total))
 total = total // 2
 if N == 1:
 sys.stderr.write("Sampling just first sequence pair!\n")
 elif N == total:
 else:
 sys.stderr.write("Sampling %i sequence pairs\n" % N)
 else:
 # Not paired
 if total < N:
-sys_exit("Requested %i sequences, but file only has %i." % (N, total))
+sys.exit("Requested %i sequences, but file only has %i." % (N, total))
 if N == 1:
 sys.stderr.write("Sampling just first sequence!\n")
 elif N == total:
 sys.stderr.write("Taking all the sequences\n")
 else:
 global N, total
 # Do we need a floating point fudge factor epsilon?
 # i.e. What if percentage comes out slighty too low, and
 # we could end up missing last few desired sequences?
 percentage = float(N) / float(total)
-#print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f"
+# print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f"
 #      % (N, total, percentage * 100.0))
 count = 0
 taken = 0
 for record in iterator:
 count += 1
 # we need to take all remaining sequences to meet target
 taken += 1
 yield record
 assert taken == N, "Picked %i, wanted %i" % (taken, N)
 else:
-sys_exit("Must use either -n, -p or -c")
+sys.exit("Must use either -n, -p or -c")
 def pair(iterator):
 """Quick and dirty pair batched iterator."""
 while True:
 def raw_fasta_iterator(handle):
 """Yields raw FASTA records as multi-line strings."""
 while True:
 line = handle.readline()
 if line == "":
-return # Premature end of file, or just empty?
+return  # Premature end of file, or just empty?
 if line[0] == ">":
 break
 no_id_warned = False
 while True:
 break
 lines.append(line)
 line = handle.readline()
 yield "".join(lines)
 if not line:
-return # StopIteration
+return  # StopIteration
 def fasta_filter(in_file, out_file, iterator_filter, inter):
 count = 0
-#Galaxy now requires Python 2.5+ so can use with statements,
+# Galaxy now requires Python 2.5+ so can use with statements,
 with open(in_file) as in_handle:
 with open(out_file, "w") as pos_handle:
 if inter:
 for r1, r2 in iterator_filter(pair(raw_fasta_iterator(in_handle))):
 count += 1
 count += 1
 pos_handle.write(record)
 return count
-from Bio.SeqIO.QualityIO import FastqGeneralIterator
 def fastq_filter(in_file, out_file, iterator_filter, inter):
 count = 0
 with open(in_file) as in_handle:
 with open(out_file, "w") as pos_handle:
 if inter:
 def sff_filter(in_file, out_file, iterator_filter, inter):
 count = 0
 try:
-from Bio.SeqIO.SffIO import SffIterator, SffWriter
-except ImportError:
-sys_exit("SFF filtering requires Biopython 1.54 or later")
-try:
 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
 except ImportError:
-#Prior to Biopython 1.56 this was a private function
+# Prior to Biopython 1.56 this was a private function
 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
 with open(in_file, "rb") as in_handle:
 try:
 manifest = ReadRocheXmlManifest(in_handle)
 except ValueError:
 manifest = None
 in_handle.seek(0)
 with open(out_file, "wb") as out_handle:
 writer = SffWriter(out_handle, xml=manifest)
-in_handle.seek(0) #start again after getting manifest
+in_handle.seek(0)  # start again after getting manifest
 if inter:
 from itertools import chain
 count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle)))))
 assert count % 2 == 0, "Odd number of records? %i" % count
 count /= 2
 else:
 count = writer.write_file(iterator_filter(SffIterator(in_handle)))
-#count = writer.write_file(SffIterator(in_handle))
 return count
 if seq_format == "sff":
 count = sff_filter(in_file, out_file, sampler, interleaved)
 elif seq_format == "fasta":
 count = fasta_filter(in_file, out_file, sampler, interleaved)
 elif seq_format.startswith("fastq"):
 count = fastq_filter(in_file, out_file, sampler, interleaved)
 else:
-sys_exit("Unsupported file type %r" % seq_format)
+sys.exit("Unsupported file type %r" % seq_format)
 if interleaved:
 sys.stderr.write("Selected %i pairs\n" % count)
 else:
 sys.stderr.write("Selected %i records\n" % count)

Mercurial > repos > peterjc > sample_seqs

comparison tools/sample_seqs/sample_seqs.py @ 5:6b71ad5d43fb draft