diff tools/sample_seqs/sample_seqs.py @ 5:6b71ad5d43fb draft

v0.2.3 clarified help, internal cleanup of Python script
author peterjc
date Wed, 01 Feb 2017 09:39:36 -0500
parents 02c13ef1a669
children 31f5701cd2e9
line wrap: on
line diff
--- a/tools/sample_seqs/sample_seqs.py	Wed Aug 05 12:30:18 2015 -0400
+++ b/tools/sample_seqs/sample_seqs.py	Wed Feb 01 09:39:36 2017 -0500
@@ -19,12 +19,7 @@
 import sys
 from optparse import OptionParser
 
-
-def sys_exit(msg, err=1):
-    sys.stderr.write(msg.rstrip() + "\n")
-    sys.exit(err)
-
-#Parse Command Line
+# Parse Command Line
 usage = """Use as follows:
 
 $ python sample_seqs.py [options]
@@ -35,6 +30,10 @@
 
 This samples uniformly though the file, rather than at random, and therefore
 should be reproducible.
+
+If you have interleaved paired reads, use the --interleaved switch. If
+instead you have two matched files (one for each pair), run the two
+twice with the same sampling options to make to matched smaller files.
 """
 parser = OptionParser(usage=usage)
 parser.add_option('-i', '--input', dest='input',
@@ -64,26 +63,33 @@
 options, args = parser.parse_args()
 
 if options.version:
-    print("v0.2.1")
+    print("v0.2.3")
     sys.exit(0)
 
+try:
+    from Bio import SeqIO
+    from Bio.SeqIO.QualityIO import FastqGeneralIterator
+    from Bio.SeqIO.FastaIO import SimpleFastaParser
+    from Bio.SeqIO.SffIO import SffIterator, SffWriter
+except ImportError:
+    sys.exit("This script requires Biopython.")
+
 in_file = options.input
 out_file = options.output
 interleaved = options.interleaved
 
 if not in_file:
-    sys_exit("Require an input filename")
+    sys.exit("Require an input filename")
 if in_file != "/dev/stdin" and not os.path.isfile(in_file):
-    sys_exit("Missing input file %r" % in_file)
+    sys.exit("Missing input file %r" % in_file)
 if not out_file:
-    sys_exit("Require an output filename")
+    sys.exit("Require an output filename")
 if not options.format:
-    sys_exit("Require the sequence format")
+    sys.exit("Require the sequence format")
 seq_format = options.format.lower()
 
 
 def count_fasta(filename):
-    from Bio.SeqIO.FastaIO import SimpleFastaParser
     count = 0
     with open(filename) as handle:
         for title, seq in SimpleFastaParser(handle):
@@ -92,7 +98,6 @@
 
 
 def count_fastq(filename):
-    from Bio.SeqIO.QualityIO import FastqGeneralIterator
     count = 0
     with open(filename) as handle:
         for title, seq, qual in FastqGeneralIterator(handle):
@@ -101,7 +106,6 @@
 
 
 def count_sff(filename):
-    from Bio import SeqIO
     # If the SFF file has a built in index (which is normal),
     # this will be parsed and is the quicker than scanning
     # the whole file.
@@ -109,29 +113,29 @@
 
 
 def count_sequences(filename, format):
-    if seq_format == "sff":
+    if format == "sff":
         return count_sff(filename)
-    elif seq_format == "fasta":
+    elif format == "fasta":
         return count_fasta(filename)
-    elif seq_format.startswith("fastq"):
+    elif format.startswith("fastq"):
         return count_fastq(filename)
     else:
-        sys_exit("Unsupported file type %r" % seq_format)
+        sys.exit("Unsupported file type %r" % format)
 
 
 if options.percent and options.everyn:
-    sys_exit("Cannot combine -p and -n options")
+    sys.exit("Cannot combine -p and -n options")
 elif options.everyn and options.count:
-    sys_exit("Cannot combine -p and -c options")
+    sys.exit("Cannot combine -p and -c options")
 elif options.percent and options.count:
-    sys_exit("Cannot combine -n and -c options")
+    sys.exit("Cannot combine -n and -c options")
 elif options.everyn:
     try:
         N = int(options.everyn)
-    except:
-        sys_exit("Bad -n argument %r" % options.everyn)
+    except ValueError:
+        sys.exit("Bad -n argument %r" % options.everyn)
     if N < 2:
-        sys_exit("Bad -n argument %r" % options.everyn)
+        sys.exit("Bad -n argument %r" % options.everyn)
     if (N % 10) == 1:
         sys.stderr.write("Sampling every %ist sequence\n" % N)
     elif (N % 10) == 2:
@@ -140,6 +144,7 @@
         sys.stderr.write("Sampling every %ird sequence\n" % N)
     else:
         sys.stderr.write("Sampling every %ith sequence\n" % N)
+
     def sampler(iterator):
         global N
         count = 0
@@ -150,11 +155,12 @@
 elif options.percent:
     try:
         percent = float(options.percent) / 100.0
-    except:
-        sys_exit("Bad -p percent argument %r" % options.percent)
+    except ValueError:
+        sys.exit("Bad -p percent argument %r" % options.percent)
     if percent <= 0.0 or 1.0 <= percent:
-        sys_exit("Bad -p percent argument %r" % options.percent)
+        sys.exit("Bad -p percent argument %r" % options.percent)
     sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent))
+
     def sampler(iterator):
         global percent
         count = 0
@@ -167,19 +173,19 @@
 elif options.count:
     try:
         N = int(options.count)
-    except:
-        sys_exit("Bad -c count argument %r" % options.count)
+    except ValueError:
+        sys.exit("Bad -c count argument %r" % options.count)
     if N < 1:
-        sys_exit("Bad -c count argument %r" % options.count)
+        sys.exit("Bad -c count argument %r" % options.count)
     total = count_sequences(in_file, seq_format)
     sys.stderr.write("Input file has %i sequences\n" % total)
     if interleaved:
         # Paired
         if total % 2:
-            sys_exit("Paired mode, but input file has an odd number of sequences: %i"
+            sys.exit("Paired mode, but input file has an odd number of sequences: %i"
                      % total)
         elif N > total // 2:
-            sys_exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)."
+            sys.exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)."
                      % (N, total // 2, total))
         total = total // 2
         if N == 1:
@@ -191,7 +197,7 @@
     else:
         # Not paired
         if total < N:
-            sys_exit("Requested %i sequences, but file only has %i." % (N, total))
+            sys.exit("Requested %i sequences, but file only has %i." % (N, total))
         if N == 1:
             sys.stderr.write("Sampling just first sequence!\n")
         elif N == total:
@@ -215,7 +221,7 @@
             # i.e. What if percentage comes out slighty too low, and
             # we could end up missing last few desired sequences?
             percentage = float(N) / float(total)
-            #print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f"
+            # print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f"
             #      % (N, total, percentage * 100.0))
             count = 0
             taken = 0
@@ -233,7 +239,7 @@
                     yield record
             assert taken == N, "Picked %i, wanted %i" % (taken, N)
 else:
-    sys_exit("Must use either -n, -p or -c")
+    sys.exit("Must use either -n, -p or -c")
 
 
 def pair(iterator):
@@ -252,7 +258,7 @@
     while True:
         line = handle.readline()
         if line == "":
-            return # Premature end of file, or just empty?
+            return  # Premature end of file, or just empty?
         if line[0] == ">":
             break
 
@@ -279,11 +285,12 @@
             line = handle.readline()
         yield "".join(lines)
         if not line:
-            return # StopIteration 
+            return  # StopIteration
+
 
 def fasta_filter(in_file, out_file, iterator_filter, inter):
     count = 0
-    #Galaxy now requires Python 2.5+ so can use with statements,
+    # Galaxy now requires Python 2.5+ so can use with statements,
     with open(in_file) as in_handle:
         with open(out_file, "w") as pos_handle:
             if inter:
@@ -298,7 +305,6 @@
     return count
 
 
-from Bio.SeqIO.QualityIO import FastqGeneralIterator
 def fastq_filter(in_file, out_file, iterator_filter, inter):
     count = 0
     with open(in_file) as in_handle:
@@ -318,13 +324,9 @@
 def sff_filter(in_file, out_file, iterator_filter, inter):
     count = 0
     try:
-        from Bio.SeqIO.SffIO import SffIterator, SffWriter
-    except ImportError:
-        sys_exit("SFF filtering requires Biopython 1.54 or later")
-    try:
         from Bio.SeqIO.SffIO import ReadRocheXmlManifest
     except ImportError:
-        #Prior to Biopython 1.56 this was a private function
+        # Prior to Biopython 1.56 this was a private function
         from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
     with open(in_file, "rb") as in_handle:
         try:
@@ -334,7 +336,7 @@
         in_handle.seek(0)
         with open(out_file, "wb") as out_handle:
             writer = SffWriter(out_handle, xml=manifest)
-            in_handle.seek(0) #start again after getting manifest
+            in_handle.seek(0)  # start again after getting manifest
             if inter:
                 from itertools import chain
                 count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle)))))
@@ -342,7 +344,6 @@
                 count /= 2
             else:
                 count = writer.write_file(iterator_filter(SffIterator(in_handle)))
-                #count = writer.write_file(SffIterator(in_handle))
     return count
 
 if seq_format == "sff":
@@ -352,7 +353,7 @@
 elif seq_format.startswith("fastq"):
     count = fastq_filter(in_file, out_file, sampler, interleaved)
 else:
-    sys_exit("Unsupported file type %r" % seq_format)
+    sys.exit("Unsupported file type %r" % seq_format)
 
 if interleaved:
     sys.stderr.write("Selected %i pairs\n" % count)