diff tools/sample_seqs/sample_seqs.py @ 2:da64f6a9e32b draft

Uploaded v0.2.0, adds desired count mode
author peterjc
date Fri, 06 Mar 2015 11:48:09 -0500
parents 3a807e5ea6c8
children 02c13ef1a669
line wrap: on
line diff
--- a/tools/sample_seqs/sample_seqs.py	Thu Mar 27 12:13:22 2014 -0400
+++ b/tools/sample_seqs/sample_seqs.py	Fri Mar 06 11:48:09 2015 -0500
@@ -2,46 +2,136 @@
 """Sub-sample sequence from a FASTA, FASTQ or SFF file.
 
 This tool is a short Python script which requires Biopython 1.62 or later
-for SFF file support. If you use this tool in scientific work leading to a
+for sequence parsing. If you use this tool in scientific work leading to a
 publication, please cite the Biopython application note:
 
 Cock et al 2009. Biopython: freely available Python tools for computational
 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
 
-This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute
+This script is copyright 2014-2015 by Peter Cock, The James Hutton Institute
 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
 See accompanying text file for licence details (MIT license).
 
-This is version 0.1.0 of the script, use -v or --version to get the version.
+Use -v or --version to get the version, -h or --help for help.
 """
 import os
 import sys
+from optparse import OptionParser
 
-def stop_err(msg, err=1):
+
+def sys_exit(msg, err=1):
     sys.stderr.write(msg.rstrip() + "\n")
     sys.exit(err)
 
-if "-v" in sys.argv or "--version" in sys.argv:
-    print("v0.1.0")
+#Parse Command Line
+usage = """Use as follows:
+
+$ python sample_seqs.py [options]
+
+e.g. Sample 20% of the reads:
+
+$ python sample_seqs.py -i my_seq.fastq -f fastq -p 20.0 -o sample.fastq
+
+This samples uniformly though the file, rather than at random, and therefore
+should be reproducible.
+"""
+parser = OptionParser(usage=usage)
+parser.add_option('-i', '--input', dest='input',
+                  default=None, help='Input sequences filename',
+                  metavar="FILE")
+parser.add_option('-f', '--format', dest='format',
+                  default=None,
+                  help='Input sequence format (e.g. fasta, fastq, sff)')
+parser.add_option('-o', '--output', dest='output',
+                  default=None, help='Output sampled sequenced filename',
+                  metavar="FILE")
+parser.add_option('-p', '--percent', dest='percent',
+                  default=None,
+                  help='Take this percent of the reads')
+parser.add_option('-n', '--everyn', dest='everyn',
+                  default=None,
+                  help='Take every N-th read')
+parser.add_option('-c', '--count', dest='count',
+                  default=None,
+                  help='Take exactly N reads')
+parser.add_option("--interleaved", dest="interleaved",
+                  default=False, action="store_true",
+                  help="Input is interleaved reads, preserve the pairings")
+parser.add_option("-v", "--version", dest="version",
+                  default=False, action="store_true",
+                  help="Show version and quit")
+options, args = parser.parse_args()
+
+if options.version:
+    print("v0.2.0")
     sys.exit(0)
 
-#Parse Command Line
-if len(sys.argv) < 5:
-    stop_err("Requires at least four arguments: seq_format, in_file, out_file, mode, ...")
-seq_format, in_file, out_file, mode = sys.argv[1:5]
+in_file = options.input
+out_file = options.output
+interleaved = options.interleaved
+
+if not in_file:
+    sys_exit("Require an input filename")
 if in_file != "/dev/stdin" and not os.path.isfile(in_file):
-    stop_err("Missing input file %r" % in_file)
+    sys_exit("Missing input file %r" % in_file)
+if not out_file:
+    sys_exit("Require an output filename")
+if not options.format:
+    sys_exit("Require the sequence format")
+seq_format = options.format.lower()
+
+
+def count_fasta(filename):
+    from Bio.SeqIO.FastaIO import SimpleFastaParser
+    count = 0
+    with open(filename) as handle:
+        for title, seq in SimpleFastaParser(handle):
+            count += 1
+    return count
+
+
+def count_fastq(filename):
+    from Bio.SeqIO.QualityIO import FastqGeneralIterator
+    count = 0
+    with open(filename) as handle:
+        for title, seq, qual in FastqGeneralIterator(handle):
+            count += 1
+    return count
+
 
-if mode == "everyNth":
-    if len(sys.argv) != 6:
-        stop_err("If using everyNth, just need argument N (integer, at least 2)")
+def count_sff(filename):
+    from Bio import SeqIO
+    # If the SFF file has a built in index (which is normal),
+    # this will be parsed and is the quicker than scanning
+    # the whole file.
+    return len(SeqIO.index(filename, "sff"))
+
+
+def count_sequences(filename, format):
+    if seq_format == "sff":
+        return count_sff(filename)
+    elif seq_format == "fasta":
+        return count_fasta(filename)
+    elif seq_format.startswith("fastq"):
+        return count_fastq(filename)
+    else:
+        sys_exit("Unsupported file type %r" % seq_format)
+
+
+if options.percent and options.everyn:
+    sys_exit("Cannot combine -p and -n options")
+elif options.everyn and options.count:
+    sys_exit("Cannot combine -p and -c options")
+elif options.percent and options.count:
+    sys_exit("Cannot combine -n and -c options")
+elif options.everyn:
     try:
-        N = int(sys.argv[5])
+        N = int(options.everyn)
     except:
-        stop_err("Bad N argument %r" % sys.argv[5])
+        sys_exit("Bad -n argument %r" % options.everyn)
     if N < 2:
-        stop_err("Bad N argument %r" % sys.argv[5])
+        sys_exit("Bad -n argument %r" % options.everyn)
     if (N % 10) == 1:
         sys.stderr.write("Sampling every %ist sequence\n" % N)
     elif (N % 10) == 2:
@@ -57,15 +147,13 @@
             count += 1
             if count % N == 1:
                 yield record
-elif mode == "percentage":
-    if len(sys.argv) != 6:
-        stop_err("If using percentage, just need percentage argument (float, range 0 to 100)")
+elif options.percent:
     try:
-        percent = float(sys.argv[5]) / 100.0
+        percent = float(options.percent) / 100.0
     except:
-        stop_err("Bad percent argument %r" % sys.argv[5])
+        sys_exit("Bad -p percent argument %r" % options.percent)
     if percent <= 0.0 or 1.0 <= percent:
-        stop_err("Bad percent argument %r" % sys.argv[5])
+        sys_exit("Bad -p percent argument %r" % options.percent)
     sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent))
     def sampler(iterator):
         global percent
@@ -76,8 +164,88 @@
             if percent * count > taken:
                 taken += 1
                 yield record
+elif options.count:
+    try:
+        N = int(options.count)
+    except:
+        sys_exit("Bad -c count argument %r" % options.count)
+    if N < 1:
+        sys_exit("Bad -c count argument %r" % options.count)
+    total = count_sequences(in_file, seq_format)
+    print("Input file has %i sequences" % total)
+    if interleaved:
+        # Paired
+        if total % 2:
+            sys_exit("Paired mode, but input file has an odd number of sequences: %i"
+                     % total)
+        elif N > total // 2:
+            sys_exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)."
+                     % (N, total // 2, total))
+        total = total // 2
+        if N == 1:
+            sys.stderr.write("Sampling just first sequence pair!\n")
+        elif N == total:
+            sys.stderr.write("Taking all the sequence pairs\n")
+        else:
+            sys.stderr.write("Sampling %i sequence pairs\n" % N)
+    else:
+        # Not paired
+        if total < N:
+            sys_exit("Requested %i sequences, but file only has %i." % (N, total))
+        if N == 1:
+            sys.stderr.write("Sampling just first sequence!\n")
+        elif N == total:
+            sys.stderr.write("Taking all the sequences\n")
+        else:
+            sys.stderr.write("Sampling %i sequences\n" % N)
+    if N == total:
+        def sampler(iterator):
+            """Dummy filter to filter nothing, taking everything."""
+            global N
+            taken = 0
+            for record in iterator:
+                taken += 1
+                yield record
+            assert taken == N, "Picked %i, wanted %i" % (taken, N)
+    else:
+        def sampler(iterator):
+            # Mimic the percentage sampler, with double check on final count
+            global N, total
+            # Do we need a floating point fudge factor epsilon?
+            # i.e. What if percentage comes out slighty too low, and
+            # we could end up missing last few desired sequences?
+            percentage = float(N) / float(total)
+            #print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f"
+            #      % (N, total, percentage * 100.0))
+            count = 0
+            taken = 0
+            for record in iterator:
+                count += 1
+                # Do we need the extra upper bound?
+                if percentage * count > taken and taken < N:
+                    taken += 1
+                    yield record
+                elif total - count + 1 <= N - taken:
+                    # remaining records (incuding this one) <= what we still need.
+                    # This is a safey check for floating point edge cases where
+                    # we need to take all remaining sequences to meet target
+                    taken += 1
+                    yield record
+            assert taken == N, "Picked %i, wanted %i" % (taken, N)
 else:
-    stop_err("Unsupported mode %r" % mode)
+    sys_exit("Must use either -n, -p or -c")
+
+
+def pair(iterator):
+    """Quick and dirty pair batched iterator."""
+    while True:
+        a = next(iterator)
+        b = next(iterator)
+        if not b:
+            assert not a, "Odd number of records?"
+            break
+        yield (a, b)
+
 
 def raw_fasta_iterator(handle):
     """Yields raw FASTA records as multi-line strings."""
@@ -113,46 +281,46 @@
         if not line:
             return # StopIteration 
 
-def fasta_filter(in_file, out_file, iterator_filter):
+def fasta_filter(in_file, out_file, iterator_filter, inter):
     count = 0
     #Galaxy now requires Python 2.5+ so can use with statements,
     with open(in_file) as in_handle:
         with open(out_file, "w") as pos_handle:
-            for record in iterator_filter(raw_fasta_iterator(in_handle)):
-                count += 1
-                pos_handle.write(record)
+            if inter:
+                for r1, r2 in iterator_filter(pair(raw_fasta_iterator(in_handle))):
+                    count += 1
+                    pos_handle.write(r1)
+                    pos_handle.write(r2)
+            else:
+                for record in iterator_filter(raw_fasta_iterator(in_handle)):
+                    count += 1
+                    pos_handle.write(record)
     return count
 
-try:
-    from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
-    def fastq_filter(in_file, out_file, iterator_filter):
-        count = 0
-        #from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
-        reader = fastqReader(open(in_file, "rU"))
-        writer = fastqWriter(open(out_file, "w"))
-        for record in iterator_filter(reader):
-            count += 1
-            writer.write(record)
-        writer.close()
-        reader.close()
-        return count
-except ImportError:
-    from Bio.SeqIO.QualityIO import FastqGeneralIterator
-    def fastq_filter(in_file, out_file, iterator_filter):
-        count = 0
-        with open(in_file) as in_handle:
-            with open(out_file, "w") as pos_handle:
+
+from Bio.SeqIO.QualityIO import FastqGeneralIterator
+def fastq_filter(in_file, out_file, iterator_filter, inter):
+    count = 0
+    with open(in_file) as in_handle:
+        with open(out_file, "w") as pos_handle:
+            if inter:
+                for r1, r2 in iterator_filter(pair(FastqGeneralIterator(in_handle))):
+                    count += 1
+                    pos_handle.write("@%s\n%s\n+\n%s\n" % r1)
+                    pos_handle.write("@%s\n%s\n+\n%s\n" % r2)
+            else:
                 for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)):
                     count += 1
                     pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
-        return count
+    return count
 
-def sff_filter(in_file, out_file, iterator_filter):
+
+def sff_filter(in_file, out_file, iterator_filter, inter):
     count = 0
     try:
         from Bio.SeqIO.SffIO import SffIterator, SffWriter
     except ImportError:
-        stop_err("SFF filtering requires Biopython 1.54 or later")
+        sys_exit("SFF filtering requires Biopython 1.54 or later")
     try:
         from Bio.SeqIO.SffIO import ReadRocheXmlManifest
     except ImportError:
@@ -167,17 +335,26 @@
         with open(out_file, "wb") as out_handle:
             writer = SffWriter(out_handle, xml=manifest)
             in_handle.seek(0) #start again after getting manifest
-            count = writer.write_file(iterator_filter(SffIterator(in_handle)))
-            #count = writer.write_file(SffIterator(in_handle))
+            if inter:
+                from itertools import chain
+                count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle)))))
+                assert count % 2 == 0, "Odd number of records? %i" % count
+                count /= 2
+            else:
+                count = writer.write_file(iterator_filter(SffIterator(in_handle)))
+                #count = writer.write_file(SffIterator(in_handle))
     return count
 
-if seq_format.lower()=="sff":
-    count = sff_filter(in_file, out_file, sampler)
-elif seq_format.lower()=="fasta":
-    count = fasta_filter(in_file, out_file, sampler)
-elif seq_format.lower().startswith("fastq"):
-    count = fastq_filter(in_file, out_file, sampler)
+if seq_format == "sff":
+    count = sff_filter(in_file, out_file, sampler, interleaved)
+elif seq_format == "fasta":
+    count = fasta_filter(in_file, out_file, sampler, interleaved)
+elif seq_format.startswith("fastq"):
+    count = fastq_filter(in_file, out_file, sampler, interleaved)
 else:
-    stop_err("Unsupported file type %r" % seq_format)
+    sys_exit("Unsupported file type %r" % seq_format)
 
-sys.stderr.write("Sampled %i records\n" % count)
+if interleaved:
+    sys.stderr.write("Selected %i pairs\n" % count)
+else:
+    sys.stderr.write("Selected %i records\n" % count)