changeset 3:fcdf11fb34de draft

v0.0.4 More statistics including optional N50 and median
author peterjc
date Thu, 17 May 2018 08:13:25 -0400
parents 6f29bb9960ac
children 17caf7a7c2c5
files tools/seq_length/README.rst tools/seq_length/seq_length.py tools/seq_length/seq_length.xml
diffstat 3 files changed, 133 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/tools/seq_length/README.rst	Mon May 14 12:09:50 2018 -0400
+++ b/tools/seq_length/README.rst	Thu May 17 08:13:25 2018 -0400
@@ -65,6 +65,8 @@
 v0.0.3  - Improved command line usage (outside of Galaxy).
         - More tests (now covers SFF as well).
         - Fix requesting SFF format.
+v0.0.4  - Report mean, minimum and maximum sequence lengths (via stdout).
+        - Option to compute median and N50 as well.
 ======= ======================================================================
 
 
@@ -92,6 +94,8 @@
     $ planemo shed_upload --tar_only tools/seq_length/
     ...
     $ tar -tzf shed_upload.tar.gz
+    test-data/MID4_GLZRM4E04_rnd30.length.tabular
+    test-data/MID4_GLZRM4E04_rnd30.sff
     test-data/SRR639755_sample_strict.fastq
     test-data/SRR639755_sample_strict.length.tabular
     test-data/four_human_proteins.fasta
--- a/tools/seq_length/seq_length.py	Mon May 14 12:09:50 2018 -0400
+++ b/tools/seq_length/seq_length.py	Thu May 17 08:13:25 2018 -0400
@@ -20,6 +20,7 @@
 from __future__ import print_function
 
 import sys
+from collections import defaultdict
 from optparse import OptionParser
 
 usage = r"""Use as follows to compute all the lengths in a sequence file:
@@ -36,13 +37,16 @@
 parser.add_option('-o', '--output', dest='output',
                   default=None, help='Output filename (tabular)',
                   metavar="FILE")
+parser.add_option("-s", "--stats", dest="stats",
+                  default=False, action="store_true",
+                  help="Compute statistics (median, N50 - will require much more RAM).")
 parser.add_option("-v", "--version", dest="version",
                   default=False, action="store_true",
                   help="Show version and quit")
 options, args = parser.parse_args()
 
 if options.version:
-    print("v0.0.3")
+    print("v0.0.4")
     sys.exit(0)
 if not options.input:
     sys.exit("Require an input filename")
@@ -87,8 +91,84 @@
     sys.exit("Unexpected format argument: %r" % options.format)
 
 
+def median_from_counts_dict(counts_dict, count=None):
+    sorted_lengths = sorted(counts_dict)  # i.e. sort the keys
+    if count is None:
+        count = sum(counts_dict.values())
+    index = count / 2
+    if count % 2:
+        # Odd, easy case - will be an exact value
+        # within one of the tally entries
+        for value in sorted_lengths:
+            index -= counts_dict[value]
+            if index < 0:
+                return value
+    else:
+        # Even, hard case - may have to take mean
+        special = None
+        for value in sorted_lengths:
+            if special is not None:
+                # We were right at boundary
+                return (special + value) / 2.0
+            index -= counts_dict[value]
+            if index == 0:
+                # Special case, want mean of this value
+                # (final entry of this tally) and the next
+                # value (first entry of the next tally).
+                special = value
+            elif index < 0:
+                # Typical case, the two middle values
+                # are equal and fall into same dict entry
+                return value
+    return None
+
+
+if sys.version_info[0] >= 3:
+    from statistics import median
+    for test in [{1: 4, 2: 3},
+                 {1: 4, 3: 6},
+                 {1: 4, 5: 4},
+                 {0: 5, 1: 1, 2: 1, 3: 5},
+                 {0: 5, 1: 1, 2: 1, 3: 1, 4: 5}]:
+        test_list = []
+        for v, c in test.items():
+            test_list.extend([v] * c)
+        # print(test)
+        # print(test_list)
+        assert median_from_counts_dict(test) == median(test_list)
+
+
+def n50_from_counts_dict(counts_dict):
+    """Calculate N50.
+
+    N50 is a statistical measure of average length of a set of sequences.
+    It is used widely in genomics, especially in reference to contig or
+    supercontig lengths within a draft assembly.
+
+    Given a set of sequences of varying lengths, the N50 length is defined
+    as the length N for which 50% of all bases in the sequences are in a
+    sequence of length L < N. This can be found mathematically as follows:
+    Take a list L of positive integers. Create another list L' , which is
+    identical to L, except that every element n in L has been replaced with
+    n copies of itself. Then the median of L' is the N50 of L. For example:
+    If L = {2, 2, 2, 3, 3, 4, 8, 8}, then L' consists of six 2's, six 3's,
+    four 4's, and sixteen 8's; the N50 of L is the median of L' , which is 6.
+
+    https://web.archive.org/web/20160726124802/http://www.broadinstitute.org/crd/wiki/index.php/N50
+    """
+    # Continuing the above example, input L would be {2:3, 3:2, 4:1, 8:2}
+    # and L' becomes {2:6, 3:6, 4:4, 8:16}} as tally tables.
+    l_prime = dict((v, v * c) for v, c in counts_dict.items())
+    return median_from_counts_dict(l_prime)
+
+
 count = 0
 total = 0
+stats = bool(options.stats)
+length_counts = defaultdict(int)  # used if stats requested
+length_min = sys.maxsize  # used if stats not requested
+length_max = 0
+
 with open(out_file, "w") as out_handle:
     out_handle.write("#Identifier\tLength\n")
     if format == "fastq":
@@ -99,6 +179,11 @@
                 total += length
                 identifier = title.split(None, 1)[0]
                 out_handle.write("%s\t%i\n" % (identifier, length))
+                if stats:
+                    length_counts[length] += 1
+                else:
+                    length_min = min(length_min, length)
+                    length_max = max(length_max, length)
     elif format == "fasta":
         with open(in_file) as in_handle:
             for title, seq in SimpleFastaParser(in_handle):
@@ -107,10 +192,29 @@
                 total += length
                 identifier = title.split(None, 1)[0]
                 out_handle.write("%s\t%i\n" % (identifier, length))
+                if stats:
+                    length_counts[length] += 1
+                else:
+                    length_min = min(length_min, length)
+                    length_max = max(length_max, length)
     else:
         for record in SeqIO.parse(in_file, format):
             count += 1
             length = len(record)
             total += length
             out_handle.write("%s\t%i\n" % (record.id, length))
-print("%i sequences, total length %i" % (count, total))
+            if stats:
+                length_counts[length] += 1
+            else:
+                length_min = min(length_min, length)
+                length_max = max(length_max, length)
+print("%i sequences, total length %i, mean %0.1f" % (count, total, float(total) / count))
+if not count:
+    pass
+elif not stats:
+    print("Shortest %i, longest %i" % (length_min, length_max))
+elif count and stats:
+    print("Shortest %i, longest %i" % (min(length_counts), max(length_counts)))
+    median = median_from_counts_dict(length_counts, count)
+    n50 = n50_from_counts_dict(length_counts)
+    print("Median length %0.1f, N50 %i" % (median, n50))
--- a/tools/seq_length/seq_length.xml	Mon May 14 12:09:50 2018 -0400
+++ b/tools/seq_length/seq_length.xml	Thu May 17 08:13:25 2018 -0400
@@ -1,4 +1,4 @@
-<tool id="seq_length" name="Sequence lengths" version="0.0.3">
+<tool id="seq_length" name="Sequence lengths" version="0.0.4">
     <description>from FASTA, QUAL, FASTQ, or SFF file</description>
     <requirements>
         <!-- This is the currently the last release of Biopython which is available via Galaxy's legacy XML packaging system -->
@@ -9,9 +9,13 @@
 </version_command>
     <command detect_errors="aggressive">
 python $__tool_directory__/seq_length.py -i '$input_file' -f '$input_file.ext' -o '$output_file'
+#if $stats
+  -s
+#end if
     </command>
     <inputs>
         <param name="input_file" type="data" format="fasta,qual,fastq,sff" label="Sequence file" help="FASTA, QUAL, FASTQ, or SFF format." />
+        <param name="stats" type="boolean" label="Compute additional statistics (median, N50)" />
     </inputs>
     <outputs>
         <data name="output_file" format="tabular" label="${on_string} length"/>
@@ -21,21 +25,26 @@
             <param name="input_file" value="four_human_proteins.fasta" ftype="fasta" />
             <output name="output_file" file="four_human_proteins.length.tabular" ftype="tabular" />
             <assert_stdout>
-                <has_line line="4 sequences, total length 3297" />
+                <has_line line="4 sequences, total length 3297, mean 824.2" />
+                <has_line line="Shortest 348, longest 1382" />
             </assert_stdout>
         </test>
         <test>
             <param name="input_file" value="SRR639755_sample_strict.fastq" ftype="fastq" />
             <output name="output_file" file="SRR639755_sample_strict.length.tabular" ftype="tabular" />
             <assert_stdout>
-                <has_line line="2 sequences, total length 202" />
+                <has_line line="2 sequences, total length 202, mean 101.0" />
+                <has_line line="Shortest 101, longest 101" />
             </assert_stdout>
         </test>
         <test>
             <param name="input_file" value="MID4_GLZRM4E04_rnd30.sff" ftype="sff" />
+            <param name="stats" value="true" />
             <output name="output_file" file="MID4_GLZRM4E04_rnd30.length.tabular" ftype="tabular" />
             <assert_stdout>
-                <has_line line="30 sequences, total length 7504" />
+                <has_line line="30 sequences, total length 7504, mean 250.1" />
+                <has_line line="Shortest 42, longest 473" />
+                <has_line line="Median length 269.5, N50 345" />
             </assert_stdout>
         </test>
     </tests>
@@ -46,9 +55,19 @@
 two-column tabular file containing one line per sequence giving the sequence
 identifier and the associated sequence's length.
 
+Additionally, the tool will report some basic statistics about the sequences
+(visible via the output file's meta data, or the stdout log for the job),
+namely the number of sequences, total length, mean length, minimum length and
+maximum length.
+
+You can optionally request additional statistics be computed which will use
+more RAM and take fractionally longer, namely the median and N50.
+
 WARNING: If there are any duplicate sequence identifiers, these will all appear
 in the tabular output.
 
+If using SFF files, this will use the trimmed lengths of the reads.
+
 **References**
 
 This tool uses Biopython's ``SeqIO`` library to read sequences, so please cite