Mercurial > repos > peterjc > seq_length
changeset 3:fcdf11fb34de draft
v0.0.4 More statistics including optional N50 and median
author | peterjc |
---|---|
date | Thu, 17 May 2018 08:13:25 -0400 |
parents | 6f29bb9960ac |
children | 17caf7a7c2c5 |
files | tools/seq_length/README.rst tools/seq_length/seq_length.py tools/seq_length/seq_length.xml |
diffstat | 3 files changed, 133 insertions(+), 6 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/seq_length/README.rst Mon May 14 12:09:50 2018 -0400 +++ b/tools/seq_length/README.rst Thu May 17 08:13:25 2018 -0400 @@ -65,6 +65,8 @@ v0.0.3 - Improved command line usage (outside of Galaxy). - More tests (now covers SFF as well). - Fix requesting SFF format. +v0.0.4 - Report mean, minimum and maximum sequence lengths (via stdout). + - Option to compute median and N50 as well. ======= ====================================================================== @@ -92,6 +94,8 @@ $ planemo shed_upload --tar_only tools/seq_length/ ... $ tar -tzf shed_upload.tar.gz + test-data/MID4_GLZRM4E04_rnd30.length.tabular + test-data/MID4_GLZRM4E04_rnd30.sff test-data/SRR639755_sample_strict.fastq test-data/SRR639755_sample_strict.length.tabular test-data/four_human_proteins.fasta
--- a/tools/seq_length/seq_length.py Mon May 14 12:09:50 2018 -0400 +++ b/tools/seq_length/seq_length.py Thu May 17 08:13:25 2018 -0400 @@ -20,6 +20,7 @@ from __future__ import print_function import sys +from collections import defaultdict from optparse import OptionParser usage = r"""Use as follows to compute all the lengths in a sequence file: @@ -36,13 +37,16 @@ parser.add_option('-o', '--output', dest='output', default=None, help='Output filename (tabular)', metavar="FILE") +parser.add_option("-s", "--stats", dest="stats", + default=False, action="store_true", + help="Compute statistics (median, N50 - will require much more RAM).") parser.add_option("-v", "--version", dest="version", default=False, action="store_true", help="Show version and quit") options, args = parser.parse_args() if options.version: - print("v0.0.3") + print("v0.0.4") sys.exit(0) if not options.input: sys.exit("Require an input filename") @@ -87,8 +91,84 @@ sys.exit("Unexpected format argument: %r" % options.format) +def median_from_counts_dict(counts_dict, count=None): + sorted_lengths = sorted(counts_dict) # i.e. sort the keys + if count is None: + count = sum(counts_dict.values()) + index = count / 2 + if count % 2: + # Odd, easy case - will be an exact value + # within one of the tally entries + for value in sorted_lengths: + index -= counts_dict[value] + if index < 0: + return value + else: + # Even, hard case - may have to take mean + special = None + for value in sorted_lengths: + if special is not None: + # We were right at boundary + return (special + value) / 2.0 + index -= counts_dict[value] + if index == 0: + # Special case, want mean of this value + # (final entry of this tally) and the next + # value (first entry of the next tally). + special = value + elif index < 0: + # Typical case, the two middle values + # are equal and fall into same dict entry + return value + return None + + +if sys.version_info[0] >= 3: + from statistics import median + for test in [{1: 4, 2: 3}, + {1: 4, 3: 6}, + {1: 4, 5: 4}, + {0: 5, 1: 1, 2: 1, 3: 5}, + {0: 5, 1: 1, 2: 1, 3: 1, 4: 5}]: + test_list = [] + for v, c in test.items(): + test_list.extend([v] * c) + # print(test) + # print(test_list) + assert median_from_counts_dict(test) == median(test_list) + + +def n50_from_counts_dict(counts_dict): + """Calculate N50. + + N50 is a statistical measure of average length of a set of sequences. + It is used widely in genomics, especially in reference to contig or + supercontig lengths within a draft assembly. + + Given a set of sequences of varying lengths, the N50 length is defined + as the length N for which 50% of all bases in the sequences are in a + sequence of length L < N. This can be found mathematically as follows: + Take a list L of positive integers. Create another list L' , which is + identical to L, except that every element n in L has been replaced with + n copies of itself. Then the median of L' is the N50 of L. For example: + If L = {2, 2, 2, 3, 3, 4, 8, 8}, then L' consists of six 2's, six 3's, + four 4's, and sixteen 8's; the N50 of L is the median of L' , which is 6. + + https://web.archive.org/web/20160726124802/http://www.broadinstitute.org/crd/wiki/index.php/N50 + """ + # Continuing the above example, input L would be {2:3, 3:2, 4:1, 8:2} + # and L' becomes {2:6, 3:6, 4:4, 8:16}} as tally tables. + l_prime = dict((v, v * c) for v, c in counts_dict.items()) + return median_from_counts_dict(l_prime) + + count = 0 total = 0 +stats = bool(options.stats) +length_counts = defaultdict(int) # used if stats requested +length_min = sys.maxsize # used if stats not requested +length_max = 0 + with open(out_file, "w") as out_handle: out_handle.write("#Identifier\tLength\n") if format == "fastq": @@ -99,6 +179,11 @@ total += length identifier = title.split(None, 1)[0] out_handle.write("%s\t%i\n" % (identifier, length)) + if stats: + length_counts[length] += 1 + else: + length_min = min(length_min, length) + length_max = max(length_max, length) elif format == "fasta": with open(in_file) as in_handle: for title, seq in SimpleFastaParser(in_handle): @@ -107,10 +192,29 @@ total += length identifier = title.split(None, 1)[0] out_handle.write("%s\t%i\n" % (identifier, length)) + if stats: + length_counts[length] += 1 + else: + length_min = min(length_min, length) + length_max = max(length_max, length) else: for record in SeqIO.parse(in_file, format): count += 1 length = len(record) total += length out_handle.write("%s\t%i\n" % (record.id, length)) -print("%i sequences, total length %i" % (count, total)) + if stats: + length_counts[length] += 1 + else: + length_min = min(length_min, length) + length_max = max(length_max, length) +print("%i sequences, total length %i, mean %0.1f" % (count, total, float(total) / count)) +if not count: + pass +elif not stats: + print("Shortest %i, longest %i" % (length_min, length_max)) +elif count and stats: + print("Shortest %i, longest %i" % (min(length_counts), max(length_counts))) + median = median_from_counts_dict(length_counts, count) + n50 = n50_from_counts_dict(length_counts) + print("Median length %0.1f, N50 %i" % (median, n50))
--- a/tools/seq_length/seq_length.xml Mon May 14 12:09:50 2018 -0400 +++ b/tools/seq_length/seq_length.xml Thu May 17 08:13:25 2018 -0400 @@ -1,4 +1,4 @@ -<tool id="seq_length" name="Sequence lengths" version="0.0.3"> +<tool id="seq_length" name="Sequence lengths" version="0.0.4"> <description>from FASTA, QUAL, FASTQ, or SFF file</description> <requirements> <!-- This is the currently the last release of Biopython which is available via Galaxy's legacy XML packaging system --> @@ -9,9 +9,13 @@ </version_command> <command detect_errors="aggressive"> python $__tool_directory__/seq_length.py -i '$input_file' -f '$input_file.ext' -o '$output_file' +#if $stats + -s +#end if </command> <inputs> <param name="input_file" type="data" format="fasta,qual,fastq,sff" label="Sequence file" help="FASTA, QUAL, FASTQ, or SFF format." /> + <param name="stats" type="boolean" label="Compute additional statistics (median, N50)" /> </inputs> <outputs> <data name="output_file" format="tabular" label="${on_string} length"/> @@ -21,21 +25,26 @@ <param name="input_file" value="four_human_proteins.fasta" ftype="fasta" /> <output name="output_file" file="four_human_proteins.length.tabular" ftype="tabular" /> <assert_stdout> - <has_line line="4 sequences, total length 3297" /> + <has_line line="4 sequences, total length 3297, mean 824.2" /> + <has_line line="Shortest 348, longest 1382" /> </assert_stdout> </test> <test> <param name="input_file" value="SRR639755_sample_strict.fastq" ftype="fastq" /> <output name="output_file" file="SRR639755_sample_strict.length.tabular" ftype="tabular" /> <assert_stdout> - <has_line line="2 sequences, total length 202" /> + <has_line line="2 sequences, total length 202, mean 101.0" /> + <has_line line="Shortest 101, longest 101" /> </assert_stdout> </test> <test> <param name="input_file" value="MID4_GLZRM4E04_rnd30.sff" ftype="sff" /> + <param name="stats" value="true" /> <output name="output_file" file="MID4_GLZRM4E04_rnd30.length.tabular" ftype="tabular" /> <assert_stdout> - <has_line line="30 sequences, total length 7504" /> + <has_line line="30 sequences, total length 7504, mean 250.1" /> + <has_line line="Shortest 42, longest 473" /> + <has_line line="Median length 269.5, N50 345" /> </assert_stdout> </test> </tests> @@ -46,9 +55,19 @@ two-column tabular file containing one line per sequence giving the sequence identifier and the associated sequence's length. +Additionally, the tool will report some basic statistics about the sequences +(visible via the output file's meta data, or the stdout log for the job), +namely the number of sequences, total length, mean length, minimum length and +maximum length. + +You can optionally request additional statistics be computed which will use +more RAM and take fractionally longer, namely the median and N50. + WARNING: If there are any duplicate sequence identifiers, these will all appear in the tabular output. +If using SFF files, this will use the trimmed lengths of the reads. + **References** This tool uses Biopython's ``SeqIO`` library to read sequences, so please cite