Mercurial > repos > peterjc > seq_length
changeset 1:458f987918a6 draft
Faster FASTA and FASTQ, v0.0.2
author | peterjc |
---|---|
date | Tue, 08 May 2018 11:16:50 -0400 |
parents | c323e29a8248 |
children | 6f29bb9960ac |
files | tools/seq_length/README.rst tools/seq_length/seq_length.py tools/seq_length/seq_length.xml |
diffstat | 3 files changed, 37 insertions(+), 8 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/seq_length/README.rst Tue May 08 09:35:45 2018 -0400 +++ b/tools/seq_length/README.rst Tue May 08 11:16:50 2018 -0400 @@ -60,6 +60,8 @@ Version Changes ------- ---------------------------------------------------------------------- v0.0.1 - Initial version. +v0.0.2 - Faster for FASTA and FASTQ. + - Fixed typo. ======= ======================================================================
--- a/tools/seq_length/seq_length.py Tue May 08 09:35:45 2018 -0400 +++ b/tools/seq_length/seq_length.py Tue May 08 11:16:50 2018 -0400 @@ -22,7 +22,7 @@ import sys if "-v" in sys.argv or "--version" in sys.argv: - print("v0.0.1") + print("v0.0.2") sys.exit(0) try: @@ -30,6 +30,16 @@ except ImportError: sys.exit("Missing required Python library Biopython.") +try: + from Bio.SeqIO.QualityIO import FastqGeneralIterator +except ImportError: + sys.exit("Biopython tool old?, missing Bio.SeqIO.QualityIO.FastqGeneralIterator") + +try: + from Bio.SeqIO.FastaIO import SimpleFastaParser +except ImportError: + sys.exit("Biopython tool old?, missing Bio.SeqIO.FastaIO.SimpleFastaParser") + # Parse Command Line try: @@ -61,9 +71,26 @@ total = 0 with open(out_file, "w") as out_handle: out_handle.write("#Identifier\tLength\n") - for record in SeqIO.parse(in_file, format): - count += 1 - length = len(record) - total += length - out_handle.write("%s\t%i\n" % (record.id, length)) + if format == "fastq": + with open(in_file) as in_handle: + for title, seq, qual in FastqGeneralIterator(in_handle): + count += 1 + length = len(seq) + total += length + identifier = title.split(None, 1)[0] + out_handle.write("%s\t%i\n" % (identifier, length)) + elif format == "fasta": + with open(in_file) as in_handle: + for title, seq in SimpleFastaParser(in_handle): + count += 1 + length = len(seq) + total += length + identifier = title.split(None, 1)[0] + out_handle.write("%s\t%i\n" % (identifier, length)) + else: + for record in SeqIO.parse(in_file, format): + count += 1 + length = len(record) + total += length + out_handle.write("%s\t%i\n" % (record.id, length)) print("%i sequences, total length %i" % (count, total))
--- a/tools/seq_length/seq_length.xml Tue May 08 09:35:45 2018 -0400 +++ b/tools/seq_length/seq_length.xml Tue May 08 11:16:50 2018 -0400 @@ -1,5 +1,5 @@ -<tool id="seq_length" name="Sequence lengths" version="0.0.1"> - <description>with ID mapping from a tabular file</description> +<tool id="seq_length" name="Sequence lengths" version="0.0.2"> + <description>from FASTA, QUAL, FASTQ, or SFF file</description> <requirements> <!-- This is the currently the last release of Biopython which is available via Galaxy's legacy XML packaging system --> <requirement type="package" version="1.67">biopython</requirement>