# HG changeset patch # User peterjc # Date 1525792610 14400 # Node ID 458f987918a67281d349ef9d5ec15f3024ee7434 # Parent c323e29a8248660821d3eff51537e6a6dfe2350b Faster FASTA and FASTQ, v0.0.2 diff -r c323e29a8248 -r 458f987918a6 tools/seq_length/README.rst --- a/tools/seq_length/README.rst Tue May 08 09:35:45 2018 -0400 +++ b/tools/seq_length/README.rst Tue May 08 11:16:50 2018 -0400 @@ -60,6 +60,8 @@ Version Changes ------- ---------------------------------------------------------------------- v0.0.1 - Initial version. +v0.0.2 - Faster for FASTA and FASTQ. + - Fixed typo. ======= ====================================================================== diff -r c323e29a8248 -r 458f987918a6 tools/seq_length/seq_length.py --- a/tools/seq_length/seq_length.py Tue May 08 09:35:45 2018 -0400 +++ b/tools/seq_length/seq_length.py Tue May 08 11:16:50 2018 -0400 @@ -22,7 +22,7 @@ import sys if "-v" in sys.argv or "--version" in sys.argv: - print("v0.0.1") + print("v0.0.2") sys.exit(0) try: @@ -30,6 +30,16 @@ except ImportError: sys.exit("Missing required Python library Biopython.") +try: + from Bio.SeqIO.QualityIO import FastqGeneralIterator +except ImportError: + sys.exit("Biopython tool old?, missing Bio.SeqIO.QualityIO.FastqGeneralIterator") + +try: + from Bio.SeqIO.FastaIO import SimpleFastaParser +except ImportError: + sys.exit("Biopython tool old?, missing Bio.SeqIO.FastaIO.SimpleFastaParser") + # Parse Command Line try: @@ -61,9 +71,26 @@ total = 0 with open(out_file, "w") as out_handle: out_handle.write("#Identifier\tLength\n") - for record in SeqIO.parse(in_file, format): - count += 1 - length = len(record) - total += length - out_handle.write("%s\t%i\n" % (record.id, length)) + if format == "fastq": + with open(in_file) as in_handle: + for title, seq, qual in FastqGeneralIterator(in_handle): + count += 1 + length = len(seq) + total += length + identifier = title.split(None, 1)[0] + out_handle.write("%s\t%i\n" % (identifier, length)) + elif format == "fasta": + with open(in_file) as in_handle: + for title, seq in SimpleFastaParser(in_handle): + count += 1 + length = len(seq) + total += length + identifier = title.split(None, 1)[0] + out_handle.write("%s\t%i\n" % (identifier, length)) + else: + for record in SeqIO.parse(in_file, format): + count += 1 + length = len(record) + total += length + out_handle.write("%s\t%i\n" % (record.id, length)) print("%i sequences, total length %i" % (count, total)) diff -r c323e29a8248 -r 458f987918a6 tools/seq_length/seq_length.xml --- a/tools/seq_length/seq_length.xml Tue May 08 09:35:45 2018 -0400 +++ b/tools/seq_length/seq_length.xml Tue May 08 11:16:50 2018 -0400 @@ -1,5 +1,5 @@ - - with ID mapping from a tabular file + + from FASTA, QUAL, FASTQ, or SFF file biopython