changeset 1:458f987918a6 draft

Faster FASTA and FASTQ, v0.0.2
author peterjc
date Tue, 08 May 2018 11:16:50 -0400
parents c323e29a8248
children 6f29bb9960ac
files tools/seq_length/README.rst tools/seq_length/seq_length.py tools/seq_length/seq_length.xml
diffstat 3 files changed, 37 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/tools/seq_length/README.rst	Tue May 08 09:35:45 2018 -0400
+++ b/tools/seq_length/README.rst	Tue May 08 11:16:50 2018 -0400
@@ -60,6 +60,8 @@
 Version Changes
 ------- ----------------------------------------------------------------------
 v0.0.1  - Initial version.
+v0.0.2  - Faster for FASTA and FASTQ.
+        - Fixed typo.
 ======= ======================================================================
 
 
--- a/tools/seq_length/seq_length.py	Tue May 08 09:35:45 2018 -0400
+++ b/tools/seq_length/seq_length.py	Tue May 08 11:16:50 2018 -0400
@@ -22,7 +22,7 @@
 import sys
 
 if "-v" in sys.argv or "--version" in sys.argv:
-    print("v0.0.1")
+    print("v0.0.2")
     sys.exit(0)
 
 try:
@@ -30,6 +30,16 @@
 except ImportError:
     sys.exit("Missing required Python library Biopython.")
 
+try:
+    from Bio.SeqIO.QualityIO import FastqGeneralIterator
+except ImportError:
+    sys.exit("Biopython tool old?, missing Bio.SeqIO.QualityIO.FastqGeneralIterator")
+
+try:
+    from Bio.SeqIO.FastaIO import SimpleFastaParser
+except ImportError:
+    sys.exit("Biopython tool old?, missing Bio.SeqIO.FastaIO.SimpleFastaParser")
+
 
 # Parse Command Line
 try:
@@ -61,9 +71,26 @@
 total = 0
 with open(out_file, "w") as out_handle:
     out_handle.write("#Identifier\tLength\n")
-    for record in SeqIO.parse(in_file, format):
-        count += 1
-        length = len(record)
-        total += length
-        out_handle.write("%s\t%i\n" % (record.id, length))
+    if format == "fastq":
+        with open(in_file) as in_handle:
+            for title, seq, qual in FastqGeneralIterator(in_handle):
+                count += 1
+                length = len(seq)
+                total += length
+                identifier = title.split(None, 1)[0]
+                out_handle.write("%s\t%i\n" % (identifier, length))
+    elif format == "fasta":
+        with open(in_file) as in_handle:
+            for title, seq in SimpleFastaParser(in_handle):
+                count += 1
+                length = len(seq)
+                total += length
+                identifier = title.split(None, 1)[0]
+                out_handle.write("%s\t%i\n" % (identifier, length))
+    else:
+        for record in SeqIO.parse(in_file, format):
+            count += 1
+            length = len(record)
+            total += length
+            out_handle.write("%s\t%i\n" % (record.id, length))
 print("%i sequences, total length %i" % (count, total))
--- a/tools/seq_length/seq_length.xml	Tue May 08 09:35:45 2018 -0400
+++ b/tools/seq_length/seq_length.xml	Tue May 08 11:16:50 2018 -0400
@@ -1,5 +1,5 @@
-<tool id="seq_length" name="Sequence lengths" version="0.0.1">
-    <description>with ID mapping from a tabular file</description>
+<tool id="seq_length" name="Sequence lengths" version="0.0.2">
+    <description>from FASTA, QUAL, FASTQ, or SFF file</description>
     <requirements>
         <!-- This is the currently the last release of Biopython which is available via Galaxy's legacy XML packaging system -->
         <requirement type="package" version="1.67">biopython</requirement>