view tools/seq_length/seq_length.py @ 2:6f29bb9960ac draft

v0.0.3 - Fixed SFF; more tests
author peterjc
date Mon, 14 May 2018 12:09:50 -0400
parents 458f987918a6
children fcdf11fb34de
line wrap: on
line source

#!/usr/bin/env python
"""Compute length of FASTA, QUAL, FASTQ or SSF sequences.

Takes three command line options: input sequence filename, input type
(e.g. FASTA or SFF) and the output filename (tabular).

This tool is a short Python script which requires Biopython 1.54 or later
for SFF file support. If you use this tool in scientific work leading to a
publication, please cite the Biopython application note:

Cock et al 2009. Biopython: freely available Python tools for computational
molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.

This script is copyright 2018 by Peter Cock, The James Hutton Institute UK.
All rights reserved. See accompanying text file for licence details (MIT
license).
"""

from __future__ import print_function

import sys
from optparse import OptionParser

usage = r"""Use as follows to compute all the lengths in a sequence file:

$ python seq_length.py -i example.fasta -f fasta -o lengths.tsv
"""

parser = OptionParser(usage=usage)
parser.add_option('-i', '--input', dest='input',
                  default=None, help='Input sequence filename (FASTA, FASTQ, etc)',
                  metavar="FILE")
parser.add_option('-f', '--format', dest='format',
                  default=None, help='Input sequence format (FASTA, QUAL, FASTQ, SFF)')
parser.add_option('-o', '--output', dest='output',
                  default=None, help='Output filename (tabular)',
                  metavar="FILE")
parser.add_option("-v", "--version", dest="version",
                  default=False, action="store_true",
                  help="Show version and quit")
options, args = parser.parse_args()

if options.version:
    print("v0.0.3")
    sys.exit(0)
if not options.input:
    sys.exit("Require an input filename")
if not options.format:
    sys.exit("Require the input format")
if not options.output:
    sys.exit("Require an output filename")

try:
    from Bio import SeqIO
except ImportError:
    sys.exit("Missing required Python library Biopython.")

try:
    from Bio.SeqIO.QualityIO import FastqGeneralIterator
except ImportError:
    sys.exit("Biopython tool old?, missing Bio.SeqIO.QualityIO.FastqGeneralIterator")

try:
    from Bio.SeqIO.FastaIO import SimpleFastaParser
except ImportError:
    sys.exit("Biopython tool old?, missing Bio.SeqIO.FastaIO.SimpleFastaParser")

in_file = options.input
out_file = options.output

if options.format.startswith("fastq"):
    # We don't care about the quality score encoding, just
    # need to translate Galaxy format name into something
    # Biopython will accept:
    format = "fastq"
elif options.format.lower() == "csfasta":
    # I have not tested with colour space FASTA
    format = "fasta"
elif options.format.lower() == "sff":
    # The masked/trimmed numbers are more interesting
    format = "sff-trim"
elif options.format.lower() in ["fasta", "qual"]:
    format = options.format.lower()
else:
    # TODO: Does Galaxy understand GenBank, EMBL, etc yet?
    sys.exit("Unexpected format argument: %r" % options.format)


count = 0
total = 0
with open(out_file, "w") as out_handle:
    out_handle.write("#Identifier\tLength\n")
    if format == "fastq":
        with open(in_file) as in_handle:
            for title, seq, qual in FastqGeneralIterator(in_handle):
                count += 1
                length = len(seq)
                total += length
                identifier = title.split(None, 1)[0]
                out_handle.write("%s\t%i\n" % (identifier, length))
    elif format == "fasta":
        with open(in_file) as in_handle:
            for title, seq in SimpleFastaParser(in_handle):
                count += 1
                length = len(seq)
                total += length
                identifier = title.split(None, 1)[0]
                out_handle.write("%s\t%i\n" % (identifier, length))
    else:
        for record in SeqIO.parse(in_file, format):
            count += 1
            length = len(record)
            total += length
            out_handle.write("%s\t%i\n" % (record.id, length))
print("%i sequences, total length %i" % (count, total))