view tools/seq_length/seq_length.py @ 0:c323e29a8248 draft

Initial release v0.0.1
author peterjc
date Tue, 08 May 2018 09:35:45 -0400
parents
children 458f987918a6
line wrap: on
line source

#!/usr/bin/env python
"""Compute length of FASTA, QUAL, FASTQ or SSF sequences.

Takes three command line options: input sequence filename, input type
(e.g. FASTA or SFF) and the output filename (tabular).

This tool is a short Python script which requires Biopython 1.54 or later
for SFF file support. If you use this tool in scientific work leading to a
publication, please cite the Biopython application note:

Cock et al 2009. Biopython: freely available Python tools for computational
molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.

This script is copyright 2018 by Peter Cock, The James Hutton Institute UK.
All rights reserved. See accompanying text file for licence details (MIT
license).
"""

from __future__ import print_function

import sys

if "-v" in sys.argv or "--version" in sys.argv:
    print("v0.0.1")
    sys.exit(0)

try:
    from Bio import SeqIO
except ImportError:
    sys.exit("Missing required Python library Biopython.")


# Parse Command Line
try:
    in_file, seq_format, out_file = sys.argv[1:]
except ValueError:
    sys.exit("Expected three arguments (input file, format, output file), "
             "got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv)))


if seq_format.startswith("fastq"):
    # We don't care about the quality score encoding, just
    # need to translate Galaxy format name into something
    # Biopython will accept:
    format = "fastq"
elif seq_format.lower() == "csfasta":
    # I have not tested with colour space FASTA
    format = "fasta"
elif seq_format.lower == "sff":
    # The masked/trimmed numbers are more interesting
    format = "sff-trim"
elif seq_format.lower() in ["fasta", "qual"]:
    format = seq_format.lower()
else:
    # TODO: Does Galaxy understand GenBank, EMBL, etc yet?
    sys.exit("Unexpected format argument: %r" % seq_format)


count = 0
total = 0
with open(out_file, "w") as out_handle:
    out_handle.write("#Identifier\tLength\n")
    for record in SeqIO.parse(in_file, format):
        count += 1
        length = len(record)
        total += length
        out_handle.write("%s\t%i\n" % (record.id, length))
print("%i sequences, total length %i" % (count, total))