0
|
1 #!/usr/bin/env python
|
|
2 """Compute length of FASTA, QUAL, FASTQ or SSF sequences.
|
|
3
|
|
4 Takes three command line options: input sequence filename, input type
|
|
5 (e.g. FASTA or SFF) and the output filename (tabular).
|
|
6
|
|
7 This tool is a short Python script which requires Biopython 1.54 or later
|
|
8 for SFF file support. If you use this tool in scientific work leading to a
|
|
9 publication, please cite the Biopython application note:
|
|
10
|
|
11 Cock et al 2009. Biopython: freely available Python tools for computational
|
|
12 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
|
|
13 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
|
|
14
|
|
15 This script is copyright 2018 by Peter Cock, The James Hutton Institute UK.
|
|
16 All rights reserved. See accompanying text file for licence details (MIT
|
|
17 license).
|
|
18 """
|
|
19
|
|
20 from __future__ import print_function
|
|
21
|
|
22 import sys
|
|
23
|
|
24 if "-v" in sys.argv or "--version" in sys.argv:
|
|
25 print("v0.0.1")
|
|
26 sys.exit(0)
|
|
27
|
|
28 try:
|
|
29 from Bio import SeqIO
|
|
30 except ImportError:
|
|
31 sys.exit("Missing required Python library Biopython.")
|
|
32
|
|
33
|
|
34 # Parse Command Line
|
|
35 try:
|
|
36 in_file, seq_format, out_file = sys.argv[1:]
|
|
37 except ValueError:
|
|
38 sys.exit("Expected three arguments (input file, format, output file), "
|
|
39 "got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv)))
|
|
40
|
|
41
|
|
42 if seq_format.startswith("fastq"):
|
|
43 # We don't care about the quality score encoding, just
|
|
44 # need to translate Galaxy format name into something
|
|
45 # Biopython will accept:
|
|
46 format = "fastq"
|
|
47 elif seq_format.lower() == "csfasta":
|
|
48 # I have not tested with colour space FASTA
|
|
49 format = "fasta"
|
|
50 elif seq_format.lower == "sff":
|
|
51 # The masked/trimmed numbers are more interesting
|
|
52 format = "sff-trim"
|
|
53 elif seq_format.lower() in ["fasta", "qual"]:
|
|
54 format = seq_format.lower()
|
|
55 else:
|
|
56 # TODO: Does Galaxy understand GenBank, EMBL, etc yet?
|
|
57 sys.exit("Unexpected format argument: %r" % seq_format)
|
|
58
|
|
59
|
|
60 count = 0
|
|
61 total = 0
|
|
62 with open(out_file, "w") as out_handle:
|
|
63 out_handle.write("#Identifier\tLength\n")
|
|
64 for record in SeqIO.parse(in_file, format):
|
|
65 count += 1
|
|
66 length = len(record)
|
|
67 total += length
|
|
68 out_handle.write("%s\t%i\n" % (record.id, length))
|
|
69 print("%i sequences, total length %i" % (count, total))
|