0
|
1 #!/usr/bin/env python
|
|
2 """Compute length of FASTA, QUAL, FASTQ or SSF sequences.
|
|
3
|
|
4 Takes three command line options: input sequence filename, input type
|
|
5 (e.g. FASTA or SFF) and the output filename (tabular).
|
|
6
|
|
7 This tool is a short Python script which requires Biopython 1.54 or later
|
|
8 for SFF file support. If you use this tool in scientific work leading to a
|
|
9 publication, please cite the Biopython application note:
|
|
10
|
|
11 Cock et al 2009. Biopython: freely available Python tools for computational
|
|
12 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
|
|
13 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
|
|
14
|
|
15 This script is copyright 2018 by Peter Cock, The James Hutton Institute UK.
|
|
16 All rights reserved. See accompanying text file for licence details (MIT
|
|
17 license).
|
|
18 """
|
|
19
|
|
20 from __future__ import print_function
|
|
21
|
|
22 import sys
|
2
|
23 from optparse import OptionParser
|
0
|
24
|
2
|
25 usage = r"""Use as follows to compute all the lengths in a sequence file:
|
|
26
|
|
27 $ python seq_length.py -i example.fasta -f fasta -o lengths.tsv
|
|
28 """
|
|
29
|
|
30 parser = OptionParser(usage=usage)
|
|
31 parser.add_option('-i', '--input', dest='input',
|
|
32 default=None, help='Input sequence filename (FASTA, FASTQ, etc)',
|
|
33 metavar="FILE")
|
|
34 parser.add_option('-f', '--format', dest='format',
|
|
35 default=None, help='Input sequence format (FASTA, QUAL, FASTQ, SFF)')
|
|
36 parser.add_option('-o', '--output', dest='output',
|
|
37 default=None, help='Output filename (tabular)',
|
|
38 metavar="FILE")
|
|
39 parser.add_option("-v", "--version", dest="version",
|
|
40 default=False, action="store_true",
|
|
41 help="Show version and quit")
|
|
42 options, args = parser.parse_args()
|
|
43
|
|
44 if options.version:
|
|
45 print("v0.0.3")
|
0
|
46 sys.exit(0)
|
2
|
47 if not options.input:
|
|
48 sys.exit("Require an input filename")
|
|
49 if not options.format:
|
|
50 sys.exit("Require the input format")
|
|
51 if not options.output:
|
|
52 sys.exit("Require an output filename")
|
0
|
53
|
|
54 try:
|
|
55 from Bio import SeqIO
|
|
56 except ImportError:
|
|
57 sys.exit("Missing required Python library Biopython.")
|
|
58
|
1
|
59 try:
|
|
60 from Bio.SeqIO.QualityIO import FastqGeneralIterator
|
|
61 except ImportError:
|
|
62 sys.exit("Biopython tool old?, missing Bio.SeqIO.QualityIO.FastqGeneralIterator")
|
|
63
|
|
64 try:
|
|
65 from Bio.SeqIO.FastaIO import SimpleFastaParser
|
|
66 except ImportError:
|
|
67 sys.exit("Biopython tool old?, missing Bio.SeqIO.FastaIO.SimpleFastaParser")
|
|
68
|
2
|
69 in_file = options.input
|
|
70 out_file = options.output
|
0
|
71
|
2
|
72 if options.format.startswith("fastq"):
|
0
|
73 # We don't care about the quality score encoding, just
|
|
74 # need to translate Galaxy format name into something
|
|
75 # Biopython will accept:
|
|
76 format = "fastq"
|
2
|
77 elif options.format.lower() == "csfasta":
|
0
|
78 # I have not tested with colour space FASTA
|
|
79 format = "fasta"
|
2
|
80 elif options.format.lower() == "sff":
|
0
|
81 # The masked/trimmed numbers are more interesting
|
|
82 format = "sff-trim"
|
2
|
83 elif options.format.lower() in ["fasta", "qual"]:
|
|
84 format = options.format.lower()
|
0
|
85 else:
|
|
86 # TODO: Does Galaxy understand GenBank, EMBL, etc yet?
|
2
|
87 sys.exit("Unexpected format argument: %r" % options.format)
|
0
|
88
|
|
89
|
|
90 count = 0
|
|
91 total = 0
|
|
92 with open(out_file, "w") as out_handle:
|
|
93 out_handle.write("#Identifier\tLength\n")
|
1
|
94 if format == "fastq":
|
|
95 with open(in_file) as in_handle:
|
|
96 for title, seq, qual in FastqGeneralIterator(in_handle):
|
|
97 count += 1
|
|
98 length = len(seq)
|
|
99 total += length
|
|
100 identifier = title.split(None, 1)[0]
|
|
101 out_handle.write("%s\t%i\n" % (identifier, length))
|
|
102 elif format == "fasta":
|
|
103 with open(in_file) as in_handle:
|
|
104 for title, seq in SimpleFastaParser(in_handle):
|
|
105 count += 1
|
|
106 length = len(seq)
|
|
107 total += length
|
|
108 identifier = title.split(None, 1)[0]
|
|
109 out_handle.write("%s\t%i\n" % (identifier, length))
|
|
110 else:
|
|
111 for record in SeqIO.parse(in_file, format):
|
|
112 count += 1
|
|
113 length = len(record)
|
|
114 total += length
|
|
115 out_handle.write("%s\t%i\n" % (record.id, length))
|
0
|
116 print("%i sequences, total length %i" % (count, total))
|