annotate tools/seq_length/seq_length.py @ 2:6f29bb9960ac draft

v0.0.3 - Fixed SFF; more tests
author peterjc
date Mon, 14 May 2018 12:09:50 -0400
parents 458f987918a6
children fcdf11fb34de
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
1 #!/usr/bin/env python
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
2 """Compute length of FASTA, QUAL, FASTQ or SSF sequences.
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
3
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
4 Takes three command line options: input sequence filename, input type
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
5 (e.g. FASTA or SFF) and the output filename (tabular).
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
6
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
7 This tool is a short Python script which requires Biopython 1.54 or later
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
8 for SFF file support. If you use this tool in scientific work leading to a
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
9 publication, please cite the Biopython application note:
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
10
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
11 Cock et al 2009. Biopython: freely available Python tools for computational
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
12 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
13 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
14
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
15 This script is copyright 2018 by Peter Cock, The James Hutton Institute UK.
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
16 All rights reserved. See accompanying text file for licence details (MIT
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
17 license).
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
18 """
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
19
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
20 from __future__ import print_function
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
21
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
22 import sys
2
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
23 from optparse import OptionParser
0
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
24
2
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
25 usage = r"""Use as follows to compute all the lengths in a sequence file:
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
26
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
27 $ python seq_length.py -i example.fasta -f fasta -o lengths.tsv
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
28 """
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
29
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
30 parser = OptionParser(usage=usage)
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
31 parser.add_option('-i', '--input', dest='input',
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
32 default=None, help='Input sequence filename (FASTA, FASTQ, etc)',
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
33 metavar="FILE")
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
34 parser.add_option('-f', '--format', dest='format',
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
35 default=None, help='Input sequence format (FASTA, QUAL, FASTQ, SFF)')
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
36 parser.add_option('-o', '--output', dest='output',
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
37 default=None, help='Output filename (tabular)',
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
38 metavar="FILE")
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
39 parser.add_option("-v", "--version", dest="version",
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
40 default=False, action="store_true",
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
41 help="Show version and quit")
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
42 options, args = parser.parse_args()
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
43
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
44 if options.version:
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
45 print("v0.0.3")
0
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
46 sys.exit(0)
2
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
47 if not options.input:
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
48 sys.exit("Require an input filename")
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
49 if not options.format:
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
50 sys.exit("Require the input format")
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
51 if not options.output:
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
52 sys.exit("Require an output filename")
0
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
53
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
54 try:
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
55 from Bio import SeqIO
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
56 except ImportError:
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
57 sys.exit("Missing required Python library Biopython.")
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
58
1
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
59 try:
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
60 from Bio.SeqIO.QualityIO import FastqGeneralIterator
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
61 except ImportError:
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
62 sys.exit("Biopython tool old?, missing Bio.SeqIO.QualityIO.FastqGeneralIterator")
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
63
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
64 try:
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
65 from Bio.SeqIO.FastaIO import SimpleFastaParser
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
66 except ImportError:
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
67 sys.exit("Biopython tool old?, missing Bio.SeqIO.FastaIO.SimpleFastaParser")
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
68
2
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
69 in_file = options.input
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
70 out_file = options.output
0
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
71
2
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
72 if options.format.startswith("fastq"):
0
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
73 # We don't care about the quality score encoding, just
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
74 # need to translate Galaxy format name into something
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
75 # Biopython will accept:
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
76 format = "fastq"
2
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
77 elif options.format.lower() == "csfasta":
0
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
78 # I have not tested with colour space FASTA
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
79 format = "fasta"
2
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
80 elif options.format.lower() == "sff":
0
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
81 # The masked/trimmed numbers are more interesting
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
82 format = "sff-trim"
2
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
83 elif options.format.lower() in ["fasta", "qual"]:
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
84 format = options.format.lower()
0
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
85 else:
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
86 # TODO: Does Galaxy understand GenBank, EMBL, etc yet?
2
6f29bb9960ac v0.0.3 - Fixed SFF; more tests
peterjc
parents: 1
diff changeset
87 sys.exit("Unexpected format argument: %r" % options.format)
0
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
88
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
89
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
90 count = 0
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
91 total = 0
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
92 with open(out_file, "w") as out_handle:
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
93 out_handle.write("#Identifier\tLength\n")
1
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
94 if format == "fastq":
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
95 with open(in_file) as in_handle:
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
96 for title, seq, qual in FastqGeneralIterator(in_handle):
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
97 count += 1
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
98 length = len(seq)
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
99 total += length
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
100 identifier = title.split(None, 1)[0]
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
101 out_handle.write("%s\t%i\n" % (identifier, length))
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
102 elif format == "fasta":
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
103 with open(in_file) as in_handle:
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
104 for title, seq in SimpleFastaParser(in_handle):
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
105 count += 1
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
106 length = len(seq)
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
107 total += length
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
108 identifier = title.split(None, 1)[0]
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
109 out_handle.write("%s\t%i\n" % (identifier, length))
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
110 else:
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
111 for record in SeqIO.parse(in_file, format):
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
112 count += 1
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
113 length = len(record)
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
114 total += length
458f987918a6 Faster FASTA and FASTQ, v0.0.2
peterjc
parents: 0
diff changeset
115 out_handle.write("%s\t%i\n" % (record.id, length))
0
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
116 print("%i sequences, total length %i" % (count, total))