0
|
1 #!/usr/bin/env python
|
|
2 """Compute length of FASTA, QUAL, FASTQ or SSF sequences.
|
|
3
|
|
4 Takes three command line options: input sequence filename, input type
|
|
5 (e.g. FASTA or SFF) and the output filename (tabular).
|
|
6
|
|
7 This tool is a short Python script which requires Biopython 1.54 or later
|
|
8 for SFF file support. If you use this tool in scientific work leading to a
|
|
9 publication, please cite the Biopython application note:
|
|
10
|
|
11 Cock et al 2009. Biopython: freely available Python tools for computational
|
|
12 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
|
|
13 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
|
|
14
|
|
15 This script is copyright 2018 by Peter Cock, The James Hutton Institute UK.
|
|
16 All rights reserved. See accompanying text file for licence details (MIT
|
|
17 license).
|
|
18 """
|
|
19
|
|
20 from __future__ import print_function
|
|
21
|
|
22 import sys
|
|
23
|
|
24 if "-v" in sys.argv or "--version" in sys.argv:
|
1
|
25 print("v0.0.2")
|
0
|
26 sys.exit(0)
|
|
27
|
|
28 try:
|
|
29 from Bio import SeqIO
|
|
30 except ImportError:
|
|
31 sys.exit("Missing required Python library Biopython.")
|
|
32
|
1
|
33 try:
|
|
34 from Bio.SeqIO.QualityIO import FastqGeneralIterator
|
|
35 except ImportError:
|
|
36 sys.exit("Biopython tool old?, missing Bio.SeqIO.QualityIO.FastqGeneralIterator")
|
|
37
|
|
38 try:
|
|
39 from Bio.SeqIO.FastaIO import SimpleFastaParser
|
|
40 except ImportError:
|
|
41 sys.exit("Biopython tool old?, missing Bio.SeqIO.FastaIO.SimpleFastaParser")
|
|
42
|
0
|
43
|
|
44 # Parse Command Line
|
|
45 try:
|
|
46 in_file, seq_format, out_file = sys.argv[1:]
|
|
47 except ValueError:
|
|
48 sys.exit("Expected three arguments (input file, format, output file), "
|
|
49 "got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv)))
|
|
50
|
|
51
|
|
52 if seq_format.startswith("fastq"):
|
|
53 # We don't care about the quality score encoding, just
|
|
54 # need to translate Galaxy format name into something
|
|
55 # Biopython will accept:
|
|
56 format = "fastq"
|
|
57 elif seq_format.lower() == "csfasta":
|
|
58 # I have not tested with colour space FASTA
|
|
59 format = "fasta"
|
|
60 elif seq_format.lower == "sff":
|
|
61 # The masked/trimmed numbers are more interesting
|
|
62 format = "sff-trim"
|
|
63 elif seq_format.lower() in ["fasta", "qual"]:
|
|
64 format = seq_format.lower()
|
|
65 else:
|
|
66 # TODO: Does Galaxy understand GenBank, EMBL, etc yet?
|
|
67 sys.exit("Unexpected format argument: %r" % seq_format)
|
|
68
|
|
69
|
|
70 count = 0
|
|
71 total = 0
|
|
72 with open(out_file, "w") as out_handle:
|
|
73 out_handle.write("#Identifier\tLength\n")
|
1
|
74 if format == "fastq":
|
|
75 with open(in_file) as in_handle:
|
|
76 for title, seq, qual in FastqGeneralIterator(in_handle):
|
|
77 count += 1
|
|
78 length = len(seq)
|
|
79 total += length
|
|
80 identifier = title.split(None, 1)[0]
|
|
81 out_handle.write("%s\t%i\n" % (identifier, length))
|
|
82 elif format == "fasta":
|
|
83 with open(in_file) as in_handle:
|
|
84 for title, seq in SimpleFastaParser(in_handle):
|
|
85 count += 1
|
|
86 length = len(seq)
|
|
87 total += length
|
|
88 identifier = title.split(None, 1)[0]
|
|
89 out_handle.write("%s\t%i\n" % (identifier, length))
|
|
90 else:
|
|
91 for record in SeqIO.parse(in_file, format):
|
|
92 count += 1
|
|
93 length = len(record)
|
|
94 total += length
|
|
95 out_handle.write("%s\t%i\n" % (record.id, length))
|
0
|
96 print("%i sequences, total length %i" % (count, total))
|