comparison tools/seq_length/seq_length.py @ 2:6f29bb9960ac draft

v0.0.3 - Fixed SFF; more tests
author peterjc
date Mon, 14 May 2018 12:09:50 -0400
parents 458f987918a6
children fcdf11fb34de
comparison
equal deleted inserted replaced
1:458f987918a6 2:6f29bb9960ac
18 """ 18 """
19 19
20 from __future__ import print_function 20 from __future__ import print_function
21 21
22 import sys 22 import sys
23 from optparse import OptionParser
23 24
24 if "-v" in sys.argv or "--version" in sys.argv: 25 usage = r"""Use as follows to compute all the lengths in a sequence file:
25 print("v0.0.2") 26
27 $ python seq_length.py -i example.fasta -f fasta -o lengths.tsv
28 """
29
30 parser = OptionParser(usage=usage)
31 parser.add_option('-i', '--input', dest='input',
32 default=None, help='Input sequence filename (FASTA, FASTQ, etc)',
33 metavar="FILE")
34 parser.add_option('-f', '--format', dest='format',
35 default=None, help='Input sequence format (FASTA, QUAL, FASTQ, SFF)')
36 parser.add_option('-o', '--output', dest='output',
37 default=None, help='Output filename (tabular)',
38 metavar="FILE")
39 parser.add_option("-v", "--version", dest="version",
40 default=False, action="store_true",
41 help="Show version and quit")
42 options, args = parser.parse_args()
43
44 if options.version:
45 print("v0.0.3")
26 sys.exit(0) 46 sys.exit(0)
47 if not options.input:
48 sys.exit("Require an input filename")
49 if not options.format:
50 sys.exit("Require the input format")
51 if not options.output:
52 sys.exit("Require an output filename")
27 53
28 try: 54 try:
29 from Bio import SeqIO 55 from Bio import SeqIO
30 except ImportError: 56 except ImportError:
31 sys.exit("Missing required Python library Biopython.") 57 sys.exit("Missing required Python library Biopython.")
38 try: 64 try:
39 from Bio.SeqIO.FastaIO import SimpleFastaParser 65 from Bio.SeqIO.FastaIO import SimpleFastaParser
40 except ImportError: 66 except ImportError:
41 sys.exit("Biopython tool old?, missing Bio.SeqIO.FastaIO.SimpleFastaParser") 67 sys.exit("Biopython tool old?, missing Bio.SeqIO.FastaIO.SimpleFastaParser")
42 68
69 in_file = options.input
70 out_file = options.output
43 71
44 # Parse Command Line 72 if options.format.startswith("fastq"):
45 try:
46 in_file, seq_format, out_file = sys.argv[1:]
47 except ValueError:
48 sys.exit("Expected three arguments (input file, format, output file), "
49 "got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv)))
50
51
52 if seq_format.startswith("fastq"):
53 # We don't care about the quality score encoding, just 73 # We don't care about the quality score encoding, just
54 # need to translate Galaxy format name into something 74 # need to translate Galaxy format name into something
55 # Biopython will accept: 75 # Biopython will accept:
56 format = "fastq" 76 format = "fastq"
57 elif seq_format.lower() == "csfasta": 77 elif options.format.lower() == "csfasta":
58 # I have not tested with colour space FASTA 78 # I have not tested with colour space FASTA
59 format = "fasta" 79 format = "fasta"
60 elif seq_format.lower == "sff": 80 elif options.format.lower() == "sff":
61 # The masked/trimmed numbers are more interesting 81 # The masked/trimmed numbers are more interesting
62 format = "sff-trim" 82 format = "sff-trim"
63 elif seq_format.lower() in ["fasta", "qual"]: 83 elif options.format.lower() in ["fasta", "qual"]:
64 format = seq_format.lower() 84 format = options.format.lower()
65 else: 85 else:
66 # TODO: Does Galaxy understand GenBank, EMBL, etc yet? 86 # TODO: Does Galaxy understand GenBank, EMBL, etc yet?
67 sys.exit("Unexpected format argument: %r" % seq_format) 87 sys.exit("Unexpected format argument: %r" % options.format)
68 88
69 89
70 count = 0 90 count = 0
71 total = 0 91 total = 0
72 with open(out_file, "w") as out_handle: 92 with open(out_file, "w") as out_handle: