annotate tools/seq_length/seq_length.py @ 0:c323e29a8248 draft

Initial release v0.0.1
author peterjc
date Tue, 08 May 2018 09:35:45 -0400
parents
children 458f987918a6
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
1 #!/usr/bin/env python
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
2 """Compute length of FASTA, QUAL, FASTQ or SSF sequences.
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
3
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
4 Takes three command line options: input sequence filename, input type
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
5 (e.g. FASTA or SFF) and the output filename (tabular).
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
6
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
7 This tool is a short Python script which requires Biopython 1.54 or later
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
8 for SFF file support. If you use this tool in scientific work leading to a
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
9 publication, please cite the Biopython application note:
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
10
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
11 Cock et al 2009. Biopython: freely available Python tools for computational
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
12 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
13 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
14
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
15 This script is copyright 2018 by Peter Cock, The James Hutton Institute UK.
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
16 All rights reserved. See accompanying text file for licence details (MIT
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
17 license).
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
18 """
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
19
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
20 from __future__ import print_function
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
21
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
22 import sys
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
23
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
24 if "-v" in sys.argv or "--version" in sys.argv:
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
25 print("v0.0.1")
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
26 sys.exit(0)
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
27
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
28 try:
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
29 from Bio import SeqIO
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
30 except ImportError:
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
31 sys.exit("Missing required Python library Biopython.")
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
32
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
33
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
34 # Parse Command Line
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
35 try:
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
36 in_file, seq_format, out_file = sys.argv[1:]
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
37 except ValueError:
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
38 sys.exit("Expected three arguments (input file, format, output file), "
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
39 "got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv)))
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
40
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
41
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
42 if seq_format.startswith("fastq"):
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
43 # We don't care about the quality score encoding, just
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
44 # need to translate Galaxy format name into something
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
45 # Biopython will accept:
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
46 format = "fastq"
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
47 elif seq_format.lower() == "csfasta":
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
48 # I have not tested with colour space FASTA
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
49 format = "fasta"
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
50 elif seq_format.lower == "sff":
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
51 # The masked/trimmed numbers are more interesting
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
52 format = "sff-trim"
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
53 elif seq_format.lower() in ["fasta", "qual"]:
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
54 format = seq_format.lower()
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
55 else:
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
56 # TODO: Does Galaxy understand GenBank, EMBL, etc yet?
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
57 sys.exit("Unexpected format argument: %r" % seq_format)
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
58
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
59
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
60 count = 0
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
61 total = 0
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
62 with open(out_file, "w") as out_handle:
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
63 out_handle.write("#Identifier\tLength\n")
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
64 for record in SeqIO.parse(in_file, format):
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
65 count += 1
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
66 length = len(record)
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
67 total += length
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
68 out_handle.write("%s\t%i\n" % (record.id, length))
c323e29a8248 Initial release v0.0.1
peterjc
parents:
diff changeset
69 print("%i sequences, total length %i" % (count, total))