annotate tools/get_orfs_or_cdss/get_orfs_or_cdss.py @ 5:5208c15805ec draft

Uploaded v0.0.5 dependant on Biopython 1.62
author peterjc
date Mon, 28 Oct 2013 05:19:38 -0400
parents
children 705a2e2df7fb
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
1 #!/usr/bin/env python
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
2 """Find ORFs in a nucleotide sequence file.
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
3
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
4 get_orfs_or_cdss.py $input_fasta $input_format $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
5
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
6 Takes ten command line options, input sequence filename, format, genetic
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
7 code, CDS vs ORF, end type (open, closed), selection mode (all, top, one),
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
8 minimum length (in amino acids), strand (both, forward, reverse), output
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
9 nucleotide filename, and output protein filename.
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
10
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
11 This tool is a short Python script which requires Biopython. If you use
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
12 this tool in scientific work leading to a publication, please cite the
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
13 Biopython application note:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
14
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
15 Cock et al 2009. Biopython: freely available Python tools for computational
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
16 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
17 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
18
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
19 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
20 (formerly SCRI), Dundee, UK. All rights reserved.
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
21
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
22 See accompanying text file for licence details (MIT/BSD style).
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
23
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
24 This is version 0.0.3 of the script.
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
25 """
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
26 import sys
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
27 import re
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
28
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
29 if "-v" in sys.argv or "--version" in sys.argv:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
30 print "v0.0.3"
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
31 sys.exit(0)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
32
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
33 def stop_err(msg, err=1):
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
34 sys.stderr.write(msg.rstrip() + "\n")
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
35 sys.exit(err)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
36
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
37 try:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
38 from Bio.Seq import Seq, reverse_complement, translate
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
39 from Bio.SeqRecord import SeqRecord
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
40 from Bio import SeqIO
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
41 from Bio.Data import CodonTable
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
42 except ImportError:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
43 stop_err("Missing Biopython library")
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
44
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
45 #Parse Command Line
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
46 try:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
47 input_file, seq_format, table, ftype, ends, mode, min_len, strand, out_nuc_file, out_prot_file = sys.argv[1:]
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
48 except ValueError:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
49 stop_err("Expected ten arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
50
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
51 try:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
52 table = int(table)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
53 except ValueError:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
54 stop_err("Expected integer for genetic code table, got %s" % table)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
55
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
56 try:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
57 table_obj = CodonTable.ambiguous_generic_by_id[table]
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
58 except KeyError:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
59 stop_err("Unknown codon table %i" % table)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
60
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
61 if ftype not in ["CDS", "ORF"]:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
62 stop_err("Expected CDS or ORF, got %s" % ftype)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
63
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
64 if ends not in ["open", "closed"]:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
65 stop_err("Expected open or closed for end treatment, got %s" % ends)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
66
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
67 try:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
68 min_len = int(min_len)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
69 except ValueError:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
70 stop_err("Expected integer for min_len, got %s" % min_len)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
71
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
72 if seq_format.lower()=="sff":
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
73 seq_format = "sff-trim"
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
74 elif seq_format.lower()=="fasta":
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
75 seq_format = "fasta"
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
76 elif seq_format.lower().startswith("fastq"):
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
77 seq_format = "fastq"
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
78 else:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
79 stop_err("Unsupported file type %r" % seq_format)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
80
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
81 print "Genetic code table %i" % table
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
82 print "Minimum length %i aa" % min_len
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
83 #print "Taking %s ORF(s) from %s strand(s)" % (mode, strand)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
84
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
85 starts = sorted(table_obj.start_codons)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
86 assert "NNN" not in starts
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
87 re_starts = re.compile("|".join(starts))
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
88
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
89 stops = sorted(table_obj.stop_codons)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
90 assert "NNN" not in stops
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
91 re_stops = re.compile("|".join(stops))
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
92
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
93 def start_chop_and_trans(s, strict=True):
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
94 """Returns offset, trimmed nuc, protein."""
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
95 if strict:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
96 assert s[-3:] in stops, s
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
97 assert len(s) % 3 == 0
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
98 for match in re_starts.finditer(s):
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
99 #Must check the start is in frame
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
100 start = match.start()
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
101 if start % 3 == 0:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
102 n = s[start:]
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
103 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
104 if strict:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
105 t = translate(n, table, cds=True)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
106 else:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
107 #Use when missing stop codon,
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
108 t = "M" + translate(n[3:], table, to_stop=True)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
109 return start, n, t
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
110 return None, None, None
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
111
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
112 def break_up_frame(s):
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
113 """Returns offset, nuc, protein."""
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
114 start = 0
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
115 for match in re_stops.finditer(s):
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
116 index = match.start() + 3
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
117 if index % 3 != 0:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
118 continue
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
119 n = s[start:index]
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
120 if ftype=="CDS":
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
121 offset, n, t = start_chop_and_trans(n)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
122 else:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
123 offset = 0
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
124 t = translate(n, table, to_stop=True)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
125 if n and len(t) >= min_len:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
126 yield start + offset, n, t
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
127 start = index
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
128 if ends == "open":
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
129 #No stop codon, Biopython's strict CDS translate will fail
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
130 n = s[start:]
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
131 #Ensure we have whole codons
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
132 #TODO - Try appending N instead?
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
133 #TODO - Do the next four lines more elegantly
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
134 if len(n) % 3:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
135 n = n[:-1]
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
136 if len(n) % 3:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
137 n = n[:-1]
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
138 if ftype=="CDS":
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
139 offset, n, t = start_chop_and_trans(n, strict=False)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
140 else:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
141 offset = 0
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
142 t = translate(n, table, to_stop=True)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
143 if n and len(t) >= min_len:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
144 yield start + offset, n, t
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
145
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
146
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
147 def get_all_peptides(nuc_seq):
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
148 """Returns start, end, strand, nucleotides, protein.
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
149
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
150 Co-ordinates are Python style zero-based.
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
151 """
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
152 #TODO - Refactor to use a generator function (in start order)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
153 #rather than making a list and sorting?
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
154 answer = []
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
155 full_len = len(nuc_seq)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
156 if strand != "reverse":
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
157 for frame in range(0,3):
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
158 for offset, n, t in break_up_frame(nuc_seq[frame:]):
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
159 start = frame + offset #zero based
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
160 answer.append((start, start + len(n), +1, n, t))
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
161 if strand != "forward":
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
162 rc = reverse_complement(nuc_seq)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
163 for frame in range(0,3) :
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
164 for offset, n, t in break_up_frame(rc[frame:]):
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
165 start = full_len - frame - offset #zero based
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
166 answer.append((start - len(n), start, -1, n ,t))
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
167 answer.sort()
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
168 return answer
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
169
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
170 def get_top_peptides(nuc_seq):
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
171 """Returns all peptides of max length."""
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
172 values = list(get_all_peptides(nuc_seq))
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
173 if not values:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
174 raise StopIteration
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
175 max_len = max(len(x[-1]) for x in values)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
176 for x in values:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
177 if len(x[-1]) == max_len:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
178 yield x
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
179
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
180 def get_one_peptide(nuc_seq):
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
181 """Returns first (left most) peptide with max length."""
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
182 values = list(get_top_peptides(nuc_seq))
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
183 if not values:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
184 raise StopIteration
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
185 yield values[0]
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
186
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
187 if mode == "all":
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
188 get_peptides = get_all_peptides
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
189 elif mode == "top":
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
190 get_peptides = get_top_peptides
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
191 elif mode == "one":
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
192 get_peptides = get_one_peptide
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
193
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
194 in_count = 0
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
195 out_count = 0
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
196 if out_nuc_file == "-":
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
197 out_nuc = sys.stdout
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
198 else:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
199 out_nuc = open(out_nuc_file, "w")
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
200 if out_prot_file == "-":
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
201 out_prot = sys.stdout
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
202 else:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
203 out_prot = open(out_prot_file, "w")
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
204 for record in SeqIO.parse(input_file, seq_format):
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
205 for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())):
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
206 out_count += 1
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
207 if f_strand == +1:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
208 loc = "%i..%i" % (f_start+1, f_end)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
209 else:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
210 loc = "complement(%i..%i)" % (f_start+1, f_end)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
211 descr = "length %i aa, %i bp, from %s of %s" \
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
212 % (len(t), len(n), loc, record.description)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
213 r = SeqRecord(Seq(n), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
214 t = SeqRecord(Seq(t), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr)
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
215 SeqIO.write(r, out_nuc, "fasta")
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
216 SeqIO.write(t, out_prot, "fasta")
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
217 in_count += 1
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
218 if out_nuc is not sys.stdout:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
219 out_nuc.close()
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
220 if out_prot is not sys.stdout:
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
221 out_prot.close()
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
222
5208c15805ec Uploaded v0.0.5 dependant on Biopython 1.62
peterjc
parents:
diff changeset
223 print "Found %i %ss in %i sequences" % (out_count, ftype, in_count)