0
|
1 #!/usr/bin/env python
|
|
2 """Find ORFs in a nucleotide sequence file.
|
|
3
|
|
4 get_orfs_or_cdss.py $input_fasta $input_format $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file
|
|
5
|
|
6 Takes ten command line options, input sequence filename, format, genetic
|
|
7 code, CDS vs ORF, end type (open, closed), selection mode (all, top, one),
|
|
8 minimum length (in amino acids), strand (both, forward, reverse), output
|
|
9 nucleotide filename, and output protein filename.
|
|
10
|
|
11 This tool is a short Python script which requires Biopython. If you use
|
|
12 this tool in scientific work leading to a publication, please cite the
|
|
13 Biopython application note:
|
|
14
|
|
15 Cock et al 2009. Biopython: freely available Python tools for computational
|
|
16 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
|
|
17 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
|
|
18
|
|
19 This script is copyright 2011 by Peter Cock, The James Hutton Institute
|
|
20 (formerly SCRI), Dundee, UK. All rights reserved.
|
|
21
|
|
22 See accompanying text file for licence details (MIT/BSD style).
|
|
23
|
|
24 This is version 0.0.1 of the script.
|
|
25 """
|
|
26 import sys
|
|
27 import re
|
|
28
|
|
29 def stop_err(msg, err=1):
|
|
30 sys.stderr.write(msg.rstrip() + "\n")
|
|
31 sys.exit(err)
|
|
32
|
|
33 try:
|
|
34 from Bio.Seq import Seq, reverse_complement, translate
|
|
35 from Bio.SeqRecord import SeqRecord
|
|
36 from Bio import SeqIO
|
|
37 from Bio.Data import CodonTable
|
|
38 except ImportError:
|
|
39 stop_err("Missing Biopython library")
|
|
40
|
|
41 #Parse Command Line
|
|
42 try:
|
|
43 input_file, seq_format, table, ftype, ends, mode, min_len, strand, out_nuc_file, out_prot_file = sys.argv[1:]
|
|
44 except ValueError:
|
|
45 stop_err("Expected ten arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
|
|
46
|
|
47 try:
|
|
48 table = int(table)
|
|
49 except ValueError:
|
|
50 stop_err("Expected integer for genetic code table, got %s" % table)
|
|
51
|
|
52 try:
|
|
53 table_obj = CodonTable.ambiguous_generic_by_id[table]
|
|
54 except KeyError:
|
|
55 stop_err("Unknown codon table %i" % table)
|
|
56
|
|
57 if ftype not in ["CDS", "ORF"]:
|
|
58 stop_err("Expected CDS or ORF, got %s" % ftype)
|
|
59
|
|
60 if ends not in ["open", "closed"]:
|
|
61 stop_err("Expected open or closed for end treatment, got %s" % ends)
|
|
62
|
|
63 try:
|
|
64 min_len = int(min_len)
|
|
65 except ValueError:
|
|
66 stop_err("Expected integer for min_len, got %s" % min_len)
|
|
67
|
|
68 if seq_format.lower()=="sff":
|
|
69 seq_format = "sff-trim"
|
|
70 elif seq_format.lower()=="fasta":
|
|
71 seq_format = "fasta"
|
|
72 elif seq_format.lower().startswith("fastq"):
|
|
73 seq_format = "fastq"
|
|
74 else:
|
|
75 stop_err("Unsupported file type %r" % seq_format)
|
|
76
|
|
77 print "Genetic code table %i" % table
|
|
78 print "Minimum length %i aa" % min_len
|
|
79 #print "Taking %s ORF(s) from %s strand(s)" % (mode, strand)
|
|
80
|
|
81 starts = sorted(table_obj.start_codons)
|
|
82 assert "NNN" not in starts
|
|
83 re_starts = re.compile("|".join(starts))
|
|
84
|
|
85 stops = sorted(table_obj.stop_codons)
|
|
86 assert "NNN" not in stops
|
|
87 re_stops = re.compile("|".join(stops))
|
|
88
|
|
89 def start_chop_and_trans(s, strict=True):
|
|
90 """Returns offset, trimmed nuc, protein."""
|
|
91 if strict:
|
|
92 assert s[-3:] in stops, s
|
|
93 assert len(s) % 3 == 0
|
|
94 for match in re_starts.finditer(s):
|
|
95 #Must check the start is in frame
|
|
96 start = match.start()
|
|
97 if start % 3 == 0:
|
|
98 n = s[start:]
|
|
99 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
|
|
100 if strict:
|
|
101 t = translate(n, table, cds=True)
|
|
102 else:
|
|
103 #Use when missing stop codon,
|
|
104 t = "M" + translate(n[3:], table, to_stop=True)
|
|
105 return start, n, t
|
|
106 return None, None, None
|
|
107
|
|
108 def break_up_frame(s):
|
|
109 """Returns offset, nuc, protein."""
|
|
110 start = 0
|
|
111 for match in re_stops.finditer(s):
|
|
112 index = match.start() + 3
|
|
113 if index % 3 != 0:
|
|
114 continue
|
|
115 n = s[start:index]
|
|
116 if ftype=="CDS":
|
|
117 offset, n, t = start_chop_and_trans(n)
|
|
118 else:
|
|
119 offset = 0
|
|
120 t = translate(n, table, to_stop=True)
|
|
121 if n and len(t) >= min_len:
|
|
122 yield start + offset, n, t
|
|
123 start = index
|
|
124 if ends == "open":
|
|
125 #No stop codon, Biopython's strict CDS translate will fail
|
|
126 n = s[start:]
|
|
127 #Ensure we have whole codons
|
|
128 #TODO - Try appending N instead?
|
|
129 #TODO - Do the next four lines more elegantly
|
|
130 if len(n) % 3:
|
|
131 n = n[:-1]
|
|
132 if len(n) % 3:
|
|
133 n = n[:-1]
|
|
134 if ftype=="CDS":
|
|
135 offset, n, t = start_chop_and_trans(n, strict=False)
|
|
136 else:
|
|
137 offset = 0
|
|
138 t = translate(n, table, to_stop=True)
|
|
139 if n and len(t) >= min_len:
|
|
140 yield start + offset, n, t
|
|
141
|
|
142
|
|
143 def get_all_peptides(nuc_seq):
|
|
144 """Returns start, end, strand, nucleotides, protein.
|
|
145
|
|
146 Co-ordinates are Python style zero-based.
|
|
147 """
|
|
148 #TODO - Refactor to use a generator function (in start order)
|
|
149 #rather than making a list and sorting?
|
|
150 answer = []
|
|
151 full_len = len(nuc_seq)
|
|
152 if strand != "reverse":
|
|
153 for frame in range(0,3):
|
|
154 for offset, n, t in break_up_frame(nuc_seq[frame:]):
|
|
155 start = frame + offset #zero based
|
|
156 answer.append((start, start + len(n), +1, n, t))
|
|
157 if strand != "forward":
|
|
158 rc = reverse_complement(nuc_seq)
|
|
159 for frame in range(0,3) :
|
|
160 for offset, n, t in break_up_frame(rc[frame:]):
|
|
161 start = full_len - frame - offset #zero based
|
|
162 answer.append((start, start + len(n), -1, n ,t))
|
|
163 answer.sort()
|
|
164 return answer
|
|
165
|
|
166 def get_top_peptides(nuc_seq):
|
|
167 """Returns all peptides of max length."""
|
|
168 values = list(get_all_peptides(nuc_seq))
|
|
169 if not values:
|
|
170 raise StopIteration
|
|
171 max_len = max(len(x[-1]) for x in values)
|
|
172 for x in values:
|
|
173 if len(x[-1]) == max_len:
|
|
174 yield x
|
|
175
|
|
176 def get_one_peptide(nuc_seq):
|
|
177 """Returns first (left most) peptide with max length."""
|
|
178 values = list(get_top_peptides(nuc_seq))
|
|
179 if not values:
|
|
180 raise StopIteration
|
|
181 yield values[0]
|
|
182
|
|
183 if mode == "all":
|
|
184 get_peptides = get_all_peptides
|
|
185 elif mode == "top":
|
|
186 get_peptides = get_top_peptides
|
|
187 elif mode == "one":
|
|
188 get_peptides = get_one_peptide
|
|
189
|
|
190 in_count = 0
|
|
191 out_count = 0
|
|
192 if out_nuc_file == "-":
|
|
193 out_nuc = sys.stdout
|
|
194 else:
|
|
195 out_nuc = open(out_nuc_file, "w")
|
|
196 if out_prot_file == "-":
|
|
197 out_prot = sys.stdout
|
|
198 else:
|
|
199 out_prot = open(out_prot_file, "w")
|
|
200 for record in SeqIO.parse(input_file, seq_format):
|
|
201 for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())):
|
|
202 out_count += 1
|
|
203 if f_strand == +1:
|
|
204 loc = "%i..%i" % (f_start+1, f_end)
|
|
205 else:
|
|
206 loc = "complement(%i..%i)" % (f_start+1, f_end)
|
|
207 descr = "length %i aa, %i bp, from %s of %s" \
|
|
208 % (len(t), len(n), loc, record.description)
|
|
209 r = SeqRecord(Seq(n), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr)
|
|
210 t = SeqRecord(Seq(t), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr)
|
|
211 SeqIO.write(r, out_nuc, "fasta")
|
|
212 SeqIO.write(t, out_prot, "fasta")
|
|
213 in_count += 1
|
|
214 if out_nuc is not sys.stdout:
|
|
215 out_nuc.close()
|
|
216 if out_prot is not sys.stdout:
|
|
217 out_prot.close()
|
|
218
|
|
219 print "Found %i %ss in %i sequences" % (out_count, ftype, in_count)
|