Mercurial > repos > peterjc > get_orfs_or_cdss
comparison tools/filters/get_orfs_or_cdss.py @ 0:9cff9a1176ea
Uploaded v0.0.1
author | peterjc |
---|---|
date | Thu, 19 Jan 2012 10:17:10 -0500 |
parents | |
children | 922d69bd5258 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9cff9a1176ea |
---|---|
1 #!/usr/bin/env python | |
2 """Find ORFs in a nucleotide sequence file. | |
3 | |
4 get_orfs_or_cdss.py $input_fasta $input_format $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file | |
5 | |
6 Takes ten command line options, input sequence filename, format, genetic | |
7 code, CDS vs ORF, end type (open, closed), selection mode (all, top, one), | |
8 minimum length (in amino acids), strand (both, forward, reverse), output | |
9 nucleotide filename, and output protein filename. | |
10 | |
11 This tool is a short Python script which requires Biopython. If you use | |
12 this tool in scientific work leading to a publication, please cite the | |
13 Biopython application note: | |
14 | |
15 Cock et al 2009. Biopython: freely available Python tools for computational | |
16 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | |
17 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | |
18 | |
19 This script is copyright 2011 by Peter Cock, The James Hutton Institute | |
20 (formerly SCRI), Dundee, UK. All rights reserved. | |
21 | |
22 See accompanying text file for licence details (MIT/BSD style). | |
23 | |
24 This is version 0.0.1 of the script. | |
25 """ | |
26 import sys | |
27 import re | |
28 | |
29 def stop_err(msg, err=1): | |
30 sys.stderr.write(msg.rstrip() + "\n") | |
31 sys.exit(err) | |
32 | |
33 try: | |
34 from Bio.Seq import Seq, reverse_complement, translate | |
35 from Bio.SeqRecord import SeqRecord | |
36 from Bio import SeqIO | |
37 from Bio.Data import CodonTable | |
38 except ImportError: | |
39 stop_err("Missing Biopython library") | |
40 | |
41 #Parse Command Line | |
42 try: | |
43 input_file, seq_format, table, ftype, ends, mode, min_len, strand, out_nuc_file, out_prot_file = sys.argv[1:] | |
44 except ValueError: | |
45 stop_err("Expected ten arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) | |
46 | |
47 try: | |
48 table = int(table) | |
49 except ValueError: | |
50 stop_err("Expected integer for genetic code table, got %s" % table) | |
51 | |
52 try: | |
53 table_obj = CodonTable.ambiguous_generic_by_id[table] | |
54 except KeyError: | |
55 stop_err("Unknown codon table %i" % table) | |
56 | |
57 if ftype not in ["CDS", "ORF"]: | |
58 stop_err("Expected CDS or ORF, got %s" % ftype) | |
59 | |
60 if ends not in ["open", "closed"]: | |
61 stop_err("Expected open or closed for end treatment, got %s" % ends) | |
62 | |
63 try: | |
64 min_len = int(min_len) | |
65 except ValueError: | |
66 stop_err("Expected integer for min_len, got %s" % min_len) | |
67 | |
68 if seq_format.lower()=="sff": | |
69 seq_format = "sff-trim" | |
70 elif seq_format.lower()=="fasta": | |
71 seq_format = "fasta" | |
72 elif seq_format.lower().startswith("fastq"): | |
73 seq_format = "fastq" | |
74 else: | |
75 stop_err("Unsupported file type %r" % seq_format) | |
76 | |
77 print "Genetic code table %i" % table | |
78 print "Minimum length %i aa" % min_len | |
79 #print "Taking %s ORF(s) from %s strand(s)" % (mode, strand) | |
80 | |
81 starts = sorted(table_obj.start_codons) | |
82 assert "NNN" not in starts | |
83 re_starts = re.compile("|".join(starts)) | |
84 | |
85 stops = sorted(table_obj.stop_codons) | |
86 assert "NNN" not in stops | |
87 re_stops = re.compile("|".join(stops)) | |
88 | |
89 def start_chop_and_trans(s, strict=True): | |
90 """Returns offset, trimmed nuc, protein.""" | |
91 if strict: | |
92 assert s[-3:] in stops, s | |
93 assert len(s) % 3 == 0 | |
94 for match in re_starts.finditer(s): | |
95 #Must check the start is in frame | |
96 start = match.start() | |
97 if start % 3 == 0: | |
98 n = s[start:] | |
99 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) | |
100 if strict: | |
101 t = translate(n, table, cds=True) | |
102 else: | |
103 #Use when missing stop codon, | |
104 t = "M" + translate(n[3:], table, to_stop=True) | |
105 return start, n, t | |
106 return None, None, None | |
107 | |
108 def break_up_frame(s): | |
109 """Returns offset, nuc, protein.""" | |
110 start = 0 | |
111 for match in re_stops.finditer(s): | |
112 index = match.start() + 3 | |
113 if index % 3 != 0: | |
114 continue | |
115 n = s[start:index] | |
116 if ftype=="CDS": | |
117 offset, n, t = start_chop_and_trans(n) | |
118 else: | |
119 offset = 0 | |
120 t = translate(n, table, to_stop=True) | |
121 if n and len(t) >= min_len: | |
122 yield start + offset, n, t | |
123 start = index | |
124 if ends == "open": | |
125 #No stop codon, Biopython's strict CDS translate will fail | |
126 n = s[start:] | |
127 #Ensure we have whole codons | |
128 #TODO - Try appending N instead? | |
129 #TODO - Do the next four lines more elegantly | |
130 if len(n) % 3: | |
131 n = n[:-1] | |
132 if len(n) % 3: | |
133 n = n[:-1] | |
134 if ftype=="CDS": | |
135 offset, n, t = start_chop_and_trans(n, strict=False) | |
136 else: | |
137 offset = 0 | |
138 t = translate(n, table, to_stop=True) | |
139 if n and len(t) >= min_len: | |
140 yield start + offset, n, t | |
141 | |
142 | |
143 def get_all_peptides(nuc_seq): | |
144 """Returns start, end, strand, nucleotides, protein. | |
145 | |
146 Co-ordinates are Python style zero-based. | |
147 """ | |
148 #TODO - Refactor to use a generator function (in start order) | |
149 #rather than making a list and sorting? | |
150 answer = [] | |
151 full_len = len(nuc_seq) | |
152 if strand != "reverse": | |
153 for frame in range(0,3): | |
154 for offset, n, t in break_up_frame(nuc_seq[frame:]): | |
155 start = frame + offset #zero based | |
156 answer.append((start, start + len(n), +1, n, t)) | |
157 if strand != "forward": | |
158 rc = reverse_complement(nuc_seq) | |
159 for frame in range(0,3) : | |
160 for offset, n, t in break_up_frame(rc[frame:]): | |
161 start = full_len - frame - offset #zero based | |
162 answer.append((start, start + len(n), -1, n ,t)) | |
163 answer.sort() | |
164 return answer | |
165 | |
166 def get_top_peptides(nuc_seq): | |
167 """Returns all peptides of max length.""" | |
168 values = list(get_all_peptides(nuc_seq)) | |
169 if not values: | |
170 raise StopIteration | |
171 max_len = max(len(x[-1]) for x in values) | |
172 for x in values: | |
173 if len(x[-1]) == max_len: | |
174 yield x | |
175 | |
176 def get_one_peptide(nuc_seq): | |
177 """Returns first (left most) peptide with max length.""" | |
178 values = list(get_top_peptides(nuc_seq)) | |
179 if not values: | |
180 raise StopIteration | |
181 yield values[0] | |
182 | |
183 if mode == "all": | |
184 get_peptides = get_all_peptides | |
185 elif mode == "top": | |
186 get_peptides = get_top_peptides | |
187 elif mode == "one": | |
188 get_peptides = get_one_peptide | |
189 | |
190 in_count = 0 | |
191 out_count = 0 | |
192 if out_nuc_file == "-": | |
193 out_nuc = sys.stdout | |
194 else: | |
195 out_nuc = open(out_nuc_file, "w") | |
196 if out_prot_file == "-": | |
197 out_prot = sys.stdout | |
198 else: | |
199 out_prot = open(out_prot_file, "w") | |
200 for record in SeqIO.parse(input_file, seq_format): | |
201 for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())): | |
202 out_count += 1 | |
203 if f_strand == +1: | |
204 loc = "%i..%i" % (f_start+1, f_end) | |
205 else: | |
206 loc = "complement(%i..%i)" % (f_start+1, f_end) | |
207 descr = "length %i aa, %i bp, from %s of %s" \ | |
208 % (len(t), len(n), loc, record.description) | |
209 r = SeqRecord(Seq(n), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr) | |
210 t = SeqRecord(Seq(t), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr) | |
211 SeqIO.write(r, out_nuc, "fasta") | |
212 SeqIO.write(t, out_prot, "fasta") | |
213 in_count += 1 | |
214 if out_nuc is not sys.stdout: | |
215 out_nuc.close() | |
216 if out_prot is not sys.stdout: | |
217 out_prot.close() | |
218 | |
219 print "Found %i %ss in %i sequences" % (out_count, ftype, in_count) |