Mercurial > repos > peterjc > get_orfs_or_cdss
comparison tools/get_orfs_or_cdss/get_orfs_or_cdss.py @ 5:5208c15805ec draft
Uploaded v0.0.5 dependant on Biopython 1.62
author | peterjc |
---|---|
date | Mon, 28 Oct 2013 05:19:38 -0400 |
parents | |
children | 705a2e2df7fb |
comparison
equal
deleted
inserted
replaced
4:d51819d2d7e2 | 5:5208c15805ec |
---|---|
1 #!/usr/bin/env python | |
2 """Find ORFs in a nucleotide sequence file. | |
3 | |
4 get_orfs_or_cdss.py $input_fasta $input_format $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file | |
5 | |
6 Takes ten command line options, input sequence filename, format, genetic | |
7 code, CDS vs ORF, end type (open, closed), selection mode (all, top, one), | |
8 minimum length (in amino acids), strand (both, forward, reverse), output | |
9 nucleotide filename, and output protein filename. | |
10 | |
11 This tool is a short Python script which requires Biopython. If you use | |
12 this tool in scientific work leading to a publication, please cite the | |
13 Biopython application note: | |
14 | |
15 Cock et al 2009. Biopython: freely available Python tools for computational | |
16 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | |
17 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | |
18 | |
19 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute | |
20 (formerly SCRI), Dundee, UK. All rights reserved. | |
21 | |
22 See accompanying text file for licence details (MIT/BSD style). | |
23 | |
24 This is version 0.0.3 of the script. | |
25 """ | |
26 import sys | |
27 import re | |
28 | |
29 if "-v" in sys.argv or "--version" in sys.argv: | |
30 print "v0.0.3" | |
31 sys.exit(0) | |
32 | |
33 def stop_err(msg, err=1): | |
34 sys.stderr.write(msg.rstrip() + "\n") | |
35 sys.exit(err) | |
36 | |
37 try: | |
38 from Bio.Seq import Seq, reverse_complement, translate | |
39 from Bio.SeqRecord import SeqRecord | |
40 from Bio import SeqIO | |
41 from Bio.Data import CodonTable | |
42 except ImportError: | |
43 stop_err("Missing Biopython library") | |
44 | |
45 #Parse Command Line | |
46 try: | |
47 input_file, seq_format, table, ftype, ends, mode, min_len, strand, out_nuc_file, out_prot_file = sys.argv[1:] | |
48 except ValueError: | |
49 stop_err("Expected ten arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) | |
50 | |
51 try: | |
52 table = int(table) | |
53 except ValueError: | |
54 stop_err("Expected integer for genetic code table, got %s" % table) | |
55 | |
56 try: | |
57 table_obj = CodonTable.ambiguous_generic_by_id[table] | |
58 except KeyError: | |
59 stop_err("Unknown codon table %i" % table) | |
60 | |
61 if ftype not in ["CDS", "ORF"]: | |
62 stop_err("Expected CDS or ORF, got %s" % ftype) | |
63 | |
64 if ends not in ["open", "closed"]: | |
65 stop_err("Expected open or closed for end treatment, got %s" % ends) | |
66 | |
67 try: | |
68 min_len = int(min_len) | |
69 except ValueError: | |
70 stop_err("Expected integer for min_len, got %s" % min_len) | |
71 | |
72 if seq_format.lower()=="sff": | |
73 seq_format = "sff-trim" | |
74 elif seq_format.lower()=="fasta": | |
75 seq_format = "fasta" | |
76 elif seq_format.lower().startswith("fastq"): | |
77 seq_format = "fastq" | |
78 else: | |
79 stop_err("Unsupported file type %r" % seq_format) | |
80 | |
81 print "Genetic code table %i" % table | |
82 print "Minimum length %i aa" % min_len | |
83 #print "Taking %s ORF(s) from %s strand(s)" % (mode, strand) | |
84 | |
85 starts = sorted(table_obj.start_codons) | |
86 assert "NNN" not in starts | |
87 re_starts = re.compile("|".join(starts)) | |
88 | |
89 stops = sorted(table_obj.stop_codons) | |
90 assert "NNN" not in stops | |
91 re_stops = re.compile("|".join(stops)) | |
92 | |
93 def start_chop_and_trans(s, strict=True): | |
94 """Returns offset, trimmed nuc, protein.""" | |
95 if strict: | |
96 assert s[-3:] in stops, s | |
97 assert len(s) % 3 == 0 | |
98 for match in re_starts.finditer(s): | |
99 #Must check the start is in frame | |
100 start = match.start() | |
101 if start % 3 == 0: | |
102 n = s[start:] | |
103 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) | |
104 if strict: | |
105 t = translate(n, table, cds=True) | |
106 else: | |
107 #Use when missing stop codon, | |
108 t = "M" + translate(n[3:], table, to_stop=True) | |
109 return start, n, t | |
110 return None, None, None | |
111 | |
112 def break_up_frame(s): | |
113 """Returns offset, nuc, protein.""" | |
114 start = 0 | |
115 for match in re_stops.finditer(s): | |
116 index = match.start() + 3 | |
117 if index % 3 != 0: | |
118 continue | |
119 n = s[start:index] | |
120 if ftype=="CDS": | |
121 offset, n, t = start_chop_and_trans(n) | |
122 else: | |
123 offset = 0 | |
124 t = translate(n, table, to_stop=True) | |
125 if n and len(t) >= min_len: | |
126 yield start + offset, n, t | |
127 start = index | |
128 if ends == "open": | |
129 #No stop codon, Biopython's strict CDS translate will fail | |
130 n = s[start:] | |
131 #Ensure we have whole codons | |
132 #TODO - Try appending N instead? | |
133 #TODO - Do the next four lines more elegantly | |
134 if len(n) % 3: | |
135 n = n[:-1] | |
136 if len(n) % 3: | |
137 n = n[:-1] | |
138 if ftype=="CDS": | |
139 offset, n, t = start_chop_and_trans(n, strict=False) | |
140 else: | |
141 offset = 0 | |
142 t = translate(n, table, to_stop=True) | |
143 if n and len(t) >= min_len: | |
144 yield start + offset, n, t | |
145 | |
146 | |
147 def get_all_peptides(nuc_seq): | |
148 """Returns start, end, strand, nucleotides, protein. | |
149 | |
150 Co-ordinates are Python style zero-based. | |
151 """ | |
152 #TODO - Refactor to use a generator function (in start order) | |
153 #rather than making a list and sorting? | |
154 answer = [] | |
155 full_len = len(nuc_seq) | |
156 if strand != "reverse": | |
157 for frame in range(0,3): | |
158 for offset, n, t in break_up_frame(nuc_seq[frame:]): | |
159 start = frame + offset #zero based | |
160 answer.append((start, start + len(n), +1, n, t)) | |
161 if strand != "forward": | |
162 rc = reverse_complement(nuc_seq) | |
163 for frame in range(0,3) : | |
164 for offset, n, t in break_up_frame(rc[frame:]): | |
165 start = full_len - frame - offset #zero based | |
166 answer.append((start - len(n), start, -1, n ,t)) | |
167 answer.sort() | |
168 return answer | |
169 | |
170 def get_top_peptides(nuc_seq): | |
171 """Returns all peptides of max length.""" | |
172 values = list(get_all_peptides(nuc_seq)) | |
173 if not values: | |
174 raise StopIteration | |
175 max_len = max(len(x[-1]) for x in values) | |
176 for x in values: | |
177 if len(x[-1]) == max_len: | |
178 yield x | |
179 | |
180 def get_one_peptide(nuc_seq): | |
181 """Returns first (left most) peptide with max length.""" | |
182 values = list(get_top_peptides(nuc_seq)) | |
183 if not values: | |
184 raise StopIteration | |
185 yield values[0] | |
186 | |
187 if mode == "all": | |
188 get_peptides = get_all_peptides | |
189 elif mode == "top": | |
190 get_peptides = get_top_peptides | |
191 elif mode == "one": | |
192 get_peptides = get_one_peptide | |
193 | |
194 in_count = 0 | |
195 out_count = 0 | |
196 if out_nuc_file == "-": | |
197 out_nuc = sys.stdout | |
198 else: | |
199 out_nuc = open(out_nuc_file, "w") | |
200 if out_prot_file == "-": | |
201 out_prot = sys.stdout | |
202 else: | |
203 out_prot = open(out_prot_file, "w") | |
204 for record in SeqIO.parse(input_file, seq_format): | |
205 for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())): | |
206 out_count += 1 | |
207 if f_strand == +1: | |
208 loc = "%i..%i" % (f_start+1, f_end) | |
209 else: | |
210 loc = "complement(%i..%i)" % (f_start+1, f_end) | |
211 descr = "length %i aa, %i bp, from %s of %s" \ | |
212 % (len(t), len(n), loc, record.description) | |
213 r = SeqRecord(Seq(n), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr) | |
214 t = SeqRecord(Seq(t), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr) | |
215 SeqIO.write(r, out_nuc, "fasta") | |
216 SeqIO.write(t, out_prot, "fasta") | |
217 in_count += 1 | |
218 if out_nuc is not sys.stdout: | |
219 out_nuc.close() | |
220 if out_prot is not sys.stdout: | |
221 out_prot.close() | |
222 | |
223 print "Found %i %ss in %i sequences" % (out_count, ftype, in_count) |