comparison tools/get_orfs_or_cdss/get_orfs_or_cdss.py @ 5:5208c15805ec draft

Uploaded v0.0.5 dependant on Biopython 1.62
author peterjc
date Mon, 28 Oct 2013 05:19:38 -0400
parents
children 705a2e2df7fb
comparison
equal deleted inserted replaced
4:d51819d2d7e2 5:5208c15805ec
1 #!/usr/bin/env python
2 """Find ORFs in a nucleotide sequence file.
3
4 get_orfs_or_cdss.py $input_fasta $input_format $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file
5
6 Takes ten command line options, input sequence filename, format, genetic
7 code, CDS vs ORF, end type (open, closed), selection mode (all, top, one),
8 minimum length (in amino acids), strand (both, forward, reverse), output
9 nucleotide filename, and output protein filename.
10
11 This tool is a short Python script which requires Biopython. If you use
12 this tool in scientific work leading to a publication, please cite the
13 Biopython application note:
14
15 Cock et al 2009. Biopython: freely available Python tools for computational
16 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
17 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
18
19 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute
20 (formerly SCRI), Dundee, UK. All rights reserved.
21
22 See accompanying text file for licence details (MIT/BSD style).
23
24 This is version 0.0.3 of the script.
25 """
26 import sys
27 import re
28
29 if "-v" in sys.argv or "--version" in sys.argv:
30 print "v0.0.3"
31 sys.exit(0)
32
33 def stop_err(msg, err=1):
34 sys.stderr.write(msg.rstrip() + "\n")
35 sys.exit(err)
36
37 try:
38 from Bio.Seq import Seq, reverse_complement, translate
39 from Bio.SeqRecord import SeqRecord
40 from Bio import SeqIO
41 from Bio.Data import CodonTable
42 except ImportError:
43 stop_err("Missing Biopython library")
44
45 #Parse Command Line
46 try:
47 input_file, seq_format, table, ftype, ends, mode, min_len, strand, out_nuc_file, out_prot_file = sys.argv[1:]
48 except ValueError:
49 stop_err("Expected ten arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
50
51 try:
52 table = int(table)
53 except ValueError:
54 stop_err("Expected integer for genetic code table, got %s" % table)
55
56 try:
57 table_obj = CodonTable.ambiguous_generic_by_id[table]
58 except KeyError:
59 stop_err("Unknown codon table %i" % table)
60
61 if ftype not in ["CDS", "ORF"]:
62 stop_err("Expected CDS or ORF, got %s" % ftype)
63
64 if ends not in ["open", "closed"]:
65 stop_err("Expected open or closed for end treatment, got %s" % ends)
66
67 try:
68 min_len = int(min_len)
69 except ValueError:
70 stop_err("Expected integer for min_len, got %s" % min_len)
71
72 if seq_format.lower()=="sff":
73 seq_format = "sff-trim"
74 elif seq_format.lower()=="fasta":
75 seq_format = "fasta"
76 elif seq_format.lower().startswith("fastq"):
77 seq_format = "fastq"
78 else:
79 stop_err("Unsupported file type %r" % seq_format)
80
81 print "Genetic code table %i" % table
82 print "Minimum length %i aa" % min_len
83 #print "Taking %s ORF(s) from %s strand(s)" % (mode, strand)
84
85 starts = sorted(table_obj.start_codons)
86 assert "NNN" not in starts
87 re_starts = re.compile("|".join(starts))
88
89 stops = sorted(table_obj.stop_codons)
90 assert "NNN" not in stops
91 re_stops = re.compile("|".join(stops))
92
93 def start_chop_and_trans(s, strict=True):
94 """Returns offset, trimmed nuc, protein."""
95 if strict:
96 assert s[-3:] in stops, s
97 assert len(s) % 3 == 0
98 for match in re_starts.finditer(s):
99 #Must check the start is in frame
100 start = match.start()
101 if start % 3 == 0:
102 n = s[start:]
103 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
104 if strict:
105 t = translate(n, table, cds=True)
106 else:
107 #Use when missing stop codon,
108 t = "M" + translate(n[3:], table, to_stop=True)
109 return start, n, t
110 return None, None, None
111
112 def break_up_frame(s):
113 """Returns offset, nuc, protein."""
114 start = 0
115 for match in re_stops.finditer(s):
116 index = match.start() + 3
117 if index % 3 != 0:
118 continue
119 n = s[start:index]
120 if ftype=="CDS":
121 offset, n, t = start_chop_and_trans(n)
122 else:
123 offset = 0
124 t = translate(n, table, to_stop=True)
125 if n and len(t) >= min_len:
126 yield start + offset, n, t
127 start = index
128 if ends == "open":
129 #No stop codon, Biopython's strict CDS translate will fail
130 n = s[start:]
131 #Ensure we have whole codons
132 #TODO - Try appending N instead?
133 #TODO - Do the next four lines more elegantly
134 if len(n) % 3:
135 n = n[:-1]
136 if len(n) % 3:
137 n = n[:-1]
138 if ftype=="CDS":
139 offset, n, t = start_chop_and_trans(n, strict=False)
140 else:
141 offset = 0
142 t = translate(n, table, to_stop=True)
143 if n and len(t) >= min_len:
144 yield start + offset, n, t
145
146
147 def get_all_peptides(nuc_seq):
148 """Returns start, end, strand, nucleotides, protein.
149
150 Co-ordinates are Python style zero-based.
151 """
152 #TODO - Refactor to use a generator function (in start order)
153 #rather than making a list and sorting?
154 answer = []
155 full_len = len(nuc_seq)
156 if strand != "reverse":
157 for frame in range(0,3):
158 for offset, n, t in break_up_frame(nuc_seq[frame:]):
159 start = frame + offset #zero based
160 answer.append((start, start + len(n), +1, n, t))
161 if strand != "forward":
162 rc = reverse_complement(nuc_seq)
163 for frame in range(0,3) :
164 for offset, n, t in break_up_frame(rc[frame:]):
165 start = full_len - frame - offset #zero based
166 answer.append((start - len(n), start, -1, n ,t))
167 answer.sort()
168 return answer
169
170 def get_top_peptides(nuc_seq):
171 """Returns all peptides of max length."""
172 values = list(get_all_peptides(nuc_seq))
173 if not values:
174 raise StopIteration
175 max_len = max(len(x[-1]) for x in values)
176 for x in values:
177 if len(x[-1]) == max_len:
178 yield x
179
180 def get_one_peptide(nuc_seq):
181 """Returns first (left most) peptide with max length."""
182 values = list(get_top_peptides(nuc_seq))
183 if not values:
184 raise StopIteration
185 yield values[0]
186
187 if mode == "all":
188 get_peptides = get_all_peptides
189 elif mode == "top":
190 get_peptides = get_top_peptides
191 elif mode == "one":
192 get_peptides = get_one_peptide
193
194 in_count = 0
195 out_count = 0
196 if out_nuc_file == "-":
197 out_nuc = sys.stdout
198 else:
199 out_nuc = open(out_nuc_file, "w")
200 if out_prot_file == "-":
201 out_prot = sys.stdout
202 else:
203 out_prot = open(out_prot_file, "w")
204 for record in SeqIO.parse(input_file, seq_format):
205 for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())):
206 out_count += 1
207 if f_strand == +1:
208 loc = "%i..%i" % (f_start+1, f_end)
209 else:
210 loc = "complement(%i..%i)" % (f_start+1, f_end)
211 descr = "length %i aa, %i bp, from %s of %s" \
212 % (len(t), len(n), loc, record.description)
213 r = SeqRecord(Seq(n), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr)
214 t = SeqRecord(Seq(t), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr)
215 SeqIO.write(r, out_nuc, "fasta")
216 SeqIO.write(t, out_prot, "fasta")
217 in_count += 1
218 if out_nuc is not sys.stdout:
219 out_nuc.close()
220 if out_prot is not sys.stdout:
221 out_prot.close()
222
223 print "Found %i %ss in %i sequences" % (out_count, ftype, in_count)