Mercurial > repos > peterjc > get_orfs_or_cdss
comparison tools/get_orfs_or_cdss/get_orfs_or_cdss.py @ 7:705a2e2df7fb draft
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
author | peterjc |
---|---|
date | Thu, 30 Jul 2015 12:35:31 -0400 |
parents | 5208c15805ec |
children | 09a8be9247ca |
comparison
equal
deleted
inserted
replaced
6:64e67f172188 | 7:705a2e2df7fb |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 """Find ORFs in a nucleotide sequence file. | 2 """Find ORFs in a nucleotide sequence file. |
3 | 3 |
4 get_orfs_or_cdss.py $input_fasta $input_format $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file | 4 For more details, see the help text and argument descriptions in the |
5 | 5 accompanying get_orfs_or_cdss.xml file which defines a Galaxy interface. |
6 Takes ten command line options, input sequence filename, format, genetic | |
7 code, CDS vs ORF, end type (open, closed), selection mode (all, top, one), | |
8 minimum length (in amino acids), strand (both, forward, reverse), output | |
9 nucleotide filename, and output protein filename. | |
10 | 6 |
11 This tool is a short Python script which requires Biopython. If you use | 7 This tool is a short Python script which requires Biopython. If you use |
12 this tool in scientific work leading to a publication, please cite the | 8 this tool in scientific work leading to a publication, please cite the |
13 Biopython application note: | 9 Biopython application note: |
14 | 10 |
17 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | 13 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. |
18 | 14 |
19 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute | 15 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute |
20 (formerly SCRI), Dundee, UK. All rights reserved. | 16 (formerly SCRI), Dundee, UK. All rights reserved. |
21 | 17 |
22 See accompanying text file for licence details (MIT/BSD style). | 18 See accompanying text file for licence details (MIT licence). |
23 | 19 |
24 This is version 0.0.3 of the script. | 20 This is version 0.1.0 of the script. |
25 """ | 21 """ |
26 import sys | 22 import sys |
27 import re | 23 import re |
28 | 24 from optparse import OptionParser |
29 if "-v" in sys.argv or "--version" in sys.argv: | 25 |
30 print "v0.0.3" | 26 def sys_exit(msg, err=1): |
31 sys.exit(0) | |
32 | |
33 def stop_err(msg, err=1): | |
34 sys.stderr.write(msg.rstrip() + "\n") | 27 sys.stderr.write(msg.rstrip() + "\n") |
35 sys.exit(err) | 28 sys.exit(err) |
29 | |
30 usage = """Use as follows: | |
31 | |
32 $ python get_orfs_or_cdss.py -i genome.fa -f fasta --table 11 -t CDS -e open -m all -s both --on cds.nuc.fa --op cds.protein.fa --ob cds.bed | |
33 """ | |
36 | 34 |
37 try: | 35 try: |
38 from Bio.Seq import Seq, reverse_complement, translate | 36 from Bio.Seq import Seq, reverse_complement, translate |
39 from Bio.SeqRecord import SeqRecord | 37 from Bio.SeqRecord import SeqRecord |
40 from Bio import SeqIO | 38 from Bio import SeqIO |
41 from Bio.Data import CodonTable | 39 from Bio.Data import CodonTable |
42 except ImportError: | 40 except ImportError: |
43 stop_err("Missing Biopython library") | 41 sys_exit("Missing Biopython library") |
44 | 42 |
45 #Parse Command Line | 43 |
44 parser = OptionParser(usage=usage) | |
45 parser.add_option('-i', '--input', dest='input_file', | |
46 default=None, help='Input fasta file', | |
47 metavar='FILE') | |
48 parser.add_option('-f', '--format', dest='seq_format', | |
49 default='fasta', help='Sequence format (e.g. fasta, fastq, sff)') | |
50 parser.add_option('--table', dest='table', | |
51 default=1, help='NCBI Translation table', type='int') | |
52 parser.add_option('-t', '--ftype', dest='ftype', type='choice', | |
53 choices=['CDS', 'ORF'], default='ORF', | |
54 help='Find ORF or CDSs') | |
55 parser.add_option('-e', '--ends', dest='ends', type='choice', | |
56 choices=['open', 'closed'], default='closed', | |
57 help='Open or closed. Closed ensures start/stop codons are present') | |
58 parser.add_option('-m', '--mode', dest='mode', type='choice', | |
59 choices=['all', 'top', 'one'], default='all', | |
60 help='Output all ORFs/CDSs from sequence, all ORFs/CDSs ' | |
61 'with max length, or first with maximum length') | |
62 parser.add_option('--min_len', dest='min_len', | |
63 default=10, help='Minimum ORF/CDS length', type='int') | |
64 parser.add_option('-s', '--strand', dest='strand', type='choice', | |
65 choices=['forward', 'reverse', 'both'], default='both', | |
66 help='Strand to search for features on') | |
67 parser.add_option('--on', dest='out_nuc_file', | |
68 default=None, help='Output nucleotide sequences, or - for STDOUT', | |
69 metavar='FILE') | |
70 parser.add_option('--op', dest='out_prot_file', | |
71 default=None, help='Output protein sequences, or - for STDOUT', | |
72 metavar='FILE') | |
73 parser.add_option('--ob', dest='out_bed_file', | |
74 default=None, help='Output BED file, or - for STDOUT', | |
75 metavar='FILE') | |
76 parser.add_option('-v', '--version', dest='version', | |
77 default=False, action='store_true', | |
78 help='Show version and quit') | |
79 | |
80 options, args = parser.parse_args() | |
81 | |
82 if options.version: | |
83 print "v0.1.0" | |
84 sys.exit(0) | |
85 | |
46 try: | 86 try: |
47 input_file, seq_format, table, ftype, ends, mode, min_len, strand, out_nuc_file, out_prot_file = sys.argv[1:] | 87 table_obj = CodonTable.ambiguous_generic_by_id[options.table] |
48 except ValueError: | |
49 stop_err("Expected ten arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) | |
50 | |
51 try: | |
52 table = int(table) | |
53 except ValueError: | |
54 stop_err("Expected integer for genetic code table, got %s" % table) | |
55 | |
56 try: | |
57 table_obj = CodonTable.ambiguous_generic_by_id[table] | |
58 except KeyError: | 88 except KeyError: |
59 stop_err("Unknown codon table %i" % table) | 89 sys_exit("Unknown codon table %i" % options.table) |
60 | 90 |
61 if ftype not in ["CDS", "ORF"]: | 91 if options.seq_format.lower()=="sff": |
62 stop_err("Expected CDS or ORF, got %s" % ftype) | |
63 | |
64 if ends not in ["open", "closed"]: | |
65 stop_err("Expected open or closed for end treatment, got %s" % ends) | |
66 | |
67 try: | |
68 min_len = int(min_len) | |
69 except ValueError: | |
70 stop_err("Expected integer for min_len, got %s" % min_len) | |
71 | |
72 if seq_format.lower()=="sff": | |
73 seq_format = "sff-trim" | 92 seq_format = "sff-trim" |
74 elif seq_format.lower()=="fasta": | 93 elif options.seq_format.lower()=="fasta": |
75 seq_format = "fasta" | 94 seq_format = "fasta" |
76 elif seq_format.lower().startswith("fastq"): | 95 elif options.seq_format.lower().startswith("fastq"): |
77 seq_format = "fastq" | 96 seq_format = "fastq" |
78 else: | 97 else: |
79 stop_err("Unsupported file type %r" % seq_format) | 98 sys_exit("Unsupported file type %r" % options.seq_format) |
80 | 99 |
81 print "Genetic code table %i" % table | 100 print "Genetic code table %i" % options.table |
82 print "Minimum length %i aa" % min_len | 101 print "Minimum length %i aa" % options.min_len |
83 #print "Taking %s ORF(s) from %s strand(s)" % (mode, strand) | 102 #print "Taking %s ORF(s) from %s strand(s)" % (mode, strand) |
84 | 103 |
85 starts = sorted(table_obj.start_codons) | 104 starts = sorted(table_obj.start_codons) |
86 assert "NNN" not in starts | 105 assert "NNN" not in starts |
87 re_starts = re.compile("|".join(starts)) | 106 re_starts = re.compile("|".join(starts)) |
100 start = match.start() | 119 start = match.start() |
101 if start % 3 == 0: | 120 if start % 3 == 0: |
102 n = s[start:] | 121 n = s[start:] |
103 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) | 122 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) |
104 if strict: | 123 if strict: |
105 t = translate(n, table, cds=True) | 124 t = translate(n, options.table, cds=True) |
106 else: | 125 else: |
107 #Use when missing stop codon, | 126 #Use when missing stop codon, |
108 t = "M" + translate(n[3:], table, to_stop=True) | 127 t = "M" + translate(n[3:], options.table, to_stop=True) |
109 return start, n, t | 128 return start, n, t |
110 return None, None, None | 129 return None, None, None |
111 | 130 |
112 def break_up_frame(s): | 131 def break_up_frame(s): |
113 """Returns offset, nuc, protein.""" | 132 """Returns offset, nuc, protein.""" |
115 for match in re_stops.finditer(s): | 134 for match in re_stops.finditer(s): |
116 index = match.start() + 3 | 135 index = match.start() + 3 |
117 if index % 3 != 0: | 136 if index % 3 != 0: |
118 continue | 137 continue |
119 n = s[start:index] | 138 n = s[start:index] |
120 if ftype=="CDS": | 139 if options.ftype=="CDS": |
121 offset, n, t = start_chop_and_trans(n) | 140 offset, n, t = start_chop_and_trans(n) |
122 else: | 141 else: |
123 offset = 0 | 142 offset = 0 |
124 t = translate(n, table, to_stop=True) | 143 t = translate(n, options.table, to_stop=True) |
125 if n and len(t) >= min_len: | 144 if n and len(t) >= options.min_len: |
126 yield start + offset, n, t | 145 yield start + offset, n, t |
127 start = index | 146 start = index |
128 if ends == "open": | 147 if options.ends == "open": |
129 #No stop codon, Biopython's strict CDS translate will fail | 148 #No stop codon, Biopython's strict CDS translate will fail |
130 n = s[start:] | 149 n = s[start:] |
131 #Ensure we have whole codons | 150 #Ensure we have whole codons |
132 #TODO - Try appending N instead? | 151 #TODO - Try appending N instead? |
133 #TODO - Do the next four lines more elegantly | 152 #TODO - Do the next four lines more elegantly |
134 if len(n) % 3: | 153 if len(n) % 3: |
135 n = n[:-1] | 154 n = n[:-1] |
136 if len(n) % 3: | 155 if len(n) % 3: |
137 n = n[:-1] | 156 n = n[:-1] |
138 if ftype=="CDS": | 157 if options.ftype=="CDS": |
139 offset, n, t = start_chop_and_trans(n, strict=False) | 158 offset, n, t = start_chop_and_trans(n, strict=False) |
140 else: | 159 else: |
141 offset = 0 | 160 offset = 0 |
142 t = translate(n, table, to_stop=True) | 161 t = translate(n, options.table, to_stop=True) |
143 if n and len(t) >= min_len: | 162 if n and len(t) >= options.min_len: |
144 yield start + offset, n, t | 163 yield start + offset, n, t |
145 | 164 |
146 | 165 |
147 def get_all_peptides(nuc_seq): | 166 def get_all_peptides(nuc_seq): |
148 """Returns start, end, strand, nucleotides, protein. | 167 """Returns start, end, strand, nucleotides, protein. |
149 | 168 |
150 Co-ordinates are Python style zero-based. | 169 Co-ordinates are Python style zero-based. |
151 """ | 170 """ |
152 #TODO - Refactor to use a generator function (in start order) | 171 #TODO - Refactor to use a generator function (in start order) |
153 #rather than making a list and sorting? | 172 #rather than making a list and sorting? |
154 answer = [] | 173 answer = [] |
155 full_len = len(nuc_seq) | 174 full_len = len(nuc_seq) |
156 if strand != "reverse": | 175 if options.strand != "reverse": |
157 for frame in range(0,3): | 176 for frame in range(0,3): |
158 for offset, n, t in break_up_frame(nuc_seq[frame:]): | 177 for offset, n, t in break_up_frame(nuc_seq[frame:]): |
159 start = frame + offset #zero based | 178 start = frame + offset #zero based |
160 answer.append((start, start + len(n), +1, n, t)) | 179 answer.append((start, start + len(n), +1, n, t)) |
161 if strand != "forward": | 180 if options.strand != "forward": |
162 rc = reverse_complement(nuc_seq) | 181 rc = reverse_complement(nuc_seq) |
163 for frame in range(0,3) : | 182 for frame in range(0,3) : |
164 for offset, n, t in break_up_frame(rc[frame:]): | 183 for offset, n, t in break_up_frame(rc[frame:]): |
165 start = full_len - frame - offset #zero based | 184 start = full_len - frame - offset #zero based |
166 answer.append((start - len(n), start, -1, n ,t)) | 185 answer.append((start - len(n), start, -1, n ,t)) |
182 values = list(get_top_peptides(nuc_seq)) | 201 values = list(get_top_peptides(nuc_seq)) |
183 if not values: | 202 if not values: |
184 raise StopIteration | 203 raise StopIteration |
185 yield values[0] | 204 yield values[0] |
186 | 205 |
187 if mode == "all": | 206 if options.mode == "all": |
188 get_peptides = get_all_peptides | 207 get_peptides = get_all_peptides |
189 elif mode == "top": | 208 elif options.mode == "top": |
190 get_peptides = get_top_peptides | 209 get_peptides = get_top_peptides |
191 elif mode == "one": | 210 elif options.mode == "one": |
192 get_peptides = get_one_peptide | 211 get_peptides = get_one_peptide |
193 | 212 |
194 in_count = 0 | 213 in_count = 0 |
195 out_count = 0 | 214 out_count = 0 |
196 if out_nuc_file == "-": | 215 if options.out_nuc_file == "-": |
197 out_nuc = sys.stdout | 216 out_nuc = sys.stdout |
198 else: | 217 else: |
199 out_nuc = open(out_nuc_file, "w") | 218 out_nuc = open(options.out_nuc_file, "w") |
200 if out_prot_file == "-": | 219 |
220 if options.out_prot_file == "-": | |
201 out_prot = sys.stdout | 221 out_prot = sys.stdout |
202 else: | 222 else: |
203 out_prot = open(out_prot_file, "w") | 223 out_prot = open(options.out_prot_file, "w") |
204 for record in SeqIO.parse(input_file, seq_format): | 224 |
225 if options.out_bed_file == "-": | |
226 out_bed = sys.stdout | |
227 else: | |
228 out_bed = open(options.out_bed_file, "w") | |
229 | |
230 for record in SeqIO.parse(options.input_file, seq_format): | |
205 for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())): | 231 for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())): |
206 out_count += 1 | 232 out_count += 1 |
207 if f_strand == +1: | 233 if f_strand == +1: |
208 loc = "%i..%i" % (f_start+1, f_end) | 234 loc = "%i..%i" % (f_start+1, f_end) |
209 else: | 235 else: |
210 loc = "complement(%i..%i)" % (f_start+1, f_end) | 236 loc = "complement(%i..%i)" % (f_start+1, f_end) |
211 descr = "length %i aa, %i bp, from %s of %s" \ | 237 descr = "length %i aa, %i bp, from %s of %s" \ |
212 % (len(t), len(n), loc, record.description) | 238 % (len(t), len(n), loc, record.description) |
213 r = SeqRecord(Seq(n), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr) | 239 fid = record.id + "|%s%i" % (options.ftype, i+1) |
214 t = SeqRecord(Seq(t), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr) | 240 r = SeqRecord(Seq(n), id = fid, name = "", description= descr) |
241 t = SeqRecord(Seq(t), id = fid, name = "", description= descr) | |
215 SeqIO.write(r, out_nuc, "fasta") | 242 SeqIO.write(r, out_nuc, "fasta") |
216 SeqIO.write(t, out_prot, "fasta") | 243 SeqIO.write(t, out_prot, "fasta") |
244 out_bed.write('\t'.join(map(str,[record.id, f_start, f_end, fid, 0, '+' if f_strand == +1 else '-'])) + '\n') | |
217 in_count += 1 | 245 in_count += 1 |
218 if out_nuc is not sys.stdout: | 246 if out_nuc is not sys.stdout: |
219 out_nuc.close() | 247 out_nuc.close() |
220 if out_prot is not sys.stdout: | 248 if out_prot is not sys.stdout: |
221 out_prot.close() | 249 out_prot.close() |
222 | 250 if out_bed is not sys.stdout: |
223 print "Found %i %ss in %i sequences" % (out_count, ftype, in_count) | 251 out_bed.close() |
252 | |
253 print "Found %i %ss in %i sequences" % (out_count, options.ftype, in_count) |