Mercurial > repos > peterjc > get_orfs_or_cdss
comparison tools/get_orfs_or_cdss/get_orfs_or_cdss.py @ 8:09a8be9247ca draft
v0.2.0 with GFF3 output
author | peterjc |
---|---|
date | Sat, 09 Jan 2016 23:42:32 -0500 |
parents | 705a2e2df7fb |
children | a06ad07431ba |
comparison
equal
deleted
inserted
replaced
7:705a2e2df7fb | 8:09a8be9247ca |
---|---|
14 | 14 |
15 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute | 15 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute |
16 (formerly SCRI), Dundee, UK. All rights reserved. | 16 (formerly SCRI), Dundee, UK. All rights reserved. |
17 | 17 |
18 See accompanying text file for licence details (MIT licence). | 18 See accompanying text file for licence details (MIT licence). |
19 | |
20 This is version 0.1.0 of the script. | |
21 """ | 19 """ |
22 import sys | 20 import sys |
23 import re | 21 import re |
24 from optparse import OptionParser | 22 from optparse import OptionParser |
25 | 23 |
26 def sys_exit(msg, err=1): | |
27 sys.stderr.write(msg.rstrip() + "\n") | |
28 sys.exit(err) | |
29 | |
30 usage = """Use as follows: | 24 usage = """Use as follows: |
31 | 25 |
32 $ python get_orfs_or_cdss.py -i genome.fa -f fasta --table 11 -t CDS -e open -m all -s both --on cds.nuc.fa --op cds.protein.fa --ob cds.bed | 26 $ python get_orfs_or_cdss.py -i genome.fa -f fasta --table 11 -t CDS -e open -m all -s both --on cds.nuc.fa --op cds.protein.fa --ob cds.bed --og cds.gff3 |
33 """ | 27 """ |
34 | 28 |
35 try: | 29 try: |
36 from Bio.Seq import Seq, reverse_complement, translate | 30 from Bio.Seq import Seq, reverse_complement, translate |
37 from Bio.SeqRecord import SeqRecord | 31 from Bio.SeqRecord import SeqRecord |
38 from Bio import SeqIO | 32 from Bio import SeqIO |
39 from Bio.Data import CodonTable | 33 from Bio.Data import CodonTable |
40 except ImportError: | 34 except ImportError: |
41 sys_exit("Missing Biopython library") | 35 sys.exit("Missing Biopython library") |
42 | 36 |
43 | 37 |
44 parser = OptionParser(usage=usage) | 38 parser = OptionParser(usage=usage) |
45 parser.add_option('-i', '--input', dest='input_file', | 39 parser.add_option('-i', '--input', dest='input_file', |
46 default=None, help='Input fasta file', | 40 default=None, help='Input fasta file', |
71 default=None, help='Output protein sequences, or - for STDOUT', | 65 default=None, help='Output protein sequences, or - for STDOUT', |
72 metavar='FILE') | 66 metavar='FILE') |
73 parser.add_option('--ob', dest='out_bed_file', | 67 parser.add_option('--ob', dest='out_bed_file', |
74 default=None, help='Output BED file, or - for STDOUT', | 68 default=None, help='Output BED file, or - for STDOUT', |
75 metavar='FILE') | 69 metavar='FILE') |
70 parser.add_option('--og', dest='out_gff3_file', | |
71 default=None, help='Output GFF3 file, or - for STDOUT', | |
72 metavar='FILE') | |
76 parser.add_option('-v', '--version', dest='version', | 73 parser.add_option('-v', '--version', dest='version', |
77 default=False, action='store_true', | 74 default=False, action='store_true', |
78 help='Show version and quit') | 75 help='Show version and quit') |
79 | 76 |
80 options, args = parser.parse_args() | 77 options, args = parser.parse_args() |
81 | 78 |
82 if options.version: | 79 if options.version: |
83 print "v0.1.0" | 80 print("v0.2.0") |
84 sys.exit(0) | 81 sys.exit(0) |
82 | |
83 if not options.input_file: | |
84 sys.exit("Input file is required") | |
85 | |
86 if not any((options.out_nuc_file, options.out_prot_file, options.out_bed_file, options.out_gff3_file)): | |
87 sys.exit("At least one output file is required") | |
85 | 88 |
86 try: | 89 try: |
87 table_obj = CodonTable.ambiguous_generic_by_id[options.table] | 90 table_obj = CodonTable.ambiguous_generic_by_id[options.table] |
88 except KeyError: | 91 except KeyError: |
89 sys_exit("Unknown codon table %i" % options.table) | 92 sys.exit("Unknown codon table %i" % options.table) |
90 | 93 |
91 if options.seq_format.lower()=="sff": | 94 if options.seq_format.lower() == "sff": |
92 seq_format = "sff-trim" | 95 seq_format = "sff-trim" |
93 elif options.seq_format.lower()=="fasta": | 96 elif options.seq_format.lower() == "fasta": |
94 seq_format = "fasta" | 97 seq_format = "fasta" |
95 elif options.seq_format.lower().startswith("fastq"): | 98 elif options.seq_format.lower().startswith("fastq"): |
96 seq_format = "fastq" | 99 seq_format = "fastq" |
97 else: | 100 else: |
98 sys_exit("Unsupported file type %r" % options.seq_format) | 101 sys.exit("Unsupported file type %r" % options.seq_format) |
99 | 102 |
100 print "Genetic code table %i" % options.table | 103 print "Genetic code table %i" % options.table |
101 print "Minimum length %i aa" % options.min_len | 104 print "Minimum length %i aa" % options.min_len |
102 #print "Taking %s ORF(s) from %s strand(s)" % (mode, strand) | 105 # print "Taking %s ORF(s) from %s strand(s)" % (mode, strand) |
103 | 106 |
104 starts = sorted(table_obj.start_codons) | 107 starts = sorted(table_obj.start_codons) |
105 assert "NNN" not in starts | 108 assert "NNN" not in starts |
106 re_starts = re.compile("|".join(starts)) | 109 re_starts = re.compile("|".join(starts)) |
107 | 110 |
108 stops = sorted(table_obj.stop_codons) | 111 stops = sorted(table_obj.stop_codons) |
109 assert "NNN" not in stops | 112 assert "NNN" not in stops |
110 re_stops = re.compile("|".join(stops)) | 113 re_stops = re.compile("|".join(stops)) |
114 | |
111 | 115 |
112 def start_chop_and_trans(s, strict=True): | 116 def start_chop_and_trans(s, strict=True): |
113 """Returns offset, trimmed nuc, protein.""" | 117 """Returns offset, trimmed nuc, protein.""" |
114 if strict: | 118 if strict: |
115 assert s[-3:] in stops, s | 119 assert s[-3:] in stops, s |
116 assert len(s) % 3 == 0 | 120 assert len(s) % 3 == 0 |
117 for match in re_starts.finditer(s): | 121 for match in re_starts.finditer(s): |
118 #Must check the start is in frame | 122 # Must check the start is in frame |
119 start = match.start() | 123 start = match.start() |
120 if start % 3 == 0: | 124 if start % 3 == 0: |
121 n = s[start:] | 125 n = s[start:] |
122 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) | 126 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) |
123 if strict: | 127 if strict: |
124 t = translate(n, options.table, cds=True) | 128 t = translate(n, options.table, cds=True) |
125 else: | 129 else: |
126 #Use when missing stop codon, | 130 # Use when missing stop codon, |
127 t = "M" + translate(n[3:], options.table, to_stop=True) | 131 t = "M" + translate(n[3:], options.table, to_stop=True) |
128 return start, n, t | 132 return start, n, t |
129 return None, None, None | 133 return None, None, None |
134 | |
130 | 135 |
131 def break_up_frame(s): | 136 def break_up_frame(s): |
132 """Returns offset, nuc, protein.""" | 137 """Returns offset, nuc, protein.""" |
133 start = 0 | 138 start = 0 |
134 for match in re_stops.finditer(s): | 139 for match in re_stops.finditer(s): |
135 index = match.start() + 3 | 140 index = match.start() + 3 |
136 if index % 3 != 0: | 141 if index % 3 != 0: |
137 continue | 142 continue |
138 n = s[start:index] | 143 n = s[start:index] |
139 if options.ftype=="CDS": | 144 if options.ftype == "CDS": |
140 offset, n, t = start_chop_and_trans(n) | 145 offset, n, t = start_chop_and_trans(n) |
141 else: | 146 else: |
142 offset = 0 | 147 offset = 0 |
143 t = translate(n, options.table, to_stop=True) | 148 t = translate(n, options.table, to_stop=True) |
144 if n and len(t) >= options.min_len: | 149 if n and len(t) >= options.min_len: |
145 yield start + offset, n, t | 150 yield start + offset, n, t |
146 start = index | 151 start = index |
147 if options.ends == "open": | 152 if options.ends == "open": |
148 #No stop codon, Biopython's strict CDS translate will fail | 153 # No stop codon, Biopython's strict CDS translate will fail |
149 n = s[start:] | 154 n = s[start:] |
150 #Ensure we have whole codons | 155 # Ensure we have whole codons |
151 #TODO - Try appending N instead? | 156 # TODO - Try appending N instead? |
152 #TODO - Do the next four lines more elegantly | 157 # TODO - Do the next four lines more elegantly |
153 if len(n) % 3: | 158 if len(n) % 3: |
154 n = n[:-1] | 159 n = n[:-1] |
155 if len(n) % 3: | 160 if len(n) % 3: |
156 n = n[:-1] | 161 n = n[:-1] |
157 if options.ftype=="CDS": | 162 if options.ftype == "CDS": |
158 offset, n, t = start_chop_and_trans(n, strict=False) | 163 offset, n, t = start_chop_and_trans(n, strict=False) |
159 else: | 164 else: |
160 offset = 0 | 165 offset = 0 |
161 t = translate(n, options.table, to_stop=True) | 166 t = translate(n, options.table, to_stop=True) |
162 if n and len(t) >= options.min_len: | 167 if n and len(t) >= options.min_len: |
166 def get_all_peptides(nuc_seq): | 171 def get_all_peptides(nuc_seq): |
167 """Returns start, end, strand, nucleotides, protein. | 172 """Returns start, end, strand, nucleotides, protein. |
168 | 173 |
169 Co-ordinates are Python style zero-based. | 174 Co-ordinates are Python style zero-based. |
170 """ | 175 """ |
171 #TODO - Refactor to use a generator function (in start order) | 176 # TODO - Refactor to use a generator function (in start order) |
172 #rather than making a list and sorting? | 177 # rather than making a list and sorting? |
173 answer = [] | 178 answer = [] |
174 full_len = len(nuc_seq) | 179 full_len = len(nuc_seq) |
175 if options.strand != "reverse": | 180 if options.strand != "reverse": |
176 for frame in range(0,3): | 181 for frame in range(0, 3): |
177 for offset, n, t in break_up_frame(nuc_seq[frame:]): | 182 for offset, n, t in break_up_frame(nuc_seq[frame:]): |
178 start = frame + offset #zero based | 183 start = frame + offset # zero based |
179 answer.append((start, start + len(n), +1, n, t)) | 184 answer.append((start, start + len(n), +1, n, t)) |
180 if options.strand != "forward": | 185 if options.strand != "forward": |
181 rc = reverse_complement(nuc_seq) | 186 rc = reverse_complement(nuc_seq) |
182 for frame in range(0,3) : | 187 for frame in range(0, 3): |
183 for offset, n, t in break_up_frame(rc[frame:]): | 188 for offset, n, t in break_up_frame(rc[frame:]): |
184 start = full_len - frame - offset #zero based | 189 start = full_len - frame - offset # zero based |
185 answer.append((start - len(n), start, -1, n ,t)) | 190 answer.append((start - len(n), start, -1, n, t)) |
186 answer.sort() | 191 answer.sort() |
187 return answer | 192 return answer |
193 | |
188 | 194 |
189 def get_top_peptides(nuc_seq): | 195 def get_top_peptides(nuc_seq): |
190 """Returns all peptides of max length.""" | 196 """Returns all peptides of max length.""" |
191 values = list(get_all_peptides(nuc_seq)) | 197 values = list(get_all_peptides(nuc_seq)) |
192 if not values: | 198 if not values: |
194 max_len = max(len(x[-1]) for x in values) | 200 max_len = max(len(x[-1]) for x in values) |
195 for x in values: | 201 for x in values: |
196 if len(x[-1]) == max_len: | 202 if len(x[-1]) == max_len: |
197 yield x | 203 yield x |
198 | 204 |
205 | |
199 def get_one_peptide(nuc_seq): | 206 def get_one_peptide(nuc_seq): |
200 """Returns first (left most) peptide with max length.""" | 207 """Returns first (left most) peptide with max length.""" |
201 values = list(get_top_peptides(nuc_seq)) | 208 values = list(get_top_peptides(nuc_seq)) |
202 if not values: | 209 if not values: |
203 raise StopIteration | 210 raise StopIteration |
212 | 219 |
213 in_count = 0 | 220 in_count = 0 |
214 out_count = 0 | 221 out_count = 0 |
215 if options.out_nuc_file == "-": | 222 if options.out_nuc_file == "-": |
216 out_nuc = sys.stdout | 223 out_nuc = sys.stdout |
217 else: | 224 elif options.out_nuc_file: |
218 out_nuc = open(options.out_nuc_file, "w") | 225 out_nuc = open(options.out_nuc_file, "w") |
226 else: | |
227 out_nuc = None | |
219 | 228 |
220 if options.out_prot_file == "-": | 229 if options.out_prot_file == "-": |
221 out_prot = sys.stdout | 230 out_prot = sys.stdout |
222 else: | 231 elif options.out_prot_file: |
223 out_prot = open(options.out_prot_file, "w") | 232 out_prot = open(options.out_prot_file, "w") |
233 else: | |
234 out_prot = None | |
224 | 235 |
225 if options.out_bed_file == "-": | 236 if options.out_bed_file == "-": |
226 out_bed = sys.stdout | 237 out_bed = sys.stdout |
227 else: | 238 elif options.out_bed_file: |
228 out_bed = open(options.out_bed_file, "w") | 239 out_bed = open(options.out_bed_file, "w") |
240 else: | |
241 out_bed = None | |
242 | |
243 if options.out_gff3_file == "-": | |
244 out_gff3 = sys.stdout | |
245 elif options.out_gff3_file: | |
246 out_gff3 = open(options.out_gff3_file, "w") | |
247 else: | |
248 out_gff3 = None | |
249 | |
250 if out_gff3: | |
251 out_gff3.write('##gff-version 3\n') | |
229 | 252 |
230 for record in SeqIO.parse(options.input_file, seq_format): | 253 for record in SeqIO.parse(options.input_file, seq_format): |
231 for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())): | 254 for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())): |
232 out_count += 1 | 255 out_count += 1 |
233 if f_strand == +1: | 256 if f_strand == +1: |
234 loc = "%i..%i" % (f_start+1, f_end) | 257 loc = "%i..%i" % (f_start + 1, f_end) |
235 else: | 258 else: |
236 loc = "complement(%i..%i)" % (f_start+1, f_end) | 259 loc = "complement(%i..%i)" % (f_start + 1, f_end) |
237 descr = "length %i aa, %i bp, from %s of %s" \ | 260 descr = "length %i aa, %i bp, from %s of %s" \ |
238 % (len(t), len(n), loc, record.description) | 261 % (len(t), len(n), loc, record.description) |
239 fid = record.id + "|%s%i" % (options.ftype, i+1) | 262 fid = record.id + "|%s%i" % (options.ftype, i + 1) |
240 r = SeqRecord(Seq(n), id = fid, name = "", description= descr) | 263 r = SeqRecord(Seq(n), id=fid, name="", description=descr) |
241 t = SeqRecord(Seq(t), id = fid, name = "", description= descr) | 264 t = SeqRecord(Seq(t), id=fid, name="", description=descr) |
242 SeqIO.write(r, out_nuc, "fasta") | 265 if out_nuc: |
243 SeqIO.write(t, out_prot, "fasta") | 266 SeqIO.write(r, out_nuc, "fasta") |
244 out_bed.write('\t'.join(map(str,[record.id, f_start, f_end, fid, 0, '+' if f_strand == +1 else '-'])) + '\n') | 267 if out_prot: |
268 SeqIO.write(t, out_prot, "fasta") | |
269 nice_strand = '+' if f_strand == +1 else '-' | |
270 if out_bed: | |
271 out_bed.write('\t'.join(map(str, [record.id, f_start, f_end, fid, 0, nice_strand])) + '\n') | |
272 if out_gff3: | |
273 out_gff3.write('\t'.join(map(str, [record.id, 'getOrfsOrCds', 'CDS', f_start + 1, f_end, '.', | |
274 nice_strand, 0, 'ID=%s%s' % (options.ftype, i + 1)])) + '\n') | |
245 in_count += 1 | 275 in_count += 1 |
246 if out_nuc is not sys.stdout: | 276 if out_nuc and out_nuc is not sys.stdout: |
247 out_nuc.close() | 277 out_nuc.close() |
248 if out_prot is not sys.stdout: | 278 if out_prot and out_prot is not sys.stdout: |
249 out_prot.close() | 279 out_prot.close() |
250 if out_bed is not sys.stdout: | 280 if out_bed and out_bed is not sys.stdout: |
251 out_bed.close() | 281 out_bed.close() |
252 | 282 |
253 print "Found %i %ss in %i sequences" % (out_count, options.ftype, in_count) | 283 print "Found %i %ss in %i sequences" % (out_count, options.ftype, in_count) |