comparison tools/get_orfs_or_cdss/get_orfs_or_cdss.py @ 8:09a8be9247ca draft

v0.2.0 with GFF3 output
author peterjc
date Sat, 09 Jan 2016 23:42:32 -0500
parents 705a2e2df7fb
children a06ad07431ba
comparison
equal deleted inserted replaced
7:705a2e2df7fb 8:09a8be9247ca
14 14
15 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute 15 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute
16 (formerly SCRI), Dundee, UK. All rights reserved. 16 (formerly SCRI), Dundee, UK. All rights reserved.
17 17
18 See accompanying text file for licence details (MIT licence). 18 See accompanying text file for licence details (MIT licence).
19
20 This is version 0.1.0 of the script.
21 """ 19 """
22 import sys 20 import sys
23 import re 21 import re
24 from optparse import OptionParser 22 from optparse import OptionParser
25 23
26 def sys_exit(msg, err=1):
27 sys.stderr.write(msg.rstrip() + "\n")
28 sys.exit(err)
29
30 usage = """Use as follows: 24 usage = """Use as follows:
31 25
32 $ python get_orfs_or_cdss.py -i genome.fa -f fasta --table 11 -t CDS -e open -m all -s both --on cds.nuc.fa --op cds.protein.fa --ob cds.bed 26 $ python get_orfs_or_cdss.py -i genome.fa -f fasta --table 11 -t CDS -e open -m all -s both --on cds.nuc.fa --op cds.protein.fa --ob cds.bed --og cds.gff3
33 """ 27 """
34 28
35 try: 29 try:
36 from Bio.Seq import Seq, reverse_complement, translate 30 from Bio.Seq import Seq, reverse_complement, translate
37 from Bio.SeqRecord import SeqRecord 31 from Bio.SeqRecord import SeqRecord
38 from Bio import SeqIO 32 from Bio import SeqIO
39 from Bio.Data import CodonTable 33 from Bio.Data import CodonTable
40 except ImportError: 34 except ImportError:
41 sys_exit("Missing Biopython library") 35 sys.exit("Missing Biopython library")
42 36
43 37
44 parser = OptionParser(usage=usage) 38 parser = OptionParser(usage=usage)
45 parser.add_option('-i', '--input', dest='input_file', 39 parser.add_option('-i', '--input', dest='input_file',
46 default=None, help='Input fasta file', 40 default=None, help='Input fasta file',
71 default=None, help='Output protein sequences, or - for STDOUT', 65 default=None, help='Output protein sequences, or - for STDOUT',
72 metavar='FILE') 66 metavar='FILE')
73 parser.add_option('--ob', dest='out_bed_file', 67 parser.add_option('--ob', dest='out_bed_file',
74 default=None, help='Output BED file, or - for STDOUT', 68 default=None, help='Output BED file, or - for STDOUT',
75 metavar='FILE') 69 metavar='FILE')
70 parser.add_option('--og', dest='out_gff3_file',
71 default=None, help='Output GFF3 file, or - for STDOUT',
72 metavar='FILE')
76 parser.add_option('-v', '--version', dest='version', 73 parser.add_option('-v', '--version', dest='version',
77 default=False, action='store_true', 74 default=False, action='store_true',
78 help='Show version and quit') 75 help='Show version and quit')
79 76
80 options, args = parser.parse_args() 77 options, args = parser.parse_args()
81 78
82 if options.version: 79 if options.version:
83 print "v0.1.0" 80 print("v0.2.0")
84 sys.exit(0) 81 sys.exit(0)
82
83 if not options.input_file:
84 sys.exit("Input file is required")
85
86 if not any((options.out_nuc_file, options.out_prot_file, options.out_bed_file, options.out_gff3_file)):
87 sys.exit("At least one output file is required")
85 88
86 try: 89 try:
87 table_obj = CodonTable.ambiguous_generic_by_id[options.table] 90 table_obj = CodonTable.ambiguous_generic_by_id[options.table]
88 except KeyError: 91 except KeyError:
89 sys_exit("Unknown codon table %i" % options.table) 92 sys.exit("Unknown codon table %i" % options.table)
90 93
91 if options.seq_format.lower()=="sff": 94 if options.seq_format.lower() == "sff":
92 seq_format = "sff-trim" 95 seq_format = "sff-trim"
93 elif options.seq_format.lower()=="fasta": 96 elif options.seq_format.lower() == "fasta":
94 seq_format = "fasta" 97 seq_format = "fasta"
95 elif options.seq_format.lower().startswith("fastq"): 98 elif options.seq_format.lower().startswith("fastq"):
96 seq_format = "fastq" 99 seq_format = "fastq"
97 else: 100 else:
98 sys_exit("Unsupported file type %r" % options.seq_format) 101 sys.exit("Unsupported file type %r" % options.seq_format)
99 102
100 print "Genetic code table %i" % options.table 103 print "Genetic code table %i" % options.table
101 print "Minimum length %i aa" % options.min_len 104 print "Minimum length %i aa" % options.min_len
102 #print "Taking %s ORF(s) from %s strand(s)" % (mode, strand) 105 # print "Taking %s ORF(s) from %s strand(s)" % (mode, strand)
103 106
104 starts = sorted(table_obj.start_codons) 107 starts = sorted(table_obj.start_codons)
105 assert "NNN" not in starts 108 assert "NNN" not in starts
106 re_starts = re.compile("|".join(starts)) 109 re_starts = re.compile("|".join(starts))
107 110
108 stops = sorted(table_obj.stop_codons) 111 stops = sorted(table_obj.stop_codons)
109 assert "NNN" not in stops 112 assert "NNN" not in stops
110 re_stops = re.compile("|".join(stops)) 113 re_stops = re.compile("|".join(stops))
114
111 115
112 def start_chop_and_trans(s, strict=True): 116 def start_chop_and_trans(s, strict=True):
113 """Returns offset, trimmed nuc, protein.""" 117 """Returns offset, trimmed nuc, protein."""
114 if strict: 118 if strict:
115 assert s[-3:] in stops, s 119 assert s[-3:] in stops, s
116 assert len(s) % 3 == 0 120 assert len(s) % 3 == 0
117 for match in re_starts.finditer(s): 121 for match in re_starts.finditer(s):
118 #Must check the start is in frame 122 # Must check the start is in frame
119 start = match.start() 123 start = match.start()
120 if start % 3 == 0: 124 if start % 3 == 0:
121 n = s[start:] 125 n = s[start:]
122 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) 126 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
123 if strict: 127 if strict:
124 t = translate(n, options.table, cds=True) 128 t = translate(n, options.table, cds=True)
125 else: 129 else:
126 #Use when missing stop codon, 130 # Use when missing stop codon,
127 t = "M" + translate(n[3:], options.table, to_stop=True) 131 t = "M" + translate(n[3:], options.table, to_stop=True)
128 return start, n, t 132 return start, n, t
129 return None, None, None 133 return None, None, None
134
130 135
131 def break_up_frame(s): 136 def break_up_frame(s):
132 """Returns offset, nuc, protein.""" 137 """Returns offset, nuc, protein."""
133 start = 0 138 start = 0
134 for match in re_stops.finditer(s): 139 for match in re_stops.finditer(s):
135 index = match.start() + 3 140 index = match.start() + 3
136 if index % 3 != 0: 141 if index % 3 != 0:
137 continue 142 continue
138 n = s[start:index] 143 n = s[start:index]
139 if options.ftype=="CDS": 144 if options.ftype == "CDS":
140 offset, n, t = start_chop_and_trans(n) 145 offset, n, t = start_chop_and_trans(n)
141 else: 146 else:
142 offset = 0 147 offset = 0
143 t = translate(n, options.table, to_stop=True) 148 t = translate(n, options.table, to_stop=True)
144 if n and len(t) >= options.min_len: 149 if n and len(t) >= options.min_len:
145 yield start + offset, n, t 150 yield start + offset, n, t
146 start = index 151 start = index
147 if options.ends == "open": 152 if options.ends == "open":
148 #No stop codon, Biopython's strict CDS translate will fail 153 # No stop codon, Biopython's strict CDS translate will fail
149 n = s[start:] 154 n = s[start:]
150 #Ensure we have whole codons 155 # Ensure we have whole codons
151 #TODO - Try appending N instead? 156 # TODO - Try appending N instead?
152 #TODO - Do the next four lines more elegantly 157 # TODO - Do the next four lines more elegantly
153 if len(n) % 3: 158 if len(n) % 3:
154 n = n[:-1] 159 n = n[:-1]
155 if len(n) % 3: 160 if len(n) % 3:
156 n = n[:-1] 161 n = n[:-1]
157 if options.ftype=="CDS": 162 if options.ftype == "CDS":
158 offset, n, t = start_chop_and_trans(n, strict=False) 163 offset, n, t = start_chop_and_trans(n, strict=False)
159 else: 164 else:
160 offset = 0 165 offset = 0
161 t = translate(n, options.table, to_stop=True) 166 t = translate(n, options.table, to_stop=True)
162 if n and len(t) >= options.min_len: 167 if n and len(t) >= options.min_len:
166 def get_all_peptides(nuc_seq): 171 def get_all_peptides(nuc_seq):
167 """Returns start, end, strand, nucleotides, protein. 172 """Returns start, end, strand, nucleotides, protein.
168 173
169 Co-ordinates are Python style zero-based. 174 Co-ordinates are Python style zero-based.
170 """ 175 """
171 #TODO - Refactor to use a generator function (in start order) 176 # TODO - Refactor to use a generator function (in start order)
172 #rather than making a list and sorting? 177 # rather than making a list and sorting?
173 answer = [] 178 answer = []
174 full_len = len(nuc_seq) 179 full_len = len(nuc_seq)
175 if options.strand != "reverse": 180 if options.strand != "reverse":
176 for frame in range(0,3): 181 for frame in range(0, 3):
177 for offset, n, t in break_up_frame(nuc_seq[frame:]): 182 for offset, n, t in break_up_frame(nuc_seq[frame:]):
178 start = frame + offset #zero based 183 start = frame + offset # zero based
179 answer.append((start, start + len(n), +1, n, t)) 184 answer.append((start, start + len(n), +1, n, t))
180 if options.strand != "forward": 185 if options.strand != "forward":
181 rc = reverse_complement(nuc_seq) 186 rc = reverse_complement(nuc_seq)
182 for frame in range(0,3) : 187 for frame in range(0, 3):
183 for offset, n, t in break_up_frame(rc[frame:]): 188 for offset, n, t in break_up_frame(rc[frame:]):
184 start = full_len - frame - offset #zero based 189 start = full_len - frame - offset # zero based
185 answer.append((start - len(n), start, -1, n ,t)) 190 answer.append((start - len(n), start, -1, n, t))
186 answer.sort() 191 answer.sort()
187 return answer 192 return answer
193
188 194
189 def get_top_peptides(nuc_seq): 195 def get_top_peptides(nuc_seq):
190 """Returns all peptides of max length.""" 196 """Returns all peptides of max length."""
191 values = list(get_all_peptides(nuc_seq)) 197 values = list(get_all_peptides(nuc_seq))
192 if not values: 198 if not values:
194 max_len = max(len(x[-1]) for x in values) 200 max_len = max(len(x[-1]) for x in values)
195 for x in values: 201 for x in values:
196 if len(x[-1]) == max_len: 202 if len(x[-1]) == max_len:
197 yield x 203 yield x
198 204
205
199 def get_one_peptide(nuc_seq): 206 def get_one_peptide(nuc_seq):
200 """Returns first (left most) peptide with max length.""" 207 """Returns first (left most) peptide with max length."""
201 values = list(get_top_peptides(nuc_seq)) 208 values = list(get_top_peptides(nuc_seq))
202 if not values: 209 if not values:
203 raise StopIteration 210 raise StopIteration
212 219
213 in_count = 0 220 in_count = 0
214 out_count = 0 221 out_count = 0
215 if options.out_nuc_file == "-": 222 if options.out_nuc_file == "-":
216 out_nuc = sys.stdout 223 out_nuc = sys.stdout
217 else: 224 elif options.out_nuc_file:
218 out_nuc = open(options.out_nuc_file, "w") 225 out_nuc = open(options.out_nuc_file, "w")
226 else:
227 out_nuc = None
219 228
220 if options.out_prot_file == "-": 229 if options.out_prot_file == "-":
221 out_prot = sys.stdout 230 out_prot = sys.stdout
222 else: 231 elif options.out_prot_file:
223 out_prot = open(options.out_prot_file, "w") 232 out_prot = open(options.out_prot_file, "w")
233 else:
234 out_prot = None
224 235
225 if options.out_bed_file == "-": 236 if options.out_bed_file == "-":
226 out_bed = sys.stdout 237 out_bed = sys.stdout
227 else: 238 elif options.out_bed_file:
228 out_bed = open(options.out_bed_file, "w") 239 out_bed = open(options.out_bed_file, "w")
240 else:
241 out_bed = None
242
243 if options.out_gff3_file == "-":
244 out_gff3 = sys.stdout
245 elif options.out_gff3_file:
246 out_gff3 = open(options.out_gff3_file, "w")
247 else:
248 out_gff3 = None
249
250 if out_gff3:
251 out_gff3.write('##gff-version 3\n')
229 252
230 for record in SeqIO.parse(options.input_file, seq_format): 253 for record in SeqIO.parse(options.input_file, seq_format):
231 for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())): 254 for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())):
232 out_count += 1 255 out_count += 1
233 if f_strand == +1: 256 if f_strand == +1:
234 loc = "%i..%i" % (f_start+1, f_end) 257 loc = "%i..%i" % (f_start + 1, f_end)
235 else: 258 else:
236 loc = "complement(%i..%i)" % (f_start+1, f_end) 259 loc = "complement(%i..%i)" % (f_start + 1, f_end)
237 descr = "length %i aa, %i bp, from %s of %s" \ 260 descr = "length %i aa, %i bp, from %s of %s" \
238 % (len(t), len(n), loc, record.description) 261 % (len(t), len(n), loc, record.description)
239 fid = record.id + "|%s%i" % (options.ftype, i+1) 262 fid = record.id + "|%s%i" % (options.ftype, i + 1)
240 r = SeqRecord(Seq(n), id = fid, name = "", description= descr) 263 r = SeqRecord(Seq(n), id=fid, name="", description=descr)
241 t = SeqRecord(Seq(t), id = fid, name = "", description= descr) 264 t = SeqRecord(Seq(t), id=fid, name="", description=descr)
242 SeqIO.write(r, out_nuc, "fasta") 265 if out_nuc:
243 SeqIO.write(t, out_prot, "fasta") 266 SeqIO.write(r, out_nuc, "fasta")
244 out_bed.write('\t'.join(map(str,[record.id, f_start, f_end, fid, 0, '+' if f_strand == +1 else '-'])) + '\n') 267 if out_prot:
268 SeqIO.write(t, out_prot, "fasta")
269 nice_strand = '+' if f_strand == +1 else '-'
270 if out_bed:
271 out_bed.write('\t'.join(map(str, [record.id, f_start, f_end, fid, 0, nice_strand])) + '\n')
272 if out_gff3:
273 out_gff3.write('\t'.join(map(str, [record.id, 'getOrfsOrCds', 'CDS', f_start + 1, f_end, '.',
274 nice_strand, 0, 'ID=%s%s' % (options.ftype, i + 1)])) + '\n')
245 in_count += 1 275 in_count += 1
246 if out_nuc is not sys.stdout: 276 if out_nuc and out_nuc is not sys.stdout:
247 out_nuc.close() 277 out_nuc.close()
248 if out_prot is not sys.stdout: 278 if out_prot and out_prot is not sys.stdout:
249 out_prot.close() 279 out_prot.close()
250 if out_bed is not sys.stdout: 280 if out_bed and out_bed is not sys.stdout:
251 out_bed.close() 281 out_bed.close()
252 282
253 print "Found %i %ss in %i sequences" % (out_count, options.ftype, in_count) 283 print "Found %i %ss in %i sequences" % (out_count, options.ftype, in_count)