comparison translate_bed_sequences.py @ 2:4221664a2bd0 draft default tip

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/translate_bed_sequences commit 2a470e2c775a7427aa530e058510e4dc7b6d8e80"
author galaxyp
date Tue, 07 Apr 2020 11:45:53 -0400
parents d723eb657f1d
children
comparison
equal deleted inserted replaced
1:6bbce76c78c1 2:4221664a2bd0
8 # Author: 8 # Author:
9 # 9 #
10 # James E Johnson 10 # James E Johnson
11 # 11 #
12 #------------------------------------------------------------------------------ 12 #------------------------------------------------------------------------------
13 """ 13
14
15 """
16 Input: BED file (12 column) + 13th sequence column appended by extract_genomic_dna 14 Input: BED file (12 column) + 13th sequence column appended by extract_genomic_dna
17 Output: Fasta of 3-frame translations of the spliced sequence 15 Output: Fasta of 3-frame translations of the spliced sequence
18
19 """ 16 """
20 17
21 import sys,re,os.path 18 import optparse
19 import os.path
20 import sys
22 import tempfile 21 import tempfile
23 import optparse 22
24 from optparse import OptionParser 23 from Bio.Seq import (
25 from Bio.Seq import reverse_complement, transcribe, back_transcribe, translate 24 reverse_complement,
26 25 translate
27 class BedEntry( object ): 26 )
28 def __init__(self, line): 27
29 self.line = line 28
29 class BedEntry(object):
30 def __init__(self, line):
31 self.line = line
32 try:
33 fields = line.rstrip('\r\n').split('\t')
34 (chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts) = fields[0:12]
35 seq = fields[12] if len(fields) > 12 else None
36 self.chrom = chrom
37 self.chromStart = int(chromStart)
38 self.chromEnd = int(chromEnd)
39 self.name = name
40 self.score = int(score)
41 self.strand = strand
42 self.thickStart = int(thickStart)
43 self.thickEnd = int(thickEnd)
44 self.itemRgb = itemRgb
45 self.blockCount = int(blockCount)
46 self.blockSizes = [int(x) for x in blockSizes.split(',')]
47 self.blockStarts = [int(x) for x in blockStarts.split(',')]
48 self.seq = seq
49 except Exception as e:
50 sys.stderr.write("Unable to read Bed entry %s\n" % e)
51 exit(1)
52
53 def __str__(self):
54 return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s%s' % (
55 self.chrom, self.chromStart, self.chromEnd, self.name, self.score, self.strand, self.thickStart, self.thickEnd, self.itemRgb, self.blockCount,
56 ','.join([str(x) for x in self.blockSizes]),
57 ','.join([str(x) for x in self.blockStarts]),
58 '\t%s' % self.seq if self.seq else '')
59
60 def get_splice_junctions(self):
61 splice_juncs = []
62 for i in range(self.blockCount - 1):
63 splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i + 1])
64 splice_juncs.append(splice_junc)
65 return splice_juncs
66
67 def get_exon_seqs(self):
68 exons = []
69 for i in range(self.blockCount):
70 # splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1])
71 exons.append(self.seq[self.blockStarts[i]:self.blockStarts[i] + self.blockSizes[i]])
72 if self.strand == '-': # reverse complement
73 exons.reverse()
74 for i, s in enumerate(exons):
75 exons[i] = reverse_complement(s)
76 return exons
77
78 def get_spliced_seq(self):
79 seq = ''.join(self.get_exon_seqs())
80 return seq
81
82 def get_translation(self, sequence=None):
83 translation = None
84 seq = sequence if sequence else self.get_spliced_seq()
85 if seq:
86 seqlen = int(len(seq) / 3) * 3
87 if seqlen >= 3:
88 translation = translate(seq[:seqlen])
89 return translation
90
91 def get_translations(self):
92 translations = []
93 seq = self.get_spliced_seq()
94 if seq:
95 for i in range(3):
96 translation = self.get_translation(sequence=seq[i:])
97 if translation:
98 translations.append(translation)
99 return translations
100
101 def get_subrange(self, tstart, tstop):
102 """
103 (start, end)
104 """
105 chromStart = self.chromStart
106 chromEnd = self.chromEnd
107 r = range(self.blockCount)
108 if self.strand == '-':
109 r = list(r)
110 r.reverse()
111 bStart = 0
112 for x in r:
113 bEnd = bStart + self.blockSizes[x]
114 if bStart <= tstart < bEnd:
115 if self.strand == '+':
116 chromStart = self.chromStart + self.blockStarts[x] + (tstart - bStart)
117 else:
118 chromEnd = self.chromStart + self.blockStarts[x] + (tstart - bStart)
119 if bStart <= tstop < bEnd:
120 if self.strand == '+':
121 chromEnd = self.chromStart + self.blockStarts[x] + (tstop - bStart)
122 else:
123 chromStart = self.chromStart + self.blockStarts[x] + self.blockSizes[x] - (tstop - bStart)
124 bStart += self.blockSizes[x]
125 return(chromStart, chromEnd)
126
127 def get_blocks(self, chromStart, chromEnd):
128 """
129 get the blocks for sub range
130 """
131 tblockCount = 0
132 tblockSizes = []
133 tblockStarts = []
134 for x in range(self.blockCount):
135 bStart = self.chromStart + self.blockStarts[x]
136 bEnd = bStart + self.blockSizes[x]
137 if bStart > chromEnd:
138 break
139 if bEnd < chromStart:
140 continue
141 cStart = max(chromStart, bStart)
142 tblockStarts.append(cStart - chromStart)
143 tblockSizes.append(min(chromEnd, bEnd) - cStart)
144 tblockCount += 1
145 # print >> sys.stderr, "tblockCount: %d tblockStarts: %s tblockSizes: %s" % (tblockCount, tblockStarts, tblockSizes)
146 return (tblockCount, tblockSizes, tblockStarts)
147
148 def get_filterd_translations(self, untrimmed=False, filtering=True, ignore_left_bp=0, ignore_right_bp=0, debug=False):
149 """
150 [(start, end, seq, blockCount, blockSizes, blockStarts), (start, end, seq, blockCount, blockSizes, blockStarts), (start, end, seq, blockCount, blockSizes, blockStarts)]
151 filter: ignore translation if stop codon in first exon after ignore_left_bp
152 """
153 translations = [None, None, None, None, None, None]
154 seq = self.get_spliced_seq()
155 ignore = int((ignore_left_bp if self.strand == '+' else ignore_right_bp) / 3)
156 block_sum = sum(self.blockSizes)
157 exon_sizes = [x for x in self.blockSizes]
158 if self.strand == '-':
159 exon_sizes.reverse()
160 splice_sites = [int(sum(exon_sizes[:x]) / 3) for x in range(1, len(exon_sizes))]
161 if debug:
162 sys.stderr.write("splice_sites: %s\n" % splice_sites)
163 junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0]
164 if seq:
165 for i in range(3):
166 translation = self.get_translation(sequence=seq[i:])
167 if translation:
168 tstart = 0
169 tstop = len(translation)
170 offset = (block_sum - i) % 3
171 if debug:
172 sys.stderr.write("frame: %d\ttstart: %d tstop: %d offset: %d\t%s\n" % (i, tstart, tstop, offset, translation))
173 if not untrimmed:
174 tstart = translation.rfind('*', 0, junc) + 1
175 stop = translation.find('*', junc)
176 tstop = stop if stop >= 0 else len(translation)
177 offset = (block_sum - i) % 3
178 trimmed = translation[tstart:tstop]
179 if debug:
180 sys.stderr.write("frame: %d\ttstart: %d tstop: %d offset: %d\t%s\n" % (i, tstart, tstop, offset, trimmed))
181 if filtering and tstart > ignore:
182 continue
183 # get genomic locations for start and end
184 if self.strand == '+':
185 chromStart = self.chromStart + i + (tstart * 3)
186 chromEnd = self.chromEnd - offset - (len(translation) - tstop) * 3
187 else:
188 chromStart = self.chromStart + offset + (len(translation) - tstop) * 3
189 chromEnd = self.chromEnd - i - (tstart * 3)
190 # get the blocks for this translation
191 (tblockCount, tblockSizes, tblockStarts) = self.get_blocks(chromStart, chromEnd)
192 translations[i] = (chromStart, chromEnd, trimmed, tblockCount, tblockSizes, tblockStarts)
193 if debug:
194 sys.stderr.write("tblockCount: %d tblockStarts: %s tblockSizes: %s\n" % (tblockCount, tblockStarts, tblockSizes))
195 # translations[i] = (chromStart, chromEnd, trimmed, tblockCount, tblockSizes, tblockStarts)
196 return translations
197
198 def get_seq_id(self, seqtype='unk:unk', reference='', frame=None):
199 """
200 # Ensembl fasta ID format
201 >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT
202 >ENSP00000328693 pep:splice chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding
203 """
204 frame_name = ''
205 chromStart = self.chromStart
206 chromEnd = self.chromEnd
207 strand = 1 if self.strand == '+' else -1
208 if frame is not None:
209 block_sum = sum(self.blockSizes)
210 offset = (block_sum - frame) % 3
211 frame_name = '_' + str(frame + 1)
212 if self.strand == '+':
213 chromStart += frame
214 chromEnd -= offset
215 else:
216 chromStart += offset
217 chromEnd -= frame
218 location = "chromosome:%s:%s:%s:%s:%s" % (reference, self.chrom, chromStart, chromEnd, strand)
219 seq_id = "%s%s %s %s" % (self.name, frame_name, seqtype, location)
220 return seq_id
221
222 def get_line(self, start_offset=0, end_offset=0):
223 if start_offset or end_offset:
224 s_offset = start_offset if start_offset else 0
225 e_offset = end_offset if end_offset else 0
226 if s_offset > self.chromStart:
227 s_offset = self.chromStart
228 chrStart = self.chromStart - s_offset
229 chrEnd = self.chromEnd + e_offset
230 blkSizes = self.blockSizes
231 blkSizes[0] += s_offset
232 blkSizes[-1] += e_offset
233 blkStarts = self.blockStarts
234 for i in range(1, self.blockCount):
235 blkStarts[i] += s_offset
236 items = [str(x) for x in [self.chrom, chrStart, chrEnd, self.name, self.score, self.strand, self.thickStart, self.thickEnd, self.itemRgb, self.blockCount, ','.join([str(x) for x in blkSizes]), ','.join([str(x) for x in blkStarts])]]
237 return '\t'.join(items) + '\n'
238 return self.line
239
240
241 def __main__():
242 # Parse Command Line
243 parser = optparse.OptionParser()
244 parser.add_option('-i', '--input', dest='input', help='BED file (tophat junctions.bed) with sequence column added')
245 parser.add_option('-o', '--output', dest='output', help='Translations of spliced sequence')
246 parser.add_option('-b', '--bed_format', dest='bed_format', action='store_true', default=False, help='Append translations to bed file instead of fasta')
247 parser.add_option('-D', '--fa_db', dest='fa_db', default=None, help='Prefix DB identifier for fasta ID line, e.g. generic')
248 parser.add_option('-s', '--fa_sep', dest='fa_sep', default='|', help='fasta ID separator defaults to pipe char, e.g. generic|ProtID|description')
249 parser.add_option('-B', '--bed', dest='bed', default=None, help='Output a bed file with added 13th column having translation')
250 parser.add_option('-G', '--gff3', dest='gff', default=None, help='Output translations to a GFF3 file')
251 parser.add_option('-S', '--seqtype', dest='seqtype', default='pep:splice', help='SEQTYPE:STATUS for fasta ID line')
252 parser.add_option('-P', '--id_prefix', dest='id_prefix', default='', help='prefix for the sequence ID')
253 parser.add_option('-R', '--reference', dest='reference', default=None, help='Genome Reference Name for fasta ID location ')
254 parser.add_option('-r', '--refsource', dest='refsource', default=None, help='Source for Genome Reference, e.g. Ensembl, UCSC, or NCBI')
255 parser.add_option('-Q', '--score_name', dest='score_name', default=None, help='include in the fasta ID line score_name:score ')
256 parser.add_option('-l', '--leading_bp', dest='leading_bp', type='int', default=None, help='leading number of base pairs to ignore when filtering')
257 parser.add_option('-t', '--trailing_bp', dest='trailing_bp', type='int', default=None, help='trailing number of base pairs to ignore when filtering')
258 parser.add_option('-U', '--unfiltered', dest='filtering', action='store_false', default=True, help='Do NOT filterout translation with stop codon in the first exon')
259 parser.add_option('-u', '--untrimmed', dest='untrimmed', action='store_true', default=False, help='Do NOT trim from splice site to stop codon')
260 parser.add_option('-L', '--min_length', dest='min_length', type='int', default=None, help='Minimun length (to first stop codon)')
261 parser.add_option('-M', '--max_stop_codons', dest='max_stop_codons', type='int', default=None, help='Filter out translations with more than max_stop_codons')
262 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout')
263 (options, args) = parser.parse_args()
264 # Input files
265 if options.input is not None:
266 try:
267 inputPath = os.path.abspath(options.input)
268 inputFile = open(inputPath, 'r')
269 except Exception as e:
270 sys.stderr.write("failed: %s\n" % e)
271 exit(2)
272 else:
273 inputFile = sys.stdin
274 # Output files
275 bed_fh = None
276 gff_fh = None
277 gff_fa_file = None
278 gff_fa = None
279 outFile = None
280 if options.output is None:
281 # write to stdout
282 outFile = sys.stdout
283 if options.gff:
284 gff_fa_file = tempfile.NamedTemporaryFile(prefix='gff_fasta_', suffix=".fa", dir=os.getcwd()).name
285 gff_fa = open(gff_fa_file, 'w')
286 else:
287 try:
288 outPath = os.path.abspath(options.output)
289 outFile = open(outPath, 'w')
290 except Exception as e:
291 sys.stderr.write("failed: %s\n" % e)
292 exit(3)
293 if options.gff:
294 gff_fa_file = outPath
295 if options.bed:
296 bed_fh = open(options.bed, 'w')
297 bed_fh.write('track name="%s" description="%s" \n' % ('novel_junctioni_translations', 'test'))
298 if options.gff:
299 gff_fh = open(options.gff, 'w')
300 gff_fh.write("##gff-version 3.2.1\n")
301 if options.reference:
302 gff_fh.write("##genome-build %s %s\n" % (options.refsource if options.refsource else 'unknown', options.reference))
303 leading_bp = 0
304 trailing_bp = 0
305 if options.leading_bp:
306 if options.leading_bp >= 0:
307 leading_bp = options.leading_bp
308 else:
309 sys.stderr.write("failed: leading_bp must be positive\n")
310 exit(5)
311 if options.trailing_bp:
312 if options.trailing_bp >= 0:
313 trailing_bp = options.trailing_bp
314 else:
315 sys.stderr.write("failed: trailing_bp must be positive\n")
316 exit(5)
317 # Scan bed file
30 try: 318 try:
31 fields = line.rstrip('\r\n').split('\t') 319 for i, line in enumerate(inputFile):
32 (chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts) = fields[0:12] 320 if line.startswith('track'):
33 seq = fields[12] if len(fields) > 12 else None 321 if outFile and options.bed_format:
34 self.chrom = chrom 322 outFile.write(line)
35 self.chromStart = int(chromStart) 323 continue
36 self.chromEnd = int(chromEnd) 324 entry = BedEntry(line)
37 self.name = name 325 strand = 1 if entry.strand == '+' else -1
38 self.score = int(score) 326 translations = entry.get_translations()
39 self.strand = strand 327 if options.debug:
40 self.thickStart = int(thickStart) 328 exon_seqs = entry.get_exon_seqs()
41 self.thickEnd = int(thickEnd) 329 exon_sizes = [len(seq) for seq in exon_seqs]
42 self.itemRgb = itemRgb 330 splice_sites = [int(sum(exon_sizes[:x]) / 3) for x in range(1, len(exon_sizes))]
43 self.blockCount = int(blockCount) 331 sys.stderr.write("%s\n" % entry.name)
44 self.blockSizes = [int(x) for x in blockSizes.split(',')] 332 sys.stderr.write("%s\n" % line.rstrip('\r\n'))
45 self.blockStarts = [int(x) for x in blockStarts.split(',')] 333 sys.stderr.write("exons: %s\n" % exon_seqs)
46 self.seq = seq 334 sys.stderr.write("%s\n" % splice_sites)
47 except Exception, e: 335 for i, translation in enumerate(translations):
48 print >> sys.stderr, "Unable to read Bed entry" % e 336 sys.stderr.write("frame %d: %s\n" % (i + 1, translation))
49 exit(1) 337 sys.stderr.write("splice: %s\n" % (''.join(['^' if int(((j * 3) + i) / 3) in splice_sites else '-' for j in range(len(translation))])))
50 def __str__(self): 338 sys.stderr.write("\n")
51 return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s%s' % ( 339 if options.bed_format:
52 self.chrom, self.chromStart, self.chromEnd, self.name, self.score, self.strand, self.thickStart, self.thickEnd, self.itemRgb, self.blockCount, 340 tx_entry = "%s\t%s\n" % (line.rstrip('\r\n'), '\t'.join(translations))
53 ','.join([str(x) for x in self.blockSizes]), 341 outFile.write(tx_entry)
54 ','.join([str(x) for x in self.blockStarts]), 342 else:
55 '\t%s' % self.seq if self.seq else '') 343 translations = entry.get_filterd_translations(untrimmed=options.untrimmed, filtering=options.filtering, ignore_left_bp=leading_bp, ignore_right_bp=trailing_bp, debug=options.debug)
56 def get_splice_junctions(self): 344 for i, tx in enumerate(translations):
57 splice_juncs = [] 345 if tx:
58 for i in range(self.blockCount - 1): 346 (chromStart, chromEnd, translation, blockCount, blockSizes, blockStarts) = tx
59 splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1]) 347 if options.min_length is not None and len(translation) < options.min_length:
60 splice_juncs.append(splice_junc) 348 continue
61 return splice_juncs 349 if options.max_stop_codons is not None and translation.count('*') > options.max_stop_codons:
62 def get_exon_seqs(self): 350 continue
63 exons = [] 351 frame_name = '_%s' % (i + 1)
64 for i in range(self.blockCount): 352 pep_id = "%s%s%s" % (options.id_prefix, entry.name, frame_name)
65 # splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1]) 353 if bed_fh:
66 exons.append(self.seq[self.blockStarts[i]:self.blockStarts[i] + self.blockSizes[i]]) 354 bed_fh.write('%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\n' % (str(entry.chrom), chromStart, chromEnd, pep_id, entry.score, entry.strand, chromStart, chromEnd, entry.itemRgb, blockCount, ','.join([str(x) for x in blockSizes]), ','.join([str(x) for x in blockStarts]), translation))
67 if self.strand == '-': #reverse complement 355 location = "chromosome:%s:%s:%s:%s:%s" % (options.reference, entry.chrom, chromStart, chromEnd, strand)
68 exons.reverse() 356 score = " %s:%s" % (options.score_name, entry.score) if options.score_name else ''
69 for i,s in enumerate(exons): 357 seq_description = "%s %s%s" % (options.seqtype, location, score)
70 exons[i] = reverse_complement(s) 358 seq_id = "%s " % pep_id
71 return exons 359 if options.fa_db:
72 def get_spliced_seq(self): 360 seq_id = "%s%s%s%s" % (options.fa_db, options.fa_sep, pep_id, options.fa_sep)
73 seq = ''.join(self.get_exon_seqs()) 361 fa_id = "%s%s" % (seq_id, seq_description)
74 return seq 362 fa_entry = ">%s\n%s\n" % (fa_id, translation)
75 def get_translation(self,sequence=None): 363 outFile.write(fa_entry)
76 translation = None 364 if gff_fh:
77 seq = sequence if sequence else self.get_spliced_seq() 365 if gff_fa:
78 if seq: 366 gff_fa.write(fa_entry)
79 seqlen = len(seq) / 3 * 3; 367 gff_fh.write("##sequence-region %s %d %d\n" % (entry.chrom, chromStart + 1, chromEnd - 1))
80 if seqlen >= 3: 368 gff_fh.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\tID=%s\n" % (entry.chrom, 'splice_junc', 'gene', chromStart + 1, chromEnd - 1, entry.score, entry.strand, 0, pep_id))
81 translation = translate(seq[:seqlen]) 369 for x in range(blockCount):
82 return translation 370 start = chromStart + blockStarts[x] + 1
83 def get_translations(self): 371 end = start + blockSizes[x] - 1
84 translations = [] 372 phase = (3 - sum(blockSizes[:x]) % 3) % 3
85 seq = self.get_spliced_seq() 373 gff_fh.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\tParent=%s;ID=%s_%d\n" % (entry.chrom, 'splice_junc', 'CDS', start, end, entry.score, entry.strand, phase, pep_id, pep_id, x))
86 if seq: 374 # ##gff-version 3
87 for i in range(3): 375 # ##sequence-region 19 1 287484
88 translation = self.get_translation(sequence=seq[i:]) 376 # 19 MassSpec peptide 282299 287484 10.0 - 0 ID=TEARLSFYSGHSSFGMYCMVFLALYVQ
89 if translation: 377 # 19 MassSpec CDS 287474 287484 . - 0 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812
90 translations.append(translation) 378 # 19 MassSpec CDS 282752 282809 . - 1 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812
91 return translations 379 # 19 MassSpec CDS 282299 282310 . - 0 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812
92 ## (start,end) 380 if bed_fh:
93 def get_subrange(self,tstart,tstop): 381 bed_fh.close()
94 chromStart = self.chromStart 382 if gff_fh:
95 chromEnd = self.chromEnd 383 if gff_fa:
96 r = range(self.blockCount) 384 gff_fa.close()
97 if self.strand == '-': 385 else:
98 r.reverse() 386 outFile.close()
99 bStart = 0 387 gff_fa = open(gff_fa_file, 'r')
100 for x in r: 388 gff_fh.write("##FASTA\n")
101 bEnd = bStart + self.blockSizes[x] 389 for i, line in enumerate(gff_fa):
102 if bStart <= tstart < bEnd: 390 gff_fh.write(line)
103 if self.strand == '+': 391 gff_fh.close()
104 chromStart = self.chromStart + self.blockStarts[x] + (tstart - bStart) 392 except Exception as e:
105 else: 393 sys.stderr.write("failed: Error reading %s - %s\n" % (options.input if options.input else 'stdin', e))
106 chromEnd = self.chromStart + self.blockStarts[x] + (tstart - bStart) 394 raise
107 if bStart <= tstop < bEnd: 395 exit(1)
108 if self.strand == '+': 396
109 chromEnd = self.chromStart + self.blockStarts[x] + (tstop - bStart) 397
110 else: 398 if __name__ == "__main__":
111 chromStart = self.chromStart + self.blockStarts[x] + self.blockSizes[x] - (tstop - bStart) 399 __main__()
112 bStart += self.blockSizes[x]
113 return(chromStart,chromEnd)
114 #get the blocks for sub range
115 def get_blocks(self,chromStart,chromEnd):
116 tblockCount = 0
117 tblockSizes = []
118 tblockStarts = []
119 for x in range(self.blockCount):
120 bStart = self.chromStart + self.blockStarts[x]
121 bEnd = bStart + self.blockSizes[x]
122 if bStart > chromEnd:
123 break
124 if bEnd < chromStart:
125 continue
126 cStart = max(chromStart,bStart)
127 tblockStarts.append(cStart - chromStart)
128 tblockSizes.append(min(chromEnd,bEnd) - cStart)
129 tblockCount += 1
130 ## print >> sys.stderr, "tblockCount: %d tblockStarts: %s tblockSizes: %s" % (tblockCount,tblockStarts,tblockSizes)
131 return (tblockCount,tblockSizes,tblockStarts)
132 ## [(start,end,seq,blockCount,blockSizes,blockStarts),(start,end,seq,blockCount,blockSizes,blockStarts),(start,end,seq,blockCount,blockSizes,blockStarts)]
133 ## filter: ignore translation if stop codon in first exon after ignore_left_bp
134 def get_filterd_translations(self,untrimmed=False,filtering=True,ignore_left_bp=0,ignore_right_bp=0,debug=False):
135 translations = [None,None,None,None,None,None]
136 seq = self.get_spliced_seq()
137 ignore = (ignore_left_bp if self.strand == '+' else ignore_right_bp) / 3
138 block_sum = sum(self.blockSizes)
139 exon_sizes = [x for x in self.blockSizes]
140 if self.strand == '-':
141 exon_sizes.reverse()
142 splice_sites = [sum(exon_sizes[:x]) / 3 for x in range(1,len(exon_sizes))]
143 if debug:
144 print >> sys.stderr, "splice_sites: %s" % splice_sites
145 junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0]
146 if seq:
147 for i in range(3):
148 translation = self.get_translation(sequence=seq[i:])
149 if translation:
150 tstart = 0
151 tstop = len(translation)
152 offset = (block_sum - i) % 3
153 if debug:
154 print >> sys.stderr, "frame: %d\ttstart: %d tstop: %d offset: %d\t%s" % (i,tstart,tstop,offset,translation)
155 if not untrimmed:
156 tstart = translation.rfind('*',0,junc) + 1
157 stop = translation.find('*',junc)
158 tstop = stop if stop >= 0 else len(translation)
159 offset = (block_sum - i) % 3
160 trimmed = translation[tstart:tstop]
161 if debug:
162 print >> sys.stderr, "frame: %d\ttstart: %d tstop: %d offset: %d\t%s" % (i,tstart,tstop,offset,trimmed)
163 if filtering and tstart > ignore:
164 continue
165 #get genomic locations for start and end
166 if self.strand == '+':
167 chromStart = self.chromStart + i + (tstart * 3)
168 chromEnd = self.chromEnd - offset - (len(translation) - tstop) * 3
169 else:
170 chromStart = self.chromStart + offset + (len(translation) - tstop) * 3
171 chromEnd = self.chromEnd - i - (tstart * 3)
172 #get the blocks for this translation
173 (tblockCount,tblockSizes,tblockStarts) = self.get_blocks(chromStart,chromEnd)
174 translations[i] = (chromStart,chromEnd,trimmed,tblockCount,tblockSizes,tblockStarts)
175 if debug:
176 print >> sys.stderr, "tblockCount: %d tblockStarts: %s tblockSizes: %s" % (tblockCount,tblockStarts,tblockSizes)
177 # translations[i] = (chromStart,chromEnd,trimmed,tblockCount,tblockSizes,tblockStarts)
178 return translations
179 def get_seq_id(self,seqtype='unk:unk',reference='',frame=None):
180 ## Ensembl fasta ID format
181 # >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT
182 # >ENSP00000328693 pep:splice chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding
183 frame_name = ''
184 chromStart = self.chromStart
185 chromEnd = self.chromEnd
186 strand = 1 if self.strand == '+' else -1
187 if frame != None:
188 block_sum = sum(self.blockSizes)
189 offset = (block_sum - frame) % 3
190 frame_name = '_' + str(frame + 1)
191 if self.strand == '+':
192 chromStart += frame
193 chromEnd -= offset
194 else:
195 chromStart += offset
196 chromEnd -= frame
197 location = "chromosome:%s:%s:%s:%s:%s" % (reference,self.chrom,chromStart,chromEnd,strand)
198 seq_id = "%s%s %s %s" % (self.name,frame_name,seqtype,location)
199 return seq_id
200 def get_line(self, start_offset = 0, end_offset = 0):
201 if start_offset or end_offset:
202 s_offset = start_offset if start_offset else 0
203 e_offset = end_offset if end_offset else 0
204 if s_offset > self.chromStart:
205 s_offset = self.chromStart
206 chrStart = self.chromStart - s_offset
207 chrEnd = self.chromEnd + e_offset
208 blkSizes = self.blockSizes
209 blkSizes[0] += s_offset
210 blkSizes[-1] += e_offset
211 blkStarts = self.blockStarts
212 for i in range(1,self.blockCount):
213 blkStarts[i] += s_offset
214 items = [str(x) for x in [self.chrom,chrStart,chrEnd,self.name,self.score,self.strand,self.thickStart,self.thickEnd,self.itemRgb,self.blockCount,','.join([str(x) for x in blkSizes]),','.join([str(x) for x in blkStarts])]]
215 return '\t'.join(items) + '\n'
216 return self.line
217
218 def __main__():
219 #Parse Command Line
220 parser = optparse.OptionParser()
221 parser.add_option( '-i', '--input', dest='input', help='BED file (tophat junctions.bed) with sequence column added' )
222 parser.add_option( '-o', '--output', dest='output', help='Translations of spliced sequence')
223 parser.add_option( '-b', '--bed_format', dest='bed_format', action='store_true', default=False, help='Append translations to bed file instead of fasta' )
224 parser.add_option( '-D', '--fa_db', dest='fa_db', default=None, help='Prefix DB identifier for fasta ID line, e.g. generic' )
225 parser.add_option( '-s', '--fa_sep', dest='fa_sep', default='|', help='fasta ID separator defaults to pipe char, e.g. generic|ProtID|description' )
226 parser.add_option( '-B', '--bed', dest='bed', default=None, help='Output a bed file with added 13th column having translation' )
227 parser.add_option( '-G', '--gff3', dest='gff', default=None, help='Output translations to a GFF3 file' )
228 parser.add_option( '-S', '--seqtype', dest='seqtype', default='pep:splice', help='SEQTYPE:STATUS for fasta ID line' )
229 parser.add_option( '-P', '--id_prefix', dest='id_prefix', default='', help='prefix for the sequence ID' )
230 parser.add_option( '-R', '--reference', dest='reference', default=None, help='Genome Reference Name for fasta ID location ' )
231 parser.add_option( '-r', '--refsource', dest='refsource', default=None, help='Source for Genome Reference, e.g. Ensembl, UCSC, or NCBI' )
232 parser.add_option( '-Q', '--score_name', dest='score_name', default=None, help='include in the fasta ID line score_name:score ' )
233 parser.add_option( '-l', '--leading_bp', dest='leading_bp', type='int', default=None, help='leading number of base pairs to ignore when filtering' )
234 parser.add_option( '-t', '--trailing_bp', dest='trailing_bp', type='int', default=None, help='trailing number of base pairs to ignore when filtering' )
235 parser.add_option( '-U', '--unfiltered', dest='filtering', action='store_false', default=True, help='Do NOT filterout translation with stop codon in the first exon' )
236 parser.add_option( '-u', '--untrimmed', dest='untrimmed', action='store_true', default=False, help='Do NOT trim from splice site to stop codon' )
237 parser.add_option( '-L', '--min_length', dest='min_length', type='int', default=None, help='Minimun length (to first stop codon)' )
238 parser.add_option( '-M', '--max_stop_codons', dest='max_stop_codons', type='int', default=None, help='Filter out translations with more than max_stop_codons' )
239 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' )
240 (options, args) = parser.parse_args()
241 # Input files
242 if options.input != None:
243 try:
244 inputPath = os.path.abspath(options.input)
245 inputFile = open(inputPath, 'r')
246 except Exception, e:
247 print >> sys.stderr, "failed: %s" % e
248 exit(2)
249 else:
250 inputFile = sys.stdin
251 # Output files
252 bed_fh = None
253 gff_fh = None
254 gff_fa_file = None
255 gff_fa = None
256 outFile = None
257 if options.output == None:
258 #write to stdout
259 outFile = sys.stdout
260 if options.gff:
261 gff_fa_file = tempfile.NamedTemporaryFile(prefix='gff_fasta_',suffix=".fa",dir=os.getcwd()).name
262 gff_fa = open(gff_fa_file,'w')
263 else:
264 try:
265 outPath = os.path.abspath(options.output)
266 outFile = open(outPath, 'w')
267 except Exception, e:
268 print >> sys.stderr, "failed: %s" % e
269 exit(3)
270 if options.gff:
271 gff_fa_file = outPath
272 if options.bed:
273 bed_fh = open(options.bed,'w')
274 bed_fh.write('track name="%s" description="%s" \n' % ('novel_junctioni_translations','test'))
275 if options.gff:
276 gff_fh = open(options.gff,'w')
277 gff_fh.write("##gff-version 3.2.1\n")
278 if options.reference:
279 gff_fh.write("##genome-build %s %s\n" % (options.refsource if options.refsource else 'unknown', options.reference))
280 leading_bp = 0
281 trailing_bp = 0
282 if options.leading_bp:
283 if options.leading_bp >= 0:
284 leading_bp = options.leading_bp
285 else:
286 print >> sys.stderr, "failed: leading_bp must be positive"
287 exit(5)
288 if options.trailing_bp:
289 if options.trailing_bp >= 0:
290 trailing_bp = options.trailing_bp
291 else:
292 print >> sys.stderr, "failed: trailing_bp must be positive"
293 exit(5)
294 # Scan bed file
295 try:
296 for i, line in enumerate( inputFile ):
297 if line.startswith('track'):
298 if outFile and options.bed_format:
299 outFile.write(line)
300 continue
301 entry = BedEntry(line)
302 strand = 1 if entry.strand == '+' else -1
303 translations = entry.get_translations()
304 if options.debug:
305 exon_seqs = entry.get_exon_seqs()
306 exon_sizes = [len(seq) for seq in exon_seqs]
307 splice_sites = [sum(exon_sizes[:x]) / 3 for x in range(1,len(exon_sizes))]
308 print >> sys.stderr, entry.name
309 print >> sys.stderr, line.rstrip('\r\n')
310 print >> sys.stderr, "exons: %s" % exon_seqs
311 print >> sys.stderr, "%s" % splice_sites
312 for i,translation in enumerate(translations):
313 print >> sys.stderr, "frame %d: %s" % (i+1,translation)
314 print >> sys.stderr, "splice: %s" % (''.join(['^' if (((j*3)+i)/3) in splice_sites else '-' for j in range(len(translation))]))
315 print >> sys.stderr, ""
316 if options.bed_format:
317 tx_entry = "%s\t%s\n" % (line.rstrip('\r\n'),'\t'.join(translations))
318 outFile.write(tx_entry)
319 else:
320 translations = entry.get_filterd_translations(untrimmed=options.untrimmed,filtering=options.filtering,ignore_left_bp=leading_bp,ignore_right_bp=trailing_bp,debug=options.debug)
321 for i,tx in enumerate(translations):
322 if tx:
323 (chromStart,chromEnd,translation,blockCount,blockSizes,blockStarts) = tx
324 if options.min_length != None and len(translation) < options.min_length:
325 continue
326 if options.max_stop_codons != None and translation.count('*') > options.max_stop_codons:
327 continue
328 frame_name = '_%s' % (i + 1)
329 pep_id = "%s%s%s" % (options.id_prefix,entry.name,frame_name)
330 if bed_fh:
331 bed_fh.write('%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\n' % (str(entry.chrom),chromStart,chromEnd,pep_id,entry.score,entry.strand,chromStart,chromEnd,entry.itemRgb,blockCount,','.join([str(x) for x in blockSizes]),','.join([str(x) for x in blockStarts]),translation))
332 location = "chromosome:%s:%s:%s:%s:%s" % (options.reference,entry.chrom,chromStart,chromEnd,strand)
333 score = " %s:%s" % (options.score_name,entry.score) if options.score_name else ''
334 seq_description = "%s %s%s" % (options.seqtype, location, score)
335 seq_id = "%s " % pep_id
336 if options.fa_db:
337 seq_id = "%s%s%s%s" % (options.fa_db,options.fa_sep,pep_id,options.fa_sep)
338 fa_id = "%s%s" % (seq_id,seq_description)
339 fa_entry = ">%s\n%s\n" % (fa_id,translation)
340 outFile.write(fa_entry)
341 if gff_fh:
342 if gff_fa:
343 gff_fa.write(fa_entry)
344 gff_fh.write("##sequence-region %s %d %d\n" % (entry.chrom,chromStart + 1,chromEnd - 1))
345 gff_fh.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\tID=%s\n" % (entry.chrom,'splice_junc','gene',chromStart + 1,chromEnd - 1,entry.score,entry.strand,0,pep_id))
346 for x in range(blockCount):
347 start = chromStart+blockStarts[x] + 1
348 end = start + blockSizes[x] - 1
349 phase = (3 - sum(blockSizes[:x]) % 3) % 3
350 gff_fh.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\tParent=%s;ID=%s_%d\n" % (entry.chrom,'splice_junc','CDS',start,end,entry.score,entry.strand,phase,pep_id,pep_id,x))
351 """
352 ##gff-version 3
353 ##sequence-region 19 1 287484
354 19 MassSpec peptide 282299 287484 10.0 - 0 ID=TEARLSFYSGHSSFGMYCMVFLALYVQ
355 19 MassSpec CDS 287474 287484 . - 0 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812
356 19 MassSpec CDS 282752 282809 . - 1 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812
357 19 MassSpec CDS 282299 282310 . - 0 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812
358 """
359 if bed_fh:
360 bed_fh.close()
361 if gff_fh:
362 if gff_fa:
363 gff_fa.close()
364 else:
365 outFile.close()
366 gff_fa = open(gff_fa_file,'r')
367 gff_fh.write("##FASTA\n")
368 for i, line in enumerate(gff_fa):
369 gff_fh.write(line)
370 gff_fh.close()
371 except Exception, e:
372 print >> sys.stderr, "failed: Error reading %s - %s" % (options.input if options.input else 'stdin',e)
373
374 if __name__ == "__main__" : __main__()
375