6
|
1 #
|
|
2 # Copyright INRA-URGI 2009-2010
|
|
3 #
|
|
4 # This software is governed by the CeCILL license under French law and
|
|
5 # abiding by the rules of distribution of free software. You can use,
|
|
6 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
7 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
8 # "http://www.cecill.info".
|
|
9 #
|
|
10 # As a counterpart to the access to the source code and rights to copy,
|
|
11 # modify and redistribute granted by the license, users are provided only
|
|
12 # with a limited warranty and the software's author, the holder of the
|
|
13 # economic rights, and the successive licensors have only limited
|
|
14 # liability.
|
|
15 #
|
|
16 # In this respect, the user's attention is drawn to the risks associated
|
|
17 # with loading, using, modifying and/or developing or reproducing the
|
|
18 # software by the user in light of its specific status of free software,
|
|
19 # that may mean that it is complicated to manipulate, and that also
|
|
20 # therefore means that it is reserved for developers and experienced
|
|
21 # professionals having in-depth computer knowledge. Users are therefore
|
|
22 # encouraged to load and test the software's suitability as regards their
|
|
23 # requirements in conditions enabling the security of their systems and/or
|
|
24 # data to be ensured and, more generally, to use and operate it in the
|
|
25 # same conditions as regards security.
|
|
26 #
|
|
27 # The fact that you are presently reading this means that you have had
|
|
28 # knowledge of the CeCILL license and that you accept its terms.
|
|
29 #
|
|
30 import sys
|
|
31 from SMART.Java.Python.structure.Interval import Interval
|
|
32 from SMART.Java.Python.structure.Sequence import Sequence
|
|
33
|
|
34
|
|
35 class Transcript(Interval):
|
|
36 """
|
|
37 A class that models an transcript, considered as a specialized interval (the bounds of the transcript) that contains exons (also represented as intervals)
|
|
38 @ivar exons: a list of exons (intervals)
|
|
39 @type exons: list of L{Interval{Interval}}
|
|
40 """
|
|
41
|
|
42 def __init__(self, transcript = None, verbosity = 0):
|
|
43 """
|
|
44 Constructor
|
|
45 @param transcript: transcript to be copied
|
|
46 @type transcript: class L{Transcript<Transcript>}
|
|
47 @param verbosity: verbosity
|
|
48 @type verbosity: int
|
|
49 """
|
|
50 super(Transcript, self).__init__(None, verbosity)
|
|
51 self.exons = []
|
|
52 self.introns = None
|
|
53 if transcript != None:
|
|
54 self.copy(transcript)
|
|
55
|
|
56
|
|
57 def copy(self, transcript):
|
|
58 """
|
|
59 Copy method
|
|
60 @param transcript: transcript to be copied
|
|
61 @type transcript: class L{Transcript<Transcript>} or L{Interval<Interval>}
|
|
62 """
|
|
63 super(Transcript, self).copy(transcript)
|
|
64 if transcript.__class__.__name__ == "Transcript":
|
|
65 exons = transcript.getExons()
|
|
66 if len(exons) > 1:
|
|
67 for exon in exons:
|
|
68 exonCopy = Interval(exon)
|
|
69 self.addExon(exonCopy)
|
|
70
|
|
71
|
|
72 def setDirection(self, direction):
|
|
73 """
|
|
74 Set the direction of the interval
|
|
75 Possibly parse different formats
|
|
76 Impact all exons
|
|
77 @param direction: direction of the transcript (+ / -)
|
|
78 @type direction: int or string
|
|
79 """
|
|
80 super(Transcript, self).setDirection(direction)
|
|
81 for exon in self.exons:
|
|
82 exon.setDirection(direction)
|
|
83
|
|
84
|
|
85 def setChromosome(self, chromosome):
|
|
86 """
|
|
87 Set the chromosome
|
|
88 @param chromosome: chromosome on which the transcript is
|
|
89 @type chromosome: string
|
|
90 """
|
|
91 super(Transcript, self).setChromosome(chromosome)
|
|
92 for exon in self.exons:
|
|
93 exon.setChromosome(chromosome)
|
|
94
|
|
95
|
|
96 def addExon(self, exon):
|
|
97 """
|
|
98 Add an exon to the list of exons
|
|
99 @param exon: a new exon
|
|
100 @type exon: class L{Interval<Interval>}
|
|
101 """
|
|
102 if not self.exons and not exon.overlapWith(self):
|
|
103 firstExon = Interval()
|
|
104 firstExon.setStart(self.getStart())
|
|
105 firstExon.setEnd(self.getEnd())
|
|
106 firstExon.setDirection(self.getDirection())
|
|
107 firstExon.setChromosome(self.getChromosome())
|
|
108 self.exons.append(firstExon)
|
|
109 newExon = Interval(exon)
|
|
110 newExon.setDirection(self.getDirection())
|
|
111 self.exons.append(newExon)
|
|
112 if newExon.getStart() < self.getStart():
|
|
113 self.setStart(newExon.getStart())
|
|
114 if newExon.getEnd() > self.getEnd():
|
|
115 self.setEnd(newExon.getEnd())
|
|
116
|
|
117
|
|
118 def setStart(self, start):
|
|
119 """
|
|
120 Set the new start, move the first exon accordingly (if exists)
|
|
121 @param start: the new start
|
|
122 @type start: int
|
|
123 """
|
|
124 super(Transcript, self).setStart(start)
|
|
125 if self.exons:
|
|
126 self.sortExonsIncreasing()
|
|
127 self.exons[0].setStart(start)
|
|
128
|
|
129
|
|
130 def setEnd(self, end):
|
|
131 """
|
|
132 Set the new end, move the last exon accordingly (if exists)
|
|
133 @param end: the new end
|
|
134 @type end: int
|
|
135 """
|
|
136 super(Transcript, self).setEnd(end)
|
|
137 if self.exons:
|
|
138 self.sortExonsIncreasing()
|
|
139 self.exons[-1].setEnd(end)
|
|
140
|
|
141
|
|
142 def reverse(self):
|
|
143 """
|
|
144 Reverse the strand of the transcript
|
|
145 """
|
|
146 super(Transcript, self).reverse()
|
|
147 for exon in self.exons:
|
|
148 exon.reverse()
|
|
149
|
|
150
|
|
151 def getUniqueName(self):
|
|
152 """
|
|
153 Try to give a unique name by possibly adding occurrence
|
|
154 """
|
|
155 if "nbOccurrences" in self.tags and "occurrence" in self.tags and self.tags["nbOccurrences"] != 1:
|
|
156 return "%s-%d" % (self.name, self.tags["occurrence"])
|
|
157 return self.name
|
|
158
|
|
159
|
|
160 def getNbExons(self):
|
|
161 """
|
|
162 Get the number of exons
|
|
163 """
|
|
164 return max(1, len(self.exons))
|
|
165
|
|
166
|
|
167 def getExon(self, i):
|
|
168 """
|
|
169 Get a specific exon
|
|
170 @param i: the rank of the exon
|
|
171 @type i: int
|
|
172 """
|
|
173 if len(self.exons) == 0:
|
|
174 if i != 0:
|
|
175 raise Exception("Cannot get exon #%i while there is no exon in the transcript" % (i))
|
|
176 return self
|
|
177 return self.exons[i]
|
|
178
|
|
179
|
|
180 def getExons(self):
|
|
181 """
|
|
182 Get all the exons
|
|
183 """
|
|
184 if len(self.exons) == 0:
|
|
185 return [Interval(self)]
|
|
186 return self.exons
|
|
187
|
|
188
|
|
189 def getIntrons(self):
|
|
190 """
|
|
191 Get all the introns
|
|
192 Compute introns on the fly
|
|
193 """
|
|
194 if self.introns != None:
|
|
195 return self.introns
|
|
196 self.sortExons()
|
|
197 self.introns = []
|
|
198 exonStart = self.getExon(0)
|
|
199 for cpt, exonEnd in enumerate(self.exons[1:]):
|
|
200 intron = Interval()
|
|
201 intron.setName("%s_intron%d" % (self.getName(), cpt+1))
|
|
202 intron.setChromosome(self.getChromosome())
|
|
203 intron.setDirection(self.getDirection())
|
|
204 if self.getDirection() == 1:
|
|
205 intron.setEnd(exonEnd.getStart() - 1)
|
|
206 intron.setStart(exonStart.getEnd() + 1)
|
|
207 else:
|
|
208 intron.setStart(exonEnd.getEnd() + 1)
|
|
209 intron.setEnd(exonStart.getStart() - 1)
|
|
210 intron.setDirection(self.getDirection())
|
|
211 if intron.getSize() > 0:
|
|
212 self.introns.append(intron)
|
|
213 exonStart = exonEnd
|
|
214 intron.setSize(intron.getEnd() - intron.getStart() + 1)
|
|
215 return self.introns
|
|
216
|
|
217
|
|
218 def getSize(self):
|
|
219 """
|
|
220 Get the size of the transcript (i.e. the number of nucleotides)
|
|
221 Compute size on the fly
|
|
222 """
|
|
223 if len(self.exons) == 0:
|
|
224 return self.getSizeWithIntrons()
|
|
225 size = 0
|
|
226 for exon in self.exons:
|
|
227 size += exon.getSize()
|
|
228 return size
|
|
229
|
|
230
|
|
231 def getSizeWithIntrons(self):
|
|
232 """
|
|
233 Get the size of the interval (i.e. distance from start to end)
|
|
234 """
|
|
235 return super(Transcript, self).getSize()
|
|
236
|
|
237
|
|
238 def overlapWithExon(self, transcript, nbNucleotides = 1):
|
|
239 """
|
|
240 Check if the exons of this transcript overlap with the exons of another transcript
|
|
241 @param transcript: transcript to be compared to
|
|
242 @type transcript: class L{Transcript<Transcript>}
|
|
243 @param nbNucleotides: minimum number of nucleotides to declare and overlap
|
|
244 @type nbNucleotides: int
|
|
245 """
|
|
246 if not self.overlapWith(transcript, nbNucleotides):
|
|
247 return False
|
|
248 for thisExon in self.getExons():
|
|
249 for thatExon in transcript.getExons():
|
|
250 if thisExon.overlapWith(thatExon, nbNucleotides):
|
|
251 return True
|
|
252 return False
|
|
253
|
|
254
|
|
255 def include(self, transcript):
|
|
256 """
|
|
257 Whether this transcript includes the other one
|
|
258 @param transcript: object to be compared to
|
|
259 @type transcript: class L{Transcript<Transcript>}
|
|
260 """
|
|
261 if not super(Transcript, self).include(transcript):
|
|
262 return False
|
|
263 for thatExon in transcript.getExons():
|
|
264 for thisExon in self.getExons():
|
|
265 if thisExon.include(thatExon):
|
|
266 break
|
|
267 else:
|
|
268 return False
|
|
269 return True
|
|
270
|
|
271
|
|
272 def merge(self, transcript, normalization = False):
|
|
273 """
|
|
274 Merge with another transcript
|
|
275 Merge exons if they overlap, otherwise add exons
|
|
276 @param transcript: transcript to be merged to
|
|
277 @type transcript: class L{Transcript<Transcript>}
|
|
278 @param normalization: whether the sum of the merge should be normalized wrt the number of mappings of each elements
|
|
279 @type normalization: boolean
|
|
280 """
|
|
281 if self.getChromosome() != transcript.getChromosome() or self.getDirection() != transcript.getDirection():
|
|
282 raise Exception("Cannot merge '%s' with '%s'!" % (self, transcript))
|
|
283
|
|
284 theseExons = self.getExons()
|
|
285 thoseExons = transcript.getExons()
|
|
286
|
|
287 for thatExon in thoseExons:
|
|
288 toBeRemoved = []
|
|
289 for thisIndex, thisExon in enumerate(theseExons):
|
|
290 if thisExon.overlapWith(thatExon):
|
|
291 thatExon.merge(thisExon)
|
|
292 toBeRemoved.append(thisIndex)
|
|
293 theseExons.append(thatExon)
|
|
294 for thisIndex in reversed(toBeRemoved):
|
|
295 del theseExons[thisIndex]
|
|
296 self.removeExons()
|
|
297 self.setStart(min(self.getStart(), transcript.getStart()))
|
|
298 self.setEnd(max(self.getEnd(), transcript.getEnd()))
|
|
299 if len(theseExons) > 1:
|
|
300 for thisExon in theseExons:
|
|
301 self.addExon(thisExon)
|
|
302
|
|
303 self.setName("%s--%s" % (self.getUniqueName(), transcript.getUniqueName()))
|
|
304 super(Transcript, self).merge(transcript, normalization)
|
|
305
|
|
306
|
|
307 def getDifference(self, transcript, sameStrand = False):
|
|
308 """
|
|
309 Get the difference between this cluster and another one
|
|
310 @param transcript: object to be compared to
|
|
311 @type transcript: class L{Transcript<Transcript>}
|
|
312 @param sameStrand: do the comparison iff the transcripts are on the same strand
|
|
313 @type sameStrand: boolean
|
|
314 @return: a transcript
|
|
315 """
|
|
316 newTranscript = Transcript()
|
|
317 newTranscript.copy(self)
|
|
318 if self.getChromosome() != transcript.getChromosome():
|
|
319 return newTranscript
|
|
320 if not self.overlapWith(transcript):
|
|
321 return newTranscript
|
|
322 if sameStrand and self.getDirection() != transcript.getDirection():
|
|
323 return newTranscript
|
|
324 newTranscript.removeExons()
|
|
325 if transcript.getEnd() > newTranscript.getStart():
|
|
326 newTranscript.setStart(transcript.getEnd() + 1)
|
|
327 if transcript.getStart() < newTranscript.getEnd():
|
|
328 newTranscript.setEnd(transcript.getStart() + 1)
|
|
329 theseExons = []
|
|
330 for exon in self.getExons():
|
|
331 exonCopy = Interval()
|
|
332 exonCopy.copy(exon)
|
|
333 theseExons.append(exonCopy)
|
|
334 for thatExon in transcript.getExons():
|
|
335 newExons = []
|
|
336 for thisExon in theseExons:
|
|
337 newExons.extend(thisExon.getDifference(thatExon))
|
|
338 theseExons = newExons
|
|
339 if not theseExons:
|
|
340 return None
|
|
341 newStart, newEnd = theseExons[0].getStart(), theseExons[0].getEnd()
|
|
342 for thisExon in theseExons[1:]:
|
|
343 newStart = min(newStart, thisExon.getStart())
|
|
344 newEnd = max(newEnd, thisExon.getEnd())
|
|
345 newTranscript.setEnd(newEnd)
|
|
346 newTranscript.setStart(newStart)
|
|
347 newTranscript.exons = theseExons
|
|
348 return newTranscript
|
|
349
|
18
|
350
|
|
351 def getIntersection(self, transcript):
|
|
352 """
|
|
353 Get the intersection between this transcript and another one
|
|
354 @param transcript: object to be compared to
|
|
355 @type transcript: class L{Transcript<Transcript>}
|
|
356 @return: an other transcript
|
|
357 """
|
|
358 if self.getChromosome() != transcript.getChromosome() or self.getDirection() != transcript.getDirection():
|
|
359 return None
|
|
360 newTranscript = Transcript()
|
|
361 newTranscript.setDirection(self.getDirection())
|
|
362 newTranscript.setChromosome(self.getChromosome())
|
|
363 newTranscript.setName("%s_intersect_%s" % (self.getName(), transcript.getName()))
|
|
364 newExons = []
|
|
365 for thisExon in self.getExons():
|
|
366 for thatExon in transcript.getExons():
|
|
367 newExon = thisExon.getIntersection(thatExon)
|
|
368 if newExon != None:
|
|
369 newExons.append(newExon)
|
|
370 if not newExons:
|
|
371 return None
|
|
372 newTranscript.exons = newExons
|
|
373 return newTranscript
|
|
374
|
6
|
375
|
|
376 def getSqlVariables(cls):
|
|
377 """
|
|
378 Get the properties of the object that should be saved in a database
|
|
379 """
|
|
380 variables = Interval.getSqlVariables()
|
|
381 variables.append("exons")
|
|
382 return variables
|
|
383 getSqlVariables = classmethod(getSqlVariables)
|
|
384
|
|
385
|
|
386 def setSqlValues(self, array):
|
|
387 """
|
|
388 Set the values of the properties of this object as given by a results line of a SQL query
|
|
389 @param array: the values to be copied
|
|
390 @type array: a list
|
|
391 """
|
|
392 super(Transcript, self).setSqlValues(array)
|
|
393 mergedExons = array[8]
|
|
394 if not mergedExons:
|
|
395 return
|
|
396 for exonCount, splittedExon in enumerate(mergedExons.split(",")):
|
|
397 start, end = splittedExon.split("-")
|
|
398 exon = Interval()
|
|
399 exon.setChromosome(self.getChromosome())
|
|
400 exon.setDirection(self.getDirection())
|
|
401 exon.setName("%s_exon%d" % (self.getName(), exonCount+1))
|
|
402 exon.setStart(int(start))
|
|
403 exon.setEnd(int(end))
|
|
404 self.addExon(exon)
|
|
405
|
|
406
|
|
407 def getSqlValues(self):
|
|
408 """
|
|
409 Get the values of the properties that should be saved in a database
|
|
410 """
|
|
411 values = super(Transcript, self).getSqlValues()
|
|
412 values["size"] = self.getSize()
|
|
413 if self.getNbExons() == 1:
|
|
414 values["exons"] = ""
|
|
415 else:
|
|
416 values["exons"] = ",".join(["%d-%d" % (exon.getStart(), exon.getEnd()) for exon in self.getExons()])
|
|
417 return values
|
|
418
|
|
419
|
|
420 def getSqlTypes(cls):
|
|
421 """
|
|
422 Get the types of the properties that should be saved in a database
|
|
423 """
|
|
424 types = Interval.getSqlTypes()
|
|
425 types["exons"] = "varchar"
|
|
426 return types
|
|
427 getSqlTypes = classmethod(getSqlTypes)
|
|
428
|
|
429
|
|
430 def getSqlSizes(cls):
|
|
431 """
|
|
432 Get the sizes of the properties that should be saved in a database
|
|
433 """
|
|
434 sizes = Interval.getSqlSizes()
|
|
435 sizes["exons"] = 10000
|
|
436 return sizes
|
|
437 getSqlSizes = classmethod(getSqlSizes)
|
|
438
|
|
439
|
|
440 def sortExons(self):
|
|
441 """
|
|
442 Sort the exons
|
|
443 Increasing order if transcript is on strand "+", decreasing otherwise
|
|
444 """
|
|
445 self.sortExonsIncreasing()
|
|
446 if self.getDirection() == -1:
|
|
447 exons = self.getExons()
|
|
448 exons.reverse()
|
|
449 self.exons = exons
|
|
450
|
|
451
|
|
452 def sortExonsIncreasing(self):
|
|
453 """
|
|
454 Sort the exons
|
|
455 Increasing order
|
|
456 """
|
|
457 exons = self.getExons()
|
|
458 sortedExons = []
|
|
459 while len(exons) > 0:
|
|
460 minExon = exons[0]
|
|
461 for index in range(1, len(exons)):
|
|
462 if minExon.getStart() > exons[index].getStart():
|
|
463 minExon = exons[index]
|
|
464 sortedExons.append(minExon)
|
|
465 exons.remove(minExon)
|
|
466 self.exons = sortedExons
|
|
467
|
|
468
|
|
469 def extendStart(self, size):
|
|
470 """
|
|
471 Extend the transcript by the 5' end
|
|
472 @param size: the size to be extended
|
|
473 @type size: int
|
|
474 """
|
|
475 if len(self.exons) != 0:
|
|
476 self.sortExons()
|
|
477 if self.getDirection() == 1:
|
|
478 self.exons[0].setStart(max(0, self.exons[0].getStart() - size))
|
|
479 else:
|
|
480 self.exons[0].setEnd(self.exons[0].getEnd() + size)
|
|
481 super(Transcript, self).extendStart(size)
|
|
482 self.bin = None
|
|
483
|
|
484
|
|
485 def extendEnd(self, size):
|
|
486 """
|
|
487 Extend the transcript by the 3' end
|
|
488 @param size: the size to be extended
|
|
489 @type size: int
|
|
490 """
|
|
491 if len(self.exons) != 0:
|
|
492 self.sortExons()
|
|
493 if self.getDirection() == 1:
|
|
494 self.exons[-1].setEnd(self.exons[-1].getEnd() + size)
|
|
495 else:
|
|
496 self.exons[-1].setStart(max(0, self.exons[-1].getStart() - size))
|
|
497 super(Transcript, self).extendEnd(size)
|
|
498 self.bin = None
|
|
499
|
|
500
|
|
501 def extendExons(self, size):
|
|
502 """
|
|
503 Extend all the exons
|
|
504 @param size: the size to be extended
|
|
505 @type size: int
|
|
506 """
|
|
507 if len(self.exons) != 0:
|
|
508 self.sortExons()
|
|
509 exons = []
|
|
510 previousExon = None
|
|
511 for exon in self.exons:
|
|
512 exon.extendStart(size)
|
|
513 exon.extendEnd(size)
|
|
514 exon.setDirection(self.getDirection())
|
|
515 if previousExon != None and previousExon.overlapWith(exon):
|
|
516 previousExon.merge(exon)
|
|
517 else:
|
|
518 if previousExon != None:
|
|
519 exons.append(previousExon)
|
|
520 previousExon = exon
|
|
521 exons.append(previousExon)
|
|
522 self.exons = exons
|
|
523 super(Transcript, self).extendStart(size)
|
|
524 super(Transcript, self).extendEnd(size)
|
|
525 self.bin = None
|
|
526
|
|
527
|
|
528 def restrictStart(self, size = 1):
|
|
529 """
|
|
530 Restrict the transcript by some nucleotides, start from its start position
|
|
531 Remove the exons
|
|
532 @param size: the size to be restricted to
|
|
533 @type size: int
|
|
534 """
|
|
535 newExons = []
|
|
536 if self.getDirection() == 1:
|
|
537 for exon in self.exons:
|
|
538 if exon.getStart() <= self.getStart() + size - 1:
|
|
539 if exon.getEnd() > self.getStart() + size - 1:
|
|
540 exon.setEnd(self.getStart() + size - 1)
|
|
541 newExons.append(exon)
|
|
542 else:
|
|
543 for exon in self.exons:
|
|
544 if exon.getEnd() >= self.getEnd() - size + 1:
|
|
545 if exon.getStart() < self.getEnd() - size + 1:
|
|
546 exon.setStart(self.getEnd() - size + 1)
|
|
547 newExons.append(exon)
|
|
548 super(Transcript, self).restrictStart(size)
|
|
549 self.exons = newExons
|
|
550
|
|
551
|
|
552 def restrictEnd(self, size = 1):
|
|
553 """
|
|
554 Restrict the transcript by some nucleotides, end from its end position
|
|
555 Remove the exons
|
|
556 @param size: the size to be restricted to
|
|
557 @type size: int
|
|
558 """
|
|
559 newExons = []
|
|
560 if self.getDirection() == 1:
|
|
561 for exon in self.exons:
|
|
562 if exon.getEnd() >= self.getEnd() - size + 1:
|
|
563 if exon.getStart() < self.getEnd() - size + 1:
|
|
564 exon.setStart(self.getEnd() - size + 1)
|
|
565 newExons.append(exon)
|
|
566 else:
|
|
567 for exon in self.exons:
|
|
568 if exon.getEnd() >= self.getEnd() - size + 1:
|
|
569 if exon.getStart() < self.getEnd() - size + 1:
|
|
570 exon.setEnd(self.getEnd() - size + 1)
|
|
571 newExons.append(exon)
|
|
572 super(Transcript, self).restrictEnd(size)
|
|
573 self.exons = newExons
|
|
574
|
|
575
|
|
576 def removeExons(self):
|
|
577 """
|
|
578 Remove the exons and transforms the current transcript into a mere interval
|
|
579 """
|
|
580 self.exons = []
|
|
581 self.bin = None
|
|
582
|
|
583
|
|
584 def printGtf(self, title):
|
|
585 """
|
|
586 Export this transcript using GTF2.2 format
|
|
587 @param title: the title of the transcripts
|
|
588 @type title: string
|
|
589 @return: a string
|
|
590 """
|
|
591 transcriptId = self.getUniqueName()
|
|
592 geneId = "%s_gene" % (transcriptId)
|
|
593 direction = "+"
|
|
594 if self.getDirection() == -1:
|
|
595 direction = "-"
|
|
596 self.sortExonsIncreasing()
|
|
597 string = ""
|
|
598 for i, exon in enumerate(self.getExons()):
|
|
599 exonCopy = Interval()
|
|
600 exonCopy.copy(exon)
|
|
601 if "ID" in exonCopy.getTagValues():
|
|
602 del exonCopy.tags["ID"]
|
|
603 feature = "exon"
|
|
604 if "feature" in exonCopy.getTagNames():
|
|
605 feature = exonCopy.getTagValue("feature")
|
|
606 del exonCopy.tags["feature"]
|
|
607 score = "."
|
|
608 if "score" in exonCopy.getTagNames():
|
|
609 score = "%d" % (int(exonCopy.getTagValue("score")))
|
|
610 del exonCopy.tags["score"]
|
|
611 if "Parent" in exonCopy.getTagNames():
|
|
612 del exonCopy.tags["Parent"]
|
|
613 exonCopy.setName("%s_part%d" % (self.getName(), i+1))
|
|
614 comment = exonCopy.getTagValues("; ", " ", "\"")
|
|
615 string += "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\ttranscript_id \"%s\"; gene_id \"%s\"; %s\n" % (exonCopy.getChromosome(), title, feature, exonCopy.getStart(), exonCopy.getEnd(), score, direction, transcriptId, geneId, comment)
|
|
616 return string
|
|
617
|
|
618
|
|
619 def printGff2(self, title):
|
|
620 """
|
|
621 Export this transcript using GFF2 format
|
|
622 @param title: the title of the transcripts
|
|
623 @type title: string
|
|
624 @return: a string
|
|
625 """
|
|
626 direction = "+"
|
|
627 if self.getDirection() == -1:
|
|
628 direction = "-"
|
|
629 self.sortExonsIncreasing()
|
|
630 comment = self.getTagValues()
|
|
631 if comment != None:
|
|
632 comment = ";%s" % (comment)
|
|
633 score = "."
|
|
634 if "score" in self.getTagNames():
|
|
635 score = "%d" % (int(self.getTagValue("score")))
|
|
636 feature = "transcript"
|
|
637 if "feature" in self.getTagNames():
|
|
638 feature = self.getTagValue("feature")
|
|
639 string = "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\tGENE %s%s\n" % (self.getChromosome(), title, feature, self.getStart(), self.getEnd(), score, direction, self.name, comment)
|
|
640 for exon in self.getExons():
|
|
641 if "score" in exon.getTagNames():
|
|
642 score = "%d" % (int(self.getTagValue("score")))
|
|
643 string += "%s\t%s\t_exon\t%d\t%d\t%s\t%s\t.\tGENE %s\n" % (self.getChromosome(), title, exon.getStart(), exon.getEnd(), score, direction, self.name)
|
|
644 return string
|
|
645
|
|
646
|
|
647 def printGff3(self, title):
|
|
648 """
|
|
649 Export this transcript using GFF3 format
|
|
650 @param title: the title of the transcripts
|
|
651 @type title: string
|
|
652 @return: a string
|
|
653 """
|
|
654 direction = "+"
|
|
655 if self.getDirection() == -1:
|
|
656 direction = "-"
|
|
657 self.sortExonsIncreasing()
|
|
658 if "ID" not in self.getTagValues():
|
|
659 self.setTagValue("ID", self.getUniqueName())
|
|
660 feature = "transcript"
|
|
661 tags = self.tags
|
|
662 if "feature" in self.getTagNames():
|
|
663 feature = self.getTagValue("feature")
|
|
664 del self.tags["feature"]
|
|
665 score = "."
|
|
666 if "score" in self.getTagNames():
|
|
667 score = "%d" % (int(self.getTagValue("score")))
|
|
668 del self.tags["score"]
|
|
669 comment = self.getTagValues(";", "=")
|
|
670 string = "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\t%s\n" % (self.getChromosome(), title, feature, self.getStart(), self.getEnd(), score, direction, comment)
|
|
671 if len(self.exons) > 1:
|
|
672 for i, exon in enumerate(self.getExons()):
|
|
673 if "score" in exon.getTagNames():
|
|
674 score = "%d" % (int(exon.getTagValue("score")))
|
|
675 string += "%s\t%s\texon\t%d\t%d\t%s\t%s\t.\tID=%s-exon%d;Name=%s-exon%d;Parent=%s\n" % (self.getChromosome(), title, exon.getStart(), exon.getEnd(), score, direction, self.getTagValue("ID"), i+1, self.name, i+1, self.getTagValue("ID"))
|
|
676 self.tags = tags
|
|
677 return string
|
|
678
|
|
679
|
|
680 def printEmbl(self):
|
|
681 """
|
|
682 Export this transcript using EMBL format
|
|
683 @return: a string
|
|
684 """
|
|
685 if len(self.exons) <= 1:
|
|
686 position = "%d..%d" % (self.getStart(), self.getEnd())
|
|
687 else:
|
|
688 positions = []
|
|
689 for exon in self.getExons():
|
|
690 positions.append("%d..%d" % (self.getStart(), self.getEnd()))
|
|
691 position = ",".join(positions)
|
|
692 position = "join(%s)" % (position)
|
|
693 if self.getDirection() == -1:
|
|
694 position = "complement(%s)" % (position)
|
|
695 feature = "misc_feature"
|
|
696 if "feature" in self.getTagNames():
|
|
697 if not self.getTagValue("feature").startswith("S-MART"):
|
|
698 feature = self.getTagValue("feature")
|
|
699 string = "FT %s %s\n" % (feature, position)
|
|
700 if "Name" in self.getTagNames():
|
|
701 string += "FT /label=\"%s\"\n" % (self.getTagValue("Name"))
|
|
702 return string
|
|
703
|
|
704
|
|
705 def printBed(self):
|
|
706 """
|
|
707 Export this transcript using BED format
|
|
708 @return: a string
|
|
709 """
|
|
710 name = self.name
|
|
711 if "nbOccurrences" in self.getTagNames() and self.getTagValue("nbOccurrences") != 1 and self.getTagValue("occurrences"):
|
|
712 name = "%s-%d" % (name, self.getTagValue("occurrence"))
|
|
713 comment = self.getTagValues(";", "=")
|
|
714 sizes = []
|
|
715 starts = []
|
|
716 direction = "+"
|
|
717 if self.getDirection() == -1:
|
|
718 direction = "-"
|
|
719 self.sortExonsIncreasing()
|
|
720 for exon in self.getExons():
|
|
721 sizes.append("%d" % (exon.getSize()))
|
|
722 starts.append("%d" % (exon.getStart() - self.getStart()))
|
|
723 return "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0\t%d\t%s,\t%s,\n" % (self.getChromosome(), self.getStart(), self.getEnd()+1, name, direction, self.getStart(), self.getEnd()+1, self.getNbExons(), ",".join(sizes), ",".join(starts))
|
|
724
|
|
725
|
|
726 def printSam(self):
|
|
727 """
|
|
728 Export this transcript using SAM format
|
|
729 @return: a string
|
|
730 """
|
|
731 name = self.name
|
|
732 flag = 0 if self.getDirection() == 1 else 0x10
|
|
733 chromosome = self.getChromosome()
|
|
734 genomeStart = self.getStart()
|
|
735 quality = 255
|
|
736 mate = "*"
|
|
737 mateGenomeStart = 0
|
|
738 gapSize = 0
|
|
739 sequence = "*"
|
|
740 qualityString = "*"
|
|
741 tags = "NM:i:0"
|
|
742
|
|
743 lastExonEnd = None
|
|
744 self.sortExonsIncreasing()
|
|
745 exon = self.getExons()[0]
|
|
746 cigar = "%dM" % (self.getExons()[0].getSize())
|
|
747 lastExonEnd = exon.getEnd()
|
|
748 for i, exon in enumerate(self.getExons()):
|
|
749 if i == 0:
|
|
750 continue
|
|
751 cigar += "%dN" % (exon.getStart() - lastExonEnd - 1)
|
|
752 cigar += "%dM" % (exon.getSize())
|
|
753
|
|
754 return "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n" % (name, flag, chromosome, genomeStart, quality, cigar, mate, mateGenomeStart, gapSize, sequence, qualityString, tags)
|
|
755
|
|
756
|
|
757 def printUcsc(self):
|
|
758 """
|
|
759 Export this transcript using UCSC BED format
|
|
760 @return: a string
|
|
761 """
|
|
762 if self.getChromosome().find("Het") != -1:
|
|
763 return ""
|
|
764 name = self.name
|
|
765 comment = self.getTagValues(";", "")
|
|
766 sizes = []
|
|
767 starts = []
|
|
768 direction = "+"
|
|
769 if self.getDirection() == -1:
|
|
770 direction = "-"
|
|
771 self.sortExonsIncreasing()
|
|
772 for exon in self.getExons():
|
|
773 sizes.append("%d" % (exon.getSize()))
|
|
774 starts.append("%d" % (exon.getStart() - self.getStart()))
|
|
775 return "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0\t%d\t%s,\t%s,\n" % (self.getChromosome().replace("arm_", "chr"), self.getStart(), self.getEnd()+1, name, direction, self.getStart(), self.getEnd()+1, self.getNbExons(), ",".join(sizes), ",".join(starts))
|
|
776
|
|
777
|
|
778 def printGBrowseReference(self):
|
|
779 """
|
|
780 Export this transcript using GBrowse format (1st line only)
|
|
781 @return: a string
|
|
782 """
|
|
783 return "reference = %s\n" % (self.getChromosome())
|
|
784
|
|
785
|
|
786 def printGBrowseLine(self):
|
|
787 """
|
|
788 Export this transcript using GBrowse format (2nd line only)
|
|
789 @return: a string
|
|
790 """
|
|
791 self.sortExons()
|
|
792 coordinates = []
|
|
793 for exon in self.getExons():
|
|
794 coordinates.append(exon.printCoordinates())
|
|
795 coordinatesString = ",".join(coordinates)
|
|
796 comment = self.getTagValues(";", "=")
|
|
797 if comment:
|
|
798 comment = "\t\"%s\"" % (comment)
|
|
799 return "User_data\t%s\t%s%s\n" % (self.name, coordinatesString, comment)
|
|
800
|
|
801
|
|
802 def printGBrowse(self):
|
|
803 """
|
|
804 Export this transcript using GBrowse format
|
|
805 @return: a string
|
|
806 """
|
|
807 return "%s%s" % (self.printGBrowseReference(), self.printGBrowseLine())
|
|
808
|
|
809
|
|
810 def printCsv(self):
|
|
811 """
|
|
812 Export this transcript using CSV format
|
|
813 @return: a string
|
|
814 """
|
|
815 self.sortExons()
|
|
816 string = "%s,%d,%d,\"%s\"," % (self.getChromosome(), self.getStart(), self.getEnd(), "+" if self.getDirection() == 1 else "-")
|
|
817 if len(self.getExons()) == 1:
|
|
818 string += "None"
|
|
819 else:
|
|
820 for exon in self.getExons():
|
|
821 string += "%d-%d " % (exon.getStart(), exon.getEnd())
|
|
822 for tag in sorted(self.tags.keys()):
|
|
823 string += ",%s=%s" % (tag, str(self.tags[tag]))
|
|
824 string += "\n"
|
|
825 return string
|
|
826
|
|
827
|
|
828 def extractSequence(self, parser):
|
|
829 """
|
|
830 Get the sequence corresponding to this transcript
|
|
831 @param parser: a parser to a FASTA file
|
|
832 @type parser: class L{SequenceListParser<SequenceListParser>}
|
|
833 @return: an instance of L{Sequence<Sequence>}
|
|
834 """
|
|
835 self.sortExons()
|
|
836 name = self.name
|
|
837 if "ID" in self.getTagNames() and self.getTagValue("ID") != self.name:
|
|
838 name += ":%s" % (self.getTagValue("ID"))
|
|
839 sequence = Sequence(name)
|
|
840 for exon in self.getExons():
|
|
841 sequence.concatenate(exon.extractSequence(parser))
|
|
842 return sequence
|
|
843
|
|
844
|
|
845 def extractWigData(self, parser):
|
|
846 """
|
|
847 Get some wig data corresponding to this transcript
|
|
848 @param parser: a parser to a wig file
|
|
849 @type parser: class L{WigParser<WigParser>}
|
|
850 @return: a sequence of float
|
|
851 """
|
|
852 self.sortExons()
|
|
853 if parser.strands:
|
|
854 strands = (-1, 1)
|
|
855 values = dict([(strand, []) for strand in strands])
|
|
856 for exon in self.getExons():
|
|
857 theseValues = exon.extractWigData(parser)
|
|
858 if self.getDirection() == -1:
|
|
859 for strand in strands:
|
|
860 theseValues[strand].reverse()
|
|
861 for strand in strands:
|
|
862 values[strand].extend(theseValues[strand])
|
|
863 if self.getDirection() == -1:
|
|
864 for strand in strands:
|
|
865 values[strand].reverse()
|
|
866 return values
|
|
867 else:
|
|
868 values = []
|
|
869 for exon in self.getExons():
|
|
870 theseValues = exon.extractWigData(parser)
|
|
871 #if self.getDirection() == -1:
|
|
872 # theseValues.reverse()
|
|
873 values.extend(theseValues)
|
|
874 #if self.getDirection() == -1:
|
|
875 # values.reverse()
|
|
876 return values
|