comparison SMART/Java/Python/structure/Transcript.py @ 6:769e306b7933

Change the repository level.
author yufei-luo
date Fri, 18 Jan 2013 04:54:14 -0500
parents
children 94ab73e8a190
comparison
equal deleted inserted replaced
5:ea3082881bf8 6:769e306b7933
1 #
2 # Copyright INRA-URGI 2009-2010
3 #
4 # This software is governed by the CeCILL license under French law and
5 # abiding by the rules of distribution of free software. You can use,
6 # modify and/ or redistribute the software under the terms of the CeCILL
7 # license as circulated by CEA, CNRS and INRIA at the following URL
8 # "http://www.cecill.info".
9 #
10 # As a counterpart to the access to the source code and rights to copy,
11 # modify and redistribute granted by the license, users are provided only
12 # with a limited warranty and the software's author, the holder of the
13 # economic rights, and the successive licensors have only limited
14 # liability.
15 #
16 # In this respect, the user's attention is drawn to the risks associated
17 # with loading, using, modifying and/or developing or reproducing the
18 # software by the user in light of its specific status of free software,
19 # that may mean that it is complicated to manipulate, and that also
20 # therefore means that it is reserved for developers and experienced
21 # professionals having in-depth computer knowledge. Users are therefore
22 # encouraged to load and test the software's suitability as regards their
23 # requirements in conditions enabling the security of their systems and/or
24 # data to be ensured and, more generally, to use and operate it in the
25 # same conditions as regards security.
26 #
27 # The fact that you are presently reading this means that you have had
28 # knowledge of the CeCILL license and that you accept its terms.
29 #
30 import sys
31 from SMART.Java.Python.structure.Interval import Interval
32 from SMART.Java.Python.structure.Sequence import Sequence
33
34
35 class Transcript(Interval):
36 """
37 A class that models an transcript, considered as a specialized interval (the bounds of the transcript) that contains exons (also represented as intervals)
38 @ivar exons: a list of exons (intervals)
39 @type exons: list of L{Interval{Interval}}
40 """
41
42 def __init__(self, transcript = None, verbosity = 0):
43 """
44 Constructor
45 @param transcript: transcript to be copied
46 @type transcript: class L{Transcript<Transcript>}
47 @param verbosity: verbosity
48 @type verbosity: int
49 """
50 super(Transcript, self).__init__(None, verbosity)
51 self.exons = []
52 self.introns = None
53 if transcript != None:
54 self.copy(transcript)
55
56
57 def copy(self, transcript):
58 """
59 Copy method
60 @param transcript: transcript to be copied
61 @type transcript: class L{Transcript<Transcript>} or L{Interval<Interval>}
62 """
63 super(Transcript, self).copy(transcript)
64 if transcript.__class__.__name__ == "Transcript":
65 exons = transcript.getExons()
66 if len(exons) > 1:
67 for exon in exons:
68 exonCopy = Interval(exon)
69 self.addExon(exonCopy)
70
71
72 def setDirection(self, direction):
73 """
74 Set the direction of the interval
75 Possibly parse different formats
76 Impact all exons
77 @param direction: direction of the transcript (+ / -)
78 @type direction: int or string
79 """
80 super(Transcript, self).setDirection(direction)
81 for exon in self.exons:
82 exon.setDirection(direction)
83
84
85 def setChromosome(self, chromosome):
86 """
87 Set the chromosome
88 @param chromosome: chromosome on which the transcript is
89 @type chromosome: string
90 """
91 super(Transcript, self).setChromosome(chromosome)
92 for exon in self.exons:
93 exon.setChromosome(chromosome)
94
95
96 def addExon(self, exon):
97 """
98 Add an exon to the list of exons
99 @param exon: a new exon
100 @type exon: class L{Interval<Interval>}
101 """
102 if not self.exons and not exon.overlapWith(self):
103 firstExon = Interval()
104 firstExon.setStart(self.getStart())
105 firstExon.setEnd(self.getEnd())
106 firstExon.setDirection(self.getDirection())
107 firstExon.setChromosome(self.getChromosome())
108 self.exons.append(firstExon)
109 newExon = Interval(exon)
110 newExon.setDirection(self.getDirection())
111 self.exons.append(newExon)
112 if newExon.getStart() < self.getStart():
113 self.setStart(newExon.getStart())
114 if newExon.getEnd() > self.getEnd():
115 self.setEnd(newExon.getEnd())
116
117
118 def setStart(self, start):
119 """
120 Set the new start, move the first exon accordingly (if exists)
121 @param start: the new start
122 @type start: int
123 """
124 super(Transcript, self).setStart(start)
125 if self.exons:
126 self.sortExonsIncreasing()
127 self.exons[0].setStart(start)
128
129
130 def setEnd(self, end):
131 """
132 Set the new end, move the last exon accordingly (if exists)
133 @param end: the new end
134 @type end: int
135 """
136 super(Transcript, self).setEnd(end)
137 if self.exons:
138 self.sortExonsIncreasing()
139 self.exons[-1].setEnd(end)
140
141
142 def reverse(self):
143 """
144 Reverse the strand of the transcript
145 """
146 super(Transcript, self).reverse()
147 for exon in self.exons:
148 exon.reverse()
149
150
151 def getUniqueName(self):
152 """
153 Try to give a unique name by possibly adding occurrence
154 """
155 if "nbOccurrences" in self.tags and "occurrence" in self.tags and self.tags["nbOccurrences"] != 1:
156 return "%s-%d" % (self.name, self.tags["occurrence"])
157 return self.name
158
159
160 def getNbExons(self):
161 """
162 Get the number of exons
163 """
164 return max(1, len(self.exons))
165
166
167 def getExon(self, i):
168 """
169 Get a specific exon
170 @param i: the rank of the exon
171 @type i: int
172 """
173 if len(self.exons) == 0:
174 if i != 0:
175 raise Exception("Cannot get exon #%i while there is no exon in the transcript" % (i))
176 return self
177 return self.exons[i]
178
179
180 def getExons(self):
181 """
182 Get all the exons
183 """
184 if len(self.exons) == 0:
185 return [Interval(self)]
186 return self.exons
187
188
189 def getIntrons(self):
190 """
191 Get all the introns
192 Compute introns on the fly
193 """
194 if self.introns != None:
195 return self.introns
196 self.sortExons()
197 self.introns = []
198 exonStart = self.getExon(0)
199 for cpt, exonEnd in enumerate(self.exons[1:]):
200 intron = Interval()
201 intron.setName("%s_intron%d" % (self.getName(), cpt+1))
202 intron.setChromosome(self.getChromosome())
203 intron.setDirection(self.getDirection())
204 if self.getDirection() == 1:
205 intron.setEnd(exonEnd.getStart() - 1)
206 intron.setStart(exonStart.getEnd() + 1)
207 else:
208 intron.setStart(exonEnd.getEnd() + 1)
209 intron.setEnd(exonStart.getStart() - 1)
210 intron.setDirection(self.getDirection())
211 if intron.getSize() > 0:
212 self.introns.append(intron)
213 exonStart = exonEnd
214 intron.setSize(intron.getEnd() - intron.getStart() + 1)
215 return self.introns
216
217
218 def getSize(self):
219 """
220 Get the size of the transcript (i.e. the number of nucleotides)
221 Compute size on the fly
222 """
223 if len(self.exons) == 0:
224 return self.getSizeWithIntrons()
225 size = 0
226 for exon in self.exons:
227 size += exon.getSize()
228 return size
229
230
231 def getSizeWithIntrons(self):
232 """
233 Get the size of the interval (i.e. distance from start to end)
234 """
235 return super(Transcript, self).getSize()
236
237
238 def overlapWithExon(self, transcript, nbNucleotides = 1):
239 """
240 Check if the exons of this transcript overlap with the exons of another transcript
241 @param transcript: transcript to be compared to
242 @type transcript: class L{Transcript<Transcript>}
243 @param nbNucleotides: minimum number of nucleotides to declare and overlap
244 @type nbNucleotides: int
245 """
246 if not self.overlapWith(transcript, nbNucleotides):
247 return False
248 for thisExon in self.getExons():
249 for thatExon in transcript.getExons():
250 if thisExon.overlapWith(thatExon, nbNucleotides):
251 return True
252 return False
253
254
255 def include(self, transcript):
256 """
257 Whether this transcript includes the other one
258 @param transcript: object to be compared to
259 @type transcript: class L{Transcript<Transcript>}
260 """
261 if not super(Transcript, self).include(transcript):
262 return False
263 for thatExon in transcript.getExons():
264 for thisExon in self.getExons():
265 if thisExon.include(thatExon):
266 break
267 else:
268 return False
269 return True
270
271
272 def merge(self, transcript, normalization = False):
273 """
274 Merge with another transcript
275 Merge exons if they overlap, otherwise add exons
276 @param transcript: transcript to be merged to
277 @type transcript: class L{Transcript<Transcript>}
278 @param normalization: whether the sum of the merge should be normalized wrt the number of mappings of each elements
279 @type normalization: boolean
280 """
281 if self.getChromosome() != transcript.getChromosome() or self.getDirection() != transcript.getDirection():
282 raise Exception("Cannot merge '%s' with '%s'!" % (self, transcript))
283
284 theseExons = self.getExons()
285 thoseExons = transcript.getExons()
286
287 for thatExon in thoseExons:
288 toBeRemoved = []
289 for thisIndex, thisExon in enumerate(theseExons):
290 if thisExon.overlapWith(thatExon):
291 thatExon.merge(thisExon)
292 toBeRemoved.append(thisIndex)
293 theseExons.append(thatExon)
294 for thisIndex in reversed(toBeRemoved):
295 del theseExons[thisIndex]
296 self.removeExons()
297 self.setStart(min(self.getStart(), transcript.getStart()))
298 self.setEnd(max(self.getEnd(), transcript.getEnd()))
299 if len(theseExons) > 1:
300 for thisExon in theseExons:
301 self.addExon(thisExon)
302
303 self.setName("%s--%s" % (self.getUniqueName(), transcript.getUniqueName()))
304 super(Transcript, self).merge(transcript, normalization)
305
306
307 def getDifference(self, transcript, sameStrand = False):
308 """
309 Get the difference between this cluster and another one
310 @param transcript: object to be compared to
311 @type transcript: class L{Transcript<Transcript>}
312 @param sameStrand: do the comparison iff the transcripts are on the same strand
313 @type sameStrand: boolean
314 @return: a transcript
315 """
316 newTranscript = Transcript()
317 newTranscript.copy(self)
318 if self.getChromosome() != transcript.getChromosome():
319 return newTranscript
320 if not self.overlapWith(transcript):
321 return newTranscript
322 if sameStrand and self.getDirection() != transcript.getDirection():
323 return newTranscript
324 newTranscript.removeExons()
325 if transcript.getEnd() > newTranscript.getStart():
326 newTranscript.setStart(transcript.getEnd() + 1)
327 if transcript.getStart() < newTranscript.getEnd():
328 newTranscript.setEnd(transcript.getStart() + 1)
329 theseExons = []
330 for exon in self.getExons():
331 exonCopy = Interval()
332 exonCopy.copy(exon)
333 theseExons.append(exonCopy)
334 for thatExon in transcript.getExons():
335 newExons = []
336 for thisExon in theseExons:
337 newExons.extend(thisExon.getDifference(thatExon))
338 theseExons = newExons
339 if not theseExons:
340 return None
341 newStart, newEnd = theseExons[0].getStart(), theseExons[0].getEnd()
342 for thisExon in theseExons[1:]:
343 newStart = min(newStart, thisExon.getStart())
344 newEnd = max(newEnd, thisExon.getEnd())
345 newTranscript.setEnd(newEnd)
346 newTranscript.setStart(newStart)
347 newTranscript.exons = theseExons
348 return newTranscript
349
350
351 def getSqlVariables(cls):
352 """
353 Get the properties of the object that should be saved in a database
354 """
355 variables = Interval.getSqlVariables()
356 variables.append("exons")
357 return variables
358 getSqlVariables = classmethod(getSqlVariables)
359
360
361 def setSqlValues(self, array):
362 """
363 Set the values of the properties of this object as given by a results line of a SQL query
364 @param array: the values to be copied
365 @type array: a list
366 """
367 super(Transcript, self).setSqlValues(array)
368 mergedExons = array[8]
369 if not mergedExons:
370 return
371 for exonCount, splittedExon in enumerate(mergedExons.split(",")):
372 start, end = splittedExon.split("-")
373 exon = Interval()
374 exon.setChromosome(self.getChromosome())
375 exon.setDirection(self.getDirection())
376 exon.setName("%s_exon%d" % (self.getName(), exonCount+1))
377 exon.setStart(int(start))
378 exon.setEnd(int(end))
379 self.addExon(exon)
380
381
382 def getSqlValues(self):
383 """
384 Get the values of the properties that should be saved in a database
385 """
386 values = super(Transcript, self).getSqlValues()
387 values["size"] = self.getSize()
388 if self.getNbExons() == 1:
389 values["exons"] = ""
390 else:
391 values["exons"] = ",".join(["%d-%d" % (exon.getStart(), exon.getEnd()) for exon in self.getExons()])
392 return values
393
394
395 def getSqlTypes(cls):
396 """
397 Get the types of the properties that should be saved in a database
398 """
399 types = Interval.getSqlTypes()
400 types["exons"] = "varchar"
401 return types
402 getSqlTypes = classmethod(getSqlTypes)
403
404
405 def getSqlSizes(cls):
406 """
407 Get the sizes of the properties that should be saved in a database
408 """
409 sizes = Interval.getSqlSizes()
410 sizes["exons"] = 10000
411 return sizes
412 getSqlSizes = classmethod(getSqlSizes)
413
414
415 def sortExons(self):
416 """
417 Sort the exons
418 Increasing order if transcript is on strand "+", decreasing otherwise
419 """
420 self.sortExonsIncreasing()
421 if self.getDirection() == -1:
422 exons = self.getExons()
423 exons.reverse()
424 self.exons = exons
425
426
427 def sortExonsIncreasing(self):
428 """
429 Sort the exons
430 Increasing order
431 """
432 exons = self.getExons()
433 sortedExons = []
434 while len(exons) > 0:
435 minExon = exons[0]
436 for index in range(1, len(exons)):
437 if minExon.getStart() > exons[index].getStart():
438 minExon = exons[index]
439 sortedExons.append(minExon)
440 exons.remove(minExon)
441 self.exons = sortedExons
442
443
444 def extendStart(self, size):
445 """
446 Extend the transcript by the 5' end
447 @param size: the size to be extended
448 @type size: int
449 """
450 if len(self.exons) != 0:
451 self.sortExons()
452 if self.getDirection() == 1:
453 self.exons[0].setStart(max(0, self.exons[0].getStart() - size))
454 else:
455 self.exons[0].setEnd(self.exons[0].getEnd() + size)
456 super(Transcript, self).extendStart(size)
457 self.bin = None
458
459
460 def extendEnd(self, size):
461 """
462 Extend the transcript by the 3' end
463 @param size: the size to be extended
464 @type size: int
465 """
466 if len(self.exons) != 0:
467 self.sortExons()
468 if self.getDirection() == 1:
469 self.exons[-1].setEnd(self.exons[-1].getEnd() + size)
470 else:
471 self.exons[-1].setStart(max(0, self.exons[-1].getStart() - size))
472 super(Transcript, self).extendEnd(size)
473 self.bin = None
474
475
476 def extendExons(self, size):
477 """
478 Extend all the exons
479 @param size: the size to be extended
480 @type size: int
481 """
482 if len(self.exons) != 0:
483 self.sortExons()
484 exons = []
485 previousExon = None
486 for exon in self.exons:
487 exon.extendStart(size)
488 exon.extendEnd(size)
489 exon.setDirection(self.getDirection())
490 if previousExon != None and previousExon.overlapWith(exon):
491 previousExon.merge(exon)
492 else:
493 if previousExon != None:
494 exons.append(previousExon)
495 previousExon = exon
496 exons.append(previousExon)
497 self.exons = exons
498 super(Transcript, self).extendStart(size)
499 super(Transcript, self).extendEnd(size)
500 self.bin = None
501
502
503 def restrictStart(self, size = 1):
504 """
505 Restrict the transcript by some nucleotides, start from its start position
506 Remove the exons
507 @param size: the size to be restricted to
508 @type size: int
509 """
510 newExons = []
511 if self.getDirection() == 1:
512 for exon in self.exons:
513 if exon.getStart() <= self.getStart() + size - 1:
514 if exon.getEnd() > self.getStart() + size - 1:
515 exon.setEnd(self.getStart() + size - 1)
516 newExons.append(exon)
517 else:
518 for exon in self.exons:
519 if exon.getEnd() >= self.getEnd() - size + 1:
520 if exon.getStart() < self.getEnd() - size + 1:
521 exon.setStart(self.getEnd() - size + 1)
522 newExons.append(exon)
523 super(Transcript, self).restrictStart(size)
524 self.exons = newExons
525
526
527 def restrictEnd(self, size = 1):
528 """
529 Restrict the transcript by some nucleotides, end from its end position
530 Remove the exons
531 @param size: the size to be restricted to
532 @type size: int
533 """
534 newExons = []
535 if self.getDirection() == 1:
536 for exon in self.exons:
537 if exon.getEnd() >= self.getEnd() - size + 1:
538 if exon.getStart() < self.getEnd() - size + 1:
539 exon.setStart(self.getEnd() - size + 1)
540 newExons.append(exon)
541 else:
542 for exon in self.exons:
543 if exon.getEnd() >= self.getEnd() - size + 1:
544 if exon.getStart() < self.getEnd() - size + 1:
545 exon.setEnd(self.getEnd() - size + 1)
546 newExons.append(exon)
547 super(Transcript, self).restrictEnd(size)
548 self.exons = newExons
549
550
551 def removeExons(self):
552 """
553 Remove the exons and transforms the current transcript into a mere interval
554 """
555 self.exons = []
556 self.bin = None
557
558
559 def printGtf(self, title):
560 """
561 Export this transcript using GTF2.2 format
562 @param title: the title of the transcripts
563 @type title: string
564 @return: a string
565 """
566 transcriptId = self.getUniqueName()
567 geneId = "%s_gene" % (transcriptId)
568 direction = "+"
569 if self.getDirection() == -1:
570 direction = "-"
571 self.sortExonsIncreasing()
572 string = ""
573 for i, exon in enumerate(self.getExons()):
574 exonCopy = Interval()
575 exonCopy.copy(exon)
576 if "ID" in exonCopy.getTagValues():
577 del exonCopy.tags["ID"]
578 feature = "exon"
579 if "feature" in exonCopy.getTagNames():
580 feature = exonCopy.getTagValue("feature")
581 del exonCopy.tags["feature"]
582 score = "."
583 if "score" in exonCopy.getTagNames():
584 score = "%d" % (int(exonCopy.getTagValue("score")))
585 del exonCopy.tags["score"]
586 if "Parent" in exonCopy.getTagNames():
587 del exonCopy.tags["Parent"]
588 exonCopy.setName("%s_part%d" % (self.getName(), i+1))
589 comment = exonCopy.getTagValues("; ", " ", "\"")
590 string += "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\ttranscript_id \"%s\"; gene_id \"%s\"; %s\n" % (exonCopy.getChromosome(), title, feature, exonCopy.getStart(), exonCopy.getEnd(), score, direction, transcriptId, geneId, comment)
591 return string
592
593
594 def printGff2(self, title):
595 """
596 Export this transcript using GFF2 format
597 @param title: the title of the transcripts
598 @type title: string
599 @return: a string
600 """
601 direction = "+"
602 if self.getDirection() == -1:
603 direction = "-"
604 self.sortExonsIncreasing()
605 comment = self.getTagValues()
606 if comment != None:
607 comment = ";%s" % (comment)
608 score = "."
609 if "score" in self.getTagNames():
610 score = "%d" % (int(self.getTagValue("score")))
611 feature = "transcript"
612 if "feature" in self.getTagNames():
613 feature = self.getTagValue("feature")
614 string = "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\tGENE %s%s\n" % (self.getChromosome(), title, feature, self.getStart(), self.getEnd(), score, direction, self.name, comment)
615 for exon in self.getExons():
616 if "score" in exon.getTagNames():
617 score = "%d" % (int(self.getTagValue("score")))
618 string += "%s\t%s\t_exon\t%d\t%d\t%s\t%s\t.\tGENE %s\n" % (self.getChromosome(), title, exon.getStart(), exon.getEnd(), score, direction, self.name)
619 return string
620
621
622 def printGff3(self, title):
623 """
624 Export this transcript using GFF3 format
625 @param title: the title of the transcripts
626 @type title: string
627 @return: a string
628 """
629 direction = "+"
630 if self.getDirection() == -1:
631 direction = "-"
632 self.sortExonsIncreasing()
633 if "ID" not in self.getTagValues():
634 self.setTagValue("ID", self.getUniqueName())
635 feature = "transcript"
636 tags = self.tags
637 if "feature" in self.getTagNames():
638 feature = self.getTagValue("feature")
639 del self.tags["feature"]
640 score = "."
641 if "score" in self.getTagNames():
642 score = "%d" % (int(self.getTagValue("score")))
643 del self.tags["score"]
644 comment = self.getTagValues(";", "=")
645 string = "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\t%s\n" % (self.getChromosome(), title, feature, self.getStart(), self.getEnd(), score, direction, comment)
646 if len(self.exons) > 1:
647 for i, exon in enumerate(self.getExons()):
648 if "score" in exon.getTagNames():
649 score = "%d" % (int(exon.getTagValue("score")))
650 string += "%s\t%s\texon\t%d\t%d\t%s\t%s\t.\tID=%s-exon%d;Name=%s-exon%d;Parent=%s\n" % (self.getChromosome(), title, exon.getStart(), exon.getEnd(), score, direction, self.getTagValue("ID"), i+1, self.name, i+1, self.getTagValue("ID"))
651 self.tags = tags
652 return string
653
654
655 def printEmbl(self):
656 """
657 Export this transcript using EMBL format
658 @return: a string
659 """
660 if len(self.exons) <= 1:
661 position = "%d..%d" % (self.getStart(), self.getEnd())
662 else:
663 positions = []
664 for exon in self.getExons():
665 positions.append("%d..%d" % (self.getStart(), self.getEnd()))
666 position = ",".join(positions)
667 position = "join(%s)" % (position)
668 if self.getDirection() == -1:
669 position = "complement(%s)" % (position)
670 feature = "misc_feature"
671 if "feature" in self.getTagNames():
672 if not self.getTagValue("feature").startswith("S-MART"):
673 feature = self.getTagValue("feature")
674 string = "FT %s %s\n" % (feature, position)
675 if "Name" in self.getTagNames():
676 string += "FT /label=\"%s\"\n" % (self.getTagValue("Name"))
677 return string
678
679
680 def printBed(self):
681 """
682 Export this transcript using BED format
683 @return: a string
684 """
685 name = self.name
686 if "nbOccurrences" in self.getTagNames() and self.getTagValue("nbOccurrences") != 1 and self.getTagValue("occurrences"):
687 name = "%s-%d" % (name, self.getTagValue("occurrence"))
688 comment = self.getTagValues(";", "=")
689 sizes = []
690 starts = []
691 direction = "+"
692 if self.getDirection() == -1:
693 direction = "-"
694 self.sortExonsIncreasing()
695 for exon in self.getExons():
696 sizes.append("%d" % (exon.getSize()))
697 starts.append("%d" % (exon.getStart() - self.getStart()))
698 return "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0\t%d\t%s,\t%s,\n" % (self.getChromosome(), self.getStart(), self.getEnd()+1, name, direction, self.getStart(), self.getEnd()+1, self.getNbExons(), ",".join(sizes), ",".join(starts))
699
700
701 def printSam(self):
702 """
703 Export this transcript using SAM format
704 @return: a string
705 """
706 name = self.name
707 flag = 0 if self.getDirection() == 1 else 0x10
708 chromosome = self.getChromosome()
709 genomeStart = self.getStart()
710 quality = 255
711 mate = "*"
712 mateGenomeStart = 0
713 gapSize = 0
714 sequence = "*"
715 qualityString = "*"
716 tags = "NM:i:0"
717
718 lastExonEnd = None
719 self.sortExonsIncreasing()
720 exon = self.getExons()[0]
721 cigar = "%dM" % (self.getExons()[0].getSize())
722 lastExonEnd = exon.getEnd()
723 for i, exon in enumerate(self.getExons()):
724 if i == 0:
725 continue
726 cigar += "%dN" % (exon.getStart() - lastExonEnd - 1)
727 cigar += "%dM" % (exon.getSize())
728
729 return "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n" % (name, flag, chromosome, genomeStart, quality, cigar, mate, mateGenomeStart, gapSize, sequence, qualityString, tags)
730
731
732 def printUcsc(self):
733 """
734 Export this transcript using UCSC BED format
735 @return: a string
736 """
737 if self.getChromosome().find("Het") != -1:
738 return ""
739 name = self.name
740 comment = self.getTagValues(";", "")
741 sizes = []
742 starts = []
743 direction = "+"
744 if self.getDirection() == -1:
745 direction = "-"
746 self.sortExonsIncreasing()
747 for exon in self.getExons():
748 sizes.append("%d" % (exon.getSize()))
749 starts.append("%d" % (exon.getStart() - self.getStart()))
750 return "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0\t%d\t%s,\t%s,\n" % (self.getChromosome().replace("arm_", "chr"), self.getStart(), self.getEnd()+1, name, direction, self.getStart(), self.getEnd()+1, self.getNbExons(), ",".join(sizes), ",".join(starts))
751
752
753 def printGBrowseReference(self):
754 """
755 Export this transcript using GBrowse format (1st line only)
756 @return: a string
757 """
758 return "reference = %s\n" % (self.getChromosome())
759
760
761 def printGBrowseLine(self):
762 """
763 Export this transcript using GBrowse format (2nd line only)
764 @return: a string
765 """
766 self.sortExons()
767 coordinates = []
768 for exon in self.getExons():
769 coordinates.append(exon.printCoordinates())
770 coordinatesString = ",".join(coordinates)
771 comment = self.getTagValues(";", "=")
772 if comment:
773 comment = "\t\"%s\"" % (comment)
774 return "User_data\t%s\t%s%s\n" % (self.name, coordinatesString, comment)
775
776
777 def printGBrowse(self):
778 """
779 Export this transcript using GBrowse format
780 @return: a string
781 """
782 return "%s%s" % (self.printGBrowseReference(), self.printGBrowseLine())
783
784
785 def printCsv(self):
786 """
787 Export this transcript using CSV format
788 @return: a string
789 """
790 self.sortExons()
791 string = "%s,%d,%d,\"%s\"," % (self.getChromosome(), self.getStart(), self.getEnd(), "+" if self.getDirection() == 1 else "-")
792 if len(self.getExons()) == 1:
793 string += "None"
794 else:
795 for exon in self.getExons():
796 string += "%d-%d " % (exon.getStart(), exon.getEnd())
797 for tag in sorted(self.tags.keys()):
798 string += ",%s=%s" % (tag, str(self.tags[tag]))
799 string += "\n"
800 return string
801
802
803 def extractSequence(self, parser):
804 """
805 Get the sequence corresponding to this transcript
806 @param parser: a parser to a FASTA file
807 @type parser: class L{SequenceListParser<SequenceListParser>}
808 @return: an instance of L{Sequence<Sequence>}
809 """
810 self.sortExons()
811 name = self.name
812 if "ID" in self.getTagNames() and self.getTagValue("ID") != self.name:
813 name += ":%s" % (self.getTagValue("ID"))
814 sequence = Sequence(name)
815 for exon in self.getExons():
816 sequence.concatenate(exon.extractSequence(parser))
817 return sequence
818
819
820 def extractWigData(self, parser):
821 """
822 Get some wig data corresponding to this transcript
823 @param parser: a parser to a wig file
824 @type parser: class L{WigParser<WigParser>}
825 @return: a sequence of float
826 """
827 self.sortExons()
828 if parser.strands:
829 strands = (-1, 1)
830 values = dict([(strand, []) for strand in strands])
831 for exon in self.getExons():
832 theseValues = exon.extractWigData(parser)
833 if self.getDirection() == -1:
834 for strand in strands:
835 theseValues[strand].reverse()
836 for strand in strands:
837 values[strand].extend(theseValues[strand])
838 if self.getDirection() == -1:
839 for strand in strands:
840 values[strand].reverse()
841 return values
842 else:
843 values = []
844 for exon in self.getExons():
845 theseValues = exon.extractWigData(parser)
846 #if self.getDirection() == -1:
847 # theseValues.reverse()
848 values.extend(theseValues)
849 #if self.getDirection() == -1:
850 # values.reverse()
851 return values