Mercurial > repos > yufei-luo > s_mart
comparison SMART/Java/Python/structure/Transcript.py @ 6:769e306b7933
Change the repository level.
author | yufei-luo |
---|---|
date | Fri, 18 Jan 2013 04:54:14 -0500 |
parents | |
children | 94ab73e8a190 |
comparison
equal
deleted
inserted
replaced
5:ea3082881bf8 | 6:769e306b7933 |
---|---|
1 # | |
2 # Copyright INRA-URGI 2009-2010 | |
3 # | |
4 # This software is governed by the CeCILL license under French law and | |
5 # abiding by the rules of distribution of free software. You can use, | |
6 # modify and/ or redistribute the software under the terms of the CeCILL | |
7 # license as circulated by CEA, CNRS and INRIA at the following URL | |
8 # "http://www.cecill.info". | |
9 # | |
10 # As a counterpart to the access to the source code and rights to copy, | |
11 # modify and redistribute granted by the license, users are provided only | |
12 # with a limited warranty and the software's author, the holder of the | |
13 # economic rights, and the successive licensors have only limited | |
14 # liability. | |
15 # | |
16 # In this respect, the user's attention is drawn to the risks associated | |
17 # with loading, using, modifying and/or developing or reproducing the | |
18 # software by the user in light of its specific status of free software, | |
19 # that may mean that it is complicated to manipulate, and that also | |
20 # therefore means that it is reserved for developers and experienced | |
21 # professionals having in-depth computer knowledge. Users are therefore | |
22 # encouraged to load and test the software's suitability as regards their | |
23 # requirements in conditions enabling the security of their systems and/or | |
24 # data to be ensured and, more generally, to use and operate it in the | |
25 # same conditions as regards security. | |
26 # | |
27 # The fact that you are presently reading this means that you have had | |
28 # knowledge of the CeCILL license and that you accept its terms. | |
29 # | |
30 import sys | |
31 from SMART.Java.Python.structure.Interval import Interval | |
32 from SMART.Java.Python.structure.Sequence import Sequence | |
33 | |
34 | |
35 class Transcript(Interval): | |
36 """ | |
37 A class that models an transcript, considered as a specialized interval (the bounds of the transcript) that contains exons (also represented as intervals) | |
38 @ivar exons: a list of exons (intervals) | |
39 @type exons: list of L{Interval{Interval}} | |
40 """ | |
41 | |
42 def __init__(self, transcript = None, verbosity = 0): | |
43 """ | |
44 Constructor | |
45 @param transcript: transcript to be copied | |
46 @type transcript: class L{Transcript<Transcript>} | |
47 @param verbosity: verbosity | |
48 @type verbosity: int | |
49 """ | |
50 super(Transcript, self).__init__(None, verbosity) | |
51 self.exons = [] | |
52 self.introns = None | |
53 if transcript != None: | |
54 self.copy(transcript) | |
55 | |
56 | |
57 def copy(self, transcript): | |
58 """ | |
59 Copy method | |
60 @param transcript: transcript to be copied | |
61 @type transcript: class L{Transcript<Transcript>} or L{Interval<Interval>} | |
62 """ | |
63 super(Transcript, self).copy(transcript) | |
64 if transcript.__class__.__name__ == "Transcript": | |
65 exons = transcript.getExons() | |
66 if len(exons) > 1: | |
67 for exon in exons: | |
68 exonCopy = Interval(exon) | |
69 self.addExon(exonCopy) | |
70 | |
71 | |
72 def setDirection(self, direction): | |
73 """ | |
74 Set the direction of the interval | |
75 Possibly parse different formats | |
76 Impact all exons | |
77 @param direction: direction of the transcript (+ / -) | |
78 @type direction: int or string | |
79 """ | |
80 super(Transcript, self).setDirection(direction) | |
81 for exon in self.exons: | |
82 exon.setDirection(direction) | |
83 | |
84 | |
85 def setChromosome(self, chromosome): | |
86 """ | |
87 Set the chromosome | |
88 @param chromosome: chromosome on which the transcript is | |
89 @type chromosome: string | |
90 """ | |
91 super(Transcript, self).setChromosome(chromosome) | |
92 for exon in self.exons: | |
93 exon.setChromosome(chromosome) | |
94 | |
95 | |
96 def addExon(self, exon): | |
97 """ | |
98 Add an exon to the list of exons | |
99 @param exon: a new exon | |
100 @type exon: class L{Interval<Interval>} | |
101 """ | |
102 if not self.exons and not exon.overlapWith(self): | |
103 firstExon = Interval() | |
104 firstExon.setStart(self.getStart()) | |
105 firstExon.setEnd(self.getEnd()) | |
106 firstExon.setDirection(self.getDirection()) | |
107 firstExon.setChromosome(self.getChromosome()) | |
108 self.exons.append(firstExon) | |
109 newExon = Interval(exon) | |
110 newExon.setDirection(self.getDirection()) | |
111 self.exons.append(newExon) | |
112 if newExon.getStart() < self.getStart(): | |
113 self.setStart(newExon.getStart()) | |
114 if newExon.getEnd() > self.getEnd(): | |
115 self.setEnd(newExon.getEnd()) | |
116 | |
117 | |
118 def setStart(self, start): | |
119 """ | |
120 Set the new start, move the first exon accordingly (if exists) | |
121 @param start: the new start | |
122 @type start: int | |
123 """ | |
124 super(Transcript, self).setStart(start) | |
125 if self.exons: | |
126 self.sortExonsIncreasing() | |
127 self.exons[0].setStart(start) | |
128 | |
129 | |
130 def setEnd(self, end): | |
131 """ | |
132 Set the new end, move the last exon accordingly (if exists) | |
133 @param end: the new end | |
134 @type end: int | |
135 """ | |
136 super(Transcript, self).setEnd(end) | |
137 if self.exons: | |
138 self.sortExonsIncreasing() | |
139 self.exons[-1].setEnd(end) | |
140 | |
141 | |
142 def reverse(self): | |
143 """ | |
144 Reverse the strand of the transcript | |
145 """ | |
146 super(Transcript, self).reverse() | |
147 for exon in self.exons: | |
148 exon.reverse() | |
149 | |
150 | |
151 def getUniqueName(self): | |
152 """ | |
153 Try to give a unique name by possibly adding occurrence | |
154 """ | |
155 if "nbOccurrences" in self.tags and "occurrence" in self.tags and self.tags["nbOccurrences"] != 1: | |
156 return "%s-%d" % (self.name, self.tags["occurrence"]) | |
157 return self.name | |
158 | |
159 | |
160 def getNbExons(self): | |
161 """ | |
162 Get the number of exons | |
163 """ | |
164 return max(1, len(self.exons)) | |
165 | |
166 | |
167 def getExon(self, i): | |
168 """ | |
169 Get a specific exon | |
170 @param i: the rank of the exon | |
171 @type i: int | |
172 """ | |
173 if len(self.exons) == 0: | |
174 if i != 0: | |
175 raise Exception("Cannot get exon #%i while there is no exon in the transcript" % (i)) | |
176 return self | |
177 return self.exons[i] | |
178 | |
179 | |
180 def getExons(self): | |
181 """ | |
182 Get all the exons | |
183 """ | |
184 if len(self.exons) == 0: | |
185 return [Interval(self)] | |
186 return self.exons | |
187 | |
188 | |
189 def getIntrons(self): | |
190 """ | |
191 Get all the introns | |
192 Compute introns on the fly | |
193 """ | |
194 if self.introns != None: | |
195 return self.introns | |
196 self.sortExons() | |
197 self.introns = [] | |
198 exonStart = self.getExon(0) | |
199 for cpt, exonEnd in enumerate(self.exons[1:]): | |
200 intron = Interval() | |
201 intron.setName("%s_intron%d" % (self.getName(), cpt+1)) | |
202 intron.setChromosome(self.getChromosome()) | |
203 intron.setDirection(self.getDirection()) | |
204 if self.getDirection() == 1: | |
205 intron.setEnd(exonEnd.getStart() - 1) | |
206 intron.setStart(exonStart.getEnd() + 1) | |
207 else: | |
208 intron.setStart(exonEnd.getEnd() + 1) | |
209 intron.setEnd(exonStart.getStart() - 1) | |
210 intron.setDirection(self.getDirection()) | |
211 if intron.getSize() > 0: | |
212 self.introns.append(intron) | |
213 exonStart = exonEnd | |
214 intron.setSize(intron.getEnd() - intron.getStart() + 1) | |
215 return self.introns | |
216 | |
217 | |
218 def getSize(self): | |
219 """ | |
220 Get the size of the transcript (i.e. the number of nucleotides) | |
221 Compute size on the fly | |
222 """ | |
223 if len(self.exons) == 0: | |
224 return self.getSizeWithIntrons() | |
225 size = 0 | |
226 for exon in self.exons: | |
227 size += exon.getSize() | |
228 return size | |
229 | |
230 | |
231 def getSizeWithIntrons(self): | |
232 """ | |
233 Get the size of the interval (i.e. distance from start to end) | |
234 """ | |
235 return super(Transcript, self).getSize() | |
236 | |
237 | |
238 def overlapWithExon(self, transcript, nbNucleotides = 1): | |
239 """ | |
240 Check if the exons of this transcript overlap with the exons of another transcript | |
241 @param transcript: transcript to be compared to | |
242 @type transcript: class L{Transcript<Transcript>} | |
243 @param nbNucleotides: minimum number of nucleotides to declare and overlap | |
244 @type nbNucleotides: int | |
245 """ | |
246 if not self.overlapWith(transcript, nbNucleotides): | |
247 return False | |
248 for thisExon in self.getExons(): | |
249 for thatExon in transcript.getExons(): | |
250 if thisExon.overlapWith(thatExon, nbNucleotides): | |
251 return True | |
252 return False | |
253 | |
254 | |
255 def include(self, transcript): | |
256 """ | |
257 Whether this transcript includes the other one | |
258 @param transcript: object to be compared to | |
259 @type transcript: class L{Transcript<Transcript>} | |
260 """ | |
261 if not super(Transcript, self).include(transcript): | |
262 return False | |
263 for thatExon in transcript.getExons(): | |
264 for thisExon in self.getExons(): | |
265 if thisExon.include(thatExon): | |
266 break | |
267 else: | |
268 return False | |
269 return True | |
270 | |
271 | |
272 def merge(self, transcript, normalization = False): | |
273 """ | |
274 Merge with another transcript | |
275 Merge exons if they overlap, otherwise add exons | |
276 @param transcript: transcript to be merged to | |
277 @type transcript: class L{Transcript<Transcript>} | |
278 @param normalization: whether the sum of the merge should be normalized wrt the number of mappings of each elements | |
279 @type normalization: boolean | |
280 """ | |
281 if self.getChromosome() != transcript.getChromosome() or self.getDirection() != transcript.getDirection(): | |
282 raise Exception("Cannot merge '%s' with '%s'!" % (self, transcript)) | |
283 | |
284 theseExons = self.getExons() | |
285 thoseExons = transcript.getExons() | |
286 | |
287 for thatExon in thoseExons: | |
288 toBeRemoved = [] | |
289 for thisIndex, thisExon in enumerate(theseExons): | |
290 if thisExon.overlapWith(thatExon): | |
291 thatExon.merge(thisExon) | |
292 toBeRemoved.append(thisIndex) | |
293 theseExons.append(thatExon) | |
294 for thisIndex in reversed(toBeRemoved): | |
295 del theseExons[thisIndex] | |
296 self.removeExons() | |
297 self.setStart(min(self.getStart(), transcript.getStart())) | |
298 self.setEnd(max(self.getEnd(), transcript.getEnd())) | |
299 if len(theseExons) > 1: | |
300 for thisExon in theseExons: | |
301 self.addExon(thisExon) | |
302 | |
303 self.setName("%s--%s" % (self.getUniqueName(), transcript.getUniqueName())) | |
304 super(Transcript, self).merge(transcript, normalization) | |
305 | |
306 | |
307 def getDifference(self, transcript, sameStrand = False): | |
308 """ | |
309 Get the difference between this cluster and another one | |
310 @param transcript: object to be compared to | |
311 @type transcript: class L{Transcript<Transcript>} | |
312 @param sameStrand: do the comparison iff the transcripts are on the same strand | |
313 @type sameStrand: boolean | |
314 @return: a transcript | |
315 """ | |
316 newTranscript = Transcript() | |
317 newTranscript.copy(self) | |
318 if self.getChromosome() != transcript.getChromosome(): | |
319 return newTranscript | |
320 if not self.overlapWith(transcript): | |
321 return newTranscript | |
322 if sameStrand and self.getDirection() != transcript.getDirection(): | |
323 return newTranscript | |
324 newTranscript.removeExons() | |
325 if transcript.getEnd() > newTranscript.getStart(): | |
326 newTranscript.setStart(transcript.getEnd() + 1) | |
327 if transcript.getStart() < newTranscript.getEnd(): | |
328 newTranscript.setEnd(transcript.getStart() + 1) | |
329 theseExons = [] | |
330 for exon in self.getExons(): | |
331 exonCopy = Interval() | |
332 exonCopy.copy(exon) | |
333 theseExons.append(exonCopy) | |
334 for thatExon in transcript.getExons(): | |
335 newExons = [] | |
336 for thisExon in theseExons: | |
337 newExons.extend(thisExon.getDifference(thatExon)) | |
338 theseExons = newExons | |
339 if not theseExons: | |
340 return None | |
341 newStart, newEnd = theseExons[0].getStart(), theseExons[0].getEnd() | |
342 for thisExon in theseExons[1:]: | |
343 newStart = min(newStart, thisExon.getStart()) | |
344 newEnd = max(newEnd, thisExon.getEnd()) | |
345 newTranscript.setEnd(newEnd) | |
346 newTranscript.setStart(newStart) | |
347 newTranscript.exons = theseExons | |
348 return newTranscript | |
349 | |
350 | |
351 def getSqlVariables(cls): | |
352 """ | |
353 Get the properties of the object that should be saved in a database | |
354 """ | |
355 variables = Interval.getSqlVariables() | |
356 variables.append("exons") | |
357 return variables | |
358 getSqlVariables = classmethod(getSqlVariables) | |
359 | |
360 | |
361 def setSqlValues(self, array): | |
362 """ | |
363 Set the values of the properties of this object as given by a results line of a SQL query | |
364 @param array: the values to be copied | |
365 @type array: a list | |
366 """ | |
367 super(Transcript, self).setSqlValues(array) | |
368 mergedExons = array[8] | |
369 if not mergedExons: | |
370 return | |
371 for exonCount, splittedExon in enumerate(mergedExons.split(",")): | |
372 start, end = splittedExon.split("-") | |
373 exon = Interval() | |
374 exon.setChromosome(self.getChromosome()) | |
375 exon.setDirection(self.getDirection()) | |
376 exon.setName("%s_exon%d" % (self.getName(), exonCount+1)) | |
377 exon.setStart(int(start)) | |
378 exon.setEnd(int(end)) | |
379 self.addExon(exon) | |
380 | |
381 | |
382 def getSqlValues(self): | |
383 """ | |
384 Get the values of the properties that should be saved in a database | |
385 """ | |
386 values = super(Transcript, self).getSqlValues() | |
387 values["size"] = self.getSize() | |
388 if self.getNbExons() == 1: | |
389 values["exons"] = "" | |
390 else: | |
391 values["exons"] = ",".join(["%d-%d" % (exon.getStart(), exon.getEnd()) for exon in self.getExons()]) | |
392 return values | |
393 | |
394 | |
395 def getSqlTypes(cls): | |
396 """ | |
397 Get the types of the properties that should be saved in a database | |
398 """ | |
399 types = Interval.getSqlTypes() | |
400 types["exons"] = "varchar" | |
401 return types | |
402 getSqlTypes = classmethod(getSqlTypes) | |
403 | |
404 | |
405 def getSqlSizes(cls): | |
406 """ | |
407 Get the sizes of the properties that should be saved in a database | |
408 """ | |
409 sizes = Interval.getSqlSizes() | |
410 sizes["exons"] = 10000 | |
411 return sizes | |
412 getSqlSizes = classmethod(getSqlSizes) | |
413 | |
414 | |
415 def sortExons(self): | |
416 """ | |
417 Sort the exons | |
418 Increasing order if transcript is on strand "+", decreasing otherwise | |
419 """ | |
420 self.sortExonsIncreasing() | |
421 if self.getDirection() == -1: | |
422 exons = self.getExons() | |
423 exons.reverse() | |
424 self.exons = exons | |
425 | |
426 | |
427 def sortExonsIncreasing(self): | |
428 """ | |
429 Sort the exons | |
430 Increasing order | |
431 """ | |
432 exons = self.getExons() | |
433 sortedExons = [] | |
434 while len(exons) > 0: | |
435 minExon = exons[0] | |
436 for index in range(1, len(exons)): | |
437 if minExon.getStart() > exons[index].getStart(): | |
438 minExon = exons[index] | |
439 sortedExons.append(minExon) | |
440 exons.remove(minExon) | |
441 self.exons = sortedExons | |
442 | |
443 | |
444 def extendStart(self, size): | |
445 """ | |
446 Extend the transcript by the 5' end | |
447 @param size: the size to be extended | |
448 @type size: int | |
449 """ | |
450 if len(self.exons) != 0: | |
451 self.sortExons() | |
452 if self.getDirection() == 1: | |
453 self.exons[0].setStart(max(0, self.exons[0].getStart() - size)) | |
454 else: | |
455 self.exons[0].setEnd(self.exons[0].getEnd() + size) | |
456 super(Transcript, self).extendStart(size) | |
457 self.bin = None | |
458 | |
459 | |
460 def extendEnd(self, size): | |
461 """ | |
462 Extend the transcript by the 3' end | |
463 @param size: the size to be extended | |
464 @type size: int | |
465 """ | |
466 if len(self.exons) != 0: | |
467 self.sortExons() | |
468 if self.getDirection() == 1: | |
469 self.exons[-1].setEnd(self.exons[-1].getEnd() + size) | |
470 else: | |
471 self.exons[-1].setStart(max(0, self.exons[-1].getStart() - size)) | |
472 super(Transcript, self).extendEnd(size) | |
473 self.bin = None | |
474 | |
475 | |
476 def extendExons(self, size): | |
477 """ | |
478 Extend all the exons | |
479 @param size: the size to be extended | |
480 @type size: int | |
481 """ | |
482 if len(self.exons) != 0: | |
483 self.sortExons() | |
484 exons = [] | |
485 previousExon = None | |
486 for exon in self.exons: | |
487 exon.extendStart(size) | |
488 exon.extendEnd(size) | |
489 exon.setDirection(self.getDirection()) | |
490 if previousExon != None and previousExon.overlapWith(exon): | |
491 previousExon.merge(exon) | |
492 else: | |
493 if previousExon != None: | |
494 exons.append(previousExon) | |
495 previousExon = exon | |
496 exons.append(previousExon) | |
497 self.exons = exons | |
498 super(Transcript, self).extendStart(size) | |
499 super(Transcript, self).extendEnd(size) | |
500 self.bin = None | |
501 | |
502 | |
503 def restrictStart(self, size = 1): | |
504 """ | |
505 Restrict the transcript by some nucleotides, start from its start position | |
506 Remove the exons | |
507 @param size: the size to be restricted to | |
508 @type size: int | |
509 """ | |
510 newExons = [] | |
511 if self.getDirection() == 1: | |
512 for exon in self.exons: | |
513 if exon.getStart() <= self.getStart() + size - 1: | |
514 if exon.getEnd() > self.getStart() + size - 1: | |
515 exon.setEnd(self.getStart() + size - 1) | |
516 newExons.append(exon) | |
517 else: | |
518 for exon in self.exons: | |
519 if exon.getEnd() >= self.getEnd() - size + 1: | |
520 if exon.getStart() < self.getEnd() - size + 1: | |
521 exon.setStart(self.getEnd() - size + 1) | |
522 newExons.append(exon) | |
523 super(Transcript, self).restrictStart(size) | |
524 self.exons = newExons | |
525 | |
526 | |
527 def restrictEnd(self, size = 1): | |
528 """ | |
529 Restrict the transcript by some nucleotides, end from its end position | |
530 Remove the exons | |
531 @param size: the size to be restricted to | |
532 @type size: int | |
533 """ | |
534 newExons = [] | |
535 if self.getDirection() == 1: | |
536 for exon in self.exons: | |
537 if exon.getEnd() >= self.getEnd() - size + 1: | |
538 if exon.getStart() < self.getEnd() - size + 1: | |
539 exon.setStart(self.getEnd() - size + 1) | |
540 newExons.append(exon) | |
541 else: | |
542 for exon in self.exons: | |
543 if exon.getEnd() >= self.getEnd() - size + 1: | |
544 if exon.getStart() < self.getEnd() - size + 1: | |
545 exon.setEnd(self.getEnd() - size + 1) | |
546 newExons.append(exon) | |
547 super(Transcript, self).restrictEnd(size) | |
548 self.exons = newExons | |
549 | |
550 | |
551 def removeExons(self): | |
552 """ | |
553 Remove the exons and transforms the current transcript into a mere interval | |
554 """ | |
555 self.exons = [] | |
556 self.bin = None | |
557 | |
558 | |
559 def printGtf(self, title): | |
560 """ | |
561 Export this transcript using GTF2.2 format | |
562 @param title: the title of the transcripts | |
563 @type title: string | |
564 @return: a string | |
565 """ | |
566 transcriptId = self.getUniqueName() | |
567 geneId = "%s_gene" % (transcriptId) | |
568 direction = "+" | |
569 if self.getDirection() == -1: | |
570 direction = "-" | |
571 self.sortExonsIncreasing() | |
572 string = "" | |
573 for i, exon in enumerate(self.getExons()): | |
574 exonCopy = Interval() | |
575 exonCopy.copy(exon) | |
576 if "ID" in exonCopy.getTagValues(): | |
577 del exonCopy.tags["ID"] | |
578 feature = "exon" | |
579 if "feature" in exonCopy.getTagNames(): | |
580 feature = exonCopy.getTagValue("feature") | |
581 del exonCopy.tags["feature"] | |
582 score = "." | |
583 if "score" in exonCopy.getTagNames(): | |
584 score = "%d" % (int(exonCopy.getTagValue("score"))) | |
585 del exonCopy.tags["score"] | |
586 if "Parent" in exonCopy.getTagNames(): | |
587 del exonCopy.tags["Parent"] | |
588 exonCopy.setName("%s_part%d" % (self.getName(), i+1)) | |
589 comment = exonCopy.getTagValues("; ", " ", "\"") | |
590 string += "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\ttranscript_id \"%s\"; gene_id \"%s\"; %s\n" % (exonCopy.getChromosome(), title, feature, exonCopy.getStart(), exonCopy.getEnd(), score, direction, transcriptId, geneId, comment) | |
591 return string | |
592 | |
593 | |
594 def printGff2(self, title): | |
595 """ | |
596 Export this transcript using GFF2 format | |
597 @param title: the title of the transcripts | |
598 @type title: string | |
599 @return: a string | |
600 """ | |
601 direction = "+" | |
602 if self.getDirection() == -1: | |
603 direction = "-" | |
604 self.sortExonsIncreasing() | |
605 comment = self.getTagValues() | |
606 if comment != None: | |
607 comment = ";%s" % (comment) | |
608 score = "." | |
609 if "score" in self.getTagNames(): | |
610 score = "%d" % (int(self.getTagValue("score"))) | |
611 feature = "transcript" | |
612 if "feature" in self.getTagNames(): | |
613 feature = self.getTagValue("feature") | |
614 string = "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\tGENE %s%s\n" % (self.getChromosome(), title, feature, self.getStart(), self.getEnd(), score, direction, self.name, comment) | |
615 for exon in self.getExons(): | |
616 if "score" in exon.getTagNames(): | |
617 score = "%d" % (int(self.getTagValue("score"))) | |
618 string += "%s\t%s\t_exon\t%d\t%d\t%s\t%s\t.\tGENE %s\n" % (self.getChromosome(), title, exon.getStart(), exon.getEnd(), score, direction, self.name) | |
619 return string | |
620 | |
621 | |
622 def printGff3(self, title): | |
623 """ | |
624 Export this transcript using GFF3 format | |
625 @param title: the title of the transcripts | |
626 @type title: string | |
627 @return: a string | |
628 """ | |
629 direction = "+" | |
630 if self.getDirection() == -1: | |
631 direction = "-" | |
632 self.sortExonsIncreasing() | |
633 if "ID" not in self.getTagValues(): | |
634 self.setTagValue("ID", self.getUniqueName()) | |
635 feature = "transcript" | |
636 tags = self.tags | |
637 if "feature" in self.getTagNames(): | |
638 feature = self.getTagValue("feature") | |
639 del self.tags["feature"] | |
640 score = "." | |
641 if "score" in self.getTagNames(): | |
642 score = "%d" % (int(self.getTagValue("score"))) | |
643 del self.tags["score"] | |
644 comment = self.getTagValues(";", "=") | |
645 string = "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\t%s\n" % (self.getChromosome(), title, feature, self.getStart(), self.getEnd(), score, direction, comment) | |
646 if len(self.exons) > 1: | |
647 for i, exon in enumerate(self.getExons()): | |
648 if "score" in exon.getTagNames(): | |
649 score = "%d" % (int(exon.getTagValue("score"))) | |
650 string += "%s\t%s\texon\t%d\t%d\t%s\t%s\t.\tID=%s-exon%d;Name=%s-exon%d;Parent=%s\n" % (self.getChromosome(), title, exon.getStart(), exon.getEnd(), score, direction, self.getTagValue("ID"), i+1, self.name, i+1, self.getTagValue("ID")) | |
651 self.tags = tags | |
652 return string | |
653 | |
654 | |
655 def printEmbl(self): | |
656 """ | |
657 Export this transcript using EMBL format | |
658 @return: a string | |
659 """ | |
660 if len(self.exons) <= 1: | |
661 position = "%d..%d" % (self.getStart(), self.getEnd()) | |
662 else: | |
663 positions = [] | |
664 for exon in self.getExons(): | |
665 positions.append("%d..%d" % (self.getStart(), self.getEnd())) | |
666 position = ",".join(positions) | |
667 position = "join(%s)" % (position) | |
668 if self.getDirection() == -1: | |
669 position = "complement(%s)" % (position) | |
670 feature = "misc_feature" | |
671 if "feature" in self.getTagNames(): | |
672 if not self.getTagValue("feature").startswith("S-MART"): | |
673 feature = self.getTagValue("feature") | |
674 string = "FT %s %s\n" % (feature, position) | |
675 if "Name" in self.getTagNames(): | |
676 string += "FT /label=\"%s\"\n" % (self.getTagValue("Name")) | |
677 return string | |
678 | |
679 | |
680 def printBed(self): | |
681 """ | |
682 Export this transcript using BED format | |
683 @return: a string | |
684 """ | |
685 name = self.name | |
686 if "nbOccurrences" in self.getTagNames() and self.getTagValue("nbOccurrences") != 1 and self.getTagValue("occurrences"): | |
687 name = "%s-%d" % (name, self.getTagValue("occurrence")) | |
688 comment = self.getTagValues(";", "=") | |
689 sizes = [] | |
690 starts = [] | |
691 direction = "+" | |
692 if self.getDirection() == -1: | |
693 direction = "-" | |
694 self.sortExonsIncreasing() | |
695 for exon in self.getExons(): | |
696 sizes.append("%d" % (exon.getSize())) | |
697 starts.append("%d" % (exon.getStart() - self.getStart())) | |
698 return "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0\t%d\t%s,\t%s,\n" % (self.getChromosome(), self.getStart(), self.getEnd()+1, name, direction, self.getStart(), self.getEnd()+1, self.getNbExons(), ",".join(sizes), ",".join(starts)) | |
699 | |
700 | |
701 def printSam(self): | |
702 """ | |
703 Export this transcript using SAM format | |
704 @return: a string | |
705 """ | |
706 name = self.name | |
707 flag = 0 if self.getDirection() == 1 else 0x10 | |
708 chromosome = self.getChromosome() | |
709 genomeStart = self.getStart() | |
710 quality = 255 | |
711 mate = "*" | |
712 mateGenomeStart = 0 | |
713 gapSize = 0 | |
714 sequence = "*" | |
715 qualityString = "*" | |
716 tags = "NM:i:0" | |
717 | |
718 lastExonEnd = None | |
719 self.sortExonsIncreasing() | |
720 exon = self.getExons()[0] | |
721 cigar = "%dM" % (self.getExons()[0].getSize()) | |
722 lastExonEnd = exon.getEnd() | |
723 for i, exon in enumerate(self.getExons()): | |
724 if i == 0: | |
725 continue | |
726 cigar += "%dN" % (exon.getStart() - lastExonEnd - 1) | |
727 cigar += "%dM" % (exon.getSize()) | |
728 | |
729 return "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n" % (name, flag, chromosome, genomeStart, quality, cigar, mate, mateGenomeStart, gapSize, sequence, qualityString, tags) | |
730 | |
731 | |
732 def printUcsc(self): | |
733 """ | |
734 Export this transcript using UCSC BED format | |
735 @return: a string | |
736 """ | |
737 if self.getChromosome().find("Het") != -1: | |
738 return "" | |
739 name = self.name | |
740 comment = self.getTagValues(";", "") | |
741 sizes = [] | |
742 starts = [] | |
743 direction = "+" | |
744 if self.getDirection() == -1: | |
745 direction = "-" | |
746 self.sortExonsIncreasing() | |
747 for exon in self.getExons(): | |
748 sizes.append("%d" % (exon.getSize())) | |
749 starts.append("%d" % (exon.getStart() - self.getStart())) | |
750 return "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0\t%d\t%s,\t%s,\n" % (self.getChromosome().replace("arm_", "chr"), self.getStart(), self.getEnd()+1, name, direction, self.getStart(), self.getEnd()+1, self.getNbExons(), ",".join(sizes), ",".join(starts)) | |
751 | |
752 | |
753 def printGBrowseReference(self): | |
754 """ | |
755 Export this transcript using GBrowse format (1st line only) | |
756 @return: a string | |
757 """ | |
758 return "reference = %s\n" % (self.getChromosome()) | |
759 | |
760 | |
761 def printGBrowseLine(self): | |
762 """ | |
763 Export this transcript using GBrowse format (2nd line only) | |
764 @return: a string | |
765 """ | |
766 self.sortExons() | |
767 coordinates = [] | |
768 for exon in self.getExons(): | |
769 coordinates.append(exon.printCoordinates()) | |
770 coordinatesString = ",".join(coordinates) | |
771 comment = self.getTagValues(";", "=") | |
772 if comment: | |
773 comment = "\t\"%s\"" % (comment) | |
774 return "User_data\t%s\t%s%s\n" % (self.name, coordinatesString, comment) | |
775 | |
776 | |
777 def printGBrowse(self): | |
778 """ | |
779 Export this transcript using GBrowse format | |
780 @return: a string | |
781 """ | |
782 return "%s%s" % (self.printGBrowseReference(), self.printGBrowseLine()) | |
783 | |
784 | |
785 def printCsv(self): | |
786 """ | |
787 Export this transcript using CSV format | |
788 @return: a string | |
789 """ | |
790 self.sortExons() | |
791 string = "%s,%d,%d,\"%s\"," % (self.getChromosome(), self.getStart(), self.getEnd(), "+" if self.getDirection() == 1 else "-") | |
792 if len(self.getExons()) == 1: | |
793 string += "None" | |
794 else: | |
795 for exon in self.getExons(): | |
796 string += "%d-%d " % (exon.getStart(), exon.getEnd()) | |
797 for tag in sorted(self.tags.keys()): | |
798 string += ",%s=%s" % (tag, str(self.tags[tag])) | |
799 string += "\n" | |
800 return string | |
801 | |
802 | |
803 def extractSequence(self, parser): | |
804 """ | |
805 Get the sequence corresponding to this transcript | |
806 @param parser: a parser to a FASTA file | |
807 @type parser: class L{SequenceListParser<SequenceListParser>} | |
808 @return: an instance of L{Sequence<Sequence>} | |
809 """ | |
810 self.sortExons() | |
811 name = self.name | |
812 if "ID" in self.getTagNames() and self.getTagValue("ID") != self.name: | |
813 name += ":%s" % (self.getTagValue("ID")) | |
814 sequence = Sequence(name) | |
815 for exon in self.getExons(): | |
816 sequence.concatenate(exon.extractSequence(parser)) | |
817 return sequence | |
818 | |
819 | |
820 def extractWigData(self, parser): | |
821 """ | |
822 Get some wig data corresponding to this transcript | |
823 @param parser: a parser to a wig file | |
824 @type parser: class L{WigParser<WigParser>} | |
825 @return: a sequence of float | |
826 """ | |
827 self.sortExons() | |
828 if parser.strands: | |
829 strands = (-1, 1) | |
830 values = dict([(strand, []) for strand in strands]) | |
831 for exon in self.getExons(): | |
832 theseValues = exon.extractWigData(parser) | |
833 if self.getDirection() == -1: | |
834 for strand in strands: | |
835 theseValues[strand].reverse() | |
836 for strand in strands: | |
837 values[strand].extend(theseValues[strand]) | |
838 if self.getDirection() == -1: | |
839 for strand in strands: | |
840 values[strand].reverse() | |
841 return values | |
842 else: | |
843 values = [] | |
844 for exon in self.getExons(): | |
845 theseValues = exon.extractWigData(parser) | |
846 #if self.getDirection() == -1: | |
847 # theseValues.reverse() | |
848 values.extend(theseValues) | |
849 #if self.getDirection() == -1: | |
850 # values.reverse() | |
851 return values |