Mercurial > repos > yufei-luo > s_mart
comparison SMART/Java/Python/structure/Transcript.py @ 36:44d5973c188c
Uploaded
| author | m-zytnicki |
|---|---|
| date | Tue, 30 Apr 2013 15:02:29 -0400 |
| parents | |
| children | 169d364ddd91 |
comparison
equal
deleted
inserted
replaced
| 35:d94018ca4ada | 36:44d5973c188c |
|---|---|
| 1 # | |
| 2 # Copyright INRA-URGI 2009-2010 | |
| 3 # | |
| 4 # This software is governed by the CeCILL license under French law and | |
| 5 # abiding by the rules of distribution of free software. You can use, | |
| 6 # modify and/ or redistribute the software under the terms of the CeCILL | |
| 7 # license as circulated by CEA, CNRS and INRIA at the following URL | |
| 8 # "http://www.cecill.info". | |
| 9 # | |
| 10 # As a counterpart to the access to the source code and rights to copy, | |
| 11 # modify and redistribute granted by the license, users are provided only | |
| 12 # with a limited warranty and the software's author, the holder of the | |
| 13 # economic rights, and the successive licensors have only limited | |
| 14 # liability. | |
| 15 # | |
| 16 # In this respect, the user's attention is drawn to the risks associated | |
| 17 # with loading, using, modifying and/or developing or reproducing the | |
| 18 # software by the user in light of its specific status of free software, | |
| 19 # that may mean that it is complicated to manipulate, and that also | |
| 20 # therefore means that it is reserved for developers and experienced | |
| 21 # professionals having in-depth computer knowledge. Users are therefore | |
| 22 # encouraged to load and test the software's suitability as regards their | |
| 23 # requirements in conditions enabling the security of their systems and/or | |
| 24 # data to be ensured and, more generally, to use and operate it in the | |
| 25 # same conditions as regards security. | |
| 26 # | |
| 27 # The fact that you are presently reading this means that you have had | |
| 28 # knowledge of the CeCILL license and that you accept its terms. | |
| 29 # | |
| 30 import sys | |
| 31 from SMART.Java.Python.structure.Interval import Interval | |
| 32 from SMART.Java.Python.structure.Sequence import Sequence | |
| 33 | |
| 34 | |
| 35 class Transcript(Interval): | |
| 36 """ | |
| 37 A class that models an transcript, considered as a specialized interval (the bounds of the transcript) that contains exons (also represented as intervals) | |
| 38 @ivar exons: a list of exons (intervals) | |
| 39 @type exons: list of L{Interval{Interval}} | |
| 40 """ | |
| 41 | |
| 42 def __init__(self, transcript = None, verbosity = 0): | |
| 43 """ | |
| 44 Constructor | |
| 45 @param transcript: transcript to be copied | |
| 46 @type transcript: class L{Transcript<Transcript>} | |
| 47 @param verbosity: verbosity | |
| 48 @type verbosity: int | |
| 49 """ | |
| 50 super(Transcript, self).__init__(None, verbosity) | |
| 51 self.exons = [] | |
| 52 self.introns = None | |
| 53 if transcript != None: | |
| 54 self.copy(transcript) | |
| 55 | |
| 56 | |
| 57 def copy(self, transcript): | |
| 58 """ | |
| 59 Copy method | |
| 60 @param transcript: transcript to be copied | |
| 61 @type transcript: class L{Transcript<Transcript>} or L{Interval<Interval>} | |
| 62 """ | |
| 63 super(Transcript, self).copy(transcript) | |
| 64 if transcript.__class__.__name__ == "Transcript": | |
| 65 exons = transcript.getExons() | |
| 66 if len(exons) > 1: | |
| 67 for exon in exons: | |
| 68 exonCopy = Interval(exon) | |
| 69 self.addExon(exonCopy) | |
| 70 | |
| 71 | |
| 72 def setDirection(self, direction): | |
| 73 """ | |
| 74 Set the direction of the interval | |
| 75 Possibly parse different formats | |
| 76 Impact all exons | |
| 77 @param direction: direction of the transcript (+ / -) | |
| 78 @type direction: int or string | |
| 79 """ | |
| 80 super(Transcript, self).setDirection(direction) | |
| 81 for exon in self.exons: | |
| 82 exon.setDirection(direction) | |
| 83 | |
| 84 | |
| 85 def setChromosome(self, chromosome): | |
| 86 """ | |
| 87 Set the chromosome | |
| 88 @param chromosome: chromosome on which the transcript is | |
| 89 @type chromosome: string | |
| 90 """ | |
| 91 super(Transcript, self).setChromosome(chromosome) | |
| 92 for exon in self.exons: | |
| 93 exon.setChromosome(chromosome) | |
| 94 | |
| 95 | |
| 96 def addExon(self, exon): | |
| 97 """ | |
| 98 Add an exon to the list of exons | |
| 99 @param exon: a new exon | |
| 100 @type exon: class L{Interval<Interval>} | |
| 101 """ | |
| 102 if not self.exons and not exon.overlapWith(self): | |
| 103 firstExon = Interval() | |
| 104 firstExon.setStart(self.getStart()) | |
| 105 firstExon.setEnd(self.getEnd()) | |
| 106 firstExon.setDirection(self.getDirection()) | |
| 107 firstExon.setChromosome(self.getChromosome()) | |
| 108 self.exons.append(firstExon) | |
| 109 newExon = Interval(exon) | |
| 110 newExon.setDirection(self.getDirection()) | |
| 111 self.exons.append(newExon) | |
| 112 if newExon.getStart() < self.getStart(): | |
| 113 self.setStart(newExon.getStart()) | |
| 114 if newExon.getEnd() > self.getEnd(): | |
| 115 self.setEnd(newExon.getEnd()) | |
| 116 | |
| 117 | |
| 118 def setStart(self, start): | |
| 119 """ | |
| 120 Set the new start, move the first exon accordingly (if exists) | |
| 121 @param start: the new start | |
| 122 @type start: int | |
| 123 """ | |
| 124 super(Transcript, self).setStart(start) | |
| 125 if self.exons: | |
| 126 self.sortExonsIncreasing() | |
| 127 self.exons[0].setStart(start) | |
| 128 | |
| 129 | |
| 130 def setEnd(self, end): | |
| 131 """ | |
| 132 Set the new end, move the last exon accordingly (if exists) | |
| 133 @param end: the new end | |
| 134 @type end: int | |
| 135 """ | |
| 136 super(Transcript, self).setEnd(end) | |
| 137 if self.exons: | |
| 138 self.sortExonsIncreasing() | |
| 139 self.exons[-1].setEnd(end) | |
| 140 | |
| 141 | |
| 142 def reverse(self): | |
| 143 """ | |
| 144 Reverse the strand of the transcript | |
| 145 """ | |
| 146 super(Transcript, self).reverse() | |
| 147 for exon in self.exons: | |
| 148 exon.reverse() | |
| 149 | |
| 150 | |
| 151 def getUniqueName(self): | |
| 152 """ | |
| 153 Try to give a unique name by possibly adding occurrence | |
| 154 """ | |
| 155 if "nbOccurrences" in self.tags and "occurrence" in self.tags and self.tags["nbOccurrences"] != 1: | |
| 156 return "%s-%d" % (self.name, self.tags["occurrence"]) | |
| 157 return self.name | |
| 158 | |
| 159 | |
| 160 def getNbExons(self): | |
| 161 """ | |
| 162 Get the number of exons | |
| 163 """ | |
| 164 return max(1, len(self.exons)) | |
| 165 | |
| 166 | |
| 167 def getExon(self, i): | |
| 168 """ | |
| 169 Get a specific exon | |
| 170 @param i: the rank of the exon | |
| 171 @type i: int | |
| 172 """ | |
| 173 if len(self.exons) == 0: | |
| 174 if i != 0: | |
| 175 raise Exception("Cannot get exon #%i while there is no exon in the transcript" % (i)) | |
| 176 return self | |
| 177 return self.exons[i] | |
| 178 | |
| 179 | |
| 180 def getExons(self): | |
| 181 """ | |
| 182 Get all the exons | |
| 183 """ | |
| 184 if len(self.exons) == 0: | |
| 185 return [Interval(self)] | |
| 186 return self.exons | |
| 187 | |
| 188 | |
| 189 def getIntrons(self): | |
| 190 """ | |
| 191 Get all the introns | |
| 192 Compute introns on the fly | |
| 193 """ | |
| 194 if self.introns != None: | |
| 195 return self.introns | |
| 196 self.sortExons() | |
| 197 self.introns = [] | |
| 198 exonStart = self.getExon(0) | |
| 199 for cpt, exonEnd in enumerate(self.exons[1:]): | |
| 200 intron = Interval() | |
| 201 intron.setName("%s_intron%d" % (self.getName(), cpt+1)) | |
| 202 intron.setChromosome(self.getChromosome()) | |
| 203 intron.setDirection(self.getDirection()) | |
| 204 if self.getDirection() == 1: | |
| 205 intron.setEnd(exonEnd.getStart() - 1) | |
| 206 intron.setStart(exonStart.getEnd() + 1) | |
| 207 else: | |
| 208 intron.setStart(exonEnd.getEnd() + 1) | |
| 209 intron.setEnd(exonStart.getStart() - 1) | |
| 210 intron.setDirection(self.getDirection()) | |
| 211 if intron.getSize() > 0: | |
| 212 self.introns.append(intron) | |
| 213 exonStart = exonEnd | |
| 214 intron.setSize(intron.getEnd() - intron.getStart() + 1) | |
| 215 return self.introns | |
| 216 | |
| 217 | |
| 218 def getSize(self): | |
| 219 """ | |
| 220 Get the size of the transcript (i.e. the number of nucleotides) | |
| 221 Compute size on the fly | |
| 222 """ | |
| 223 if len(self.exons) == 0: | |
| 224 return self.getSizeWithIntrons() | |
| 225 size = 0 | |
| 226 for exon in self.exons: | |
| 227 size += exon.getSize() | |
| 228 return size | |
| 229 | |
| 230 | |
| 231 def getSizeWithIntrons(self): | |
| 232 """ | |
| 233 Get the size of the interval (i.e. distance from start to end) | |
| 234 """ | |
| 235 return super(Transcript, self).getSize() | |
| 236 | |
| 237 | |
| 238 def overlapWithExon(self, transcript, nbNucleotides = 1): | |
| 239 """ | |
| 240 Check if the exons of this transcript overlap with the exons of another transcript | |
| 241 @param transcript: transcript to be compared to | |
| 242 @type transcript: class L{Transcript<Transcript>} | |
| 243 @param nbNucleotides: minimum number of nucleotides to declare and overlap | |
| 244 @type nbNucleotides: int | |
| 245 """ | |
| 246 if not self.overlapWith(transcript, nbNucleotides): | |
| 247 return False | |
| 248 for thisExon in self.getExons(): | |
| 249 for thatExon in transcript.getExons(): | |
| 250 if thisExon.overlapWith(thatExon, nbNucleotides): | |
| 251 return True | |
| 252 return False | |
| 253 | |
| 254 | |
| 255 def include(self, transcript): | |
| 256 """ | |
| 257 Whether this transcript includes the other one | |
| 258 @param transcript: object to be compared to | |
| 259 @type transcript: class L{Transcript<Transcript>} | |
| 260 """ | |
| 261 if not super(Transcript, self).include(transcript): | |
| 262 return False | |
| 263 for thatExon in transcript.getExons(): | |
| 264 for thisExon in self.getExons(): | |
| 265 if thisExon.include(thatExon): | |
| 266 break | |
| 267 else: | |
| 268 return False | |
| 269 return True | |
| 270 | |
| 271 | |
| 272 def merge(self, transcript, normalization = False): | |
| 273 """ | |
| 274 Merge with another transcript | |
| 275 Merge exons if they overlap, otherwise add exons | |
| 276 @param transcript: transcript to be merged to | |
| 277 @type transcript: class L{Transcript<Transcript>} | |
| 278 @param normalization: whether the sum of the merge should be normalized wrt the number of mappings of each elements | |
| 279 @type normalization: boolean | |
| 280 """ | |
| 281 if self.getChromosome() != transcript.getChromosome() or self.getDirection() != transcript.getDirection(): | |
| 282 raise Exception("Cannot merge '%s' with '%s'!" % (self, transcript)) | |
| 283 | |
| 284 theseExons = self.getExons() | |
| 285 thoseExons = transcript.getExons() | |
| 286 | |
| 287 for thatExon in thoseExons: | |
| 288 toBeRemoved = [] | |
| 289 for thisIndex, thisExon in enumerate(theseExons): | |
| 290 if thisExon.overlapWith(thatExon): | |
| 291 thatExon.merge(thisExon) | |
| 292 toBeRemoved.append(thisIndex) | |
| 293 theseExons.append(thatExon) | |
| 294 for thisIndex in reversed(toBeRemoved): | |
| 295 del theseExons[thisIndex] | |
| 296 self.removeExons() | |
| 297 self.setStart(min(self.getStart(), transcript.getStart())) | |
| 298 self.setEnd(max(self.getEnd(), transcript.getEnd())) | |
| 299 if len(theseExons) > 1: | |
| 300 for thisExon in theseExons: | |
| 301 self.addExon(thisExon) | |
| 302 | |
| 303 self.setName("%s--%s" % (self.getUniqueName(), transcript.getUniqueName())) | |
| 304 super(Transcript, self).merge(transcript, normalization) | |
| 305 | |
| 306 | |
| 307 def getDifference(self, transcript, sameStrand = False): | |
| 308 """ | |
| 309 Get the difference between this cluster and another one | |
| 310 @param transcript: object to be compared to | |
| 311 @type transcript: class L{Transcript<Transcript>} | |
| 312 @param sameStrand: do the comparison iff the transcripts are on the same strand | |
| 313 @type sameStrand: boolean | |
| 314 @return: a transcript | |
| 315 """ | |
| 316 newTranscript = Transcript() | |
| 317 newTranscript.copy(self) | |
| 318 if self.getChromosome() != transcript.getChromosome(): | |
| 319 return newTranscript | |
| 320 if not self.overlapWith(transcript): | |
| 321 return newTranscript | |
| 322 if sameStrand and self.getDirection() != transcript.getDirection(): | |
| 323 return newTranscript | |
| 324 newTranscript.removeExons() | |
| 325 if transcript.getEnd() > newTranscript.getStart(): | |
| 326 newTranscript.setStart(transcript.getEnd() + 1) | |
| 327 if transcript.getStart() < newTranscript.getEnd(): | |
| 328 newTranscript.setEnd(transcript.getStart() + 1) | |
| 329 theseExons = [] | |
| 330 for exon in self.getExons(): | |
| 331 exonCopy = Interval() | |
| 332 exonCopy.copy(exon) | |
| 333 theseExons.append(exonCopy) | |
| 334 for thatExon in transcript.getExons(): | |
| 335 newExons = [] | |
| 336 for thisExon in theseExons: | |
| 337 newExons.extend(thisExon.getDifference(thatExon)) | |
| 338 theseExons = newExons | |
| 339 if not theseExons: | |
| 340 return None | |
| 341 newStart, newEnd = theseExons[0].getStart(), theseExons[0].getEnd() | |
| 342 for thisExon in theseExons[1:]: | |
| 343 newStart = min(newStart, thisExon.getStart()) | |
| 344 newEnd = max(newEnd, thisExon.getEnd()) | |
| 345 newTranscript.setEnd(newEnd) | |
| 346 newTranscript.setStart(newStart) | |
| 347 newTranscript.exons = theseExons | |
| 348 return newTranscript | |
| 349 | |
| 350 | |
| 351 def getIntersection(self, transcript): | |
| 352 """ | |
| 353 Get the intersection between this transcript and another one | |
| 354 @param transcript: object to be compared to | |
| 355 @type transcript: class L{Transcript<Transcript>} | |
| 356 @return: an other transcript | |
| 357 """ | |
| 358 if self.getChromosome() != transcript.getChromosome() or self.getDirection() != transcript.getDirection(): | |
| 359 return None | |
| 360 newTranscript = Transcript() | |
| 361 newTranscript.setDirection(self.getDirection()) | |
| 362 newTranscript.setChromosome(self.getChromosome()) | |
| 363 newTranscript.setName("%s_intersect_%s" % (self.getName(), transcript.getName())) | |
| 364 newExons = [] | |
| 365 for thisExon in self.getExons(): | |
| 366 for thatExon in transcript.getExons(): | |
| 367 newExon = thisExon.getIntersection(thatExon) | |
| 368 if newExon != None: | |
| 369 newExons.append(newExon) | |
| 370 if not newExons: | |
| 371 return None | |
| 372 newTranscript.exons = newExons | |
| 373 return newTranscript | |
| 374 | |
| 375 | |
| 376 def getSqlVariables(cls): | |
| 377 """ | |
| 378 Get the properties of the object that should be saved in a database | |
| 379 """ | |
| 380 variables = Interval.getSqlVariables() | |
| 381 variables.append("exons") | |
| 382 return variables | |
| 383 getSqlVariables = classmethod(getSqlVariables) | |
| 384 | |
| 385 | |
| 386 def setSqlValues(self, array): | |
| 387 """ | |
| 388 Set the values of the properties of this object as given by a results line of a SQL query | |
| 389 @param array: the values to be copied | |
| 390 @type array: a list | |
| 391 """ | |
| 392 super(Transcript, self).setSqlValues(array) | |
| 393 mergedExons = array[8] | |
| 394 if not mergedExons: | |
| 395 return | |
| 396 for exonCount, splittedExon in enumerate(mergedExons.split(",")): | |
| 397 start, end = splittedExon.split("-") | |
| 398 exon = Interval() | |
| 399 exon.setChromosome(self.getChromosome()) | |
| 400 exon.setDirection(self.getDirection()) | |
| 401 exon.setName("%s_exon%d" % (self.getName(), exonCount+1)) | |
| 402 exon.setStart(int(start)) | |
| 403 exon.setEnd(int(end)) | |
| 404 self.addExon(exon) | |
| 405 | |
| 406 | |
| 407 def getSqlValues(self): | |
| 408 """ | |
| 409 Get the values of the properties that should be saved in a database | |
| 410 """ | |
| 411 values = super(Transcript, self).getSqlValues() | |
| 412 values["size"] = self.getSize() | |
| 413 if self.getNbExons() == 1: | |
| 414 values["exons"] = "" | |
| 415 else: | |
| 416 values["exons"] = ",".join(["%d-%d" % (exon.getStart(), exon.getEnd()) for exon in self.getExons()]) | |
| 417 return values | |
| 418 | |
| 419 | |
| 420 def getSqlTypes(cls): | |
| 421 """ | |
| 422 Get the types of the properties that should be saved in a database | |
| 423 """ | |
| 424 types = Interval.getSqlTypes() | |
| 425 types["exons"] = "varchar" | |
| 426 return types | |
| 427 getSqlTypes = classmethod(getSqlTypes) | |
| 428 | |
| 429 | |
| 430 def getSqlSizes(cls): | |
| 431 """ | |
| 432 Get the sizes of the properties that should be saved in a database | |
| 433 """ | |
| 434 sizes = Interval.getSqlSizes() | |
| 435 sizes["exons"] = 10000 | |
| 436 return sizes | |
| 437 getSqlSizes = classmethod(getSqlSizes) | |
| 438 | |
| 439 | |
| 440 def sortExons(self): | |
| 441 """ | |
| 442 Sort the exons | |
| 443 Increasing order if transcript is on strand "+", decreasing otherwise | |
| 444 """ | |
| 445 self.sortExonsIncreasing() | |
| 446 if self.getDirection() == -1: | |
| 447 exons = self.getExons() | |
| 448 exons.reverse() | |
| 449 self.exons = exons | |
| 450 | |
| 451 | |
| 452 def sortExonsIncreasing(self): | |
| 453 """ | |
| 454 Sort the exons | |
| 455 Increasing order | |
| 456 """ | |
| 457 exons = self.getExons() | |
| 458 sortedExons = [] | |
| 459 while len(exons) > 0: | |
| 460 minExon = exons[0] | |
| 461 for index in range(1, len(exons)): | |
| 462 if minExon.getStart() > exons[index].getStart(): | |
| 463 minExon = exons[index] | |
| 464 sortedExons.append(minExon) | |
| 465 exons.remove(minExon) | |
| 466 self.exons = sortedExons | |
| 467 | |
| 468 | |
| 469 def extendStart(self, size): | |
| 470 """ | |
| 471 Extend the transcript by the 5' end | |
| 472 @param size: the size to be extended | |
| 473 @type size: int | |
| 474 """ | |
| 475 if len(self.exons) != 0: | |
| 476 self.sortExons() | |
| 477 if self.getDirection() == 1: | |
| 478 self.exons[0].setStart(max(0, self.exons[0].getStart() - size)) | |
| 479 else: | |
| 480 self.exons[0].setEnd(self.exons[0].getEnd() + size) | |
| 481 super(Transcript, self).extendStart(size) | |
| 482 self.bin = None | |
| 483 | |
| 484 | |
| 485 def extendEnd(self, size): | |
| 486 """ | |
| 487 Extend the transcript by the 3' end | |
| 488 @param size: the size to be extended | |
| 489 @type size: int | |
| 490 """ | |
| 491 if len(self.exons) != 0: | |
| 492 self.sortExons() | |
| 493 if self.getDirection() == 1: | |
| 494 self.exons[-1].setEnd(self.exons[-1].getEnd() + size) | |
| 495 else: | |
| 496 self.exons[-1].setStart(max(0, self.exons[-1].getStart() - size)) | |
| 497 super(Transcript, self).extendEnd(size) | |
| 498 self.bin = None | |
| 499 | |
| 500 | |
| 501 def extendExons(self, size): | |
| 502 """ | |
| 503 Extend all the exons | |
| 504 @param size: the size to be extended | |
| 505 @type size: int | |
| 506 """ | |
| 507 if len(self.exons) != 0: | |
| 508 self.sortExons() | |
| 509 exons = [] | |
| 510 previousExon = None | |
| 511 for exon in self.exons: | |
| 512 exon.extendStart(size) | |
| 513 exon.extendEnd(size) | |
| 514 exon.setDirection(self.getDirection()) | |
| 515 if previousExon != None and previousExon.overlapWith(exon): | |
| 516 previousExon.merge(exon) | |
| 517 else: | |
| 518 if previousExon != None: | |
| 519 exons.append(previousExon) | |
| 520 previousExon = exon | |
| 521 exons.append(previousExon) | |
| 522 self.exons = exons | |
| 523 super(Transcript, self).extendStart(size) | |
| 524 super(Transcript, self).extendEnd(size) | |
| 525 self.bin = None | |
| 526 | |
| 527 | |
| 528 def restrictStart(self, size = 1): | |
| 529 """ | |
| 530 Restrict the transcript by some nucleotides, start from its start position | |
| 531 Remove the exons | |
| 532 @param size: the size to be restricted to | |
| 533 @type size: int | |
| 534 """ | |
| 535 newExons = [] | |
| 536 if self.getDirection() == 1: | |
| 537 for exon in self.exons: | |
| 538 if exon.getStart() <= self.getStart() + size - 1: | |
| 539 if exon.getEnd() > self.getStart() + size - 1: | |
| 540 exon.setEnd(self.getStart() + size - 1) | |
| 541 newExons.append(exon) | |
| 542 else: | |
| 543 for exon in self.exons: | |
| 544 if exon.getEnd() >= self.getEnd() - size + 1: | |
| 545 if exon.getStart() < self.getEnd() - size + 1: | |
| 546 exon.setStart(self.getEnd() - size + 1) | |
| 547 newExons.append(exon) | |
| 548 super(Transcript, self).restrictStart(size) | |
| 549 self.exons = newExons | |
| 550 | |
| 551 | |
| 552 def restrictEnd(self, size = 1): | |
| 553 """ | |
| 554 Restrict the transcript by some nucleotides, end from its end position | |
| 555 Remove the exons | |
| 556 @param size: the size to be restricted to | |
| 557 @type size: int | |
| 558 """ | |
| 559 newExons = [] | |
| 560 if self.getDirection() == 1: | |
| 561 for exon in self.exons: | |
| 562 if exon.getEnd() >= self.getEnd() - size + 1: | |
| 563 if exon.getStart() < self.getEnd() - size + 1: | |
| 564 exon.setStart(self.getEnd() - size + 1) | |
| 565 newExons.append(exon) | |
| 566 else: | |
| 567 for exon in self.exons: | |
| 568 if exon.getEnd() >= self.getEnd() - size + 1: | |
| 569 if exon.getStart() < self.getEnd() - size + 1: | |
| 570 exon.setEnd(self.getEnd() - size + 1) | |
| 571 newExons.append(exon) | |
| 572 super(Transcript, self).restrictEnd(size) | |
| 573 self.exons = newExons | |
| 574 | |
| 575 | |
| 576 def removeExons(self): | |
| 577 """ | |
| 578 Remove the exons and transforms the current transcript into a mere interval | |
| 579 """ | |
| 580 self.exons = [] | |
| 581 self.bin = None | |
| 582 | |
| 583 | |
| 584 def printGtf(self, title): | |
| 585 """ | |
| 586 Export this transcript using GTF2.2 format | |
| 587 @param title: the title of the transcripts | |
| 588 @type title: string | |
| 589 @return: a string | |
| 590 """ | |
| 591 transcriptId = self.getUniqueName() | |
| 592 geneId = "%s_gene" % (transcriptId) | |
| 593 direction = "+" | |
| 594 if self.getDirection() == -1: | |
| 595 direction = "-" | |
| 596 self.sortExonsIncreasing() | |
| 597 string = "" | |
| 598 for i, exon in enumerate(self.getExons()): | |
| 599 exonCopy = Interval() | |
| 600 exonCopy.copy(exon) | |
| 601 if "ID" in exonCopy.getTagValues(): | |
| 602 del exonCopy.tags["ID"] | |
| 603 feature = "exon" | |
| 604 if "feature" in exonCopy.getTagNames(): | |
| 605 feature = exonCopy.getTagValue("feature") | |
| 606 del exonCopy.tags["feature"] | |
| 607 score = "." | |
| 608 if "score" in exonCopy.getTagNames(): | |
| 609 score = "%d" % (int(exonCopy.getTagValue("score"))) | |
| 610 del exonCopy.tags["score"] | |
| 611 if "Parent" in exonCopy.getTagNames(): | |
| 612 del exonCopy.tags["Parent"] | |
| 613 exonCopy.setName("%s_part%d" % (self.getName(), i+1)) | |
| 614 comment = exonCopy.getTagValues("; ", " ", "\"") | |
| 615 string += "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\ttranscript_id \"%s\"; gene_id \"%s\"; %s\n" % (exonCopy.getChromosome(), title, feature, exonCopy.getStart(), exonCopy.getEnd(), score, direction, transcriptId, geneId, comment) | |
| 616 return string | |
| 617 | |
| 618 | |
| 619 def printGff2(self, title): | |
| 620 """ | |
| 621 Export this transcript using GFF2 format | |
| 622 @param title: the title of the transcripts | |
| 623 @type title: string | |
| 624 @return: a string | |
| 625 """ | |
| 626 direction = "+" | |
| 627 if self.getDirection() == -1: | |
| 628 direction = "-" | |
| 629 self.sortExonsIncreasing() | |
| 630 comment = self.getTagValues() | |
| 631 if comment != None: | |
| 632 comment = ";%s" % (comment) | |
| 633 score = "." | |
| 634 if "score" in self.getTagNames(): | |
| 635 score = "%d" % (int(self.getTagValue("score"))) | |
| 636 feature = "transcript" | |
| 637 if "feature" in self.getTagNames(): | |
| 638 feature = self.getTagValue("feature") | |
| 639 string = "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\tGENE %s%s\n" % (self.getChromosome(), title, feature, self.getStart(), self.getEnd(), score, direction, self.name, comment) | |
| 640 for exon in self.getExons(): | |
| 641 if "score" in exon.getTagNames(): | |
| 642 score = "%d" % (int(self.getTagValue("score"))) | |
| 643 string += "%s\t%s\t_exon\t%d\t%d\t%s\t%s\t.\tGENE %s\n" % (self.getChromosome(), title, exon.getStart(), exon.getEnd(), score, direction, self.name) | |
| 644 return string | |
| 645 | |
| 646 | |
| 647 def printGff3(self, title): | |
| 648 """ | |
| 649 Export this transcript using GFF3 format | |
| 650 @param title: the title of the transcripts | |
| 651 @type title: string | |
| 652 @return: a string | |
| 653 """ | |
| 654 direction = "+" | |
| 655 if self.getDirection() == -1: | |
| 656 direction = "-" | |
| 657 self.sortExonsIncreasing() | |
| 658 if "ID" not in self.getTagValues(): | |
| 659 self.setTagValue("ID", self.getUniqueName()) | |
| 660 feature = "transcript" | |
| 661 tags = self.tags | |
| 662 if "feature" in self.getTagNames(): | |
| 663 feature = self.getTagValue("feature") | |
| 664 del self.tags["feature"] | |
| 665 score = "." | |
| 666 if "score" in self.getTagNames(): | |
| 667 score = "%d" % (int(self.getTagValue("score"))) | |
| 668 del self.tags["score"] | |
| 669 comment = self.getTagValues(";", "=") | |
| 670 string = "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\t%s\n" % (self.getChromosome(), title, feature, self.getStart(), self.getEnd(), score, direction, comment) | |
| 671 if len(self.exons) > 1: | |
| 672 for i, exon in enumerate(self.getExons()): | |
| 673 if "score" in exon.getTagNames(): | |
| 674 score = "%d" % (int(exon.getTagValue("score"))) | |
| 675 string += "%s\t%s\texon\t%d\t%d\t%s\t%s\t.\tID=%s-exon%d;Name=%s-exon%d;Parent=%s\n" % (self.getChromosome(), title, exon.getStart(), exon.getEnd(), score, direction, self.getTagValue("ID"), i+1, self.name, i+1, self.getTagValue("ID")) | |
| 676 self.tags = tags | |
| 677 return string | |
| 678 | |
| 679 | |
| 680 def printEmbl(self): | |
| 681 """ | |
| 682 Export this transcript using EMBL format | |
| 683 @return: a string | |
| 684 """ | |
| 685 if len(self.exons) <= 1: | |
| 686 position = "%d..%d" % (self.getStart(), self.getEnd()) | |
| 687 else: | |
| 688 positions = [] | |
| 689 for exon in self.getExons(): | |
| 690 positions.append("%d..%d" % (self.getStart(), self.getEnd())) | |
| 691 position = ",".join(positions) | |
| 692 position = "join(%s)" % (position) | |
| 693 if self.getDirection() == -1: | |
| 694 position = "complement(%s)" % (position) | |
| 695 feature = "misc_feature" | |
| 696 if "feature" in self.getTagNames(): | |
| 697 if not self.getTagValue("feature").startswith("S-MART"): | |
| 698 feature = self.getTagValue("feature") | |
| 699 string = "FT %s %s\n" % (feature, position) | |
| 700 if "Name" in self.getTagNames(): | |
| 701 string += "FT /label=\"%s\"\n" % (self.getTagValue("Name")) | |
| 702 return string | |
| 703 | |
| 704 | |
| 705 def printBed(self): | |
| 706 """ | |
| 707 Export this transcript using BED format | |
| 708 @return: a string | |
| 709 """ | |
| 710 name = self.name | |
| 711 if "nbOccurrences" in self.getTagNames() and self.getTagValue("nbOccurrences") != 1 and self.getTagValue("occurrences"): | |
| 712 name = "%s-%d" % (name, self.getTagValue("occurrence")) | |
| 713 comment = self.getTagValues(";", "=") | |
| 714 sizes = [] | |
| 715 starts = [] | |
| 716 direction = "+" | |
| 717 if self.getDirection() == -1: | |
| 718 direction = "-" | |
| 719 self.sortExonsIncreasing() | |
| 720 for exon in self.getExons(): | |
| 721 sizes.append("%d" % (exon.getSize())) | |
| 722 starts.append("%d" % (exon.getStart() - self.getStart())) | |
| 723 return "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0\t%d\t%s,\t%s,\n" % (self.getChromosome(), self.getStart(), self.getEnd()+1, name, direction, self.getStart(), self.getEnd()+1, self.getNbExons(), ",".join(sizes), ",".join(starts)) | |
| 724 | |
| 725 | |
| 726 def printSam(self): | |
| 727 """ | |
| 728 Export this transcript using SAM format | |
| 729 @return: a string | |
| 730 """ | |
| 731 name = self.name | |
| 732 flag = 0 if self.getDirection() == 1 else 0x10 | |
| 733 chromosome = self.getChromosome() | |
| 734 genomeStart = self.getStart() | |
| 735 quality = 255 | |
| 736 mate = "*" | |
| 737 mateGenomeStart = 0 | |
| 738 gapSize = 0 | |
| 739 sequence = "*" | |
| 740 qualityString = "*" | |
| 741 tags = "NM:i:0" | |
| 742 | |
| 743 lastExonEnd = None | |
| 744 self.sortExonsIncreasing() | |
| 745 exon = self.getExons()[0] | |
| 746 cigar = "%dM" % (self.getExons()[0].getSize()) | |
| 747 lastExonEnd = exon.getEnd() | |
| 748 for i, exon in enumerate(self.getExons()): | |
| 749 if i == 0: | |
| 750 continue | |
| 751 cigar += "%dN" % (exon.getStart() - lastExonEnd - 1) | |
| 752 cigar += "%dM" % (exon.getSize()) | |
| 753 | |
| 754 return "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n" % (name, flag, chromosome, genomeStart, quality, cigar, mate, mateGenomeStart, gapSize, sequence, qualityString, tags) | |
| 755 | |
| 756 | |
| 757 def printUcsc(self): | |
| 758 """ | |
| 759 Export this transcript using UCSC BED format | |
| 760 @return: a string | |
| 761 """ | |
| 762 if self.getChromosome().find("Het") != -1: | |
| 763 return "" | |
| 764 name = self.name | |
| 765 comment = self.getTagValues(";", "") | |
| 766 sizes = [] | |
| 767 starts = [] | |
| 768 direction = "+" | |
| 769 if self.getDirection() == -1: | |
| 770 direction = "-" | |
| 771 self.sortExonsIncreasing() | |
| 772 for exon in self.getExons(): | |
| 773 sizes.append("%d" % (exon.getSize())) | |
| 774 starts.append("%d" % (exon.getStart() - self.getStart())) | |
| 775 return "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0\t%d\t%s,\t%s,\n" % (self.getChromosome().replace("arm_", "chr"), self.getStart(), self.getEnd()+1, name, direction, self.getStart(), self.getEnd()+1, self.getNbExons(), ",".join(sizes), ",".join(starts)) | |
| 776 | |
| 777 | |
| 778 def printGBrowseReference(self): | |
| 779 """ | |
| 780 Export this transcript using GBrowse format (1st line only) | |
| 781 @return: a string | |
| 782 """ | |
| 783 return "reference = %s\n" % (self.getChromosome()) | |
| 784 | |
| 785 | |
| 786 def printGBrowseLine(self): | |
| 787 """ | |
| 788 Export this transcript using GBrowse format (2nd line only) | |
| 789 @return: a string | |
| 790 """ | |
| 791 self.sortExons() | |
| 792 coordinates = [] | |
| 793 for exon in self.getExons(): | |
| 794 coordinates.append(exon.printCoordinates()) | |
| 795 coordinatesString = ",".join(coordinates) | |
| 796 comment = self.getTagValues(";", "=") | |
| 797 if comment: | |
| 798 comment = "\t\"%s\"" % (comment) | |
| 799 return "User_data\t%s\t%s%s\n" % (self.name, coordinatesString, comment) | |
| 800 | |
| 801 | |
| 802 def printGBrowse(self): | |
| 803 """ | |
| 804 Export this transcript using GBrowse format | |
| 805 @return: a string | |
| 806 """ | |
| 807 return "%s%s" % (self.printGBrowseReference(), self.printGBrowseLine()) | |
| 808 | |
| 809 | |
| 810 def printCsv(self): | |
| 811 """ | |
| 812 Export this transcript using CSV format | |
| 813 @return: a string | |
| 814 """ | |
| 815 self.sortExons() | |
| 816 string = "%s,%d,%d,\"%s\"," % (self.getChromosome(), self.getStart(), self.getEnd(), "+" if self.getDirection() == 1 else "-") | |
| 817 if len(self.getExons()) == 1: | |
| 818 string += "None" | |
| 819 else: | |
| 820 for exon in self.getExons(): | |
| 821 string += "%d-%d " % (exon.getStart(), exon.getEnd()) | |
| 822 for tag in sorted(self.tags.keys()): | |
| 823 string += ",%s=%s" % (tag, str(self.tags[tag])) | |
| 824 string += "\n" | |
| 825 return string | |
| 826 | |
| 827 | |
| 828 def extractSequence(self, parser): | |
| 829 """ | |
| 830 Get the sequence corresponding to this transcript | |
| 831 @param parser: a parser to a FASTA file | |
| 832 @type parser: class L{SequenceListParser<SequenceListParser>} | |
| 833 @return: an instance of L{Sequence<Sequence>} | |
| 834 """ | |
| 835 self.sortExons() | |
| 836 name = self.name | |
| 837 if "ID" in self.getTagNames() and self.getTagValue("ID") != self.name: | |
| 838 name += ":%s" % (self.getTagValue("ID")) | |
| 839 sequence = Sequence(name) | |
| 840 for exon in self.getExons(): | |
| 841 sequence.concatenate(exon.extractSequence(parser)) | |
| 842 return sequence | |
| 843 | |
| 844 | |
| 845 def extractWigData(self, parser): | |
| 846 """ | |
| 847 Get some wig data corresponding to this transcript | |
| 848 @param parser: a parser to a wig file | |
| 849 @type parser: class L{WigParser<WigParser>} | |
| 850 @return: a sequence of float | |
| 851 """ | |
| 852 self.sortExons() | |
| 853 if parser.strands: | |
| 854 strands = (-1, 1) | |
| 855 values = dict([(strand, []) for strand in strands]) | |
| 856 for exon in self.getExons(): | |
| 857 theseValues = exon.extractWigData(parser) | |
| 858 if self.getDirection() == -1: | |
| 859 for strand in strands: | |
| 860 theseValues[strand].reverse() | |
| 861 for strand in strands: | |
| 862 values[strand].extend(theseValues[strand]) | |
| 863 if self.getDirection() == -1: | |
| 864 for strand in strands: | |
| 865 values[strand].reverse() | |
| 866 return values | |
| 867 else: | |
| 868 values = [] | |
| 869 for exon in self.getExons(): | |
| 870 theseValues = exon.extractWigData(parser) | |
| 871 #if self.getDirection() == -1: | |
| 872 # theseValues.reverse() | |
| 873 values.extend(theseValues) | |
| 874 #if self.getDirection() == -1: | |
| 875 # values.reverse() | |
| 876 return values |
