Mercurial > repos > yufei-luo > s_mart
comparison SMART/Java/Python/structure/Transcript.py @ 6:769e306b7933
Change the repository level.
| author | yufei-luo |
|---|---|
| date | Fri, 18 Jan 2013 04:54:14 -0500 |
| parents | |
| children | 94ab73e8a190 |
comparison
equal
deleted
inserted
replaced
| 5:ea3082881bf8 | 6:769e306b7933 |
|---|---|
| 1 # | |
| 2 # Copyright INRA-URGI 2009-2010 | |
| 3 # | |
| 4 # This software is governed by the CeCILL license under French law and | |
| 5 # abiding by the rules of distribution of free software. You can use, | |
| 6 # modify and/ or redistribute the software under the terms of the CeCILL | |
| 7 # license as circulated by CEA, CNRS and INRIA at the following URL | |
| 8 # "http://www.cecill.info". | |
| 9 # | |
| 10 # As a counterpart to the access to the source code and rights to copy, | |
| 11 # modify and redistribute granted by the license, users are provided only | |
| 12 # with a limited warranty and the software's author, the holder of the | |
| 13 # economic rights, and the successive licensors have only limited | |
| 14 # liability. | |
| 15 # | |
| 16 # In this respect, the user's attention is drawn to the risks associated | |
| 17 # with loading, using, modifying and/or developing or reproducing the | |
| 18 # software by the user in light of its specific status of free software, | |
| 19 # that may mean that it is complicated to manipulate, and that also | |
| 20 # therefore means that it is reserved for developers and experienced | |
| 21 # professionals having in-depth computer knowledge. Users are therefore | |
| 22 # encouraged to load and test the software's suitability as regards their | |
| 23 # requirements in conditions enabling the security of their systems and/or | |
| 24 # data to be ensured and, more generally, to use and operate it in the | |
| 25 # same conditions as regards security. | |
| 26 # | |
| 27 # The fact that you are presently reading this means that you have had | |
| 28 # knowledge of the CeCILL license and that you accept its terms. | |
| 29 # | |
| 30 import sys | |
| 31 from SMART.Java.Python.structure.Interval import Interval | |
| 32 from SMART.Java.Python.structure.Sequence import Sequence | |
| 33 | |
| 34 | |
| 35 class Transcript(Interval): | |
| 36 """ | |
| 37 A class that models an transcript, considered as a specialized interval (the bounds of the transcript) that contains exons (also represented as intervals) | |
| 38 @ivar exons: a list of exons (intervals) | |
| 39 @type exons: list of L{Interval{Interval}} | |
| 40 """ | |
| 41 | |
| 42 def __init__(self, transcript = None, verbosity = 0): | |
| 43 """ | |
| 44 Constructor | |
| 45 @param transcript: transcript to be copied | |
| 46 @type transcript: class L{Transcript<Transcript>} | |
| 47 @param verbosity: verbosity | |
| 48 @type verbosity: int | |
| 49 """ | |
| 50 super(Transcript, self).__init__(None, verbosity) | |
| 51 self.exons = [] | |
| 52 self.introns = None | |
| 53 if transcript != None: | |
| 54 self.copy(transcript) | |
| 55 | |
| 56 | |
| 57 def copy(self, transcript): | |
| 58 """ | |
| 59 Copy method | |
| 60 @param transcript: transcript to be copied | |
| 61 @type transcript: class L{Transcript<Transcript>} or L{Interval<Interval>} | |
| 62 """ | |
| 63 super(Transcript, self).copy(transcript) | |
| 64 if transcript.__class__.__name__ == "Transcript": | |
| 65 exons = transcript.getExons() | |
| 66 if len(exons) > 1: | |
| 67 for exon in exons: | |
| 68 exonCopy = Interval(exon) | |
| 69 self.addExon(exonCopy) | |
| 70 | |
| 71 | |
| 72 def setDirection(self, direction): | |
| 73 """ | |
| 74 Set the direction of the interval | |
| 75 Possibly parse different formats | |
| 76 Impact all exons | |
| 77 @param direction: direction of the transcript (+ / -) | |
| 78 @type direction: int or string | |
| 79 """ | |
| 80 super(Transcript, self).setDirection(direction) | |
| 81 for exon in self.exons: | |
| 82 exon.setDirection(direction) | |
| 83 | |
| 84 | |
| 85 def setChromosome(self, chromosome): | |
| 86 """ | |
| 87 Set the chromosome | |
| 88 @param chromosome: chromosome on which the transcript is | |
| 89 @type chromosome: string | |
| 90 """ | |
| 91 super(Transcript, self).setChromosome(chromosome) | |
| 92 for exon in self.exons: | |
| 93 exon.setChromosome(chromosome) | |
| 94 | |
| 95 | |
| 96 def addExon(self, exon): | |
| 97 """ | |
| 98 Add an exon to the list of exons | |
| 99 @param exon: a new exon | |
| 100 @type exon: class L{Interval<Interval>} | |
| 101 """ | |
| 102 if not self.exons and not exon.overlapWith(self): | |
| 103 firstExon = Interval() | |
| 104 firstExon.setStart(self.getStart()) | |
| 105 firstExon.setEnd(self.getEnd()) | |
| 106 firstExon.setDirection(self.getDirection()) | |
| 107 firstExon.setChromosome(self.getChromosome()) | |
| 108 self.exons.append(firstExon) | |
| 109 newExon = Interval(exon) | |
| 110 newExon.setDirection(self.getDirection()) | |
| 111 self.exons.append(newExon) | |
| 112 if newExon.getStart() < self.getStart(): | |
| 113 self.setStart(newExon.getStart()) | |
| 114 if newExon.getEnd() > self.getEnd(): | |
| 115 self.setEnd(newExon.getEnd()) | |
| 116 | |
| 117 | |
| 118 def setStart(self, start): | |
| 119 """ | |
| 120 Set the new start, move the first exon accordingly (if exists) | |
| 121 @param start: the new start | |
| 122 @type start: int | |
| 123 """ | |
| 124 super(Transcript, self).setStart(start) | |
| 125 if self.exons: | |
| 126 self.sortExonsIncreasing() | |
| 127 self.exons[0].setStart(start) | |
| 128 | |
| 129 | |
| 130 def setEnd(self, end): | |
| 131 """ | |
| 132 Set the new end, move the last exon accordingly (if exists) | |
| 133 @param end: the new end | |
| 134 @type end: int | |
| 135 """ | |
| 136 super(Transcript, self).setEnd(end) | |
| 137 if self.exons: | |
| 138 self.sortExonsIncreasing() | |
| 139 self.exons[-1].setEnd(end) | |
| 140 | |
| 141 | |
| 142 def reverse(self): | |
| 143 """ | |
| 144 Reverse the strand of the transcript | |
| 145 """ | |
| 146 super(Transcript, self).reverse() | |
| 147 for exon in self.exons: | |
| 148 exon.reverse() | |
| 149 | |
| 150 | |
| 151 def getUniqueName(self): | |
| 152 """ | |
| 153 Try to give a unique name by possibly adding occurrence | |
| 154 """ | |
| 155 if "nbOccurrences" in self.tags and "occurrence" in self.tags and self.tags["nbOccurrences"] != 1: | |
| 156 return "%s-%d" % (self.name, self.tags["occurrence"]) | |
| 157 return self.name | |
| 158 | |
| 159 | |
| 160 def getNbExons(self): | |
| 161 """ | |
| 162 Get the number of exons | |
| 163 """ | |
| 164 return max(1, len(self.exons)) | |
| 165 | |
| 166 | |
| 167 def getExon(self, i): | |
| 168 """ | |
| 169 Get a specific exon | |
| 170 @param i: the rank of the exon | |
| 171 @type i: int | |
| 172 """ | |
| 173 if len(self.exons) == 0: | |
| 174 if i != 0: | |
| 175 raise Exception("Cannot get exon #%i while there is no exon in the transcript" % (i)) | |
| 176 return self | |
| 177 return self.exons[i] | |
| 178 | |
| 179 | |
| 180 def getExons(self): | |
| 181 """ | |
| 182 Get all the exons | |
| 183 """ | |
| 184 if len(self.exons) == 0: | |
| 185 return [Interval(self)] | |
| 186 return self.exons | |
| 187 | |
| 188 | |
| 189 def getIntrons(self): | |
| 190 """ | |
| 191 Get all the introns | |
| 192 Compute introns on the fly | |
| 193 """ | |
| 194 if self.introns != None: | |
| 195 return self.introns | |
| 196 self.sortExons() | |
| 197 self.introns = [] | |
| 198 exonStart = self.getExon(0) | |
| 199 for cpt, exonEnd in enumerate(self.exons[1:]): | |
| 200 intron = Interval() | |
| 201 intron.setName("%s_intron%d" % (self.getName(), cpt+1)) | |
| 202 intron.setChromosome(self.getChromosome()) | |
| 203 intron.setDirection(self.getDirection()) | |
| 204 if self.getDirection() == 1: | |
| 205 intron.setEnd(exonEnd.getStart() - 1) | |
| 206 intron.setStart(exonStart.getEnd() + 1) | |
| 207 else: | |
| 208 intron.setStart(exonEnd.getEnd() + 1) | |
| 209 intron.setEnd(exonStart.getStart() - 1) | |
| 210 intron.setDirection(self.getDirection()) | |
| 211 if intron.getSize() > 0: | |
| 212 self.introns.append(intron) | |
| 213 exonStart = exonEnd | |
| 214 intron.setSize(intron.getEnd() - intron.getStart() + 1) | |
| 215 return self.introns | |
| 216 | |
| 217 | |
| 218 def getSize(self): | |
| 219 """ | |
| 220 Get the size of the transcript (i.e. the number of nucleotides) | |
| 221 Compute size on the fly | |
| 222 """ | |
| 223 if len(self.exons) == 0: | |
| 224 return self.getSizeWithIntrons() | |
| 225 size = 0 | |
| 226 for exon in self.exons: | |
| 227 size += exon.getSize() | |
| 228 return size | |
| 229 | |
| 230 | |
| 231 def getSizeWithIntrons(self): | |
| 232 """ | |
| 233 Get the size of the interval (i.e. distance from start to end) | |
| 234 """ | |
| 235 return super(Transcript, self).getSize() | |
| 236 | |
| 237 | |
| 238 def overlapWithExon(self, transcript, nbNucleotides = 1): | |
| 239 """ | |
| 240 Check if the exons of this transcript overlap with the exons of another transcript | |
| 241 @param transcript: transcript to be compared to | |
| 242 @type transcript: class L{Transcript<Transcript>} | |
| 243 @param nbNucleotides: minimum number of nucleotides to declare and overlap | |
| 244 @type nbNucleotides: int | |
| 245 """ | |
| 246 if not self.overlapWith(transcript, nbNucleotides): | |
| 247 return False | |
| 248 for thisExon in self.getExons(): | |
| 249 for thatExon in transcript.getExons(): | |
| 250 if thisExon.overlapWith(thatExon, nbNucleotides): | |
| 251 return True | |
| 252 return False | |
| 253 | |
| 254 | |
| 255 def include(self, transcript): | |
| 256 """ | |
| 257 Whether this transcript includes the other one | |
| 258 @param transcript: object to be compared to | |
| 259 @type transcript: class L{Transcript<Transcript>} | |
| 260 """ | |
| 261 if not super(Transcript, self).include(transcript): | |
| 262 return False | |
| 263 for thatExon in transcript.getExons(): | |
| 264 for thisExon in self.getExons(): | |
| 265 if thisExon.include(thatExon): | |
| 266 break | |
| 267 else: | |
| 268 return False | |
| 269 return True | |
| 270 | |
| 271 | |
| 272 def merge(self, transcript, normalization = False): | |
| 273 """ | |
| 274 Merge with another transcript | |
| 275 Merge exons if they overlap, otherwise add exons | |
| 276 @param transcript: transcript to be merged to | |
| 277 @type transcript: class L{Transcript<Transcript>} | |
| 278 @param normalization: whether the sum of the merge should be normalized wrt the number of mappings of each elements | |
| 279 @type normalization: boolean | |
| 280 """ | |
| 281 if self.getChromosome() != transcript.getChromosome() or self.getDirection() != transcript.getDirection(): | |
| 282 raise Exception("Cannot merge '%s' with '%s'!" % (self, transcript)) | |
| 283 | |
| 284 theseExons = self.getExons() | |
| 285 thoseExons = transcript.getExons() | |
| 286 | |
| 287 for thatExon in thoseExons: | |
| 288 toBeRemoved = [] | |
| 289 for thisIndex, thisExon in enumerate(theseExons): | |
| 290 if thisExon.overlapWith(thatExon): | |
| 291 thatExon.merge(thisExon) | |
| 292 toBeRemoved.append(thisIndex) | |
| 293 theseExons.append(thatExon) | |
| 294 for thisIndex in reversed(toBeRemoved): | |
| 295 del theseExons[thisIndex] | |
| 296 self.removeExons() | |
| 297 self.setStart(min(self.getStart(), transcript.getStart())) | |
| 298 self.setEnd(max(self.getEnd(), transcript.getEnd())) | |
| 299 if len(theseExons) > 1: | |
| 300 for thisExon in theseExons: | |
| 301 self.addExon(thisExon) | |
| 302 | |
| 303 self.setName("%s--%s" % (self.getUniqueName(), transcript.getUniqueName())) | |
| 304 super(Transcript, self).merge(transcript, normalization) | |
| 305 | |
| 306 | |
| 307 def getDifference(self, transcript, sameStrand = False): | |
| 308 """ | |
| 309 Get the difference between this cluster and another one | |
| 310 @param transcript: object to be compared to | |
| 311 @type transcript: class L{Transcript<Transcript>} | |
| 312 @param sameStrand: do the comparison iff the transcripts are on the same strand | |
| 313 @type sameStrand: boolean | |
| 314 @return: a transcript | |
| 315 """ | |
| 316 newTranscript = Transcript() | |
| 317 newTranscript.copy(self) | |
| 318 if self.getChromosome() != transcript.getChromosome(): | |
| 319 return newTranscript | |
| 320 if not self.overlapWith(transcript): | |
| 321 return newTranscript | |
| 322 if sameStrand and self.getDirection() != transcript.getDirection(): | |
| 323 return newTranscript | |
| 324 newTranscript.removeExons() | |
| 325 if transcript.getEnd() > newTranscript.getStart(): | |
| 326 newTranscript.setStart(transcript.getEnd() + 1) | |
| 327 if transcript.getStart() < newTranscript.getEnd(): | |
| 328 newTranscript.setEnd(transcript.getStart() + 1) | |
| 329 theseExons = [] | |
| 330 for exon in self.getExons(): | |
| 331 exonCopy = Interval() | |
| 332 exonCopy.copy(exon) | |
| 333 theseExons.append(exonCopy) | |
| 334 for thatExon in transcript.getExons(): | |
| 335 newExons = [] | |
| 336 for thisExon in theseExons: | |
| 337 newExons.extend(thisExon.getDifference(thatExon)) | |
| 338 theseExons = newExons | |
| 339 if not theseExons: | |
| 340 return None | |
| 341 newStart, newEnd = theseExons[0].getStart(), theseExons[0].getEnd() | |
| 342 for thisExon in theseExons[1:]: | |
| 343 newStart = min(newStart, thisExon.getStart()) | |
| 344 newEnd = max(newEnd, thisExon.getEnd()) | |
| 345 newTranscript.setEnd(newEnd) | |
| 346 newTranscript.setStart(newStart) | |
| 347 newTranscript.exons = theseExons | |
| 348 return newTranscript | |
| 349 | |
| 350 | |
| 351 def getSqlVariables(cls): | |
| 352 """ | |
| 353 Get the properties of the object that should be saved in a database | |
| 354 """ | |
| 355 variables = Interval.getSqlVariables() | |
| 356 variables.append("exons") | |
| 357 return variables | |
| 358 getSqlVariables = classmethod(getSqlVariables) | |
| 359 | |
| 360 | |
| 361 def setSqlValues(self, array): | |
| 362 """ | |
| 363 Set the values of the properties of this object as given by a results line of a SQL query | |
| 364 @param array: the values to be copied | |
| 365 @type array: a list | |
| 366 """ | |
| 367 super(Transcript, self).setSqlValues(array) | |
| 368 mergedExons = array[8] | |
| 369 if not mergedExons: | |
| 370 return | |
| 371 for exonCount, splittedExon in enumerate(mergedExons.split(",")): | |
| 372 start, end = splittedExon.split("-") | |
| 373 exon = Interval() | |
| 374 exon.setChromosome(self.getChromosome()) | |
| 375 exon.setDirection(self.getDirection()) | |
| 376 exon.setName("%s_exon%d" % (self.getName(), exonCount+1)) | |
| 377 exon.setStart(int(start)) | |
| 378 exon.setEnd(int(end)) | |
| 379 self.addExon(exon) | |
| 380 | |
| 381 | |
| 382 def getSqlValues(self): | |
| 383 """ | |
| 384 Get the values of the properties that should be saved in a database | |
| 385 """ | |
| 386 values = super(Transcript, self).getSqlValues() | |
| 387 values["size"] = self.getSize() | |
| 388 if self.getNbExons() == 1: | |
| 389 values["exons"] = "" | |
| 390 else: | |
| 391 values["exons"] = ",".join(["%d-%d" % (exon.getStart(), exon.getEnd()) for exon in self.getExons()]) | |
| 392 return values | |
| 393 | |
| 394 | |
| 395 def getSqlTypes(cls): | |
| 396 """ | |
| 397 Get the types of the properties that should be saved in a database | |
| 398 """ | |
| 399 types = Interval.getSqlTypes() | |
| 400 types["exons"] = "varchar" | |
| 401 return types | |
| 402 getSqlTypes = classmethod(getSqlTypes) | |
| 403 | |
| 404 | |
| 405 def getSqlSizes(cls): | |
| 406 """ | |
| 407 Get the sizes of the properties that should be saved in a database | |
| 408 """ | |
| 409 sizes = Interval.getSqlSizes() | |
| 410 sizes["exons"] = 10000 | |
| 411 return sizes | |
| 412 getSqlSizes = classmethod(getSqlSizes) | |
| 413 | |
| 414 | |
| 415 def sortExons(self): | |
| 416 """ | |
| 417 Sort the exons | |
| 418 Increasing order if transcript is on strand "+", decreasing otherwise | |
| 419 """ | |
| 420 self.sortExonsIncreasing() | |
| 421 if self.getDirection() == -1: | |
| 422 exons = self.getExons() | |
| 423 exons.reverse() | |
| 424 self.exons = exons | |
| 425 | |
| 426 | |
| 427 def sortExonsIncreasing(self): | |
| 428 """ | |
| 429 Sort the exons | |
| 430 Increasing order | |
| 431 """ | |
| 432 exons = self.getExons() | |
| 433 sortedExons = [] | |
| 434 while len(exons) > 0: | |
| 435 minExon = exons[0] | |
| 436 for index in range(1, len(exons)): | |
| 437 if minExon.getStart() > exons[index].getStart(): | |
| 438 minExon = exons[index] | |
| 439 sortedExons.append(minExon) | |
| 440 exons.remove(minExon) | |
| 441 self.exons = sortedExons | |
| 442 | |
| 443 | |
| 444 def extendStart(self, size): | |
| 445 """ | |
| 446 Extend the transcript by the 5' end | |
| 447 @param size: the size to be extended | |
| 448 @type size: int | |
| 449 """ | |
| 450 if len(self.exons) != 0: | |
| 451 self.sortExons() | |
| 452 if self.getDirection() == 1: | |
| 453 self.exons[0].setStart(max(0, self.exons[0].getStart() - size)) | |
| 454 else: | |
| 455 self.exons[0].setEnd(self.exons[0].getEnd() + size) | |
| 456 super(Transcript, self).extendStart(size) | |
| 457 self.bin = None | |
| 458 | |
| 459 | |
| 460 def extendEnd(self, size): | |
| 461 """ | |
| 462 Extend the transcript by the 3' end | |
| 463 @param size: the size to be extended | |
| 464 @type size: int | |
| 465 """ | |
| 466 if len(self.exons) != 0: | |
| 467 self.sortExons() | |
| 468 if self.getDirection() == 1: | |
| 469 self.exons[-1].setEnd(self.exons[-1].getEnd() + size) | |
| 470 else: | |
| 471 self.exons[-1].setStart(max(0, self.exons[-1].getStart() - size)) | |
| 472 super(Transcript, self).extendEnd(size) | |
| 473 self.bin = None | |
| 474 | |
| 475 | |
| 476 def extendExons(self, size): | |
| 477 """ | |
| 478 Extend all the exons | |
| 479 @param size: the size to be extended | |
| 480 @type size: int | |
| 481 """ | |
| 482 if len(self.exons) != 0: | |
| 483 self.sortExons() | |
| 484 exons = [] | |
| 485 previousExon = None | |
| 486 for exon in self.exons: | |
| 487 exon.extendStart(size) | |
| 488 exon.extendEnd(size) | |
| 489 exon.setDirection(self.getDirection()) | |
| 490 if previousExon != None and previousExon.overlapWith(exon): | |
| 491 previousExon.merge(exon) | |
| 492 else: | |
| 493 if previousExon != None: | |
| 494 exons.append(previousExon) | |
| 495 previousExon = exon | |
| 496 exons.append(previousExon) | |
| 497 self.exons = exons | |
| 498 super(Transcript, self).extendStart(size) | |
| 499 super(Transcript, self).extendEnd(size) | |
| 500 self.bin = None | |
| 501 | |
| 502 | |
| 503 def restrictStart(self, size = 1): | |
| 504 """ | |
| 505 Restrict the transcript by some nucleotides, start from its start position | |
| 506 Remove the exons | |
| 507 @param size: the size to be restricted to | |
| 508 @type size: int | |
| 509 """ | |
| 510 newExons = [] | |
| 511 if self.getDirection() == 1: | |
| 512 for exon in self.exons: | |
| 513 if exon.getStart() <= self.getStart() + size - 1: | |
| 514 if exon.getEnd() > self.getStart() + size - 1: | |
| 515 exon.setEnd(self.getStart() + size - 1) | |
| 516 newExons.append(exon) | |
| 517 else: | |
| 518 for exon in self.exons: | |
| 519 if exon.getEnd() >= self.getEnd() - size + 1: | |
| 520 if exon.getStart() < self.getEnd() - size + 1: | |
| 521 exon.setStart(self.getEnd() - size + 1) | |
| 522 newExons.append(exon) | |
| 523 super(Transcript, self).restrictStart(size) | |
| 524 self.exons = newExons | |
| 525 | |
| 526 | |
| 527 def restrictEnd(self, size = 1): | |
| 528 """ | |
| 529 Restrict the transcript by some nucleotides, end from its end position | |
| 530 Remove the exons | |
| 531 @param size: the size to be restricted to | |
| 532 @type size: int | |
| 533 """ | |
| 534 newExons = [] | |
| 535 if self.getDirection() == 1: | |
| 536 for exon in self.exons: | |
| 537 if exon.getEnd() >= self.getEnd() - size + 1: | |
| 538 if exon.getStart() < self.getEnd() - size + 1: | |
| 539 exon.setStart(self.getEnd() - size + 1) | |
| 540 newExons.append(exon) | |
| 541 else: | |
| 542 for exon in self.exons: | |
| 543 if exon.getEnd() >= self.getEnd() - size + 1: | |
| 544 if exon.getStart() < self.getEnd() - size + 1: | |
| 545 exon.setEnd(self.getEnd() - size + 1) | |
| 546 newExons.append(exon) | |
| 547 super(Transcript, self).restrictEnd(size) | |
| 548 self.exons = newExons | |
| 549 | |
| 550 | |
| 551 def removeExons(self): | |
| 552 """ | |
| 553 Remove the exons and transforms the current transcript into a mere interval | |
| 554 """ | |
| 555 self.exons = [] | |
| 556 self.bin = None | |
| 557 | |
| 558 | |
| 559 def printGtf(self, title): | |
| 560 """ | |
| 561 Export this transcript using GTF2.2 format | |
| 562 @param title: the title of the transcripts | |
| 563 @type title: string | |
| 564 @return: a string | |
| 565 """ | |
| 566 transcriptId = self.getUniqueName() | |
| 567 geneId = "%s_gene" % (transcriptId) | |
| 568 direction = "+" | |
| 569 if self.getDirection() == -1: | |
| 570 direction = "-" | |
| 571 self.sortExonsIncreasing() | |
| 572 string = "" | |
| 573 for i, exon in enumerate(self.getExons()): | |
| 574 exonCopy = Interval() | |
| 575 exonCopy.copy(exon) | |
| 576 if "ID" in exonCopy.getTagValues(): | |
| 577 del exonCopy.tags["ID"] | |
| 578 feature = "exon" | |
| 579 if "feature" in exonCopy.getTagNames(): | |
| 580 feature = exonCopy.getTagValue("feature") | |
| 581 del exonCopy.tags["feature"] | |
| 582 score = "." | |
| 583 if "score" in exonCopy.getTagNames(): | |
| 584 score = "%d" % (int(exonCopy.getTagValue("score"))) | |
| 585 del exonCopy.tags["score"] | |
| 586 if "Parent" in exonCopy.getTagNames(): | |
| 587 del exonCopy.tags["Parent"] | |
| 588 exonCopy.setName("%s_part%d" % (self.getName(), i+1)) | |
| 589 comment = exonCopy.getTagValues("; ", " ", "\"") | |
| 590 string += "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\ttranscript_id \"%s\"; gene_id \"%s\"; %s\n" % (exonCopy.getChromosome(), title, feature, exonCopy.getStart(), exonCopy.getEnd(), score, direction, transcriptId, geneId, comment) | |
| 591 return string | |
| 592 | |
| 593 | |
| 594 def printGff2(self, title): | |
| 595 """ | |
| 596 Export this transcript using GFF2 format | |
| 597 @param title: the title of the transcripts | |
| 598 @type title: string | |
| 599 @return: a string | |
| 600 """ | |
| 601 direction = "+" | |
| 602 if self.getDirection() == -1: | |
| 603 direction = "-" | |
| 604 self.sortExonsIncreasing() | |
| 605 comment = self.getTagValues() | |
| 606 if comment != None: | |
| 607 comment = ";%s" % (comment) | |
| 608 score = "." | |
| 609 if "score" in self.getTagNames(): | |
| 610 score = "%d" % (int(self.getTagValue("score"))) | |
| 611 feature = "transcript" | |
| 612 if "feature" in self.getTagNames(): | |
| 613 feature = self.getTagValue("feature") | |
| 614 string = "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\tGENE %s%s\n" % (self.getChromosome(), title, feature, self.getStart(), self.getEnd(), score, direction, self.name, comment) | |
| 615 for exon in self.getExons(): | |
| 616 if "score" in exon.getTagNames(): | |
| 617 score = "%d" % (int(self.getTagValue("score"))) | |
| 618 string += "%s\t%s\t_exon\t%d\t%d\t%s\t%s\t.\tGENE %s\n" % (self.getChromosome(), title, exon.getStart(), exon.getEnd(), score, direction, self.name) | |
| 619 return string | |
| 620 | |
| 621 | |
| 622 def printGff3(self, title): | |
| 623 """ | |
| 624 Export this transcript using GFF3 format | |
| 625 @param title: the title of the transcripts | |
| 626 @type title: string | |
| 627 @return: a string | |
| 628 """ | |
| 629 direction = "+" | |
| 630 if self.getDirection() == -1: | |
| 631 direction = "-" | |
| 632 self.sortExonsIncreasing() | |
| 633 if "ID" not in self.getTagValues(): | |
| 634 self.setTagValue("ID", self.getUniqueName()) | |
| 635 feature = "transcript" | |
| 636 tags = self.tags | |
| 637 if "feature" in self.getTagNames(): | |
| 638 feature = self.getTagValue("feature") | |
| 639 del self.tags["feature"] | |
| 640 score = "." | |
| 641 if "score" in self.getTagNames(): | |
| 642 score = "%d" % (int(self.getTagValue("score"))) | |
| 643 del self.tags["score"] | |
| 644 comment = self.getTagValues(";", "=") | |
| 645 string = "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\t%s\n" % (self.getChromosome(), title, feature, self.getStart(), self.getEnd(), score, direction, comment) | |
| 646 if len(self.exons) > 1: | |
| 647 for i, exon in enumerate(self.getExons()): | |
| 648 if "score" in exon.getTagNames(): | |
| 649 score = "%d" % (int(exon.getTagValue("score"))) | |
| 650 string += "%s\t%s\texon\t%d\t%d\t%s\t%s\t.\tID=%s-exon%d;Name=%s-exon%d;Parent=%s\n" % (self.getChromosome(), title, exon.getStart(), exon.getEnd(), score, direction, self.getTagValue("ID"), i+1, self.name, i+1, self.getTagValue("ID")) | |
| 651 self.tags = tags | |
| 652 return string | |
| 653 | |
| 654 | |
| 655 def printEmbl(self): | |
| 656 """ | |
| 657 Export this transcript using EMBL format | |
| 658 @return: a string | |
| 659 """ | |
| 660 if len(self.exons) <= 1: | |
| 661 position = "%d..%d" % (self.getStart(), self.getEnd()) | |
| 662 else: | |
| 663 positions = [] | |
| 664 for exon in self.getExons(): | |
| 665 positions.append("%d..%d" % (self.getStart(), self.getEnd())) | |
| 666 position = ",".join(positions) | |
| 667 position = "join(%s)" % (position) | |
| 668 if self.getDirection() == -1: | |
| 669 position = "complement(%s)" % (position) | |
| 670 feature = "misc_feature" | |
| 671 if "feature" in self.getTagNames(): | |
| 672 if not self.getTagValue("feature").startswith("S-MART"): | |
| 673 feature = self.getTagValue("feature") | |
| 674 string = "FT %s %s\n" % (feature, position) | |
| 675 if "Name" in self.getTagNames(): | |
| 676 string += "FT /label=\"%s\"\n" % (self.getTagValue("Name")) | |
| 677 return string | |
| 678 | |
| 679 | |
| 680 def printBed(self): | |
| 681 """ | |
| 682 Export this transcript using BED format | |
| 683 @return: a string | |
| 684 """ | |
| 685 name = self.name | |
| 686 if "nbOccurrences" in self.getTagNames() and self.getTagValue("nbOccurrences") != 1 and self.getTagValue("occurrences"): | |
| 687 name = "%s-%d" % (name, self.getTagValue("occurrence")) | |
| 688 comment = self.getTagValues(";", "=") | |
| 689 sizes = [] | |
| 690 starts = [] | |
| 691 direction = "+" | |
| 692 if self.getDirection() == -1: | |
| 693 direction = "-" | |
| 694 self.sortExonsIncreasing() | |
| 695 for exon in self.getExons(): | |
| 696 sizes.append("%d" % (exon.getSize())) | |
| 697 starts.append("%d" % (exon.getStart() - self.getStart())) | |
| 698 return "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0\t%d\t%s,\t%s,\n" % (self.getChromosome(), self.getStart(), self.getEnd()+1, name, direction, self.getStart(), self.getEnd()+1, self.getNbExons(), ",".join(sizes), ",".join(starts)) | |
| 699 | |
| 700 | |
| 701 def printSam(self): | |
| 702 """ | |
| 703 Export this transcript using SAM format | |
| 704 @return: a string | |
| 705 """ | |
| 706 name = self.name | |
| 707 flag = 0 if self.getDirection() == 1 else 0x10 | |
| 708 chromosome = self.getChromosome() | |
| 709 genomeStart = self.getStart() | |
| 710 quality = 255 | |
| 711 mate = "*" | |
| 712 mateGenomeStart = 0 | |
| 713 gapSize = 0 | |
| 714 sequence = "*" | |
| 715 qualityString = "*" | |
| 716 tags = "NM:i:0" | |
| 717 | |
| 718 lastExonEnd = None | |
| 719 self.sortExonsIncreasing() | |
| 720 exon = self.getExons()[0] | |
| 721 cigar = "%dM" % (self.getExons()[0].getSize()) | |
| 722 lastExonEnd = exon.getEnd() | |
| 723 for i, exon in enumerate(self.getExons()): | |
| 724 if i == 0: | |
| 725 continue | |
| 726 cigar += "%dN" % (exon.getStart() - lastExonEnd - 1) | |
| 727 cigar += "%dM" % (exon.getSize()) | |
| 728 | |
| 729 return "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n" % (name, flag, chromosome, genomeStart, quality, cigar, mate, mateGenomeStart, gapSize, sequence, qualityString, tags) | |
| 730 | |
| 731 | |
| 732 def printUcsc(self): | |
| 733 """ | |
| 734 Export this transcript using UCSC BED format | |
| 735 @return: a string | |
| 736 """ | |
| 737 if self.getChromosome().find("Het") != -1: | |
| 738 return "" | |
| 739 name = self.name | |
| 740 comment = self.getTagValues(";", "") | |
| 741 sizes = [] | |
| 742 starts = [] | |
| 743 direction = "+" | |
| 744 if self.getDirection() == -1: | |
| 745 direction = "-" | |
| 746 self.sortExonsIncreasing() | |
| 747 for exon in self.getExons(): | |
| 748 sizes.append("%d" % (exon.getSize())) | |
| 749 starts.append("%d" % (exon.getStart() - self.getStart())) | |
| 750 return "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0\t%d\t%s,\t%s,\n" % (self.getChromosome().replace("arm_", "chr"), self.getStart(), self.getEnd()+1, name, direction, self.getStart(), self.getEnd()+1, self.getNbExons(), ",".join(sizes), ",".join(starts)) | |
| 751 | |
| 752 | |
| 753 def printGBrowseReference(self): | |
| 754 """ | |
| 755 Export this transcript using GBrowse format (1st line only) | |
| 756 @return: a string | |
| 757 """ | |
| 758 return "reference = %s\n" % (self.getChromosome()) | |
| 759 | |
| 760 | |
| 761 def printGBrowseLine(self): | |
| 762 """ | |
| 763 Export this transcript using GBrowse format (2nd line only) | |
| 764 @return: a string | |
| 765 """ | |
| 766 self.sortExons() | |
| 767 coordinates = [] | |
| 768 for exon in self.getExons(): | |
| 769 coordinates.append(exon.printCoordinates()) | |
| 770 coordinatesString = ",".join(coordinates) | |
| 771 comment = self.getTagValues(";", "=") | |
| 772 if comment: | |
| 773 comment = "\t\"%s\"" % (comment) | |
| 774 return "User_data\t%s\t%s%s\n" % (self.name, coordinatesString, comment) | |
| 775 | |
| 776 | |
| 777 def printGBrowse(self): | |
| 778 """ | |
| 779 Export this transcript using GBrowse format | |
| 780 @return: a string | |
| 781 """ | |
| 782 return "%s%s" % (self.printGBrowseReference(), self.printGBrowseLine()) | |
| 783 | |
| 784 | |
| 785 def printCsv(self): | |
| 786 """ | |
| 787 Export this transcript using CSV format | |
| 788 @return: a string | |
| 789 """ | |
| 790 self.sortExons() | |
| 791 string = "%s,%d,%d,\"%s\"," % (self.getChromosome(), self.getStart(), self.getEnd(), "+" if self.getDirection() == 1 else "-") | |
| 792 if len(self.getExons()) == 1: | |
| 793 string += "None" | |
| 794 else: | |
| 795 for exon in self.getExons(): | |
| 796 string += "%d-%d " % (exon.getStart(), exon.getEnd()) | |
| 797 for tag in sorted(self.tags.keys()): | |
| 798 string += ",%s=%s" % (tag, str(self.tags[tag])) | |
| 799 string += "\n" | |
| 800 return string | |
| 801 | |
| 802 | |
| 803 def extractSequence(self, parser): | |
| 804 """ | |
| 805 Get the sequence corresponding to this transcript | |
| 806 @param parser: a parser to a FASTA file | |
| 807 @type parser: class L{SequenceListParser<SequenceListParser>} | |
| 808 @return: an instance of L{Sequence<Sequence>} | |
| 809 """ | |
| 810 self.sortExons() | |
| 811 name = self.name | |
| 812 if "ID" in self.getTagNames() and self.getTagValue("ID") != self.name: | |
| 813 name += ":%s" % (self.getTagValue("ID")) | |
| 814 sequence = Sequence(name) | |
| 815 for exon in self.getExons(): | |
| 816 sequence.concatenate(exon.extractSequence(parser)) | |
| 817 return sequence | |
| 818 | |
| 819 | |
| 820 def extractWigData(self, parser): | |
| 821 """ | |
| 822 Get some wig data corresponding to this transcript | |
| 823 @param parser: a parser to a wig file | |
| 824 @type parser: class L{WigParser<WigParser>} | |
| 825 @return: a sequence of float | |
| 826 """ | |
| 827 self.sortExons() | |
| 828 if parser.strands: | |
| 829 strands = (-1, 1) | |
| 830 values = dict([(strand, []) for strand in strands]) | |
| 831 for exon in self.getExons(): | |
| 832 theseValues = exon.extractWigData(parser) | |
| 833 if self.getDirection() == -1: | |
| 834 for strand in strands: | |
| 835 theseValues[strand].reverse() | |
| 836 for strand in strands: | |
| 837 values[strand].extend(theseValues[strand]) | |
| 838 if self.getDirection() == -1: | |
| 839 for strand in strands: | |
| 840 values[strand].reverse() | |
| 841 return values | |
| 842 else: | |
| 843 values = [] | |
| 844 for exon in self.getExons(): | |
| 845 theseValues = exon.extractWigData(parser) | |
| 846 #if self.getDirection() == -1: | |
| 847 # theseValues.reverse() | |
| 848 values.extend(theseValues) | |
| 849 #if self.getDirection() == -1: | |
| 850 # values.reverse() | |
| 851 return values |
