6
+ − 1 #
+ − 2 # Copyright INRA-URGI 2009-2010
+ − 3 #
+ − 4 # This software is governed by the CeCILL license under French law and
+ − 5 # abiding by the rules of distribution of free software. You can use,
+ − 6 # modify and/ or redistribute the software under the terms of the CeCILL
+ − 7 # license as circulated by CEA, CNRS and INRIA at the following URL
+ − 8 # "http://www.cecill.info".
+ − 9 #
+ − 10 # As a counterpart to the access to the source code and rights to copy,
+ − 11 # modify and redistribute granted by the license, users are provided only
+ − 12 # with a limited warranty and the software's author, the holder of the
+ − 13 # economic rights, and the successive licensors have only limited
+ − 14 # liability.
+ − 15 #
+ − 16 # In this respect, the user's attention is drawn to the risks associated
+ − 17 # with loading, using, modifying and/or developing or reproducing the
+ − 18 # software by the user in light of its specific status of free software,
+ − 19 # that may mean that it is complicated to manipulate, and that also
+ − 20 # therefore means that it is reserved for developers and experienced
+ − 21 # professionals having in-depth computer knowledge. Users are therefore
+ − 22 # encouraged to load and test the software's suitability as regards their
+ − 23 # requirements in conditions enabling the security of their systems and/or
+ − 24 # data to be ensured and, more generally, to use and operate it in the
+ − 25 # same conditions as regards security.
+ − 26 #
+ − 27 # The fact that you are presently reading this means that you have had
+ − 28 # knowledge of the CeCILL license and that you accept its terms.
+ − 29 #
+ − 30 import sys
+ − 31 import random
+ − 32 from SMART.Java.Python.misc import Utils
+ − 33 from SMART.Java.Python.structure.Transcript import Transcript
+ − 34 from SMART.Java.Python.structure.TranscriptList import TranscriptList
+ − 35 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
+ − 36 from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection
+ − 37 from SMART.Java.Python.mySql.MySqlTranscriptTable import MySqlTranscriptTable
+ − 38 from SMART.Java.Python.misc.Progress import Progress
+ − 39 from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter
+ − 40
+ − 41
+ − 42
+ − 43 class TranscriptListsComparator(object):
+ − 44 """
+ − 45 Compare two transcript lists, using a database for one of the list
+ − 46 Uses one TranscriptContainer for query data,
+ − 47 one TranscriptContainer exported to MySqlTranscriptTable for reference data,
+ − 48 one MySqlTranscriptTable for transformed reference data
+ − 49 @ivar inputTranscriptContainers: parsers to the list of query transcripts
+ − 50 @type inputTranscriptContainers: list of 2 L{TranscriptContainer<TranscriptContainer>}
+ − 51 @ivar writer: transcript list writer
+ − 52 @type writer: class L{TranscriptListWriter<TranscriptListWriter>}
+ − 53 @ivar mySqlConnection: connection to a MySQL database (to compute the ovelapping efficiently)
+ − 54 @type mySqlConnection: class L{MySqlConnection<MySqlConnection>}
+ − 55 @ivar introns: compare transcripts or exons only
+ − 56 @type introns: list of 2 boolean
+ − 57 @ivar starts: restrict the query transcripts to first nucleotides
+ − 58 @type starts: list of 2 int or None
+ − 59 @ivar fivePrimes: extend a list of transcripts by their 5' end
+ − 60 @type fivePrimes: list of 2 int or None
+ − 61 @ivar threePrimes: extend a list of transcripts by their 3' end
+ − 62 @type threePrimes: list of 2 int or None
+ − 63 @ivar minDistance: min distance between two transcripts [default: 0]
+ − 64 @type minDistance: int
+ − 65 @ivar maxDistance: max distance between two transcripts [default: 0]
+ − 66 @type maxDistance: int
+ − 67 @ivar minOverlap: minimum number of overlapping nucleotides to declare an overlap
+ − 68 @type minOverlap: int
+ − 69 @ivar pcOverlap: percentage of overlapping nucleotides to declare an overlap
+ − 70 @type pcOverlap: int
+ − 71 @ivar upstreams: consider distances with elements which are upstream of the transcripts
+ − 72 @type upstreams: boolean
+ − 73 @ivar downstreams: consider distances with elements which are downstream of the transcripts
+ − 74 @type downstreams: boolean
+ − 75 @ivar colinear: whether transcripts should overlap in the same direction
+ − 76 @type colinear: boolean
+ − 77 @ivar antisense: whether transcripts should overlap in the opposite direction
+ − 78 @type antisense: boolean
+ − 79 @ivar outputDistance: output distance between query and reference instead of query transcript
+ − 80 @type outputDistance: boolean
+ − 81 @ivar absolute: do not consider the strand while computing distance
+ − 82 @type absolute: boolean
+ − 83 @ivar strandedDistance: return a line per strand while computing distances
+ − 84 @type strandedDistance: boolean
+ − 85 @ivar QUERY: constant specifying the query objects
+ − 86 @type QUERY: int
+ − 87 @ivar REFERENCE: constant specifying the reference objects
+ − 88 @type REFERENCE: int
+ − 89 @ivar INPUTTYPES: set of input types of data (query or reference) objects
+ − 90 @type INPUTTYPES: list of 2 int
+ − 91 @ivar typeToString: string representation of the previous types
+ − 92 @type typeToString: dict
+ − 93 @ivar tableNames: name of the transcript tables
+ − 94 @type tableNames: dict of strings
+ − 95 @ivar nbTranscripts: number of transcript in the query/reference set
+ − 96 @type nbTranscripts: list of 2 int or None
+ − 97 @ivar nbNucleotides: number of nucleotides in the query/reference set
+ − 98 @type nbNucleotides: list of 2 int or None
+ − 99 @ivar transcriptsToBeStored: transcripts that will be stored into database
+ − 100 @type transcriptsToBeStored: dict of class L{TranscriptList<TranscriptList>}
+ − 101 @ivar multiple: in merge mode, aggregate multiple transcripts
+ − 102 @type multiple: boolean
+ − 103 @ivar normalization: normalize each element by the number of mappings of this element
+ − 104 @type normalization: boolean
+ − 105 @ivar invert: invert the current comparison
+ − 106 @type invert: boolean
+ − 107 @ivar splitDifference: split into intervals when computing difference
+ − 108 @type splitDifference: boolean
+ − 109 @ivar odds: whether odds about the comparison should be computed
+ − 110 @type odds: boolean
+ − 111 @ivar overlapResults: count the number of overlaps
+ − 112 @type overlapResults: dictionary
+ − 113 @ivar oddResults: compute the number of times each transcript overlaps (or is merged with) another one
+ − 114 @type oddResults: dictionary
+ − 115 @ivar outputContainer: container of the output transcripts
+ − 116 @type outputContainer: class L{TranscriptContainer<TranscriptContainer>}
+ − 117 @ivar logHandle: log handle
+ − 118 @type logHandle: file
+ − 119 @ivar verbosity: verbosity
+ − 120 @type verbosity: int
+ − 121 """
+ − 122
+ − 123 def __init__(self, logHandle = None, verbosity = 0):
+ − 124 """
+ − 125 Constructor
+ − 126 @param transcriptListParser2: parser to the list of reference transcripts
+ − 127 @type transcriptListParser2: class L{TranscriptListParser<TranscriptListParser>}
+ − 128 @param logHandle: log handle
+ − 129 @type logHandle: file
+ − 130 @param verbosity: verbosity
+ − 131 @type verbosity: int
+ − 132 """
+ − 133 self.QUERY = 0
+ − 134 self.REFERENCE = 1
+ − 135 self.WORKING = 2
+ − 136 self.INPUTTYPES = (self.QUERY, self.REFERENCE)
+ − 137 self.INPUTWORKINGTYPES = (self.QUERY, self.REFERENCE, self.WORKING)
+ − 138 self.typeToString = {self.QUERY: "Query", self.REFERENCE: "Reference", self.WORKING: "Working"}
+ − 139
+ − 140 self.logHandle = logHandle
+ − 141 self.verbosity = verbosity
+ − 142 self.mySqlConnection = MySqlConnection(self.verbosity-1)
+ − 143 self.inputTranscriptContainers = [None, None]
+ − 144 self.tableNames = ["tmpQueryTable_%d" % (random.randint(0, 100000)), "tmpReferenceTable_%d" % (random.randint(0, 100000)), "tmpOutputTable_%d" % (random.randint(0, 100000)), "tmpWorkingTable_%d" % (random.randint(0, 100000))]
+ − 145 self.mySqlTranscriptWriters = [MySqlTranscriptWriter(self.mySqlConnection, name, verbosity-1) for name in self.tableNames]
+ − 146 self.writer = None
+ − 147 self.introns = [False, False]
+ − 148 self.starts = [None, None]
+ − 149 self.ends = [None, None]
+ − 150 self.fivePrimes = [None, None]
+ − 151 self.threePrimes = [None, None]
+ − 152 self.minDistance = None
+ − 153 self.maxDistance = 0
+ − 154 self.minOverlap = 1
+ − 155 self.pcOverlap = None
+ − 156 self.colinear = False
+ − 157 self.antisense = False
+ − 158 self.downstreams = [False, False]
+ − 159 self.upstreams = [False, False]
+ − 160 self.outputDistance = False
+ − 161 self.absolute = False
+ − 162 self.strandedDistance = False
+ − 163 self.nbTranscripts = [None, None]
+ − 164 self.nbNucleotides = [None, None]
+ − 165 self.normalization = False
+ − 166 self.included = False
+ − 167 self.including = False
+ − 168 self.invert = False
+ − 169 self.notOverlapping = False
+ − 170 self.splitDifference = False
+ − 171 self.multiple = False
+ − 172 self.odds = False
+ − 173 self.overlapResults = None
+ − 174 self.oddResults = None
+ − 175 self.outputContainer = None
+ − 176 self.transcriptsToBeStored = dict([(type, TranscriptList()) for type in self.INPUTWORKINGTYPES])
+ − 177 self.nbPrinted = 0
+ − 178
+ − 179 self.mySqlConnection.createDatabase()
+ − 180
+ − 181
+ − 182 def __del__(self):
+ − 183 """
+ − 184 Destructor
+ − 185 Remove all temporary tables
+ − 186 """
+ − 187 for type in self.INPUTWORKINGTYPES:
+ − 188 self.mySqlTranscriptWriters[type].removeTables()
+ − 189 self.mySqlConnection.deleteDatabase()
+ − 190
+ − 191 def acceptIntrons(self, type, bool):
+ − 192 """
+ − 193 Compare transcripts or exons only
+ − 194 @param type: whether use query/reference data
+ − 195 @type type: int
+ − 196 @param bool: include introns or not
+ − 197 @type bool: boolean
+ − 198 """
+ − 199 self.introns[type] = bool
+ − 200
+ − 201
+ − 202 def restrictToStart(self, type, size):
+ − 203 """
+ − 204 Restrict a list of transcripts to first nucleotides
+ − 205 @param type: whether use query/reference data
+ − 206 @type type: int
+ − 207 @param size: the size of the transcript to be considered
+ − 208 @type size: int
+ − 209 """
+ − 210 self.starts[type] = size
+ − 211 self.introns[type] = False
+ − 212
+ − 213
+ − 214 def restrictToEnd(self, type, size):
+ − 215 """
+ − 216 Restrict a list of transcripts to first nucleotides
+ − 217 @param type: whether use query/reference data
+ − 218 @type type: int
+ − 219 @param size: the size of the transcript to be considered
+ − 220 @type size: int
+ − 221 """
+ − 222 self.ends[type] = size
+ − 223 self.introns[type] = False
+ − 224
+ − 225
+ − 226 def extendFivePrime(self, type, size):
+ − 227 """
+ − 228 Extend a list of transcripts by their 5' end
+ − 229 @param type: whether use query/reference data
+ − 230 @type type: int
+ − 231 @param size: size of the extension
+ − 232 @type size: int
+ − 233 """
+ − 234 self.fivePrimes[type] = size
+ − 235
+ − 236
+ − 237 def extendThreePrime(self, type, size):
+ − 238 """
+ − 239 Extend the list of query transcripts by their 3' end
+ − 240 @param type: whether use query/reference data
+ − 241 @type type: int
+ − 242 @param size: size of the extension
+ − 243 @type size: int
+ − 244 """
+ − 245 self.threePrimes[type] = size
+ − 246
+ − 247
+ − 248 def setMinDistance(self, distance):
+ − 249 """
+ − 250 Set the min distance between two transcripts
+ − 251 @param distance: distance
+ − 252 @type distance: int
+ − 253 """
+ − 254 self.minDistance = distance
+ − 255
+ − 256
+ − 257 def setMaxDistance(self, distance):
+ − 258 """
+ − 259 Set the max distance between two transcripts
+ − 260 @param distance: distance
+ − 261 @type distance: int
+ − 262 """
+ − 263 self.maxDistance = distance
+ − 264
+ − 265
+ − 266 def setMinOverlap(self, overlap):
+ − 267 """
+ − 268 Set the minimum number of nucleotides to declare an overlap
+ − 269 @param overlap: minimum number of nucleotides
+ − 270 @type overlap: int
+ − 271 """
+ − 272 self.minOverlap = overlap
+ − 273
+ − 274
+ − 275 def setPcOverlap(self, overlap):
+ − 276 """
+ − 277 Set the percentage of nucleotides to declare an overlap
+ − 278 @param overlap: percentage of nucleotides
+ − 279 @type overlap: int
+ − 280 """
+ − 281 self.pcOverlap = overlap
+ − 282
+ − 283
+ − 284 def setUpstream(self, type, boolean):
+ − 285 """
+ − 286 Consider transcripts which are upstream of some transcripts
+ − 287 @param type: whether use query/reference data
+ − 288 @type type: int
+ − 289 @param boolean: consider only these transcripts or not
+ − 290 @type boolean: boolean
+ − 291 """
+ − 292 self.upstreams[type] = boolean
+ − 293
+ − 294
+ − 295 def setDownstream(self, type, boolean):
+ − 296 """
+ − 297 Consider transcripts which are downstream of some transcripts
+ − 298 @param type: whether use query/reference data
+ − 299 @type type: int
+ − 300 @param boolean: consider only these transcripts or not
+ − 301 @type boolean: boolean
+ − 302 """
+ − 303 self.downstreams[type] = boolean
+ − 304
+ − 305
+ − 306 def setOutputDistance(self, boolean):
+ − 307 """
+ − 308 Output distance between query and reference instead of query transcript
+ − 309 @param boolean: whether distance should be output
+ − 310 @type boolean: boolean
+ − 311 """
+ − 312 self.outputDistance = boolean
+ − 313
+ − 314
+ − 315 def setAbsolute(self, boolean):
+ − 316 """
+ − 317 Do not consider strand when computing distance (thus, having only non-negative values)
+ − 318 @param boolean: whether we should consider strands
+ − 319 @type boolean: boolean
+ − 320 """
+ − 321 self.absolute = boolean
+ − 322
+ − 323
+ − 324 def setStrandedDistance(self, boolean):
+ − 325 """
+ − 326 Return two distance distributions, one per strand
+ − 327 @param boolean: whether we should return 2 distance distance
+ − 328 @type boolean: boolean
+ − 329 """
+ − 330 self.strandedDistance = boolean
+ − 331
+ − 332
+ − 333 def getColinearOnly(self, boolean):
+ − 334 """
+ − 335 Only consider transcripts that overlap in the same direction
+ − 336 @param boolean: whether transcripts should overlap in the same direction
+ − 337 @type boolean: boolean
+ − 338 """
+ − 339 self.colinear = boolean
+ − 340
+ − 341
+ − 342 def getAntisenseOnly(self, boolean):
+ − 343 """
+ − 344 Only consider transcripts that overlap in the opposite direction
+ − 345 @param boolean: whether transcripts should overlap in the opposite direction
+ − 346 @type boolean: boolean
+ − 347 """
+ − 348 self.antisense = boolean
+ − 349
+ − 350
+ − 351 def setIncludedOnly(self, boolean):
+ − 352 """
+ − 353 Keep the elements from first set which are included in the second set
+ − 354 @param boolean: whether to keep included elements only
+ − 355 @type boolean: boolean
+ − 356 """
+ − 357 self.included = boolean
+ − 358
+ − 359
+ − 360 def setIncludingOnly(self, boolean):
+ − 361 """
+ − 362 Keep the elements from second set which are included in the first set
+ − 363 @param boolean: whether to keep included elements only
+ − 364 @type boolean: boolean
+ − 365 """
+ − 366 self.including = boolean
+ − 367
+ − 368
+ − 369 def setNormalization(self, boolean):
+ − 370 """
+ − 371 Normalize the elements by the number of mappings in the genome
+ − 372 @param boolean: whether normalize
+ − 373 @type boolean: boolean
+ − 374 """
+ − 375 self.normalization = boolean
+ − 376
+ − 377
+ − 378 def getInvert(self, boolean):
+ − 379 """
+ − 380 Only consider transcripts that do not overlap
+ − 381 @param boolean: whether invert the selection
+ − 382 @type boolean: boolean
+ − 383 """
+ − 384 self.invert = boolean
+ − 385
+ − 386
+ − 387 def includeNotOverlapping(self, boolean):
+ − 388 """
+ − 389 Also output the elements which do not overlap
+ − 390 @param boolean: whether output the elements which do not overlap
+ − 391 @type boolean: boolean
+ − 392 """
+ − 393 self.notOverlapping = boolean
+ − 394
+ − 395
+ − 396 def setSplitDifference(self, boolean):
+ − 397 """
+ − 398 Split into intervals when computing difference
+ − 399 @param boolean: whether to split
+ − 400 @type boolean: boolean
+ − 401 """
+ − 402 self.splitDifference = boolean
+ − 403
+ − 404
+ − 405 def aggregate(self, boolean):
+ − 406 """
+ − 407 In merge mode, aggregate multiple transcripts
+ − 408 @param boolean: aggregate multiple transcripts
+ − 409 @type boolean: boolean
+ − 410 """
+ − 411 self.multiple = boolean
+ − 412
+ − 413
+ − 414 def getTables(self, type):
+ − 415 """
+ − 416 Get the SQL tables
+ − 417 @param type: type of the table (query, reference, etc.)
+ − 418 @type type: int
+ − 419 """
+ − 420 return self.mySqlTranscriptWriters[type].getTables()
+ − 421
+ − 422
+ − 423 def computeOdds(self, boolean):
+ − 424 """
+ − 425 Compute odds
+ − 426 @param boolean: whether odds should be computed
+ − 427 @type boolean: boolean
+ − 428 """
+ − 429 self.odds = boolean
+ − 430 if self.odds:
+ − 431 self.overlapResults = dict()
+ − 432
+ − 433
+ − 434 def computeOddsPerTranscript(self, boolean):
+ − 435 """
+ − 436 Compute odds for each transcript
+ − 437 @param boolean: whether odds for each transcript should be computed
+ − 438 @type boolean: boolean
+ − 439 """
+ − 440 self.odds = boolean
+ − 441 if self.odds:
+ − 442 self.overlapResults = dict()
+ − 443
+ − 444
+ − 445 def removeTables(self):
+ − 446 """
+ − 447 Remove the temporary MySQL tables
+ − 448 """
+ − 449 for type in self.INPUTWORKINGTYPES:
+ − 450 for chromosome in self.getTables(type):
+ − 451 self.getTables(type)[chromosome].remove()
+ − 452
+ − 453
+ − 454 def clearTables(self):
+ − 455 """
+ − 456 Empty the content of the databases
+ − 457 """
+ − 458 for type in self.INPUTWORKINGTYPES:
+ − 459 if self.transcriptListParsers[type] != None:
+ − 460 for chromosome in self.getTables(type):
+ − 461 self.getTables(type)[chromosome].clear()
+ − 462
+ − 463
+ − 464 def extendTranscript(self, type, transcript):
+ − 465 """
+ − 466 Extend a transcript corresponding to the parameters of the class
+ − 467 @param transcript: a transcript
+ − 468 @type transcript: class L{Transcript<Transcript>}
+ − 469 @return: the possibly extended transcript
+ − 470 """
+ − 471 extendedTranscript = Transcript()
+ − 472 extendedTranscript.copy(transcript)
+ − 473 if self.starts[type] != None:
+ − 474 extendedTranscript.restrictStart(self.starts[type])
+ − 475 if self.ends[type] != None:
+ − 476 extendedTranscript.restrictEnd(self.ends[type])
+ − 477 if self.fivePrimes[type] != None:
+ − 478 extendedTranscript.extendStart(self.fivePrimes[type])
+ − 479 if self.threePrimes[type] != None:
+ − 480 extendedTranscript.extendEnd(self.threePrimes[type])
+ − 481 return extendedTranscript
+ − 482
+ − 483
+ − 484 def storeTranscript(self, type, transcript, now = True):
+ − 485 """
+ − 486 Add a transcript to a MySQL database, or postpone the store
+ − 487 @param type: whether use query/reference table
+ − 488 @type type: int
+ − 489 @param transcript: a transcript
+ − 490 @type transcript: class L{Transcript<Transcript>}
+ − 491 @param now: whether transcript should be stored now (or stored can be postponed)
+ − 492 @type now: bool
+ − 493 """
+ − 494 self.mySqlTranscriptWriters[type].addTranscript(transcript)
+ − 495 if type == self.REFERENCE:
+ − 496 self.mySqlTranscriptWriters[self.WORKING].addTranscript(transcript)
+ − 497 if now:
+ − 498 self.mySqlTranscriptWriters[type].write()
+ − 499 if type == self.REFERENCE:
+ − 500 self.mySqlTranscriptWriters[self.WORKING].write()
+ − 501
+ − 502
+ − 503 def writeTranscript(self, transcript):
+ − 504 """
+ − 505 Write a transcript in the output file
+ − 506 @param transcript: a transcript
+ − 507 @type transcript: class L{Transcript<Transcript>}
+ − 508 """
+ − 509 if self.writer != None:
+ − 510 self.writer.addTranscript(transcript)
+ − 511 self.nbPrinted += 1
+ − 512
+ − 513
+ − 514 def flushData(self, type = None):
+ − 515 """
+ − 516 Store the remaining transcripts
+ − 517 @param type: whether use query/reference table (None for all)
+ − 518 @type type: int or None
+ − 519 """
+ − 520 if type == None:
+ − 521 types = self.INPUTWORKINGTYPES
+ − 522 else:
+ − 523 types = [type]
+ − 524 for type in types:
+ − 525 self.mySqlTranscriptWriters[type].write()
+ − 526 if self.writer != None:
+ − 527 self.writer.write()
+ − 528
+ − 529
+ − 530 def unstoreTranscript(self, type, transcript):
+ − 531 """
+ − 532 Remove a transcript from a MySQL database
+ − 533 @param type: whether use query/reference table
+ − 534 @type type: int
+ − 535 @param transcript: a transcript
+ − 536 @type transcript: class L{Transcript<Transcript>}
+ − 537 """
+ − 538 self.getTables(type)[transcript.getChromosome()].removeTranscript(transcript)
+ − 539 if type == self.REFERENCE:
+ − 540 self.getTables(self.WORKING)[transcript.getChromosome()].removeTranscript(transcript)
+ − 541
+ − 542
+ − 543 def addIndexes(self, tables):
+ − 544 """
+ − 545 Add useful indexes to the tables
+ − 546 @param tables: which tables should be indexed
+ − 547 @type tables: list of int
+ − 548 """
+ − 549 for type in tables:
+ − 550 for chromosome in self.getTables(type):
+ − 551 self.getTables(type)[chromosome].createIndex("iStart_transcript_%s_%d_%d" % (chromosome, type, random.randint(0, 100000)), ["start"])
+ − 552 self.getTables(type)[chromosome].exonsTable.createIndex("iTranscriptId_exon_%s_%d_%d" % (chromosome, type, random.randint(0, 100000)), ["transcriptId"])
+ − 553
+ − 554
+ − 555 def storeTranscriptList(self, type, transcriptListParser, extension):
+ − 556 """
+ − 557 Store a transcript list into database
+ − 558 @param type: whether use query/reference parser
+ − 559 @type type: int
+ − 560 @param parser: a parser of transcript list
+ − 561 @type parser: class L{TranscriptContainer<TranscriptContainer>}
+ − 562 @param extension: extend (or not) the transcripts
+ − 563 @type extension: boolean
+ − 564 """
+ − 565 progress = Progress(transcriptListParser.getNbTranscripts(), "Writing transcripts for %s" % ("query" if type == self.QUERY else "reference"), self.verbosity-1)
+ − 566 for transcript in transcriptListParser.getIterator():
+ − 567 if extension:
+ − 568 transcript = self.extendTranscript(type, transcript)
+ − 569 self.mySqlTranscriptWriters[type].addTranscript(transcript)
+ − 570 progress.inc()
+ − 571 self.mySqlTranscriptWriters[type].write()
+ − 572 progress.done()
+ − 573 if type == self.REFERENCE:
+ − 574 for chromosome in self.getTables(self.REFERENCE):
+ − 575 self.getTables(self.WORKING)[chromosome] = MySqlTranscriptTable(self.mySqlConnection, self.tableNames[self.WORKING], chromosome, self.verbosity-1)
+ − 576 self.getTables(self.WORKING)[chromosome].copy(self.getTables(self.REFERENCE)[chromosome])
+ − 577
+ − 578
+ − 579 def setInputTranscriptContainer(self, type, inputTranscriptContainer):
+ − 580 """
+ − 581 Set an input transcript list container
+ − 582 @param type: whether use query/reference parser
+ − 583 @type type: int
+ − 584 @param inputTranscriptContainer: a container
+ − 585 @type inputTranscriptContainer: class L{TranscriptContainer<TranscriptContainer>}
+ − 586 """
+ − 587 self.inputTranscriptContainers[type] = inputTranscriptContainer
+ − 588 self.nbTranscripts[type] = self.inputTranscriptContainers[type].getNbTranscripts()
+ − 589 self.nbNucleotides[type] = self.inputTranscriptContainers[type].getNbNucleotides()
+ − 590
+ − 591
+ − 592 def setOutputWriter(self, writer):
+ − 593 """
+ − 594 Set an output transcript list writer
+ − 595 @param writer: a writer
+ − 596 @type writer: class L{TranscriptListWriter<TranscriptListWriter>}
+ − 597 """
+ − 598 self.writer = writer
+ − 599
+ − 600
+ − 601 def compareTranscript(self, transcript1, transcript2, includeDistance = False):
+ − 602 """
+ − 603 Compare two transcripts, using user defined parameters
+ − 604 @param transcript1: a transcript from the query set (already extended)
+ − 605 @type transcript1: class L{Transcript<Transcript>}
+ − 606 @param transcript2: a transcript from the reference set (already extended)
+ − 607 @type transcript2: class L{Transcript<Transcript>}
+ − 608 @param includeDistance: take into account the distance too
+ − 609 @type includeDistance: boolean
+ − 610 @return: true, if they overlap
+ − 611 """
+ − 612 extendedTranscript1 = Transcript()
+ − 613 extendedTranscript1.copy(transcript1)
+ − 614 if includeDistance:
+ − 615 if self.maxDistance > 0:
+ − 616 extendedTranscript1.extendStart(self.maxDistance)
+ − 617 extendedTranscript1.extendEnd(self.maxDistance)
+ − 618
+ − 619 minOverlap = self.minOverlap
+ − 620 if self.pcOverlap != None:
+ − 621 minOverlap = max(minOverlap, transcript1.getSize() / 100.0 * self.pcOverlap)
+ − 622 if not extendedTranscript1.overlapWith(transcript2, self.minOverlap):
+ − 623 return False
+ − 624 if (self.downstreams[self.QUERY] and transcript2.getStart() > extendedTranscript1.getStart()) or \
+ − 625 (self.upstreams[self.QUERY] and transcript2.getEnd() < extendedTranscript1.getEnd()) or \
+ − 626 (self.downstreams[self.REFERENCE] and extendedTranscript1.getStart() > transcript2.getStart()) or \
+ − 627 (self.upstreams[self.REFERENCE] and extendedTranscript1.getEnd() < transcript2.getEnd()):
+ − 628 return False
+ − 629 if (self.antisense and extendedTranscript1.getDirection() == transcript2.getDirection()) or (self.colinear and extendedTranscript1.getDirection() != transcript2.getDirection()):
+ − 630 return False
+ − 631 if self.included and not transcript2.include(extendedTranscript1):
+ − 632 return False
+ − 633 if self.including and not extendedTranscript1.include(transcript2):
+ − 634 return False
+ − 635 if self.introns[self.REFERENCE] and self.introns[self.QUERY]:
+ − 636 if self.logHandle != None:
+ − 637 self.logHandle.write("%s overlaps with intron of %s\n" % (str(extendedTranscript1), str(transcript2)))
+ − 638 return True
+ − 639 if (not self.introns[self.REFERENCE]) and (not self.introns[self.QUERY]) and extendedTranscript1.overlapWithExon(transcript2, minOverlap):
+ − 640 if self.logHandle != None:
+ − 641 self.logHandle.write("%s overlaps with exon of %s\n" % (str(extendedTranscript1), str(transcript2)))
+ − 642 return True
+ − 643 return False
+ − 644
+ − 645
+ − 646 def compareTranscriptToList(self, transcript1):
+ − 647 """
+ − 648 Compare a transcript to the reference list of transcripts
+ − 649 (Do not extend the transcripts, except for the distance)
+ − 650 @param transcript1: a transcript (from the query set)
+ − 651 @type transcript1: class L{Transcript<Transcript>}
+ − 652 @return: the reference transcripts overlapping
+ − 653 """
+ − 654 # no transcript in the reference table
+ − 655 if transcript1.getChromosome() not in self.getTables(self.WORKING):
+ − 656 return
+ − 657
+ − 658 # retrieve the the transcripts that may overlap in the working tables
+ − 659 clauses = []
+ − 660 extendedTranscript1 = Transcript()
+ − 661 extendedTranscript1.copy(transcript1)
+ − 662 if self.maxDistance > 0:
+ − 663 extendedTranscript1.extendStart(self.maxDistance)
+ − 664 if self.maxDistance > 0:
+ − 665 extendedTranscript1.extendEnd(self.maxDistance)
+ − 666 command = "SELECT * FROM %s WHERE (" % (self.getTables(self.WORKING)[transcript1.getChromosome()].getName())
+ − 667 for binPair in extendedTranscript1.getBins():
+ − 668 clause = "bin "
+ − 669 if binPair[0] == binPair[1]:
+ − 670 clause += "= %i" % (binPair[0])
+ − 671 else:
+ − 672 clause += "BETWEEN %i AND %i" % (binPair[0], binPair[1])
+ − 673 clauses.append(clause)
+ − 674 command += " OR ".join(clauses)
+ − 675 command += ") AND start <= %d AND end >= %d" % (extendedTranscript1.getEnd(), extendedTranscript1.getStart())
+ − 676
+ − 677 for index2, transcript2 in self.getTables(self.REFERENCE)[transcript1.getChromosome()].selectTranscripts(command):
+ − 678 if self.compareTranscript(extendedTranscript1, transcript2):
+ − 679 yield transcript2
+ − 680
+ − 681
+ − 682 def compareTranscriptList(self):
+ − 683 """
+ − 684 Compare a list of transcript to the reference one
+ − 685 @return: the transcripts that overlap with the reference set
+ − 686 """
+ − 687 distance = 0
+ − 688 nbClustersIn = 0
+ − 689 nbClustersOut = 0
+ − 690 if self.maxDistance != None:
+ − 691 distance = self.maxDistance
+ − 692
+ − 693 self.addIndexes([self.QUERY, self.REFERENCE])
+ − 694
+ − 695 # export the container into tables
+ − 696 self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True)
+ − 697 self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True)
+ − 698
+ − 699 # looping
+ − 700 for chromosome1 in sorted(self.getTables(self.QUERY).keys()):
+ − 701 # get range of transcripts
+ − 702 command = "SELECT MIN(start), MAX(end), COUNT(id) FROM %s" % (self.getTables(self.QUERY)[chromosome1].getName())
+ − 703 query = self.mySqlConnection.executeQuery(command)
+ − 704 result = query.getLine()
+ − 705 first = result[0]
+ − 706 last = result[1]
+ − 707 nb = result[2]
+ − 708
+ − 709 transcripts1 = []
+ − 710 toBeRemoved1 = []
+ − 711 transcripts2 = []
+ − 712 toBeRemoved2 = []
+ − 713 overlapsWith = []
+ − 714 nbOverlaps = []
+ − 715 nbChunks = max(1, nb / 100)
+ − 716 chunkSize = (last - first) / nbChunks
+ − 717 progress = Progress(nbChunks + 1, "Analyzing chromosome %s" % (chromosome1), self.verbosity-1)
+ − 718 for chunk in range(nbChunks + 1):
+ − 719
+ − 720 # load transcripts
+ − 721 start = first + chunk * chunkSize
+ − 722 end = start + chunkSize - 1
+ − 723 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.QUERY)[chromosome1].getName(), start, end-1)
+ − 724 for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command):
+ − 725 transcripts1.append(transcript1)
+ − 726 overlapsWith.append([])
+ − 727 nbOverlaps.append(0)
+ − 728 nbClustersIn += 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements")
+ − 729 command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.QUERY)[chromosome1].getName(), end)
+ − 730 self.mySqlConnection.executeQuery(command)
+ − 731 if chromosome1 in self.getTables(self.REFERENCE):
+ − 732 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), start-distance, end+distance-1)
+ − 733 if chunk == 0:
+ − 734 command = "SELECT * FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance)
+ − 735 for index2, transcript2 in self.getTables(self.REFERENCE)[chromosome1].selectTranscripts(command):
+ − 736 transcripts2.append(transcript2)
+ − 737 command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance)
+ − 738 self.mySqlConnection.executeQuery(command)
+ − 739
+ − 740 # compare sets
+ − 741 for index1, transcript1 in enumerate(transcripts1):
+ − 742 overlappingNames = []
+ − 743 nbElements1 = 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements")
+ − 744 for transcript2 in transcripts2:
+ − 745 if self.compareTranscript(transcript1, transcript2, True):
+ − 746 id2 = transcript2.getTagValue("ID") if "ID" in transcript2.getTagNames() else transcript2.getName()
+ − 747 if id2 not in overlapsWith[index1]:
+ − 748 overlapsWith[index1].append(id2)
+ − 749 nbOverlaps[index1] += 1 if "nbElements" not in transcript2.getTagNames() else transcript2.getTagValue("nbElements")
+ − 750 if self.odds:
+ − 751 if transcript2.getName() not in self.overlapResults:
+ − 752 self.overlapResults[transcript2.getName()] = 0
+ − 753 self.overlapResults[transcript2.getName()] += nbElements1
+ − 754
+ − 755 # check if query transcript extends bounds of the chunk
+ − 756 if transcript1.getEnd() < end:
+ − 757 if Utils.xor(overlapsWith[index1], self.invert) or self.notOverlapping:
+ − 758 if overlapsWith[index1]:
+ − 759 transcript1.setTagValue("overlapWith", ",".join(overlapsWith[index1])[:100])
+ − 760 transcript1.setTagValue("nbOverlaps", "%d" % (nbOverlaps[index1]))
+ − 761 elif self.notOverlapping:
+ − 762 transcript1.setTagValue("nbOverlaps", "0")
+ − 763 self.writeTranscript(transcript1)
+ − 764 nbClustersOut += nbElements1
+ − 765 toBeRemoved1.append(index1)
+ − 766
+ − 767 # update list of query transcripts
+ − 768 for index1 in reversed(toBeRemoved1):
+ − 769 del transcripts1[index1]
+ − 770 del overlapsWith[index1]
+ − 771 del nbOverlaps[index1]
+ − 772 toBeRemoved1 = []
+ − 773
+ − 774 # check if the reference transcripts extends bounds of the chunk
+ − 775 for index2, transcript2 in enumerate(transcripts2):
+ − 776 if transcript2.getEnd() + distance < end:
+ − 777 toBeRemoved2.append(index2)
+ − 778 for index2 in reversed(toBeRemoved2):
+ − 779 del transcripts2[index2]
+ − 780 toBeRemoved2 = []
+ − 781
+ − 782 progress.inc()
+ − 783
+ − 784 for index1, transcript1 in enumerate(transcripts1):
+ − 785 if Utils.xor(overlapsWith[index1], self.invert) or self.notOverlapping:
+ − 786 if overlapsWith[index1]:
+ − 787 transcript1.setTagValue("overlapWith", ",".join(overlapsWith[index1])[:100])
+ − 788 transcript1.setTagValue("nbOverlaps", "%d" % (nbOverlaps[index1]))
+ − 789 elif self.notOverlapping:
+ − 790 transcript1.setTagValue("nbOverlaps", "0")
+ − 791 self.writeTranscript(transcript1)
+ − 792 nbClustersOut += 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements")
+ − 793 progress.done()
+ − 794 self.getTables(self.QUERY)[chromosome1].remove()
+ − 795 if chromosome1 in self.getTables(self.REFERENCE):
+ − 796 self.getTables(self.REFERENCE)[chromosome1].remove()
+ − 797 self.getTables(self.WORKING)[chromosome1].remove()
+ − 798
+ − 799 self.flushData()
+ − 800 if self.writer != None:
+ − 801 self.writer.close()
+ − 802 self.writer = None
+ − 803
+ − 804 if self.verbosity > 0:
+ − 805 print "reference: %d elements" % (self.nbTranscripts[self.REFERENCE])
+ − 806 print "query: %d elements, %d clustered" % (self.nbTranscripts[self.QUERY], nbClustersIn)
+ − 807 if self.nbTranscripts[self.QUERY] != 0:
+ − 808 print "output: %d elements (%.2f%%)"% (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100),
+ − 809 if nbClustersOut != 0:
+ − 810 print ", %d clustered (%.2f%%)" % (nbClustersOut, float(nbClustersOut) / nbClustersIn * 100)
+ − 811
+ − 812
+ − 813 def compareTranscriptListDistance(self):
+ − 814 """
+ − 815 Compare a list of transcript to the reference one
+ − 816 @return: the distance distributions in a hash
+ − 817 """
+ − 818 nbDistances = 0
+ − 819 distances = {}
+ − 820 absDistances = {}
+ − 821 strandedDistances = dict([(strand, {}) for strand in (1, -1)])
+ − 822
+ − 823 # export the container into tables
+ − 824 self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True)
+ − 825 self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True)
+ − 826
+ − 827 progress = Progress(self.nbTranscripts[self.QUERY], "Analyzing chromosomes", self.verbosity-1)
+ − 828 for transcript1 in self.inputTranscriptContainers[self.QUERY].getIterator():
+ − 829 # get the distance
+ − 830 transcript1 = self.extendTranscript(self.QUERY, transcript1)
+ − 831 distance = self.maxDistance + 1
+ − 832 strand = None
+ − 833 closestElement = "None"
+ − 834 for transcript2 in self.compareTranscriptToList(transcript1):
+ − 835 thisStrand = transcript1.getDirection() * transcript2.getDirection()
+ − 836 if self.antisense or (not self.colinear and transcript1.getDirection() != transcript2.getDirection()):
+ − 837 transcript2.reverse()
+ − 838 if self.absolute:
+ − 839 transcript2.setDirection(transcript1.getDirection())
+ − 840 if transcript2.getDirection() == transcript1.getDirection():
+ − 841 if self.starts[self.REFERENCE] != None:
+ − 842 transcript2.restrictStart(self.starts[self.REFERENCE])
+ − 843 if self.ends[self.REFERENCE] != None:
+ − 844 transcript2.restrictEnd(self.ends[self.REFERENCE])
+ − 845 thisDistance = transcript1.getRelativeDistance(transcript2)
+ − 846 if (self.absolute):
+ − 847 thisDistance = abs(thisDistance)
+ − 848 if abs(thisDistance) < abs(distance):
+ − 849 distance = thisDistance
+ − 850 strand = thisStrand
+ − 851 closestElement = transcript2.getTagValue("ID") if "ID" in transcript2.getTagNames() else transcript2.getName()
+ − 852 if (distance <= self.maxDistance) and (self.minDistance == None or distance >= self.minDistance):
+ − 853 nbDistances += 1
+ − 854 distances[distance] = distances.get(distance, 0) + 1
+ − 855 absDistance = abs(distance)
+ − 856 absDistances[absDistance] = absDistances.get(absDistance, 0) + 1
+ − 857 strandedDistances[strand][distance] = strandedDistances[strand].get(distance, 0)
+ − 858 if distance not in strandedDistances[-strand]:
+ − 859 strandedDistances[-strand][distance] = 0
+ − 860
+ − 861 # write transcript
+ − 862 if distance == self.maxDistance + 1:
+ − 863 distance = "None"
+ − 864 tmpTranscript = Transcript()
+ − 865 tmpTranscript.copy(transcript1)
+ − 866 tmpTranscript.setTagValue("distance", distance)
+ − 867 tmpTranscript.setTagValue("closestElement", closestElement)
+ − 868 self.writeTranscript(tmpTranscript)
+ − 869
+ − 870 progress.inc()
+ − 871 progress.done()
+ − 872
+ − 873 self.flushData()
+ − 874
+ − 875 if self.verbosity > 0:
+ − 876 print "reference: %d sequences" % (self.nbTranscripts[self.REFERENCE])
+ − 877 print "query: %d sequences" % (self.nbTranscripts[self.QUERY])
+ − 878 if nbDistances == 0:
+ − 879 print "Nothing matches"
+ − 880 else:
+ − 881 print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(absDistances)
+ − 882 print "for %d distances (%.2f%%)" % (nbDistances, float(nbDistances) / self.nbTranscripts[self.QUERY] * 100)
+ − 883
+ − 884 if self.strandedDistance:
+ − 885 return strandedDistances
+ − 886 return distances
+ − 887
+ − 888
+ − 889 def compareTranscriptListMerge(self):
+ − 890 """
+ − 891 Merge the query list of transcript with itself
+ − 892 @return: the merged transcripts in a transcript list database
+ − 893 """
+ − 894 nbMerges = 0
+ − 895
+ − 896 for type in (self.QUERY, self.REFERENCE):
+ − 897 self.storeTranscriptList(type, self.inputTranscriptContainers[type], True)
+ − 898 self.flushData()
+ − 899
+ − 900 # Loop on the chromosomes
+ − 901 for chromosome in sorted(self.getTables(self.QUERY).keys()):
+ − 902 if chromosome not in self.getTables(self.REFERENCE):
+ − 903 continue
+ − 904
+ − 905 # Get the size of the chromosome
+ − 906 maxEnd = 0
+ − 907 nbChunks = 0
+ − 908 for type in (self.QUERY, self.REFERENCE):
+ − 909 command = "SELECT MAX(end) from %s" % (self.getTables(type)[chromosome].getName())
+ − 910 query = self.mySqlConnection.executeQuery(command)
+ − 911 maxEnd = max(maxEnd, int(query.getLine()[0]))
+ − 912 nbChunks = max(nbChunks, self.getTables(type)[chromosome].getNbElements())
+ − 913
+ − 914 mergedTranscripts = {}
+ − 915 transcripts = {self.QUERY: [], self.REFERENCE: []}
+ − 916 progress = Progress(nbChunks, "Analyzing %s" % (chromosome), self.verbosity-1)
+ − 917 for i in range(nbChunks):
+ − 918 rangeStart = int(i * (float(maxEnd) / nbChunks)) + 1
+ − 919 rangeEnd = int((i+1) * (float(maxEnd) / nbChunks))
+ − 920
+ − 921 # Get all transcripts in query and reference from chunk
+ − 922 for type in (self.QUERY, self.REFERENCE):
+ − 923 correction = 0 if self.QUERY else self.maxDistance
+ − 924 command = "SELECT * FROM %s WHERE start <= %d" % (self.getTables(type)[chromosome].getName(), rangeEnd + correction)
+ − 925 for index, transcript in self.getTables(type)[chromosome].selectTranscripts(command):
+ − 926 transcripts[type].append(transcript)
+ − 927
+ − 928 # Merge elements between the two samples
+ − 929 for iQuery, queryTranscript in enumerate(transcripts[self.QUERY]):
+ − 930 for iReference, referenceTranscript in enumerate(transcripts[self.REFERENCE]):
+ − 931 if referenceTranscript == None: continue
+ − 932 if self.compareTranscript(queryTranscript, referenceTranscript, True):
+ − 933 if queryTranscript.getDirection() != referenceTranscript.getDirection():
+ − 934 referenceTranscript.setDirection(queryTranscript.getDirection())
+ − 935 queryTranscript.merge(referenceTranscript, self.normalization)
+ − 936 nbMerges += 1
+ − 937 transcripts[self.REFERENCE][iReference] = None
+ − 938 if not self.multiple:
+ − 939 mergedTranscripts[iQuery] = 0
+ − 940
+ − 941 # Remove transcripts from database
+ − 942 for type in (self.QUERY, self.REFERENCE):
+ − 943 correction = 0 if self.QUERY else self.maxDistance
+ − 944 command = "DELETE FROM %s WHERE start <= %d" % (self.getTables(type)[chromosome].getName(), rangeEnd - correction)
+ − 945 query = self.mySqlConnection.executeQuery(command)
+ − 946
+ − 947 # Just in case, self-merge the elements in the query (beware of mergedTranscripts!)
+ − 948 if (self.multiple):
+ − 949 for iQuery1, queryTranscript1 in enumerate(transcripts[self.QUERY]):
+ − 950 if queryTranscript1 == None: continue
+ − 951 for iQuery2, queryTranscript2 in enumerate(transcripts[self.QUERY]):
+ − 952 if iQuery2 <= iQuery1 or queryTranscript2 == None: continue
+ − 953 minOverlap = self.minOverlap
+ − 954 if self.pcOverlap != None:
+ − 955 minOverlap = max(minOverlap, queryTranscript1.getSize() / 100.0 * self.pcOverlap)
+ − 956 if queryTranscript2.overlapWith(queryTranscript1, minOverlap) and (queryTranscript1.getDirection() == queryTranscript2.getDirection() or not self.colinear):
+ − 957 if queryTranscript1.getDirection() != queryTranscript2.getDirection():
+ − 958 queryTranscript2.setDirection(queryTranscript1.getDirection())
+ − 959 queryTranscript1.merge(queryTranscript2, self.normalization)
+ − 960 transcripts[self.QUERY][iQuery2] = None
+ − 961 nbMerges += 1
+ − 962 if not self.multiple:
+ − 963 mergedTranscripts[iQuery1] = 0
+ − 964
+ − 965 # Update the sets of transcripts and write into database (also update mergedTranscripts)
+ − 966 newTranscripts = {self.QUERY: [], self.REFERENCE: []}
+ − 967 newMergedTranscripts = {}
+ − 968 for type in (self.QUERY, self.REFERENCE):
+ − 969 for i, transcript in enumerate(transcripts[type]):
+ − 970 if transcript == None: continue
+ − 971 correction = 0 if self.QUERY else self.maxDistance
+ − 972 if transcript.getEnd() < rangeEnd - correction:
+ − 973 if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)):
+ − 974 self.writeTranscript(transcripts[type][i])
+ − 975 else:
+ − 976 if type == self.QUERY and i in mergedTranscripts:
+ − 977 newMergedTranscripts[len(newTranscripts[type])] = 0
+ − 978 newTranscripts[type].append(transcript)
+ − 979 transcripts = newTranscripts
+ − 980 mergedTranscripts = newMergedTranscripts
+ − 981
+ − 982 progress.inc()
+ − 983 progress.done()
+ − 984
+ − 985 for type in (self.QUERY, self.REFERENCE):
+ − 986 for i, transcript in enumerate(transcripts[type]):
+ − 987 if transcripts == None: continue
+ − 988 if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)):
+ − 989 self.writeTranscript(transcripts[type][i])
+ − 990
+ − 991 # Manage chromosomes with no corresponding data
+ − 992 if self.multiple:
+ − 993 for type in self.INPUTTYPES:
+ − 994 for chromosome in self.getTables(type):
+ − 995 if chromosome in self.getTables(1 - type):
+ − 996 continue
+ − 997 for transcript in self.getTables(self.OUTPUT)[chromosome].getIterator():
+ − 998 self.writeTranscript(transcript)
+ − 999
+ − 1000 self.flushData()
+ − 1001 if self.writer != None:
+ − 1002 self.writer.close()
+ − 1003 self.writer = None
+ − 1004
+ − 1005 if self.verbosity > 0:
+ − 1006 print "query: %d sequences" % (self.nbTranscripts[self.QUERY])
+ − 1007 print "# merges: %d" % (nbMerges)
+ − 1008 print "# printed %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100)
+ − 1009
+ − 1010
+ − 1011 def compareTranscriptListSelfMerge(self):
+ − 1012 """
+ − 1013 Merge the query list of transcript with itself
+ − 1014 @return: the merged transcripts in a transcript list database
+ − 1015 """
+ − 1016 nbMerges = 0
+ − 1017 distance = self.maxDistance if self.maxDistance != None else 0
+ − 1018
+ − 1019 self.addIndexes([self.QUERY])
+ − 1020 self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True)
+ − 1021 self.flushData()
+ − 1022
+ − 1023 # looping
+ − 1024 for chromosome1 in sorted(self.getTables(self.QUERY).keys()):
+ − 1025 transcripts2 = []
+ − 1026
+ − 1027 # get range of transcripts
+ − 1028 progress = Progress(self.getTables(self.QUERY)[chromosome1].getNbElements(), "Analyzing chromosome %s" % (chromosome1), self.verbosity-1)
+ − 1029 command = "SELECT * FROM %s ORDER BY start" % (self.getTables(self.QUERY)[chromosome1].getName())
+ − 1030 for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command):
+ − 1031
+ − 1032 # compare sets
+ − 1033 toBeRemoved = set()
+ − 1034 toBePrinted = set()
+ − 1035 for index2, transcript2 in enumerate(transcripts2):
+ − 1036
+ − 1037 if self.compareTranscript(transcript1, transcript2, True):
+ − 1038 if transcript1.getDirection() != transcript2.getDirection():
+ − 1039 transcript2.setDirection(transcript1.getDirection())
+ − 1040 transcript1.merge(transcript2, self.normalization)
+ − 1041 toBeRemoved.add(index2)
+ − 1042 nbMerges += 1
+ − 1043 elif transcript2.getEnd() + distance < transcript1.getStart():
+ − 1044 toBePrinted.add(index2)
+ − 1045 transcripts2.append(transcript1)
+ − 1046
+ − 1047 for index2 in sorted(toBePrinted):
+ − 1048 self.writeTranscript(transcripts2[index2])
+ − 1049 transcripts2 = [transcripts2[index2] for index2 in range(len(transcripts2)) if index2 not in (toBeRemoved | toBePrinted)]
+ − 1050
+ − 1051 for transcript2 in transcripts2:
+ − 1052 self.writeTranscript(transcript2)
+ − 1053 progress.done()
+ − 1054 self.getTables(self.QUERY)[chromosome1].remove()
+ − 1055
+ − 1056 self.flushData()
+ − 1057 if self.writer != None:
+ − 1058 self.writer.close()
+ − 1059 self.writer = None
+ − 1060
+ − 1061 if self.verbosity > 0:
+ − 1062 print "query: %d sequences" % (self.nbTranscripts[self.QUERY])
+ − 1063 print "# merges: %d" % (nbMerges)
+ − 1064 print "# printed %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100)
+ − 1065
+ − 1066
+ − 1067 def getDifferenceTranscriptList(self):
+ − 1068 """
+ − 1069 Get the elements of the first list which do not overlap the second list (at the nucleotide level)
+ − 1070 @return: the transcripts that overlap with the reference set
+ − 1071 """
+ − 1072 distance = 0 if self.maxDistance == None else self.maxDistance
+ − 1073
+ − 1074 self.addIndexes([self.QUERY, self.REFERENCE])
+ − 1075
+ − 1076 # export the container into tables
+ − 1077 self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True)
+ − 1078 self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True)
+ − 1079
+ − 1080 # looping
+ − 1081 for chromosome1 in sorted(self.getTables(self.QUERY).keys()):
+ − 1082 # get range of transcripts
+ − 1083 command = "SELECT MIN(start), MAX(end), COUNT(id) FROM %s" % (self.getTables(self.QUERY)[chromosome1].getName())
+ − 1084 query = self.mySqlConnection.executeQuery(command)
+ − 1085 result = query.getLine()
+ − 1086 first = result[0]
+ − 1087 last = result[1]
+ − 1088 nb = result[2]
+ − 1089
+ − 1090 transcripts1 = []
+ − 1091 transcripts2 = []
+ − 1092 nbChunks = max(1, nb / 100)
+ − 1093 chunkSize = (last - first) / nbChunks
+ − 1094 progress = Progress(nbChunks + 1, "Analyzing chromosome %s" % (chromosome1), self.verbosity-1)
+ − 1095 for chunk in range(nbChunks + 1):
+ − 1096
+ − 1097 # load transcripts
+ − 1098 start = first + chunk * chunkSize
+ − 1099 end = start + chunkSize - 1
+ − 1100 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.QUERY)[chromosome1].getName(), start, end-1)
+ − 1101 for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command):
+ − 1102 transcripts1.append(transcript1)
+ − 1103 command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.QUERY)[chromosome1].getName(), end)
+ − 1104 self.mySqlConnection.executeQuery(command)
+ − 1105 if chromosome1 in self.getTables(self.REFERENCE):
+ − 1106 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), start-distance, end+distance-1)
+ − 1107 if chunk == 0:
+ − 1108 command = "SELECT * FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance)
+ − 1109 for index2, transcript2 in self.getTables(self.REFERENCE)[chromosome1].selectTranscripts(command):
+ − 1110 transcripts2.append(transcript2)
+ − 1111 command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance)
+ − 1112 self.mySqlConnection.executeQuery(command)
+ − 1113
+ − 1114 # compare sets
+ − 1115 toBeRemoved1 = []
+ − 1116 for index1, transcript1 in enumerate(transcripts1):
+ − 1117 newTranscript1 = Transcript()
+ − 1118 newTranscript1.copy(transcript1)
+ − 1119 for transcript2 in transcripts2:
+ − 1120 newTranscript1 = newTranscript1.getDifference(transcript2)
+ − 1121 if newTranscript1 == None:
+ − 1122 toBeRemoved1.append(index1)
+ − 1123 break
+ − 1124 transcripts1[index1] = newTranscript1
+ − 1125
+ − 1126 # check if query transcript extends bounds of the chunk
+ − 1127 if newTranscript1 != None and newTranscript1.getEnd() < end:
+ − 1128 if self.splitDifference:
+ − 1129 for exon in newTranscript1.getExons():
+ − 1130 transcript = Transcript()
+ − 1131 transcript.copy(exon)
+ − 1132 self.writeTranscript(transcript)
+ − 1133 else:
+ − 1134 self.writeTranscript(newTranscript1)
+ − 1135 toBeRemoved1.append(index1)
+ − 1136
+ − 1137 # update list of query transcripts
+ − 1138 for index1 in reversed(toBeRemoved1):
+ − 1139 del transcripts1[index1]
+ − 1140
+ − 1141 # check if the reference transcripts extends bounds of the chunk
+ − 1142 toBeRemoved2 = []
+ − 1143 for index2, transcript2 in enumerate(transcripts2):
+ − 1144 if transcript2.getEnd() + distance < end:
+ − 1145 toBeRemoved2.append(index2)
+ − 1146 for index2 in reversed(toBeRemoved2):
+ − 1147 del transcripts2[index2]
+ − 1148
+ − 1149 progress.inc()
+ − 1150
+ − 1151 for transcript1 in transcripts1:
+ − 1152 if self.splitDifference:
+ − 1153 for exon in transcript1.getExons():
+ − 1154 transcript = Transcript()
+ − 1155 transcript.copy(exon)
+ − 1156 self.writeTranscript(transcript)
+ − 1157 else:
+ − 1158 self.writeTranscript(transcript1)
+ − 1159 progress.done()
+ − 1160 self.getTables(self.QUERY)[chromosome1].remove()
+ − 1161 if chromosome1 in self.getTables(self.REFERENCE):
+ − 1162 self.getTables(self.REFERENCE)[chromosome1].remove()
+ − 1163 self.getTables(self.WORKING)[chromosome1].remove()
+ − 1164
+ − 1165 self.flushData()
+ − 1166 if self.writer != None:
+ − 1167 self.writer.close()
+ − 1168 self.writer = None
+ − 1169
+ − 1170 if self.verbosity > 0:
+ − 1171 print "query: %d elements" % (self.nbTranscripts[self.QUERY])
+ − 1172 print "reference: %d elements" % (self.nbTranscripts[self.REFERENCE])
+ − 1173 print "# printed: %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100)
+ − 1174
+ − 1175
+ − 1176 def getOddsPerTranscript(self):
+ − 1177 """
+ − 1178 Return overlap results
+ − 1179 @return a dict of data
+ − 1180 """
+ − 1181 if not self.odds:
+ − 1182 raise Exception("Did not compute odds!")
+ − 1183 return self.overlapResults
+ − 1184
+ − 1185
+ − 1186 def getOdds(self):
+ − 1187 """
+ − 1188 Return odds about the overlap
+ − 1189 @return a dict of data
+ − 1190 """
+ − 1191 if not self.odds:
+ − 1192 raise Exception("Did not compute odds!")
+ − 1193 if self.oddResults != None:
+ − 1194 return self.oddResults
+ − 1195 self.oddResults = {}
+ − 1196 for name, value in self.overlapResults.iteritems():
+ − 1197 self.oddResults[value] = self.oddResults.get(value, 0) + 1
+ − 1198 return self.oddResults