Mercurial > repos > yufei-luo > s_mart
comparison SMART/Java/Python/structure/TranscriptListsComparator.py @ 6:769e306b7933
Change the repository level.
| author | yufei-luo |
|---|---|
| date | Fri, 18 Jan 2013 04:54:14 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 5:ea3082881bf8 | 6:769e306b7933 |
|---|---|
| 1 # | |
| 2 # Copyright INRA-URGI 2009-2010 | |
| 3 # | |
| 4 # This software is governed by the CeCILL license under French law and | |
| 5 # abiding by the rules of distribution of free software. You can use, | |
| 6 # modify and/ or redistribute the software under the terms of the CeCILL | |
| 7 # license as circulated by CEA, CNRS and INRIA at the following URL | |
| 8 # "http://www.cecill.info". | |
| 9 # | |
| 10 # As a counterpart to the access to the source code and rights to copy, | |
| 11 # modify and redistribute granted by the license, users are provided only | |
| 12 # with a limited warranty and the software's author, the holder of the | |
| 13 # economic rights, and the successive licensors have only limited | |
| 14 # liability. | |
| 15 # | |
| 16 # In this respect, the user's attention is drawn to the risks associated | |
| 17 # with loading, using, modifying and/or developing or reproducing the | |
| 18 # software by the user in light of its specific status of free software, | |
| 19 # that may mean that it is complicated to manipulate, and that also | |
| 20 # therefore means that it is reserved for developers and experienced | |
| 21 # professionals having in-depth computer knowledge. Users are therefore | |
| 22 # encouraged to load and test the software's suitability as regards their | |
| 23 # requirements in conditions enabling the security of their systems and/or | |
| 24 # data to be ensured and, more generally, to use and operate it in the | |
| 25 # same conditions as regards security. | |
| 26 # | |
| 27 # The fact that you are presently reading this means that you have had | |
| 28 # knowledge of the CeCILL license and that you accept its terms. | |
| 29 # | |
| 30 import sys | |
| 31 import random | |
| 32 from SMART.Java.Python.misc import Utils | |
| 33 from SMART.Java.Python.structure.Transcript import Transcript | |
| 34 from SMART.Java.Python.structure.TranscriptList import TranscriptList | |
| 35 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer | |
| 36 from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection | |
| 37 from SMART.Java.Python.mySql.MySqlTranscriptTable import MySqlTranscriptTable | |
| 38 from SMART.Java.Python.misc.Progress import Progress | |
| 39 from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter | |
| 40 | |
| 41 | |
| 42 | |
| 43 class TranscriptListsComparator(object): | |
| 44 """ | |
| 45 Compare two transcript lists, using a database for one of the list | |
| 46 Uses one TranscriptContainer for query data, | |
| 47 one TranscriptContainer exported to MySqlTranscriptTable for reference data, | |
| 48 one MySqlTranscriptTable for transformed reference data | |
| 49 @ivar inputTranscriptContainers: parsers to the list of query transcripts | |
| 50 @type inputTranscriptContainers: list of 2 L{TranscriptContainer<TranscriptContainer>} | |
| 51 @ivar writer: transcript list writer | |
| 52 @type writer: class L{TranscriptListWriter<TranscriptListWriter>} | |
| 53 @ivar mySqlConnection: connection to a MySQL database (to compute the ovelapping efficiently) | |
| 54 @type mySqlConnection: class L{MySqlConnection<MySqlConnection>} | |
| 55 @ivar introns: compare transcripts or exons only | |
| 56 @type introns: list of 2 boolean | |
| 57 @ivar starts: restrict the query transcripts to first nucleotides | |
| 58 @type starts: list of 2 int or None | |
| 59 @ivar fivePrimes: extend a list of transcripts by their 5' end | |
| 60 @type fivePrimes: list of 2 int or None | |
| 61 @ivar threePrimes: extend a list of transcripts by their 3' end | |
| 62 @type threePrimes: list of 2 int or None | |
| 63 @ivar minDistance: min distance between two transcripts [default: 0] | |
| 64 @type minDistance: int | |
| 65 @ivar maxDistance: max distance between two transcripts [default: 0] | |
| 66 @type maxDistance: int | |
| 67 @ivar minOverlap: minimum number of overlapping nucleotides to declare an overlap | |
| 68 @type minOverlap: int | |
| 69 @ivar pcOverlap: percentage of overlapping nucleotides to declare an overlap | |
| 70 @type pcOverlap: int | |
| 71 @ivar upstreams: consider distances with elements which are upstream of the transcripts | |
| 72 @type upstreams: boolean | |
| 73 @ivar downstreams: consider distances with elements which are downstream of the transcripts | |
| 74 @type downstreams: boolean | |
| 75 @ivar colinear: whether transcripts should overlap in the same direction | |
| 76 @type colinear: boolean | |
| 77 @ivar antisense: whether transcripts should overlap in the opposite direction | |
| 78 @type antisense: boolean | |
| 79 @ivar outputDistance: output distance between query and reference instead of query transcript | |
| 80 @type outputDistance: boolean | |
| 81 @ivar absolute: do not consider the strand while computing distance | |
| 82 @type absolute: boolean | |
| 83 @ivar strandedDistance: return a line per strand while computing distances | |
| 84 @type strandedDistance: boolean | |
| 85 @ivar QUERY: constant specifying the query objects | |
| 86 @type QUERY: int | |
| 87 @ivar REFERENCE: constant specifying the reference objects | |
| 88 @type REFERENCE: int | |
| 89 @ivar INPUTTYPES: set of input types of data (query or reference) objects | |
| 90 @type INPUTTYPES: list of 2 int | |
| 91 @ivar typeToString: string representation of the previous types | |
| 92 @type typeToString: dict | |
| 93 @ivar tableNames: name of the transcript tables | |
| 94 @type tableNames: dict of strings | |
| 95 @ivar nbTranscripts: number of transcript in the query/reference set | |
| 96 @type nbTranscripts: list of 2 int or None | |
| 97 @ivar nbNucleotides: number of nucleotides in the query/reference set | |
| 98 @type nbNucleotides: list of 2 int or None | |
| 99 @ivar transcriptsToBeStored: transcripts that will be stored into database | |
| 100 @type transcriptsToBeStored: dict of class L{TranscriptList<TranscriptList>} | |
| 101 @ivar multiple: in merge mode, aggregate multiple transcripts | |
| 102 @type multiple: boolean | |
| 103 @ivar normalization: normalize each element by the number of mappings of this element | |
| 104 @type normalization: boolean | |
| 105 @ivar invert: invert the current comparison | |
| 106 @type invert: boolean | |
| 107 @ivar splitDifference: split into intervals when computing difference | |
| 108 @type splitDifference: boolean | |
| 109 @ivar odds: whether odds about the comparison should be computed | |
| 110 @type odds: boolean | |
| 111 @ivar overlapResults: count the number of overlaps | |
| 112 @type overlapResults: dictionary | |
| 113 @ivar oddResults: compute the number of times each transcript overlaps (or is merged with) another one | |
| 114 @type oddResults: dictionary | |
| 115 @ivar outputContainer: container of the output transcripts | |
| 116 @type outputContainer: class L{TranscriptContainer<TranscriptContainer>} | |
| 117 @ivar logHandle: log handle | |
| 118 @type logHandle: file | |
| 119 @ivar verbosity: verbosity | |
| 120 @type verbosity: int | |
| 121 """ | |
| 122 | |
| 123 def __init__(self, logHandle = None, verbosity = 0): | |
| 124 """ | |
| 125 Constructor | |
| 126 @param transcriptListParser2: parser to the list of reference transcripts | |
| 127 @type transcriptListParser2: class L{TranscriptListParser<TranscriptListParser>} | |
| 128 @param logHandle: log handle | |
| 129 @type logHandle: file | |
| 130 @param verbosity: verbosity | |
| 131 @type verbosity: int | |
| 132 """ | |
| 133 self.QUERY = 0 | |
| 134 self.REFERENCE = 1 | |
| 135 self.WORKING = 2 | |
| 136 self.INPUTTYPES = (self.QUERY, self.REFERENCE) | |
| 137 self.INPUTWORKINGTYPES = (self.QUERY, self.REFERENCE, self.WORKING) | |
| 138 self.typeToString = {self.QUERY: "Query", self.REFERENCE: "Reference", self.WORKING: "Working"} | |
| 139 | |
| 140 self.logHandle = logHandle | |
| 141 self.verbosity = verbosity | |
| 142 self.mySqlConnection = MySqlConnection(self.verbosity-1) | |
| 143 self.inputTranscriptContainers = [None, None] | |
| 144 self.tableNames = ["tmpQueryTable_%d" % (random.randint(0, 100000)), "tmpReferenceTable_%d" % (random.randint(0, 100000)), "tmpOutputTable_%d" % (random.randint(0, 100000)), "tmpWorkingTable_%d" % (random.randint(0, 100000))] | |
| 145 self.mySqlTranscriptWriters = [MySqlTranscriptWriter(self.mySqlConnection, name, verbosity-1) for name in self.tableNames] | |
| 146 self.writer = None | |
| 147 self.introns = [False, False] | |
| 148 self.starts = [None, None] | |
| 149 self.ends = [None, None] | |
| 150 self.fivePrimes = [None, None] | |
| 151 self.threePrimes = [None, None] | |
| 152 self.minDistance = None | |
| 153 self.maxDistance = 0 | |
| 154 self.minOverlap = 1 | |
| 155 self.pcOverlap = None | |
| 156 self.colinear = False | |
| 157 self.antisense = False | |
| 158 self.downstreams = [False, False] | |
| 159 self.upstreams = [False, False] | |
| 160 self.outputDistance = False | |
| 161 self.absolute = False | |
| 162 self.strandedDistance = False | |
| 163 self.nbTranscripts = [None, None] | |
| 164 self.nbNucleotides = [None, None] | |
| 165 self.normalization = False | |
| 166 self.included = False | |
| 167 self.including = False | |
| 168 self.invert = False | |
| 169 self.notOverlapping = False | |
| 170 self.splitDifference = False | |
| 171 self.multiple = False | |
| 172 self.odds = False | |
| 173 self.overlapResults = None | |
| 174 self.oddResults = None | |
| 175 self.outputContainer = None | |
| 176 self.transcriptsToBeStored = dict([(type, TranscriptList()) for type in self.INPUTWORKINGTYPES]) | |
| 177 self.nbPrinted = 0 | |
| 178 | |
| 179 self.mySqlConnection.createDatabase() | |
| 180 | |
| 181 | |
| 182 def __del__(self): | |
| 183 """ | |
| 184 Destructor | |
| 185 Remove all temporary tables | |
| 186 """ | |
| 187 for type in self.INPUTWORKINGTYPES: | |
| 188 self.mySqlTranscriptWriters[type].removeTables() | |
| 189 self.mySqlConnection.deleteDatabase() | |
| 190 | |
| 191 def acceptIntrons(self, type, bool): | |
| 192 """ | |
| 193 Compare transcripts or exons only | |
| 194 @param type: whether use query/reference data | |
| 195 @type type: int | |
| 196 @param bool: include introns or not | |
| 197 @type bool: boolean | |
| 198 """ | |
| 199 self.introns[type] = bool | |
| 200 | |
| 201 | |
| 202 def restrictToStart(self, type, size): | |
| 203 """ | |
| 204 Restrict a list of transcripts to first nucleotides | |
| 205 @param type: whether use query/reference data | |
| 206 @type type: int | |
| 207 @param size: the size of the transcript to be considered | |
| 208 @type size: int | |
| 209 """ | |
| 210 self.starts[type] = size | |
| 211 self.introns[type] = False | |
| 212 | |
| 213 | |
| 214 def restrictToEnd(self, type, size): | |
| 215 """ | |
| 216 Restrict a list of transcripts to first nucleotides | |
| 217 @param type: whether use query/reference data | |
| 218 @type type: int | |
| 219 @param size: the size of the transcript to be considered | |
| 220 @type size: int | |
| 221 """ | |
| 222 self.ends[type] = size | |
| 223 self.introns[type] = False | |
| 224 | |
| 225 | |
| 226 def extendFivePrime(self, type, size): | |
| 227 """ | |
| 228 Extend a list of transcripts by their 5' end | |
| 229 @param type: whether use query/reference data | |
| 230 @type type: int | |
| 231 @param size: size of the extension | |
| 232 @type size: int | |
| 233 """ | |
| 234 self.fivePrimes[type] = size | |
| 235 | |
| 236 | |
| 237 def extendThreePrime(self, type, size): | |
| 238 """ | |
| 239 Extend the list of query transcripts by their 3' end | |
| 240 @param type: whether use query/reference data | |
| 241 @type type: int | |
| 242 @param size: size of the extension | |
| 243 @type size: int | |
| 244 """ | |
| 245 self.threePrimes[type] = size | |
| 246 | |
| 247 | |
| 248 def setMinDistance(self, distance): | |
| 249 """ | |
| 250 Set the min distance between two transcripts | |
| 251 @param distance: distance | |
| 252 @type distance: int | |
| 253 """ | |
| 254 self.minDistance = distance | |
| 255 | |
| 256 | |
| 257 def setMaxDistance(self, distance): | |
| 258 """ | |
| 259 Set the max distance between two transcripts | |
| 260 @param distance: distance | |
| 261 @type distance: int | |
| 262 """ | |
| 263 self.maxDistance = distance | |
| 264 | |
| 265 | |
| 266 def setMinOverlap(self, overlap): | |
| 267 """ | |
| 268 Set the minimum number of nucleotides to declare an overlap | |
| 269 @param overlap: minimum number of nucleotides | |
| 270 @type overlap: int | |
| 271 """ | |
| 272 self.minOverlap = overlap | |
| 273 | |
| 274 | |
| 275 def setPcOverlap(self, overlap): | |
| 276 """ | |
| 277 Set the percentage of nucleotides to declare an overlap | |
| 278 @param overlap: percentage of nucleotides | |
| 279 @type overlap: int | |
| 280 """ | |
| 281 self.pcOverlap = overlap | |
| 282 | |
| 283 | |
| 284 def setUpstream(self, type, boolean): | |
| 285 """ | |
| 286 Consider transcripts which are upstream of some transcripts | |
| 287 @param type: whether use query/reference data | |
| 288 @type type: int | |
| 289 @param boolean: consider only these transcripts or not | |
| 290 @type boolean: boolean | |
| 291 """ | |
| 292 self.upstreams[type] = boolean | |
| 293 | |
| 294 | |
| 295 def setDownstream(self, type, boolean): | |
| 296 """ | |
| 297 Consider transcripts which are downstream of some transcripts | |
| 298 @param type: whether use query/reference data | |
| 299 @type type: int | |
| 300 @param boolean: consider only these transcripts or not | |
| 301 @type boolean: boolean | |
| 302 """ | |
| 303 self.downstreams[type] = boolean | |
| 304 | |
| 305 | |
| 306 def setOutputDistance(self, boolean): | |
| 307 """ | |
| 308 Output distance between query and reference instead of query transcript | |
| 309 @param boolean: whether distance should be output | |
| 310 @type boolean: boolean | |
| 311 """ | |
| 312 self.outputDistance = boolean | |
| 313 | |
| 314 | |
| 315 def setAbsolute(self, boolean): | |
| 316 """ | |
| 317 Do not consider strand when computing distance (thus, having only non-negative values) | |
| 318 @param boolean: whether we should consider strands | |
| 319 @type boolean: boolean | |
| 320 """ | |
| 321 self.absolute = boolean | |
| 322 | |
| 323 | |
| 324 def setStrandedDistance(self, boolean): | |
| 325 """ | |
| 326 Return two distance distributions, one per strand | |
| 327 @param boolean: whether we should return 2 distance distance | |
| 328 @type boolean: boolean | |
| 329 """ | |
| 330 self.strandedDistance = boolean | |
| 331 | |
| 332 | |
| 333 def getColinearOnly(self, boolean): | |
| 334 """ | |
| 335 Only consider transcripts that overlap in the same direction | |
| 336 @param boolean: whether transcripts should overlap in the same direction | |
| 337 @type boolean: boolean | |
| 338 """ | |
| 339 self.colinear = boolean | |
| 340 | |
| 341 | |
| 342 def getAntisenseOnly(self, boolean): | |
| 343 """ | |
| 344 Only consider transcripts that overlap in the opposite direction | |
| 345 @param boolean: whether transcripts should overlap in the opposite direction | |
| 346 @type boolean: boolean | |
| 347 """ | |
| 348 self.antisense = boolean | |
| 349 | |
| 350 | |
| 351 def setIncludedOnly(self, boolean): | |
| 352 """ | |
| 353 Keep the elements from first set which are included in the second set | |
| 354 @param boolean: whether to keep included elements only | |
| 355 @type boolean: boolean | |
| 356 """ | |
| 357 self.included = boolean | |
| 358 | |
| 359 | |
| 360 def setIncludingOnly(self, boolean): | |
| 361 """ | |
| 362 Keep the elements from second set which are included in the first set | |
| 363 @param boolean: whether to keep included elements only | |
| 364 @type boolean: boolean | |
| 365 """ | |
| 366 self.including = boolean | |
| 367 | |
| 368 | |
| 369 def setNormalization(self, boolean): | |
| 370 """ | |
| 371 Normalize the elements by the number of mappings in the genome | |
| 372 @param boolean: whether normalize | |
| 373 @type boolean: boolean | |
| 374 """ | |
| 375 self.normalization = boolean | |
| 376 | |
| 377 | |
| 378 def getInvert(self, boolean): | |
| 379 """ | |
| 380 Only consider transcripts that do not overlap | |
| 381 @param boolean: whether invert the selection | |
| 382 @type boolean: boolean | |
| 383 """ | |
| 384 self.invert = boolean | |
| 385 | |
| 386 | |
| 387 def includeNotOverlapping(self, boolean): | |
| 388 """ | |
| 389 Also output the elements which do not overlap | |
| 390 @param boolean: whether output the elements which do not overlap | |
| 391 @type boolean: boolean | |
| 392 """ | |
| 393 self.notOverlapping = boolean | |
| 394 | |
| 395 | |
| 396 def setSplitDifference(self, boolean): | |
| 397 """ | |
| 398 Split into intervals when computing difference | |
| 399 @param boolean: whether to split | |
| 400 @type boolean: boolean | |
| 401 """ | |
| 402 self.splitDifference = boolean | |
| 403 | |
| 404 | |
| 405 def aggregate(self, boolean): | |
| 406 """ | |
| 407 In merge mode, aggregate multiple transcripts | |
| 408 @param boolean: aggregate multiple transcripts | |
| 409 @type boolean: boolean | |
| 410 """ | |
| 411 self.multiple = boolean | |
| 412 | |
| 413 | |
| 414 def getTables(self, type): | |
| 415 """ | |
| 416 Get the SQL tables | |
| 417 @param type: type of the table (query, reference, etc.) | |
| 418 @type type: int | |
| 419 """ | |
| 420 return self.mySqlTranscriptWriters[type].getTables() | |
| 421 | |
| 422 | |
| 423 def computeOdds(self, boolean): | |
| 424 """ | |
| 425 Compute odds | |
| 426 @param boolean: whether odds should be computed | |
| 427 @type boolean: boolean | |
| 428 """ | |
| 429 self.odds = boolean | |
| 430 if self.odds: | |
| 431 self.overlapResults = dict() | |
| 432 | |
| 433 | |
| 434 def computeOddsPerTranscript(self, boolean): | |
| 435 """ | |
| 436 Compute odds for each transcript | |
| 437 @param boolean: whether odds for each transcript should be computed | |
| 438 @type boolean: boolean | |
| 439 """ | |
| 440 self.odds = boolean | |
| 441 if self.odds: | |
| 442 self.overlapResults = dict() | |
| 443 | |
| 444 | |
| 445 def removeTables(self): | |
| 446 """ | |
| 447 Remove the temporary MySQL tables | |
| 448 """ | |
| 449 for type in self.INPUTWORKINGTYPES: | |
| 450 for chromosome in self.getTables(type): | |
| 451 self.getTables(type)[chromosome].remove() | |
| 452 | |
| 453 | |
| 454 def clearTables(self): | |
| 455 """ | |
| 456 Empty the content of the databases | |
| 457 """ | |
| 458 for type in self.INPUTWORKINGTYPES: | |
| 459 if self.transcriptListParsers[type] != None: | |
| 460 for chromosome in self.getTables(type): | |
| 461 self.getTables(type)[chromosome].clear() | |
| 462 | |
| 463 | |
| 464 def extendTranscript(self, type, transcript): | |
| 465 """ | |
| 466 Extend a transcript corresponding to the parameters of the class | |
| 467 @param transcript: a transcript | |
| 468 @type transcript: class L{Transcript<Transcript>} | |
| 469 @return: the possibly extended transcript | |
| 470 """ | |
| 471 extendedTranscript = Transcript() | |
| 472 extendedTranscript.copy(transcript) | |
| 473 if self.starts[type] != None: | |
| 474 extendedTranscript.restrictStart(self.starts[type]) | |
| 475 if self.ends[type] != None: | |
| 476 extendedTranscript.restrictEnd(self.ends[type]) | |
| 477 if self.fivePrimes[type] != None: | |
| 478 extendedTranscript.extendStart(self.fivePrimes[type]) | |
| 479 if self.threePrimes[type] != None: | |
| 480 extendedTranscript.extendEnd(self.threePrimes[type]) | |
| 481 return extendedTranscript | |
| 482 | |
| 483 | |
| 484 def storeTranscript(self, type, transcript, now = True): | |
| 485 """ | |
| 486 Add a transcript to a MySQL database, or postpone the store | |
| 487 @param type: whether use query/reference table | |
| 488 @type type: int | |
| 489 @param transcript: a transcript | |
| 490 @type transcript: class L{Transcript<Transcript>} | |
| 491 @param now: whether transcript should be stored now (or stored can be postponed) | |
| 492 @type now: bool | |
| 493 """ | |
| 494 self.mySqlTranscriptWriters[type].addTranscript(transcript) | |
| 495 if type == self.REFERENCE: | |
| 496 self.mySqlTranscriptWriters[self.WORKING].addTranscript(transcript) | |
| 497 if now: | |
| 498 self.mySqlTranscriptWriters[type].write() | |
| 499 if type == self.REFERENCE: | |
| 500 self.mySqlTranscriptWriters[self.WORKING].write() | |
| 501 | |
| 502 | |
| 503 def writeTranscript(self, transcript): | |
| 504 """ | |
| 505 Write a transcript in the output file | |
| 506 @param transcript: a transcript | |
| 507 @type transcript: class L{Transcript<Transcript>} | |
| 508 """ | |
| 509 if self.writer != None: | |
| 510 self.writer.addTranscript(transcript) | |
| 511 self.nbPrinted += 1 | |
| 512 | |
| 513 | |
| 514 def flushData(self, type = None): | |
| 515 """ | |
| 516 Store the remaining transcripts | |
| 517 @param type: whether use query/reference table (None for all) | |
| 518 @type type: int or None | |
| 519 """ | |
| 520 if type == None: | |
| 521 types = self.INPUTWORKINGTYPES | |
| 522 else: | |
| 523 types = [type] | |
| 524 for type in types: | |
| 525 self.mySqlTranscriptWriters[type].write() | |
| 526 if self.writer != None: | |
| 527 self.writer.write() | |
| 528 | |
| 529 | |
| 530 def unstoreTranscript(self, type, transcript): | |
| 531 """ | |
| 532 Remove a transcript from a MySQL database | |
| 533 @param type: whether use query/reference table | |
| 534 @type type: int | |
| 535 @param transcript: a transcript | |
| 536 @type transcript: class L{Transcript<Transcript>} | |
| 537 """ | |
| 538 self.getTables(type)[transcript.getChromosome()].removeTranscript(transcript) | |
| 539 if type == self.REFERENCE: | |
| 540 self.getTables(self.WORKING)[transcript.getChromosome()].removeTranscript(transcript) | |
| 541 | |
| 542 | |
| 543 def addIndexes(self, tables): | |
| 544 """ | |
| 545 Add useful indexes to the tables | |
| 546 @param tables: which tables should be indexed | |
| 547 @type tables: list of int | |
| 548 """ | |
| 549 for type in tables: | |
| 550 for chromosome in self.getTables(type): | |
| 551 self.getTables(type)[chromosome].createIndex("iStart_transcript_%s_%d_%d" % (chromosome, type, random.randint(0, 100000)), ["start"]) | |
| 552 self.getTables(type)[chromosome].exonsTable.createIndex("iTranscriptId_exon_%s_%d_%d" % (chromosome, type, random.randint(0, 100000)), ["transcriptId"]) | |
| 553 | |
| 554 | |
| 555 def storeTranscriptList(self, type, transcriptListParser, extension): | |
| 556 """ | |
| 557 Store a transcript list into database | |
| 558 @param type: whether use query/reference parser | |
| 559 @type type: int | |
| 560 @param parser: a parser of transcript list | |
| 561 @type parser: class L{TranscriptContainer<TranscriptContainer>} | |
| 562 @param extension: extend (or not) the transcripts | |
| 563 @type extension: boolean | |
| 564 """ | |
| 565 progress = Progress(transcriptListParser.getNbTranscripts(), "Writing transcripts for %s" % ("query" if type == self.QUERY else "reference"), self.verbosity-1) | |
| 566 for transcript in transcriptListParser.getIterator(): | |
| 567 if extension: | |
| 568 transcript = self.extendTranscript(type, transcript) | |
| 569 self.mySqlTranscriptWriters[type].addTranscript(transcript) | |
| 570 progress.inc() | |
| 571 self.mySqlTranscriptWriters[type].write() | |
| 572 progress.done() | |
| 573 if type == self.REFERENCE: | |
| 574 for chromosome in self.getTables(self.REFERENCE): | |
| 575 self.getTables(self.WORKING)[chromosome] = MySqlTranscriptTable(self.mySqlConnection, self.tableNames[self.WORKING], chromosome, self.verbosity-1) | |
| 576 self.getTables(self.WORKING)[chromosome].copy(self.getTables(self.REFERENCE)[chromosome]) | |
| 577 | |
| 578 | |
| 579 def setInputTranscriptContainer(self, type, inputTranscriptContainer): | |
| 580 """ | |
| 581 Set an input transcript list container | |
| 582 @param type: whether use query/reference parser | |
| 583 @type type: int | |
| 584 @param inputTranscriptContainer: a container | |
| 585 @type inputTranscriptContainer: class L{TranscriptContainer<TranscriptContainer>} | |
| 586 """ | |
| 587 self.inputTranscriptContainers[type] = inputTranscriptContainer | |
| 588 self.nbTranscripts[type] = self.inputTranscriptContainers[type].getNbTranscripts() | |
| 589 self.nbNucleotides[type] = self.inputTranscriptContainers[type].getNbNucleotides() | |
| 590 | |
| 591 | |
| 592 def setOutputWriter(self, writer): | |
| 593 """ | |
| 594 Set an output transcript list writer | |
| 595 @param writer: a writer | |
| 596 @type writer: class L{TranscriptListWriter<TranscriptListWriter>} | |
| 597 """ | |
| 598 self.writer = writer | |
| 599 | |
| 600 | |
| 601 def compareTranscript(self, transcript1, transcript2, includeDistance = False): | |
| 602 """ | |
| 603 Compare two transcripts, using user defined parameters | |
| 604 @param transcript1: a transcript from the query set (already extended) | |
| 605 @type transcript1: class L{Transcript<Transcript>} | |
| 606 @param transcript2: a transcript from the reference set (already extended) | |
| 607 @type transcript2: class L{Transcript<Transcript>} | |
| 608 @param includeDistance: take into account the distance too | |
| 609 @type includeDistance: boolean | |
| 610 @return: true, if they overlap | |
| 611 """ | |
| 612 extendedTranscript1 = Transcript() | |
| 613 extendedTranscript1.copy(transcript1) | |
| 614 if includeDistance: | |
| 615 if self.maxDistance > 0: | |
| 616 extendedTranscript1.extendStart(self.maxDistance) | |
| 617 extendedTranscript1.extendEnd(self.maxDistance) | |
| 618 | |
| 619 minOverlap = self.minOverlap | |
| 620 if self.pcOverlap != None: | |
| 621 minOverlap = max(minOverlap, transcript1.getSize() / 100.0 * self.pcOverlap) | |
| 622 if not extendedTranscript1.overlapWith(transcript2, self.minOverlap): | |
| 623 return False | |
| 624 if (self.downstreams[self.QUERY] and transcript2.getStart() > extendedTranscript1.getStart()) or \ | |
| 625 (self.upstreams[self.QUERY] and transcript2.getEnd() < extendedTranscript1.getEnd()) or \ | |
| 626 (self.downstreams[self.REFERENCE] and extendedTranscript1.getStart() > transcript2.getStart()) or \ | |
| 627 (self.upstreams[self.REFERENCE] and extendedTranscript1.getEnd() < transcript2.getEnd()): | |
| 628 return False | |
| 629 if (self.antisense and extendedTranscript1.getDirection() == transcript2.getDirection()) or (self.colinear and extendedTranscript1.getDirection() != transcript2.getDirection()): | |
| 630 return False | |
| 631 if self.included and not transcript2.include(extendedTranscript1): | |
| 632 return False | |
| 633 if self.including and not extendedTranscript1.include(transcript2): | |
| 634 return False | |
| 635 if self.introns[self.REFERENCE] and self.introns[self.QUERY]: | |
| 636 if self.logHandle != None: | |
| 637 self.logHandle.write("%s overlaps with intron of %s\n" % (str(extendedTranscript1), str(transcript2))) | |
| 638 return True | |
| 639 if (not self.introns[self.REFERENCE]) and (not self.introns[self.QUERY]) and extendedTranscript1.overlapWithExon(transcript2, minOverlap): | |
| 640 if self.logHandle != None: | |
| 641 self.logHandle.write("%s overlaps with exon of %s\n" % (str(extendedTranscript1), str(transcript2))) | |
| 642 return True | |
| 643 return False | |
| 644 | |
| 645 | |
| 646 def compareTranscriptToList(self, transcript1): | |
| 647 """ | |
| 648 Compare a transcript to the reference list of transcripts | |
| 649 (Do not extend the transcripts, except for the distance) | |
| 650 @param transcript1: a transcript (from the query set) | |
| 651 @type transcript1: class L{Transcript<Transcript>} | |
| 652 @return: the reference transcripts overlapping | |
| 653 """ | |
| 654 # no transcript in the reference table | |
| 655 if transcript1.getChromosome() not in self.getTables(self.WORKING): | |
| 656 return | |
| 657 | |
| 658 # retrieve the the transcripts that may overlap in the working tables | |
| 659 clauses = [] | |
| 660 extendedTranscript1 = Transcript() | |
| 661 extendedTranscript1.copy(transcript1) | |
| 662 if self.maxDistance > 0: | |
| 663 extendedTranscript1.extendStart(self.maxDistance) | |
| 664 if self.maxDistance > 0: | |
| 665 extendedTranscript1.extendEnd(self.maxDistance) | |
| 666 command = "SELECT * FROM %s WHERE (" % (self.getTables(self.WORKING)[transcript1.getChromosome()].getName()) | |
| 667 for binPair in extendedTranscript1.getBins(): | |
| 668 clause = "bin " | |
| 669 if binPair[0] == binPair[1]: | |
| 670 clause += "= %i" % (binPair[0]) | |
| 671 else: | |
| 672 clause += "BETWEEN %i AND %i" % (binPair[0], binPair[1]) | |
| 673 clauses.append(clause) | |
| 674 command += " OR ".join(clauses) | |
| 675 command += ") AND start <= %d AND end >= %d" % (extendedTranscript1.getEnd(), extendedTranscript1.getStart()) | |
| 676 | |
| 677 for index2, transcript2 in self.getTables(self.REFERENCE)[transcript1.getChromosome()].selectTranscripts(command): | |
| 678 if self.compareTranscript(extendedTranscript1, transcript2): | |
| 679 yield transcript2 | |
| 680 | |
| 681 | |
| 682 def compareTranscriptList(self): | |
| 683 """ | |
| 684 Compare a list of transcript to the reference one | |
| 685 @return: the transcripts that overlap with the reference set | |
| 686 """ | |
| 687 distance = 0 | |
| 688 nbClustersIn = 0 | |
| 689 nbClustersOut = 0 | |
| 690 if self.maxDistance != None: | |
| 691 distance = self.maxDistance | |
| 692 | |
| 693 self.addIndexes([self.QUERY, self.REFERENCE]) | |
| 694 | |
| 695 # export the container into tables | |
| 696 self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) | |
| 697 self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True) | |
| 698 | |
| 699 # looping | |
| 700 for chromosome1 in sorted(self.getTables(self.QUERY).keys()): | |
| 701 # get range of transcripts | |
| 702 command = "SELECT MIN(start), MAX(end), COUNT(id) FROM %s" % (self.getTables(self.QUERY)[chromosome1].getName()) | |
| 703 query = self.mySqlConnection.executeQuery(command) | |
| 704 result = query.getLine() | |
| 705 first = result[0] | |
| 706 last = result[1] | |
| 707 nb = result[2] | |
| 708 | |
| 709 transcripts1 = [] | |
| 710 toBeRemoved1 = [] | |
| 711 transcripts2 = [] | |
| 712 toBeRemoved2 = [] | |
| 713 overlapsWith = [] | |
| 714 nbOverlaps = [] | |
| 715 nbChunks = max(1, nb / 100) | |
| 716 chunkSize = (last - first) / nbChunks | |
| 717 progress = Progress(nbChunks + 1, "Analyzing chromosome %s" % (chromosome1), self.verbosity-1) | |
| 718 for chunk in range(nbChunks + 1): | |
| 719 | |
| 720 # load transcripts | |
| 721 start = first + chunk * chunkSize | |
| 722 end = start + chunkSize - 1 | |
| 723 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.QUERY)[chromosome1].getName(), start, end-1) | |
| 724 for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command): | |
| 725 transcripts1.append(transcript1) | |
| 726 overlapsWith.append([]) | |
| 727 nbOverlaps.append(0) | |
| 728 nbClustersIn += 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements") | |
| 729 command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.QUERY)[chromosome1].getName(), end) | |
| 730 self.mySqlConnection.executeQuery(command) | |
| 731 if chromosome1 in self.getTables(self.REFERENCE): | |
| 732 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), start-distance, end+distance-1) | |
| 733 if chunk == 0: | |
| 734 command = "SELECT * FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) | |
| 735 for index2, transcript2 in self.getTables(self.REFERENCE)[chromosome1].selectTranscripts(command): | |
| 736 transcripts2.append(transcript2) | |
| 737 command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) | |
| 738 self.mySqlConnection.executeQuery(command) | |
| 739 | |
| 740 # compare sets | |
| 741 for index1, transcript1 in enumerate(transcripts1): | |
| 742 overlappingNames = [] | |
| 743 nbElements1 = 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements") | |
| 744 for transcript2 in transcripts2: | |
| 745 if self.compareTranscript(transcript1, transcript2, True): | |
| 746 id2 = transcript2.getTagValue("ID") if "ID" in transcript2.getTagNames() else transcript2.getName() | |
| 747 if id2 not in overlapsWith[index1]: | |
| 748 overlapsWith[index1].append(id2) | |
| 749 nbOverlaps[index1] += 1 if "nbElements" not in transcript2.getTagNames() else transcript2.getTagValue("nbElements") | |
| 750 if self.odds: | |
| 751 if transcript2.getName() not in self.overlapResults: | |
| 752 self.overlapResults[transcript2.getName()] = 0 | |
| 753 self.overlapResults[transcript2.getName()] += nbElements1 | |
| 754 | |
| 755 # check if query transcript extends bounds of the chunk | |
| 756 if transcript1.getEnd() < end: | |
| 757 if Utils.xor(overlapsWith[index1], self.invert) or self.notOverlapping: | |
| 758 if overlapsWith[index1]: | |
| 759 transcript1.setTagValue("overlapWith", ",".join(overlapsWith[index1])[:100]) | |
| 760 transcript1.setTagValue("nbOverlaps", "%d" % (nbOverlaps[index1])) | |
| 761 elif self.notOverlapping: | |
| 762 transcript1.setTagValue("nbOverlaps", "0") | |
| 763 self.writeTranscript(transcript1) | |
| 764 nbClustersOut += nbElements1 | |
| 765 toBeRemoved1.append(index1) | |
| 766 | |
| 767 # update list of query transcripts | |
| 768 for index1 in reversed(toBeRemoved1): | |
| 769 del transcripts1[index1] | |
| 770 del overlapsWith[index1] | |
| 771 del nbOverlaps[index1] | |
| 772 toBeRemoved1 = [] | |
| 773 | |
| 774 # check if the reference transcripts extends bounds of the chunk | |
| 775 for index2, transcript2 in enumerate(transcripts2): | |
| 776 if transcript2.getEnd() + distance < end: | |
| 777 toBeRemoved2.append(index2) | |
| 778 for index2 in reversed(toBeRemoved2): | |
| 779 del transcripts2[index2] | |
| 780 toBeRemoved2 = [] | |
| 781 | |
| 782 progress.inc() | |
| 783 | |
| 784 for index1, transcript1 in enumerate(transcripts1): | |
| 785 if Utils.xor(overlapsWith[index1], self.invert) or self.notOverlapping: | |
| 786 if overlapsWith[index1]: | |
| 787 transcript1.setTagValue("overlapWith", ",".join(overlapsWith[index1])[:100]) | |
| 788 transcript1.setTagValue("nbOverlaps", "%d" % (nbOverlaps[index1])) | |
| 789 elif self.notOverlapping: | |
| 790 transcript1.setTagValue("nbOverlaps", "0") | |
| 791 self.writeTranscript(transcript1) | |
| 792 nbClustersOut += 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements") | |
| 793 progress.done() | |
| 794 self.getTables(self.QUERY)[chromosome1].remove() | |
| 795 if chromosome1 in self.getTables(self.REFERENCE): | |
| 796 self.getTables(self.REFERENCE)[chromosome1].remove() | |
| 797 self.getTables(self.WORKING)[chromosome1].remove() | |
| 798 | |
| 799 self.flushData() | |
| 800 if self.writer != None: | |
| 801 self.writer.close() | |
| 802 self.writer = None | |
| 803 | |
| 804 if self.verbosity > 0: | |
| 805 print "reference: %d elements" % (self.nbTranscripts[self.REFERENCE]) | |
| 806 print "query: %d elements, %d clustered" % (self.nbTranscripts[self.QUERY], nbClustersIn) | |
| 807 if self.nbTranscripts[self.QUERY] != 0: | |
| 808 print "output: %d elements (%.2f%%)"% (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100), | |
| 809 if nbClustersOut != 0: | |
| 810 print ", %d clustered (%.2f%%)" % (nbClustersOut, float(nbClustersOut) / nbClustersIn * 100) | |
| 811 | |
| 812 | |
| 813 def compareTranscriptListDistance(self): | |
| 814 """ | |
| 815 Compare a list of transcript to the reference one | |
| 816 @return: the distance distributions in a hash | |
| 817 """ | |
| 818 nbDistances = 0 | |
| 819 distances = {} | |
| 820 absDistances = {} | |
| 821 strandedDistances = dict([(strand, {}) for strand in (1, -1)]) | |
| 822 | |
| 823 # export the container into tables | |
| 824 self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) | |
| 825 self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True) | |
| 826 | |
| 827 progress = Progress(self.nbTranscripts[self.QUERY], "Analyzing chromosomes", self.verbosity-1) | |
| 828 for transcript1 in self.inputTranscriptContainers[self.QUERY].getIterator(): | |
| 829 # get the distance | |
| 830 transcript1 = self.extendTranscript(self.QUERY, transcript1) | |
| 831 distance = self.maxDistance + 1 | |
| 832 strand = None | |
| 833 closestElement = "None" | |
| 834 for transcript2 in self.compareTranscriptToList(transcript1): | |
| 835 thisStrand = transcript1.getDirection() * transcript2.getDirection() | |
| 836 if self.antisense or (not self.colinear and transcript1.getDirection() != transcript2.getDirection()): | |
| 837 transcript2.reverse() | |
| 838 if self.absolute: | |
| 839 transcript2.setDirection(transcript1.getDirection()) | |
| 840 if transcript2.getDirection() == transcript1.getDirection(): | |
| 841 if self.starts[self.REFERENCE] != None: | |
| 842 transcript2.restrictStart(self.starts[self.REFERENCE]) | |
| 843 if self.ends[self.REFERENCE] != None: | |
| 844 transcript2.restrictEnd(self.ends[self.REFERENCE]) | |
| 845 thisDistance = transcript1.getRelativeDistance(transcript2) | |
| 846 if (self.absolute): | |
| 847 thisDistance = abs(thisDistance) | |
| 848 if abs(thisDistance) < abs(distance): | |
| 849 distance = thisDistance | |
| 850 strand = thisStrand | |
| 851 closestElement = transcript2.getTagValue("ID") if "ID" in transcript2.getTagNames() else transcript2.getName() | |
| 852 if (distance <= self.maxDistance) and (self.minDistance == None or distance >= self.minDistance): | |
| 853 nbDistances += 1 | |
| 854 distances[distance] = distances.get(distance, 0) + 1 | |
| 855 absDistance = abs(distance) | |
| 856 absDistances[absDistance] = absDistances.get(absDistance, 0) + 1 | |
| 857 strandedDistances[strand][distance] = strandedDistances[strand].get(distance, 0) | |
| 858 if distance not in strandedDistances[-strand]: | |
| 859 strandedDistances[-strand][distance] = 0 | |
| 860 | |
| 861 # write transcript | |
| 862 if distance == self.maxDistance + 1: | |
| 863 distance = "None" | |
| 864 tmpTranscript = Transcript() | |
| 865 tmpTranscript.copy(transcript1) | |
| 866 tmpTranscript.setTagValue("distance", distance) | |
| 867 tmpTranscript.setTagValue("closestElement", closestElement) | |
| 868 self.writeTranscript(tmpTranscript) | |
| 869 | |
| 870 progress.inc() | |
| 871 progress.done() | |
| 872 | |
| 873 self.flushData() | |
| 874 | |
| 875 if self.verbosity > 0: | |
| 876 print "reference: %d sequences" % (self.nbTranscripts[self.REFERENCE]) | |
| 877 print "query: %d sequences" % (self.nbTranscripts[self.QUERY]) | |
| 878 if nbDistances == 0: | |
| 879 print "Nothing matches" | |
| 880 else: | |
| 881 print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(absDistances) | |
| 882 print "for %d distances (%.2f%%)" % (nbDistances, float(nbDistances) / self.nbTranscripts[self.QUERY] * 100) | |
| 883 | |
| 884 if self.strandedDistance: | |
| 885 return strandedDistances | |
| 886 return distances | |
| 887 | |
| 888 | |
| 889 def compareTranscriptListMerge(self): | |
| 890 """ | |
| 891 Merge the query list of transcript with itself | |
| 892 @return: the merged transcripts in a transcript list database | |
| 893 """ | |
| 894 nbMerges = 0 | |
| 895 | |
| 896 for type in (self.QUERY, self.REFERENCE): | |
| 897 self.storeTranscriptList(type, self.inputTranscriptContainers[type], True) | |
| 898 self.flushData() | |
| 899 | |
| 900 # Loop on the chromosomes | |
| 901 for chromosome in sorted(self.getTables(self.QUERY).keys()): | |
| 902 if chromosome not in self.getTables(self.REFERENCE): | |
| 903 continue | |
| 904 | |
| 905 # Get the size of the chromosome | |
| 906 maxEnd = 0 | |
| 907 nbChunks = 0 | |
| 908 for type in (self.QUERY, self.REFERENCE): | |
| 909 command = "SELECT MAX(end) from %s" % (self.getTables(type)[chromosome].getName()) | |
| 910 query = self.mySqlConnection.executeQuery(command) | |
| 911 maxEnd = max(maxEnd, int(query.getLine()[0])) | |
| 912 nbChunks = max(nbChunks, self.getTables(type)[chromosome].getNbElements()) | |
| 913 | |
| 914 mergedTranscripts = {} | |
| 915 transcripts = {self.QUERY: [], self.REFERENCE: []} | |
| 916 progress = Progress(nbChunks, "Analyzing %s" % (chromosome), self.verbosity-1) | |
| 917 for i in range(nbChunks): | |
| 918 rangeStart = int(i * (float(maxEnd) / nbChunks)) + 1 | |
| 919 rangeEnd = int((i+1) * (float(maxEnd) / nbChunks)) | |
| 920 | |
| 921 # Get all transcripts in query and reference from chunk | |
| 922 for type in (self.QUERY, self.REFERENCE): | |
| 923 correction = 0 if self.QUERY else self.maxDistance | |
| 924 command = "SELECT * FROM %s WHERE start <= %d" % (self.getTables(type)[chromosome].getName(), rangeEnd + correction) | |
| 925 for index, transcript in self.getTables(type)[chromosome].selectTranscripts(command): | |
| 926 transcripts[type].append(transcript) | |
| 927 | |
| 928 # Merge elements between the two samples | |
| 929 for iQuery, queryTranscript in enumerate(transcripts[self.QUERY]): | |
| 930 for iReference, referenceTranscript in enumerate(transcripts[self.REFERENCE]): | |
| 931 if referenceTranscript == None: continue | |
| 932 if self.compareTranscript(queryTranscript, referenceTranscript, True): | |
| 933 if queryTranscript.getDirection() != referenceTranscript.getDirection(): | |
| 934 referenceTranscript.setDirection(queryTranscript.getDirection()) | |
| 935 queryTranscript.merge(referenceTranscript, self.normalization) | |
| 936 nbMerges += 1 | |
| 937 transcripts[self.REFERENCE][iReference] = None | |
| 938 if not self.multiple: | |
| 939 mergedTranscripts[iQuery] = 0 | |
| 940 | |
| 941 # Remove transcripts from database | |
| 942 for type in (self.QUERY, self.REFERENCE): | |
| 943 correction = 0 if self.QUERY else self.maxDistance | |
| 944 command = "DELETE FROM %s WHERE start <= %d" % (self.getTables(type)[chromosome].getName(), rangeEnd - correction) | |
| 945 query = self.mySqlConnection.executeQuery(command) | |
| 946 | |
| 947 # Just in case, self-merge the elements in the query (beware of mergedTranscripts!) | |
| 948 if (self.multiple): | |
| 949 for iQuery1, queryTranscript1 in enumerate(transcripts[self.QUERY]): | |
| 950 if queryTranscript1 == None: continue | |
| 951 for iQuery2, queryTranscript2 in enumerate(transcripts[self.QUERY]): | |
| 952 if iQuery2 <= iQuery1 or queryTranscript2 == None: continue | |
| 953 minOverlap = self.minOverlap | |
| 954 if self.pcOverlap != None: | |
| 955 minOverlap = max(minOverlap, queryTranscript1.getSize() / 100.0 * self.pcOverlap) | |
| 956 if queryTranscript2.overlapWith(queryTranscript1, minOverlap) and (queryTranscript1.getDirection() == queryTranscript2.getDirection() or not self.colinear): | |
| 957 if queryTranscript1.getDirection() != queryTranscript2.getDirection(): | |
| 958 queryTranscript2.setDirection(queryTranscript1.getDirection()) | |
| 959 queryTranscript1.merge(queryTranscript2, self.normalization) | |
| 960 transcripts[self.QUERY][iQuery2] = None | |
| 961 nbMerges += 1 | |
| 962 if not self.multiple: | |
| 963 mergedTranscripts[iQuery1] = 0 | |
| 964 | |
| 965 # Update the sets of transcripts and write into database (also update mergedTranscripts) | |
| 966 newTranscripts = {self.QUERY: [], self.REFERENCE: []} | |
| 967 newMergedTranscripts = {} | |
| 968 for type in (self.QUERY, self.REFERENCE): | |
| 969 for i, transcript in enumerate(transcripts[type]): | |
| 970 if transcript == None: continue | |
| 971 correction = 0 if self.QUERY else self.maxDistance | |
| 972 if transcript.getEnd() < rangeEnd - correction: | |
| 973 if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)): | |
| 974 self.writeTranscript(transcripts[type][i]) | |
| 975 else: | |
| 976 if type == self.QUERY and i in mergedTranscripts: | |
| 977 newMergedTranscripts[len(newTranscripts[type])] = 0 | |
| 978 newTranscripts[type].append(transcript) | |
| 979 transcripts = newTranscripts | |
| 980 mergedTranscripts = newMergedTranscripts | |
| 981 | |
| 982 progress.inc() | |
| 983 progress.done() | |
| 984 | |
| 985 for type in (self.QUERY, self.REFERENCE): | |
| 986 for i, transcript in enumerate(transcripts[type]): | |
| 987 if transcripts == None: continue | |
| 988 if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)): | |
| 989 self.writeTranscript(transcripts[type][i]) | |
| 990 | |
| 991 # Manage chromosomes with no corresponding data | |
| 992 if self.multiple: | |
| 993 for type in self.INPUTTYPES: | |
| 994 for chromosome in self.getTables(type): | |
| 995 if chromosome in self.getTables(1 - type): | |
| 996 continue | |
| 997 for transcript in self.getTables(self.OUTPUT)[chromosome].getIterator(): | |
| 998 self.writeTranscript(transcript) | |
| 999 | |
| 1000 self.flushData() | |
| 1001 if self.writer != None: | |
| 1002 self.writer.close() | |
| 1003 self.writer = None | |
| 1004 | |
| 1005 if self.verbosity > 0: | |
| 1006 print "query: %d sequences" % (self.nbTranscripts[self.QUERY]) | |
| 1007 print "# merges: %d" % (nbMerges) | |
| 1008 print "# printed %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100) | |
| 1009 | |
| 1010 | |
| 1011 def compareTranscriptListSelfMerge(self): | |
| 1012 """ | |
| 1013 Merge the query list of transcript with itself | |
| 1014 @return: the merged transcripts in a transcript list database | |
| 1015 """ | |
| 1016 nbMerges = 0 | |
| 1017 distance = self.maxDistance if self.maxDistance != None else 0 | |
| 1018 | |
| 1019 self.addIndexes([self.QUERY]) | |
| 1020 self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) | |
| 1021 self.flushData() | |
| 1022 | |
| 1023 # looping | |
| 1024 for chromosome1 in sorted(self.getTables(self.QUERY).keys()): | |
| 1025 transcripts2 = [] | |
| 1026 | |
| 1027 # get range of transcripts | |
| 1028 progress = Progress(self.getTables(self.QUERY)[chromosome1].getNbElements(), "Analyzing chromosome %s" % (chromosome1), self.verbosity-1) | |
| 1029 command = "SELECT * FROM %s ORDER BY start" % (self.getTables(self.QUERY)[chromosome1].getName()) | |
| 1030 for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command): | |
| 1031 | |
| 1032 # compare sets | |
| 1033 toBeRemoved = set() | |
| 1034 toBePrinted = set() | |
| 1035 for index2, transcript2 in enumerate(transcripts2): | |
| 1036 | |
| 1037 if self.compareTranscript(transcript1, transcript2, True): | |
| 1038 if transcript1.getDirection() != transcript2.getDirection(): | |
| 1039 transcript2.setDirection(transcript1.getDirection()) | |
| 1040 transcript1.merge(transcript2, self.normalization) | |
| 1041 toBeRemoved.add(index2) | |
| 1042 nbMerges += 1 | |
| 1043 elif transcript2.getEnd() + distance < transcript1.getStart(): | |
| 1044 toBePrinted.add(index2) | |
| 1045 transcripts2.append(transcript1) | |
| 1046 | |
| 1047 for index2 in sorted(toBePrinted): | |
| 1048 self.writeTranscript(transcripts2[index2]) | |
| 1049 transcripts2 = [transcripts2[index2] for index2 in range(len(transcripts2)) if index2 not in (toBeRemoved | toBePrinted)] | |
| 1050 | |
| 1051 for transcript2 in transcripts2: | |
| 1052 self.writeTranscript(transcript2) | |
| 1053 progress.done() | |
| 1054 self.getTables(self.QUERY)[chromosome1].remove() | |
| 1055 | |
| 1056 self.flushData() | |
| 1057 if self.writer != None: | |
| 1058 self.writer.close() | |
| 1059 self.writer = None | |
| 1060 | |
| 1061 if self.verbosity > 0: | |
| 1062 print "query: %d sequences" % (self.nbTranscripts[self.QUERY]) | |
| 1063 print "# merges: %d" % (nbMerges) | |
| 1064 print "# printed %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100) | |
| 1065 | |
| 1066 | |
| 1067 def getDifferenceTranscriptList(self): | |
| 1068 """ | |
| 1069 Get the elements of the first list which do not overlap the second list (at the nucleotide level) | |
| 1070 @return: the transcripts that overlap with the reference set | |
| 1071 """ | |
| 1072 distance = 0 if self.maxDistance == None else self.maxDistance | |
| 1073 | |
| 1074 self.addIndexes([self.QUERY, self.REFERENCE]) | |
| 1075 | |
| 1076 # export the container into tables | |
| 1077 self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) | |
| 1078 self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True) | |
| 1079 | |
| 1080 # looping | |
| 1081 for chromosome1 in sorted(self.getTables(self.QUERY).keys()): | |
| 1082 # get range of transcripts | |
| 1083 command = "SELECT MIN(start), MAX(end), COUNT(id) FROM %s" % (self.getTables(self.QUERY)[chromosome1].getName()) | |
| 1084 query = self.mySqlConnection.executeQuery(command) | |
| 1085 result = query.getLine() | |
| 1086 first = result[0] | |
| 1087 last = result[1] | |
| 1088 nb = result[2] | |
| 1089 | |
| 1090 transcripts1 = [] | |
| 1091 transcripts2 = [] | |
| 1092 nbChunks = max(1, nb / 100) | |
| 1093 chunkSize = (last - first) / nbChunks | |
| 1094 progress = Progress(nbChunks + 1, "Analyzing chromosome %s" % (chromosome1), self.verbosity-1) | |
| 1095 for chunk in range(nbChunks + 1): | |
| 1096 | |
| 1097 # load transcripts | |
| 1098 start = first + chunk * chunkSize | |
| 1099 end = start + chunkSize - 1 | |
| 1100 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.QUERY)[chromosome1].getName(), start, end-1) | |
| 1101 for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command): | |
| 1102 transcripts1.append(transcript1) | |
| 1103 command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.QUERY)[chromosome1].getName(), end) | |
| 1104 self.mySqlConnection.executeQuery(command) | |
| 1105 if chromosome1 in self.getTables(self.REFERENCE): | |
| 1106 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), start-distance, end+distance-1) | |
| 1107 if chunk == 0: | |
| 1108 command = "SELECT * FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) | |
| 1109 for index2, transcript2 in self.getTables(self.REFERENCE)[chromosome1].selectTranscripts(command): | |
| 1110 transcripts2.append(transcript2) | |
| 1111 command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) | |
| 1112 self.mySqlConnection.executeQuery(command) | |
| 1113 | |
| 1114 # compare sets | |
| 1115 toBeRemoved1 = [] | |
| 1116 for index1, transcript1 in enumerate(transcripts1): | |
| 1117 newTranscript1 = Transcript() | |
| 1118 newTranscript1.copy(transcript1) | |
| 1119 for transcript2 in transcripts2: | |
| 1120 newTranscript1 = newTranscript1.getDifference(transcript2) | |
| 1121 if newTranscript1 == None: | |
| 1122 toBeRemoved1.append(index1) | |
| 1123 break | |
| 1124 transcripts1[index1] = newTranscript1 | |
| 1125 | |
| 1126 # check if query transcript extends bounds of the chunk | |
| 1127 if newTranscript1 != None and newTranscript1.getEnd() < end: | |
| 1128 if self.splitDifference: | |
| 1129 for exon in newTranscript1.getExons(): | |
| 1130 transcript = Transcript() | |
| 1131 transcript.copy(exon) | |
| 1132 self.writeTranscript(transcript) | |
| 1133 else: | |
| 1134 self.writeTranscript(newTranscript1) | |
| 1135 toBeRemoved1.append(index1) | |
| 1136 | |
| 1137 # update list of query transcripts | |
| 1138 for index1 in reversed(toBeRemoved1): | |
| 1139 del transcripts1[index1] | |
| 1140 | |
| 1141 # check if the reference transcripts extends bounds of the chunk | |
| 1142 toBeRemoved2 = [] | |
| 1143 for index2, transcript2 in enumerate(transcripts2): | |
| 1144 if transcript2.getEnd() + distance < end: | |
| 1145 toBeRemoved2.append(index2) | |
| 1146 for index2 in reversed(toBeRemoved2): | |
| 1147 del transcripts2[index2] | |
| 1148 | |
| 1149 progress.inc() | |
| 1150 | |
| 1151 for transcript1 in transcripts1: | |
| 1152 if self.splitDifference: | |
| 1153 for exon in transcript1.getExons(): | |
| 1154 transcript = Transcript() | |
| 1155 transcript.copy(exon) | |
| 1156 self.writeTranscript(transcript) | |
| 1157 else: | |
| 1158 self.writeTranscript(transcript1) | |
| 1159 progress.done() | |
| 1160 self.getTables(self.QUERY)[chromosome1].remove() | |
| 1161 if chromosome1 in self.getTables(self.REFERENCE): | |
| 1162 self.getTables(self.REFERENCE)[chromosome1].remove() | |
| 1163 self.getTables(self.WORKING)[chromosome1].remove() | |
| 1164 | |
| 1165 self.flushData() | |
| 1166 if self.writer != None: | |
| 1167 self.writer.close() | |
| 1168 self.writer = None | |
| 1169 | |
| 1170 if self.verbosity > 0: | |
| 1171 print "query: %d elements" % (self.nbTranscripts[self.QUERY]) | |
| 1172 print "reference: %d elements" % (self.nbTranscripts[self.REFERENCE]) | |
| 1173 print "# printed: %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100) | |
| 1174 | |
| 1175 | |
| 1176 def getOddsPerTranscript(self): | |
| 1177 """ | |
| 1178 Return overlap results | |
| 1179 @return a dict of data | |
| 1180 """ | |
| 1181 if not self.odds: | |
| 1182 raise Exception("Did not compute odds!") | |
| 1183 return self.overlapResults | |
| 1184 | |
| 1185 | |
| 1186 def getOdds(self): | |
| 1187 """ | |
| 1188 Return odds about the overlap | |
| 1189 @return a dict of data | |
| 1190 """ | |
| 1191 if not self.odds: | |
| 1192 raise Exception("Did not compute odds!") | |
| 1193 if self.oddResults != None: | |
| 1194 return self.oddResults | |
| 1195 self.oddResults = {} | |
| 1196 for name, value in self.overlapResults.iteritems(): | |
| 1197 self.oddResults[value] = self.oddResults.get(value, 0) + 1 | |
| 1198 return self.oddResults |
