| 6 | 1 # | 
|  | 2 # Copyright INRA-URGI 2009-2010 | 
|  | 3 # | 
|  | 4 # This software is governed by the CeCILL license under French law and | 
|  | 5 # abiding by the rules of distribution of free software. You can use, | 
|  | 6 # modify and/ or redistribute the software under the terms of the CeCILL | 
|  | 7 # license as circulated by CEA, CNRS and INRIA at the following URL | 
|  | 8 # "http://www.cecill.info". | 
|  | 9 # | 
|  | 10 # As a counterpart to the access to the source code and rights to copy, | 
|  | 11 # modify and redistribute granted by the license, users are provided only | 
|  | 12 # with a limited warranty and the software's author, the holder of the | 
|  | 13 # economic rights, and the successive licensors have only limited | 
|  | 14 # liability. | 
|  | 15 # | 
|  | 16 # In this respect, the user's attention is drawn to the risks associated | 
|  | 17 # with loading, using, modifying and/or developing or reproducing the | 
|  | 18 # software by the user in light of its specific status of free software, | 
|  | 19 # that may mean that it is complicated to manipulate, and that also | 
|  | 20 # therefore means that it is reserved for developers and experienced | 
|  | 21 # professionals having in-depth computer knowledge. Users are therefore | 
|  | 22 # encouraged to load and test the software's suitability as regards their | 
|  | 23 # requirements in conditions enabling the security of their systems and/or | 
|  | 24 # data to be ensured and, more generally, to use and operate it in the | 
|  | 25 # same conditions as regards security. | 
|  | 26 # | 
|  | 27 # The fact that you are presently reading this means that you have had | 
|  | 28 # knowledge of the CeCILL license and that you accept its terms. | 
|  | 29 # | 
|  | 30 import sys | 
|  | 31 import random | 
|  | 32 from SMART.Java.Python.misc import Utils | 
|  | 33 from SMART.Java.Python.structure.Transcript import Transcript | 
|  | 34 from SMART.Java.Python.structure.TranscriptList import TranscriptList | 
|  | 35 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer | 
|  | 36 from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection | 
|  | 37 from SMART.Java.Python.mySql.MySqlTranscriptTable import MySqlTranscriptTable | 
|  | 38 from SMART.Java.Python.misc.Progress import Progress | 
|  | 39 from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter | 
|  | 40 | 
|  | 41 | 
|  | 42 | 
|  | 43 class TranscriptListsComparator(object): | 
|  | 44     """ | 
|  | 45     Compare two transcript lists, using a database for one of the list | 
|  | 46     Uses one TranscriptContainer for query data, | 
|  | 47              one TranscriptContainer exported to MySqlTranscriptTable for reference data, | 
|  | 48              one MySqlTranscriptTable for transformed reference data | 
|  | 49     @ivar inputTranscriptContainers: parsers to the list of query transcripts | 
|  | 50     @type inputTranscriptContainers: list of 2 L{TranscriptContainer<TranscriptContainer>} | 
|  | 51     @ivar writer:                    transcript list writer | 
|  | 52     @type writer:                    class L{TranscriptListWriter<TranscriptListWriter>} | 
|  | 53     @ivar mySqlConnection:           connection to a MySQL database (to compute the ovelapping efficiently) | 
|  | 54     @type mySqlConnection:           class L{MySqlConnection<MySqlConnection>} | 
|  | 55     @ivar introns:                   compare transcripts or exons only | 
|  | 56     @type introns:                   list of 2 boolean | 
|  | 57     @ivar starts:                    restrict the query transcripts to first nucleotides | 
|  | 58     @type starts:                    list of 2 int or None | 
|  | 59     @ivar fivePrimes:                extend a list of transcripts by their 5' end | 
|  | 60     @type fivePrimes:                list of 2 int or None | 
|  | 61     @ivar threePrimes:               extend a list of transcripts by their 3' end | 
|  | 62     @type threePrimes:               list of 2 int or None | 
|  | 63     @ivar minDistance:               min distance between two transcripts [default: 0] | 
|  | 64     @type minDistance:               int | 
|  | 65     @ivar maxDistance:               max distance between two transcripts [default: 0] | 
|  | 66     @type maxDistance:               int | 
|  | 67     @ivar minOverlap:                minimum number of overlapping nucleotides to declare an overlap | 
|  | 68     @type minOverlap:                int | 
|  | 69     @ivar pcOverlap:                 percentage of overlapping nucleotides to declare an overlap | 
|  | 70     @type pcOverlap:                 int | 
|  | 71     @ivar upstreams:                 consider distances with elements which are upstream of the transcripts | 
|  | 72     @type upstreams:                 boolean | 
|  | 73     @ivar downstreams:               consider distances with elements which are downstream of the transcripts | 
|  | 74     @type downstreams:               boolean | 
|  | 75     @ivar colinear:                  whether transcripts should overlap in the same direction | 
|  | 76     @type colinear:                  boolean | 
|  | 77     @ivar antisense:                 whether transcripts should overlap in the opposite direction | 
|  | 78     @type antisense:                 boolean | 
|  | 79     @ivar outputDistance:            output distance between query and reference instead of query transcript | 
|  | 80     @type outputDistance:            boolean | 
|  | 81     @ivar absolute:                  do not consider the strand while computing distance | 
|  | 82     @type absolute:                  boolean | 
|  | 83     @ivar strandedDistance:          return a line per strand while computing distances | 
|  | 84     @type strandedDistance:          boolean | 
|  | 85     @ivar QUERY:                     constant specifying the query objects | 
|  | 86     @type QUERY:                     int | 
|  | 87     @ivar REFERENCE:                 constant specifying the reference objects | 
|  | 88     @type REFERENCE:                 int | 
|  | 89     @ivar INPUTTYPES:                set of input types of data (query or reference) objects | 
|  | 90     @type INPUTTYPES:                list of 2 int | 
|  | 91     @ivar typeToString:              string representation of the previous types | 
|  | 92     @type typeToString:              dict | 
|  | 93     @ivar tableNames:                name of the transcript tables | 
|  | 94     @type tableNames:                dict of strings | 
|  | 95     @ivar nbTranscripts:             number of transcript in the query/reference set | 
|  | 96     @type nbTranscripts:             list of 2 int or None | 
|  | 97     @ivar nbNucleotides:             number of nucleotides in the query/reference set | 
|  | 98     @type nbNucleotides:             list of 2 int or None | 
|  | 99     @ivar transcriptsToBeStored:     transcripts that will be stored into database | 
|  | 100     @type transcriptsToBeStored:     dict of class L{TranscriptList<TranscriptList>} | 
|  | 101     @ivar multiple:                  in merge mode, aggregate multiple transcripts | 
|  | 102     @type multiple:                  boolean | 
|  | 103     @ivar normalization:             normalize each element by the number of mappings of this element | 
|  | 104     @type normalization:             boolean | 
|  | 105     @ivar invert:                    invert the current comparison | 
|  | 106     @type invert:                    boolean | 
|  | 107     @ivar splitDifference:           split into intervals when computing difference | 
|  | 108     @type splitDifference:           boolean | 
|  | 109     @ivar odds:                      whether odds about the comparison should be computed | 
|  | 110     @type odds:                      boolean | 
|  | 111     @ivar overlapResults:            count the number of overlaps | 
|  | 112     @type overlapResults:            dictionary | 
|  | 113     @ivar oddResults:                compute the number of times each transcript overlaps (or is merged with) another one | 
|  | 114     @type oddResults:                dictionary | 
|  | 115     @ivar outputContainer:           container of the output transcripts | 
|  | 116     @type outputContainer:           class L{TranscriptContainer<TranscriptContainer>} | 
|  | 117     @ivar logHandle:                 log handle | 
|  | 118     @type logHandle:                 file | 
|  | 119     @ivar verbosity:                 verbosity | 
|  | 120     @type verbosity:                 int | 
|  | 121     """ | 
|  | 122 | 
|  | 123     def __init__(self, logHandle = None, verbosity = 0): | 
|  | 124         """ | 
|  | 125         Constructor | 
|  | 126         @param transcriptListParser2: parser to the list of reference transcripts | 
|  | 127         @type  transcriptListParser2: class L{TranscriptListParser<TranscriptListParser>} | 
|  | 128         @param logHandle:             log handle | 
|  | 129         @type  logHandle:             file | 
|  | 130         @param verbosity:             verbosity | 
|  | 131         @type  verbosity:             int | 
|  | 132         """ | 
|  | 133         self.QUERY                     = 0 | 
|  | 134         self.REFERENCE                 = 1 | 
|  | 135         self.WORKING                   = 2 | 
|  | 136         self.INPUTTYPES                = (self.QUERY, self.REFERENCE) | 
|  | 137         self.INPUTWORKINGTYPES         = (self.QUERY, self.REFERENCE, self.WORKING) | 
|  | 138         self.typeToString              = {self.QUERY: "Query", self.REFERENCE: "Reference", self.WORKING: "Working"} | 
|  | 139 | 
|  | 140         self.logHandle                 = logHandle | 
|  | 141         self.verbosity                 = verbosity | 
|  | 142         self.mySqlConnection           = MySqlConnection(self.verbosity-1) | 
|  | 143         self.inputTranscriptContainers = [None, None] | 
|  | 144         self.tableNames                = ["tmpQueryTable_%d" % (random.randint(0, 100000)), "tmpReferenceTable_%d" % (random.randint(0, 100000)), "tmpOutputTable_%d" % (random.randint(0, 100000)), "tmpWorkingTable_%d" % (random.randint(0, 100000))] | 
|  | 145         self.mySqlTranscriptWriters    = [MySqlTranscriptWriter(self.mySqlConnection, name, verbosity-1) for name in self.tableNames] | 
|  | 146         self.writer                    = None | 
|  | 147         self.introns                   = [False, False] | 
|  | 148         self.starts                    = [None, None] | 
|  | 149         self.ends                      = [None, None] | 
|  | 150         self.fivePrimes                = [None, None] | 
|  | 151         self.threePrimes               = [None, None] | 
|  | 152         self.minDistance               = None | 
|  | 153         self.maxDistance               = 0 | 
|  | 154         self.minOverlap                = 1 | 
|  | 155         self.pcOverlap                 = None | 
|  | 156         self.colinear                  = False | 
|  | 157         self.antisense                 = False | 
|  | 158         self.downstreams               = [False, False] | 
|  | 159         self.upstreams                 = [False, False] | 
|  | 160         self.outputDistance            = False | 
|  | 161         self.absolute                  = False | 
|  | 162         self.strandedDistance          = False | 
|  | 163         self.nbTranscripts             = [None, None] | 
|  | 164         self.nbNucleotides             = [None, None] | 
|  | 165         self.normalization             = False | 
|  | 166         self.included                  = False | 
|  | 167         self.including                 = False | 
|  | 168         self.invert                    = False | 
|  | 169         self.notOverlapping            = False | 
|  | 170         self.splitDifference           = False | 
|  | 171         self.multiple                  = False | 
|  | 172         self.odds                      = False | 
|  | 173         self.overlapResults            = None | 
|  | 174         self.oddResults                = None | 
|  | 175         self.outputContainer           = None | 
|  | 176         self.transcriptsToBeStored     = dict([(type, TranscriptList()) for type in self.INPUTWORKINGTYPES]) | 
|  | 177         self.nbPrinted                 = 0 | 
|  | 178 | 
|  | 179         self.mySqlConnection.createDatabase() | 
|  | 180 | 
|  | 181 | 
|  | 182     def __del__(self): | 
|  | 183         """ | 
|  | 184         Destructor | 
|  | 185         Remove all temporary tables | 
|  | 186         """ | 
|  | 187         for type in self.INPUTWORKINGTYPES: | 
|  | 188             self.mySqlTranscriptWriters[type].removeTables() | 
|  | 189         self.mySqlConnection.deleteDatabase() | 
|  | 190 | 
|  | 191     def acceptIntrons(self, type, bool): | 
|  | 192         """ | 
|  | 193         Compare transcripts or exons only | 
|  | 194         @param type: whether use query/reference data | 
|  | 195         @type  type: int | 
|  | 196         @param bool: include introns or not | 
|  | 197         @type  bool: boolean | 
|  | 198         """ | 
|  | 199         self.introns[type] = bool | 
|  | 200 | 
|  | 201 | 
|  | 202     def restrictToStart(self, type, size): | 
|  | 203         """ | 
|  | 204         Restrict a list of transcripts to first nucleotides | 
|  | 205         @param type: whether use query/reference data | 
|  | 206         @type  type: int | 
|  | 207         @param size: the size of the transcript to be considered | 
|  | 208         @type  size: int | 
|  | 209         """ | 
|  | 210         self.starts[type]  = size | 
|  | 211         self.introns[type] = False | 
|  | 212 | 
|  | 213 | 
|  | 214     def restrictToEnd(self, type, size): | 
|  | 215         """ | 
|  | 216         Restrict a list of transcripts to first nucleotides | 
|  | 217         @param type: whether use query/reference data | 
|  | 218         @type  type: int | 
|  | 219         @param size: the size of the transcript to be considered | 
|  | 220         @type  size: int | 
|  | 221         """ | 
|  | 222         self.ends[type]    = size | 
|  | 223         self.introns[type] = False | 
|  | 224 | 
|  | 225 | 
|  | 226     def extendFivePrime(self, type, size): | 
|  | 227         """ | 
|  | 228         Extend a list of transcripts by their 5' end | 
|  | 229         @param type: whether use query/reference data | 
|  | 230         @type  type: int | 
|  | 231         @param size: size of the extension | 
|  | 232         @type  size: int | 
|  | 233         """ | 
|  | 234         self.fivePrimes[type] = size | 
|  | 235 | 
|  | 236 | 
|  | 237     def extendThreePrime(self, type, size): | 
|  | 238         """ | 
|  | 239         Extend the list of query transcripts by their 3' end | 
|  | 240         @param type: whether use query/reference data | 
|  | 241         @type  type: int | 
|  | 242         @param size: size of the extension | 
|  | 243         @type  size: int | 
|  | 244         """ | 
|  | 245         self.threePrimes[type] = size | 
|  | 246 | 
|  | 247 | 
|  | 248     def setMinDistance(self, distance): | 
|  | 249         """ | 
|  | 250         Set the min distance between two transcripts | 
|  | 251         @param distance: distance | 
|  | 252         @type  distance: int | 
|  | 253         """ | 
|  | 254         self.minDistance = distance | 
|  | 255 | 
|  | 256 | 
|  | 257     def setMaxDistance(self, distance): | 
|  | 258         """ | 
|  | 259         Set the max distance between two transcripts | 
|  | 260         @param distance: distance | 
|  | 261         @type  distance: int | 
|  | 262         """ | 
|  | 263         self.maxDistance = distance | 
|  | 264 | 
|  | 265 | 
|  | 266     def setMinOverlap(self, overlap): | 
|  | 267         """ | 
|  | 268         Set the minimum number of nucleotides to declare an overlap | 
|  | 269         @param overlap: minimum number of nucleotides | 
|  | 270         @type  overlap: int | 
|  | 271         """ | 
|  | 272         self.minOverlap = overlap | 
|  | 273 | 
|  | 274 | 
|  | 275     def setPcOverlap(self, overlap): | 
|  | 276         """ | 
|  | 277         Set the percentage of nucleotides to declare an overlap | 
|  | 278         @param overlap: percentage of nucleotides | 
|  | 279         @type  overlap: int | 
|  | 280         """ | 
|  | 281         self.pcOverlap = overlap | 
|  | 282 | 
|  | 283 | 
|  | 284     def setUpstream(self, type, boolean): | 
|  | 285         """ | 
|  | 286         Consider transcripts which are upstream of some transcripts | 
|  | 287         @param type:        whether use query/reference data | 
|  | 288         @type  type:        int | 
|  | 289         @param boolean: consider only these transcripts or not | 
|  | 290         @type  boolean: boolean | 
|  | 291         """ | 
|  | 292         self.upstreams[type] = boolean | 
|  | 293 | 
|  | 294 | 
|  | 295     def setDownstream(self, type, boolean): | 
|  | 296         """ | 
|  | 297         Consider transcripts which are downstream of some transcripts | 
|  | 298         @param type:        whether use query/reference data | 
|  | 299         @type  type:        int | 
|  | 300         @param boolean: consider only these transcripts or not | 
|  | 301         @type  boolean: boolean | 
|  | 302         """ | 
|  | 303         self.downstreams[type] = boolean | 
|  | 304 | 
|  | 305 | 
|  | 306     def setOutputDistance(self, boolean): | 
|  | 307         """ | 
|  | 308         Output distance between query and reference instead of query transcript | 
|  | 309         @param boolean: whether distance should be output | 
|  | 310         @type  boolean: boolean | 
|  | 311         """ | 
|  | 312         self.outputDistance = boolean | 
|  | 313 | 
|  | 314 | 
|  | 315     def setAbsolute(self, boolean): | 
|  | 316         """ | 
|  | 317         Do not consider strand when computing distance (thus, having only non-negative values) | 
|  | 318         @param boolean: whether we should consider strands | 
|  | 319         @type  boolean: boolean | 
|  | 320         """ | 
|  | 321         self.absolute = boolean | 
|  | 322 | 
|  | 323 | 
|  | 324     def setStrandedDistance(self, boolean): | 
|  | 325         """ | 
|  | 326         Return two distance distributions, one per strand | 
|  | 327         @param boolean: whether we should return 2 distance distance | 
|  | 328         @type  boolean: boolean | 
|  | 329         """ | 
|  | 330         self.strandedDistance = boolean | 
|  | 331 | 
|  | 332 | 
|  | 333     def getColinearOnly(self, boolean): | 
|  | 334         """ | 
|  | 335         Only consider transcripts that overlap in the same direction | 
|  | 336         @param boolean: whether transcripts should overlap in the same direction | 
|  | 337         @type  boolean: boolean | 
|  | 338         """ | 
|  | 339         self.colinear = boolean | 
|  | 340 | 
|  | 341 | 
|  | 342     def getAntisenseOnly(self, boolean): | 
|  | 343         """ | 
|  | 344         Only consider transcripts that overlap in the opposite direction | 
|  | 345         @param boolean: whether transcripts should overlap in the opposite direction | 
|  | 346         @type  boolean: boolean | 
|  | 347         """ | 
|  | 348         self.antisense = boolean | 
|  | 349 | 
|  | 350 | 
|  | 351     def setIncludedOnly(self, boolean): | 
|  | 352         """ | 
|  | 353         Keep the elements from first set which are included in the second set | 
|  | 354         @param boolean: whether to keep included elements only | 
|  | 355         @type  boolean: boolean | 
|  | 356         """ | 
|  | 357         self.included = boolean | 
|  | 358 | 
|  | 359 | 
|  | 360     def setIncludingOnly(self, boolean): | 
|  | 361         """ | 
|  | 362         Keep the elements from second set which are included in the first set | 
|  | 363         @param boolean: whether to keep included elements only | 
|  | 364         @type  boolean: boolean | 
|  | 365         """ | 
|  | 366         self.including = boolean | 
|  | 367 | 
|  | 368 | 
|  | 369     def setNormalization(self, boolean): | 
|  | 370         """ | 
|  | 371         Normalize the elements by the number of mappings in the genome | 
|  | 372         @param boolean: whether normalize | 
|  | 373         @type  boolean: boolean | 
|  | 374         """ | 
|  | 375         self.normalization = boolean | 
|  | 376 | 
|  | 377 | 
|  | 378     def getInvert(self, boolean): | 
|  | 379         """ | 
|  | 380         Only consider transcripts that do not overlap | 
|  | 381         @param boolean: whether invert the selection | 
|  | 382         @type  boolean: boolean | 
|  | 383         """ | 
|  | 384         self.invert = boolean | 
|  | 385 | 
|  | 386 | 
|  | 387     def includeNotOverlapping(self, boolean): | 
|  | 388         """ | 
|  | 389         Also output the elements which do not overlap | 
|  | 390         @param boolean: whether output the elements which do not overlap | 
|  | 391         @type  boolean: boolean | 
|  | 392         """ | 
|  | 393         self.notOverlapping = boolean | 
|  | 394 | 
|  | 395 | 
|  | 396     def setSplitDifference(self, boolean): | 
|  | 397         """ | 
|  | 398         Split into intervals when computing difference | 
|  | 399         @param boolean: whether to split | 
|  | 400         @type  boolean: boolean | 
|  | 401         """ | 
|  | 402         self.splitDifference = boolean | 
|  | 403 | 
|  | 404 | 
|  | 405     def aggregate(self, boolean): | 
|  | 406         """ | 
|  | 407         In merge mode, aggregate multiple transcripts | 
|  | 408         @param boolean: aggregate multiple transcripts | 
|  | 409         @type  boolean: boolean | 
|  | 410         """ | 
|  | 411         self.multiple = boolean | 
|  | 412 | 
|  | 413 | 
|  | 414     def getTables(self, type): | 
|  | 415         """ | 
|  | 416         Get the SQL tables | 
|  | 417         @param type: type of the table (query, reference, etc.) | 
|  | 418         @type  type: int | 
|  | 419         """ | 
|  | 420         return self.mySqlTranscriptWriters[type].getTables() | 
|  | 421 | 
|  | 422 | 
|  | 423     def computeOdds(self, boolean): | 
|  | 424         """ | 
|  | 425         Compute odds | 
|  | 426         @param boolean: whether odds should be computed | 
|  | 427         @type  boolean: boolean | 
|  | 428         """ | 
|  | 429         self.odds = boolean | 
|  | 430         if self.odds: | 
|  | 431             self.overlapResults = dict() | 
|  | 432 | 
|  | 433 | 
|  | 434     def computeOddsPerTranscript(self, boolean): | 
|  | 435         """ | 
|  | 436         Compute odds for each transcript | 
|  | 437         @param boolean: whether odds for each transcript should be computed | 
|  | 438         @type  boolean: boolean | 
|  | 439         """ | 
|  | 440         self.odds = boolean | 
|  | 441         if self.odds: | 
|  | 442             self.overlapResults = dict() | 
|  | 443 | 
|  | 444 | 
|  | 445     def removeTables(self): | 
|  | 446         """ | 
|  | 447         Remove the temporary MySQL tables | 
|  | 448         """ | 
|  | 449         for type in self.INPUTWORKINGTYPES: | 
|  | 450             for chromosome in self.getTables(type): | 
|  | 451                 self.getTables(type)[chromosome].remove() | 
|  | 452 | 
|  | 453 | 
|  | 454     def clearTables(self): | 
|  | 455         """ | 
|  | 456         Empty the content of the databases | 
|  | 457         """ | 
|  | 458         for type in self.INPUTWORKINGTYPES: | 
|  | 459             if self.transcriptListParsers[type] != None: | 
|  | 460                 for chromosome in self.getTables(type): | 
|  | 461                     self.getTables(type)[chromosome].clear() | 
|  | 462 | 
|  | 463 | 
|  | 464     def extendTranscript(self, type, transcript): | 
|  | 465         """ | 
|  | 466         Extend a transcript corresponding to the parameters of the class | 
|  | 467         @param transcript: a transcript | 
|  | 468         @type  transcript: class L{Transcript<Transcript>} | 
|  | 469         @return: the possibly extended transcript | 
|  | 470         """ | 
|  | 471         extendedTranscript = Transcript() | 
|  | 472         extendedTranscript.copy(transcript) | 
|  | 473         if self.starts[type] != None: | 
|  | 474             extendedTranscript.restrictStart(self.starts[type]) | 
|  | 475         if self.ends[type] != None: | 
|  | 476             extendedTranscript.restrictEnd(self.ends[type]) | 
|  | 477         if self.fivePrimes[type] != None: | 
|  | 478             extendedTranscript.extendStart(self.fivePrimes[type]) | 
|  | 479         if self.threePrimes[type] != None: | 
|  | 480             extendedTranscript.extendEnd(self.threePrimes[type]) | 
|  | 481         return extendedTranscript | 
|  | 482 | 
|  | 483 | 
|  | 484     def storeTranscript(self, type, transcript, now = True): | 
|  | 485         """ | 
|  | 486         Add a transcript to a MySQL database, or postpone the store | 
|  | 487         @param type:       whether use query/reference table | 
|  | 488         @type  type:       int | 
|  | 489         @param transcript: a transcript | 
|  | 490         @type  transcript: class L{Transcript<Transcript>} | 
|  | 491         @param now:        whether transcript should be stored now (or stored can be postponed) | 
|  | 492         @type  now:        bool | 
|  | 493         """ | 
|  | 494         self.mySqlTranscriptWriters[type].addTranscript(transcript) | 
|  | 495         if type == self.REFERENCE: | 
|  | 496             self.mySqlTranscriptWriters[self.WORKING].addTranscript(transcript) | 
|  | 497         if now: | 
|  | 498             self.mySqlTranscriptWriters[type].write() | 
|  | 499             if type == self.REFERENCE: | 
|  | 500                 self.mySqlTranscriptWriters[self.WORKING].write() | 
|  | 501 | 
|  | 502 | 
|  | 503     def writeTranscript(self, transcript): | 
|  | 504         """ | 
|  | 505         Write a transcript in the output file | 
|  | 506         @param transcript: a transcript | 
|  | 507         @type  transcript: class L{Transcript<Transcript>} | 
|  | 508         """ | 
|  | 509         if self.writer != None: | 
|  | 510             self.writer.addTranscript(transcript) | 
|  | 511             self.nbPrinted += 1 | 
|  | 512 | 
|  | 513 | 
|  | 514     def flushData(self, type = None): | 
|  | 515         """ | 
|  | 516         Store the remaining transcripts | 
|  | 517         @param type: whether use query/reference table (None for all) | 
|  | 518         @type  type: int or None | 
|  | 519         """ | 
|  | 520         if type == None: | 
|  | 521             types = self.INPUTWORKINGTYPES | 
|  | 522         else: | 
|  | 523             types = [type] | 
|  | 524         for type in types: | 
|  | 525             self.mySqlTranscriptWriters[type].write() | 
|  | 526         if self.writer != None: | 
|  | 527             self.writer.write() | 
|  | 528 | 
|  | 529 | 
|  | 530     def unstoreTranscript(self, type, transcript): | 
|  | 531         """ | 
|  | 532         Remove a transcript from a MySQL database | 
|  | 533         @param type:       whether use query/reference table | 
|  | 534         @type  type:       int | 
|  | 535         @param transcript: a transcript | 
|  | 536         @type  transcript: class L{Transcript<Transcript>} | 
|  | 537         """ | 
|  | 538         self.getTables(type)[transcript.getChromosome()].removeTranscript(transcript) | 
|  | 539         if type == self.REFERENCE: | 
|  | 540             self.getTables(self.WORKING)[transcript.getChromosome()].removeTranscript(transcript) | 
|  | 541 | 
|  | 542 | 
|  | 543     def addIndexes(self, tables): | 
|  | 544         """ | 
|  | 545         Add useful indexes to the tables | 
|  | 546         @param tables: which tables should be indexed | 
|  | 547         @type  tables: list of int | 
|  | 548         """ | 
|  | 549         for type in tables: | 
|  | 550             for chromosome in self.getTables(type): | 
|  | 551                 self.getTables(type)[chromosome].createIndex("iStart_transcript_%s_%d_%d" % (chromosome, type, random.randint(0, 100000)), ["start"]) | 
|  | 552                 self.getTables(type)[chromosome].exonsTable.createIndex("iTranscriptId_exon_%s_%d_%d" % (chromosome, type, random.randint(0, 100000)), ["transcriptId"]) | 
|  | 553 | 
|  | 554 | 
|  | 555     def storeTranscriptList(self, type, transcriptListParser, extension): | 
|  | 556         """ | 
|  | 557         Store a transcript list into database | 
|  | 558         @param type:      whether use query/reference parser | 
|  | 559         @type  type:      int | 
|  | 560         @param parser:    a parser of transcript list | 
|  | 561         @type  parser:    class L{TranscriptContainer<TranscriptContainer>} | 
|  | 562         @param extension: extend (or not) the transcripts | 
|  | 563         @type  extension: boolean | 
|  | 564         """ | 
|  | 565         progress = Progress(transcriptListParser.getNbTranscripts(), "Writing transcripts for %s" % ("query" if type == self.QUERY else "reference"), self.verbosity-1) | 
|  | 566         for transcript in transcriptListParser.getIterator(): | 
|  | 567             if extension: | 
|  | 568                 transcript = self.extendTranscript(type, transcript) | 
|  | 569             self.mySqlTranscriptWriters[type].addTranscript(transcript) | 
|  | 570             progress.inc() | 
|  | 571         self.mySqlTranscriptWriters[type].write() | 
|  | 572         progress.done() | 
|  | 573         if type == self.REFERENCE: | 
|  | 574             for chromosome in self.getTables(self.REFERENCE): | 
|  | 575                 self.getTables(self.WORKING)[chromosome] = MySqlTranscriptTable(self.mySqlConnection, self.tableNames[self.WORKING], chromosome, self.verbosity-1) | 
|  | 576                 self.getTables(self.WORKING)[chromosome].copy(self.getTables(self.REFERENCE)[chromosome]) | 
|  | 577 | 
|  | 578 | 
|  | 579     def setInputTranscriptContainer(self, type, inputTranscriptContainer): | 
|  | 580         """ | 
|  | 581         Set an input transcript list container | 
|  | 582         @param type:                     whether use query/reference parser | 
|  | 583         @type  type:                     int | 
|  | 584         @param inputTranscriptContainer: a container | 
|  | 585         @type  inputTranscriptContainer: class L{TranscriptContainer<TranscriptContainer>} | 
|  | 586         """ | 
|  | 587         self.inputTranscriptContainers[type] = inputTranscriptContainer | 
|  | 588         self.nbTranscripts[type]             = self.inputTranscriptContainers[type].getNbTranscripts() | 
|  | 589         self.nbNucleotides[type]             = self.inputTranscriptContainers[type].getNbNucleotides() | 
|  | 590 | 
|  | 591 | 
|  | 592     def setOutputWriter(self, writer): | 
|  | 593         """ | 
|  | 594         Set an output transcript list writer | 
|  | 595         @param writer: a writer | 
|  | 596         @type  writer: class L{TranscriptListWriter<TranscriptListWriter>} | 
|  | 597         """ | 
|  | 598         self.writer = writer | 
|  | 599 | 
|  | 600 | 
|  | 601     def compareTranscript(self, transcript1, transcript2, includeDistance = False): | 
|  | 602         """ | 
|  | 603         Compare two transcripts, using user defined parameters | 
|  | 604         @param transcript1:     a transcript from the query set (already extended) | 
|  | 605         @type  transcript1:     class L{Transcript<Transcript>} | 
|  | 606         @param transcript2:     a transcript from the reference set (already extended) | 
|  | 607         @type  transcript2:     class L{Transcript<Transcript>} | 
|  | 608         @param includeDistance: take into account the distance too | 
|  | 609         @type  includeDistance: boolean | 
|  | 610         @return:                true, if they overlap | 
|  | 611         """ | 
|  | 612         extendedTranscript1 = Transcript() | 
|  | 613         extendedTranscript1.copy(transcript1) | 
|  | 614         if includeDistance: | 
|  | 615             if self.maxDistance > 0: | 
|  | 616                 extendedTranscript1.extendStart(self.maxDistance) | 
|  | 617                 extendedTranscript1.extendEnd(self.maxDistance) | 
|  | 618 | 
|  | 619         minOverlap = self.minOverlap | 
|  | 620         if self.pcOverlap != None: | 
|  | 621             minOverlap = max(minOverlap, transcript1.getSize() / 100.0 * self.pcOverlap) | 
|  | 622         if not extendedTranscript1.overlapWith(transcript2, self.minOverlap): | 
|  | 623             return False | 
|  | 624         if (self.downstreams[self.QUERY]       and transcript2.getStart() > extendedTranscript1.getStart()) or \ | 
|  | 625              (self.upstreams[self.QUERY]       and transcript2.getEnd()   < extendedTranscript1.getEnd())   or \ | 
|  | 626              (self.downstreams[self.REFERENCE] and extendedTranscript1.getStart() > transcript2.getStart()) or \ | 
|  | 627              (self.upstreams[self.REFERENCE]   and extendedTranscript1.getEnd()   < transcript2.getEnd()): | 
|  | 628             return False | 
|  | 629         if (self.antisense and extendedTranscript1.getDirection() == transcript2.getDirection()) or (self.colinear and extendedTranscript1.getDirection() != transcript2.getDirection()): | 
|  | 630             return False | 
|  | 631         if self.included and not transcript2.include(extendedTranscript1): | 
|  | 632             return False | 
|  | 633         if self.including and not extendedTranscript1.include(transcript2): | 
|  | 634             return False | 
|  | 635         if self.introns[self.REFERENCE] and self.introns[self.QUERY]: | 
|  | 636             if self.logHandle != None: | 
|  | 637                 self.logHandle.write("%s overlaps with intron of %s\n" % (str(extendedTranscript1), str(transcript2))) | 
|  | 638             return True | 
|  | 639         if (not self.introns[self.REFERENCE]) and (not self.introns[self.QUERY]) and extendedTranscript1.overlapWithExon(transcript2, minOverlap): | 
|  | 640             if self.logHandle != None: | 
|  | 641                 self.logHandle.write("%s overlaps with exon of %s\n" % (str(extendedTranscript1), str(transcript2))) | 
|  | 642             return True | 
|  | 643         return False | 
|  | 644 | 
|  | 645 | 
|  | 646     def compareTranscriptToList(self, transcript1): | 
|  | 647         """ | 
|  | 648         Compare a transcript to the reference list of transcripts | 
|  | 649         (Do not extend the transcripts, except for the distance) | 
|  | 650         @param transcript1: a transcript (from the query set) | 
|  | 651         @type  transcript1: class L{Transcript<Transcript>} | 
|  | 652         @return:            the reference transcripts overlapping | 
|  | 653         """ | 
|  | 654         # no transcript in the reference table | 
|  | 655         if transcript1.getChromosome() not in self.getTables(self.WORKING): | 
|  | 656             return | 
|  | 657 | 
|  | 658         # retrieve the the transcripts that may overlap in the working tables | 
|  | 659         clauses = [] | 
|  | 660         extendedTranscript1 = Transcript() | 
|  | 661         extendedTranscript1.copy(transcript1) | 
|  | 662         if self.maxDistance > 0: | 
|  | 663             extendedTranscript1.extendStart(self.maxDistance) | 
|  | 664         if self.maxDistance > 0: | 
|  | 665             extendedTranscript1.extendEnd(self.maxDistance) | 
|  | 666         command = "SELECT * FROM %s WHERE (" % (self.getTables(self.WORKING)[transcript1.getChromosome()].getName()) | 
|  | 667         for binPair in extendedTranscript1.getBins(): | 
|  | 668             clause = "bin " | 
|  | 669             if binPair[0] == binPair[1]: | 
|  | 670                 clause += "= %i" % (binPair[0]) | 
|  | 671             else: | 
|  | 672                 clause += "BETWEEN %i AND %i" % (binPair[0], binPair[1]) | 
|  | 673             clauses.append(clause) | 
|  | 674         command += " OR ".join(clauses) | 
|  | 675         command += ") AND start <= %d AND end >= %d" % (extendedTranscript1.getEnd(), extendedTranscript1.getStart()) | 
|  | 676 | 
|  | 677         for index2, transcript2 in self.getTables(self.REFERENCE)[transcript1.getChromosome()].selectTranscripts(command): | 
|  | 678             if self.compareTranscript(extendedTranscript1, transcript2): | 
|  | 679                 yield transcript2 | 
|  | 680 | 
|  | 681 | 
|  | 682     def compareTranscriptList(self): | 
|  | 683         """ | 
|  | 684         Compare a list of transcript to the reference one | 
|  | 685         @return: the transcripts that overlap with the reference set | 
|  | 686         """ | 
|  | 687         distance      = 0 | 
|  | 688         nbClustersIn  = 0 | 
|  | 689         nbClustersOut = 0 | 
|  | 690         if self.maxDistance != None: | 
|  | 691             distance = self.maxDistance | 
|  | 692 | 
|  | 693         self.addIndexes([self.QUERY, self.REFERENCE]) | 
|  | 694 | 
|  | 695         # export the container into tables | 
|  | 696         self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) | 
|  | 697         self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True) | 
|  | 698 | 
|  | 699         # looping | 
|  | 700         for chromosome1 in sorted(self.getTables(self.QUERY).keys()): | 
|  | 701             # get range of transcripts | 
|  | 702             command = "SELECT MIN(start), MAX(end), COUNT(id) FROM %s" % (self.getTables(self.QUERY)[chromosome1].getName()) | 
|  | 703             query     = self.mySqlConnection.executeQuery(command) | 
|  | 704             result    = query.getLine() | 
|  | 705             first     = result[0] | 
|  | 706             last      = result[1] | 
|  | 707             nb        = result[2] | 
|  | 708 | 
|  | 709             transcripts1 = [] | 
|  | 710             toBeRemoved1 = [] | 
|  | 711             transcripts2 = [] | 
|  | 712             toBeRemoved2 = [] | 
|  | 713             overlapsWith = [] | 
|  | 714             nbOverlaps   = [] | 
|  | 715             nbChunks     = max(1, nb / 100) | 
|  | 716             chunkSize    = (last - first) / nbChunks | 
|  | 717             progress     = Progress(nbChunks + 1, "Analyzing chromosome %s" % (chromosome1), self.verbosity-1) | 
|  | 718             for chunk in range(nbChunks + 1): | 
|  | 719 | 
|  | 720                 # load transcripts | 
|  | 721                 start   = first + chunk * chunkSize | 
|  | 722                 end     = start + chunkSize - 1 | 
|  | 723                 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.QUERY)[chromosome1].getName(), start, end-1) | 
|  | 724                 for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command): | 
|  | 725                     transcripts1.append(transcript1) | 
|  | 726                     overlapsWith.append([]) | 
|  | 727                     nbOverlaps.append(0) | 
|  | 728                     nbClustersIn += 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements") | 
|  | 729                 command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.QUERY)[chromosome1].getName(), end) | 
|  | 730                 self.mySqlConnection.executeQuery(command) | 
|  | 731                 if chromosome1 in self.getTables(self.REFERENCE): | 
|  | 732                     command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), start-distance, end+distance-1) | 
|  | 733                     if chunk == 0: | 
|  | 734                         command = "SELECT * FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) | 
|  | 735                     for index2, transcript2 in self.getTables(self.REFERENCE)[chromosome1].selectTranscripts(command): | 
|  | 736                         transcripts2.append(transcript2) | 
|  | 737                     command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) | 
|  | 738                     self.mySqlConnection.executeQuery(command) | 
|  | 739 | 
|  | 740                 # compare sets | 
|  | 741                 for index1, transcript1 in enumerate(transcripts1): | 
|  | 742                     overlappingNames = [] | 
|  | 743                     nbElements1      = 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements") | 
|  | 744                     for transcript2 in transcripts2: | 
|  | 745                         if self.compareTranscript(transcript1, transcript2, True): | 
|  | 746                             id2 = transcript2.getTagValue("ID") if "ID" in transcript2.getTagNames() else transcript2.getName() | 
|  | 747                             if id2 not in overlapsWith[index1]: | 
|  | 748                                 overlapsWith[index1].append(id2) | 
|  | 749                                 nbOverlaps[index1] += 1 if "nbElements" not in transcript2.getTagNames() else transcript2.getTagValue("nbElements") | 
|  | 750                                 if self.odds: | 
|  | 751                                     if transcript2.getName() not in self.overlapResults: | 
|  | 752                                         self.overlapResults[transcript2.getName()] = 0 | 
|  | 753                                     self.overlapResults[transcript2.getName()] += nbElements1 | 
|  | 754 | 
|  | 755                     # check if query transcript extends bounds of the chunk | 
|  | 756                     if transcript1.getEnd() < end: | 
|  | 757                         if Utils.xor(overlapsWith[index1], self.invert) or self.notOverlapping: | 
|  | 758                             if overlapsWith[index1]: | 
|  | 759                                 transcript1.setTagValue("overlapWith", ",".join(overlapsWith[index1])[:100]) | 
|  | 760                                 transcript1.setTagValue("nbOverlaps", "%d" % (nbOverlaps[index1])) | 
|  | 761                             elif self.notOverlapping: | 
|  | 762                                 transcript1.setTagValue("nbOverlaps", "0") | 
|  | 763                             self.writeTranscript(transcript1) | 
|  | 764                             nbClustersOut += nbElements1 | 
|  | 765                         toBeRemoved1.append(index1) | 
|  | 766 | 
|  | 767                 # update list of query transcripts | 
|  | 768                 for index1 in reversed(toBeRemoved1): | 
|  | 769                     del transcripts1[index1] | 
|  | 770                     del overlapsWith[index1] | 
|  | 771                     del nbOverlaps[index1] | 
|  | 772                 toBeRemoved1 = [] | 
|  | 773 | 
|  | 774                 # check if the reference transcripts extends bounds of the chunk | 
|  | 775                 for index2, transcript2 in enumerate(transcripts2): | 
|  | 776                     if transcript2.getEnd() + distance < end: | 
|  | 777                         toBeRemoved2.append(index2) | 
|  | 778                 for index2 in reversed(toBeRemoved2): | 
|  | 779                     del transcripts2[index2] | 
|  | 780                 toBeRemoved2 = [] | 
|  | 781 | 
|  | 782                 progress.inc() | 
|  | 783 | 
|  | 784             for index1, transcript1 in enumerate(transcripts1): | 
|  | 785                 if Utils.xor(overlapsWith[index1], self.invert) or self.notOverlapping: | 
|  | 786                     if overlapsWith[index1]: | 
|  | 787                         transcript1.setTagValue("overlapWith", ",".join(overlapsWith[index1])[:100]) | 
|  | 788                         transcript1.setTagValue("nbOverlaps", "%d" % (nbOverlaps[index1])) | 
|  | 789                     elif self.notOverlapping: | 
|  | 790                         transcript1.setTagValue("nbOverlaps", "0") | 
|  | 791                     self.writeTranscript(transcript1) | 
|  | 792                     nbClustersOut += 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements") | 
|  | 793             progress.done() | 
|  | 794             self.getTables(self.QUERY)[chromosome1].remove() | 
|  | 795             if chromosome1 in self.getTables(self.REFERENCE): | 
|  | 796                 self.getTables(self.REFERENCE)[chromosome1].remove() | 
|  | 797                 self.getTables(self.WORKING)[chromosome1].remove() | 
|  | 798 | 
|  | 799         self.flushData() | 
|  | 800         if self.writer != None: | 
|  | 801             self.writer.close() | 
|  | 802             self.writer = None | 
|  | 803 | 
|  | 804         if self.verbosity > 0: | 
|  | 805             print "reference: %d elements" % (self.nbTranscripts[self.REFERENCE]) | 
|  | 806             print "query:     %d elements, %d clustered" % (self.nbTranscripts[self.QUERY], nbClustersIn) | 
|  | 807             if self.nbTranscripts[self.QUERY] != 0: | 
|  | 808                 print "output:    %d elements (%.2f%%)"% (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100), | 
|  | 809                 if nbClustersOut != 0: | 
|  | 810                     print ", %d clustered (%.2f%%)" % (nbClustersOut, float(nbClustersOut) / nbClustersIn * 100) | 
|  | 811 | 
|  | 812 | 
|  | 813     def compareTranscriptListDistance(self): | 
|  | 814         """ | 
|  | 815         Compare a list of transcript to the reference one | 
|  | 816         @return: the distance distributions in a hash | 
|  | 817         """ | 
|  | 818         nbDistances  = 0 | 
|  | 819         distances    = {} | 
|  | 820         absDistances = {} | 
|  | 821         strandedDistances = dict([(strand, {}) for strand in (1, -1)]) | 
|  | 822 | 
|  | 823         # export the container into tables | 
|  | 824         self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) | 
|  | 825         self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True) | 
|  | 826 | 
|  | 827         progress = Progress(self.nbTranscripts[self.QUERY], "Analyzing chromosomes", self.verbosity-1) | 
|  | 828         for transcript1 in self.inputTranscriptContainers[self.QUERY].getIterator(): | 
|  | 829             # get the distance | 
|  | 830             transcript1    = self.extendTranscript(self.QUERY, transcript1) | 
|  | 831             distance       = self.maxDistance + 1 | 
|  | 832             strand         = None | 
|  | 833             closestElement = "None" | 
|  | 834             for transcript2 in self.compareTranscriptToList(transcript1): | 
|  | 835                 thisStrand = transcript1.getDirection() * transcript2.getDirection() | 
|  | 836                 if self.antisense or (not self.colinear and transcript1.getDirection() != transcript2.getDirection()): | 
|  | 837                     transcript2.reverse() | 
|  | 838                 if self.absolute: | 
|  | 839                     transcript2.setDirection(transcript1.getDirection()) | 
|  | 840                 if transcript2.getDirection() == transcript1.getDirection(): | 
|  | 841                     if self.starts[self.REFERENCE] != None: | 
|  | 842                         transcript2.restrictStart(self.starts[self.REFERENCE]) | 
|  | 843                     if self.ends[self.REFERENCE] != None: | 
|  | 844                         transcript2.restrictEnd(self.ends[self.REFERENCE]) | 
|  | 845                     thisDistance = transcript1.getRelativeDistance(transcript2) | 
|  | 846                     if (self.absolute): | 
|  | 847                         thisDistance = abs(thisDistance) | 
|  | 848                     if abs(thisDistance) < abs(distance): | 
|  | 849                         distance       = thisDistance | 
|  | 850                         strand         = thisStrand | 
|  | 851                         closestElement = transcript2.getTagValue("ID") if "ID" in transcript2.getTagNames() else transcript2.getName() | 
|  | 852             if (distance <= self.maxDistance) and (self.minDistance == None or distance >= self.minDistance): | 
|  | 853                 nbDistances                        += 1 | 
|  | 854                 distances[distance]                 = distances.get(distance, 0) + 1 | 
|  | 855                 absDistance                         = abs(distance) | 
|  | 856                 absDistances[absDistance]           = absDistances.get(absDistance, 0) + 1 | 
|  | 857                 strandedDistances[strand][distance] = strandedDistances[strand].get(distance, 0) | 
|  | 858                 if distance not in strandedDistances[-strand]: | 
|  | 859                     strandedDistances[-strand][distance] = 0 | 
|  | 860 | 
|  | 861             # write transcript | 
|  | 862             if distance == self.maxDistance + 1: | 
|  | 863                 distance = "None" | 
|  | 864             tmpTranscript = Transcript() | 
|  | 865             tmpTranscript.copy(transcript1) | 
|  | 866             tmpTranscript.setTagValue("distance", distance) | 
|  | 867             tmpTranscript.setTagValue("closestElement", closestElement) | 
|  | 868             self.writeTranscript(tmpTranscript) | 
|  | 869 | 
|  | 870             progress.inc() | 
|  | 871         progress.done() | 
|  | 872 | 
|  | 873         self.flushData() | 
|  | 874 | 
|  | 875         if self.verbosity > 0: | 
|  | 876             print "reference: %d sequences" % (self.nbTranscripts[self.REFERENCE]) | 
|  | 877             print "query:     %d sequences" % (self.nbTranscripts[self.QUERY]) | 
|  | 878             if nbDistances == 0: | 
|  | 879                 print "Nothing matches" | 
|  | 880             else: | 
|  | 881                 print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(absDistances) | 
|  | 882                 print "for %d distances (%.2f%%)" % (nbDistances, float(nbDistances) / self.nbTranscripts[self.QUERY] * 100) | 
|  | 883 | 
|  | 884         if self.strandedDistance: | 
|  | 885             return strandedDistances | 
|  | 886         return distances | 
|  | 887 | 
|  | 888 | 
|  | 889     def compareTranscriptListMerge(self): | 
|  | 890         """ | 
|  | 891         Merge the query list of transcript with itself | 
|  | 892         @return: the merged transcripts in a transcript list database | 
|  | 893         """ | 
|  | 894         nbMerges  = 0 | 
|  | 895 | 
|  | 896         for type in (self.QUERY, self.REFERENCE): | 
|  | 897             self.storeTranscriptList(type, self.inputTranscriptContainers[type], True) | 
|  | 898         self.flushData() | 
|  | 899 | 
|  | 900         # Loop on the chromosomes | 
|  | 901         for chromosome in sorted(self.getTables(self.QUERY).keys()): | 
|  | 902             if chromosome not in self.getTables(self.REFERENCE): | 
|  | 903                 continue | 
|  | 904 | 
|  | 905             # Get the size of the chromosome | 
|  | 906             maxEnd   = 0 | 
|  | 907             nbChunks = 0 | 
|  | 908             for type in (self.QUERY, self.REFERENCE): | 
|  | 909                 command  = "SELECT MAX(end) from %s" % (self.getTables(type)[chromosome].getName()) | 
|  | 910                 query    = self.mySqlConnection.executeQuery(command) | 
|  | 911                 maxEnd   = max(maxEnd, int(query.getLine()[0])) | 
|  | 912                 nbChunks = max(nbChunks, self.getTables(type)[chromosome].getNbElements()) | 
|  | 913 | 
|  | 914             mergedTranscripts = {} | 
|  | 915             transcripts = {self.QUERY: [], self.REFERENCE: []} | 
|  | 916             progress    = Progress(nbChunks, "Analyzing %s" % (chromosome), self.verbosity-1) | 
|  | 917             for i in range(nbChunks): | 
|  | 918                 rangeStart = int(i * (float(maxEnd) / nbChunks)) + 1 | 
|  | 919                 rangeEnd   = int((i+1) * (float(maxEnd) / nbChunks)) | 
|  | 920 | 
|  | 921                 # Get all transcripts in query and reference from chunk | 
|  | 922                 for type in (self.QUERY, self.REFERENCE): | 
|  | 923                     correction = 0 if self.QUERY else self.maxDistance | 
|  | 924                     command = "SELECT * FROM %s WHERE start <= %d" % (self.getTables(type)[chromosome].getName(), rangeEnd + correction) | 
|  | 925                     for index, transcript in self.getTables(type)[chromosome].selectTranscripts(command): | 
|  | 926                         transcripts[type].append(transcript) | 
|  | 927 | 
|  | 928                 # Merge elements between the two samples | 
|  | 929                 for iQuery, queryTranscript in enumerate(transcripts[self.QUERY]): | 
|  | 930                     for iReference, referenceTranscript in enumerate(transcripts[self.REFERENCE]): | 
|  | 931                         if referenceTranscript == None: continue | 
|  | 932                         if self.compareTranscript(queryTranscript, referenceTranscript, True): | 
|  | 933                             if queryTranscript.getDirection() != referenceTranscript.getDirection(): | 
|  | 934                                 referenceTranscript.setDirection(queryTranscript.getDirection()) | 
|  | 935                             queryTranscript.merge(referenceTranscript, self.normalization) | 
|  | 936                             nbMerges += 1 | 
|  | 937                             transcripts[self.REFERENCE][iReference] = None | 
|  | 938                             if not self.multiple: | 
|  | 939                                 mergedTranscripts[iQuery] = 0 | 
|  | 940 | 
|  | 941                 # Remove transcripts from database | 
|  | 942                 for type in (self.QUERY, self.REFERENCE): | 
|  | 943                     correction = 0 if self.QUERY else self.maxDistance | 
|  | 944                     command    = "DELETE FROM %s WHERE start <= %d" % (self.getTables(type)[chromosome].getName(), rangeEnd - correction) | 
|  | 945                     query      = self.mySqlConnection.executeQuery(command) | 
|  | 946 | 
|  | 947                 # Just in case, self-merge the elements in the query (beware of mergedTranscripts!) | 
|  | 948                 if (self.multiple): | 
|  | 949                     for iQuery1, queryTranscript1 in enumerate(transcripts[self.QUERY]): | 
|  | 950                         if queryTranscript1 == None: continue | 
|  | 951                         for iQuery2, queryTranscript2 in enumerate(transcripts[self.QUERY]): | 
|  | 952                             if iQuery2 <= iQuery1 or queryTranscript2 == None: continue | 
|  | 953                             minOverlap = self.minOverlap | 
|  | 954                             if self.pcOverlap != None: | 
|  | 955                                 minOverlap = max(minOverlap, queryTranscript1.getSize() / 100.0 * self.pcOverlap) | 
|  | 956                             if queryTranscript2.overlapWith(queryTranscript1, minOverlap) and (queryTranscript1.getDirection() == queryTranscript2.getDirection() or not self.colinear): | 
|  | 957                                 if queryTranscript1.getDirection() != queryTranscript2.getDirection(): | 
|  | 958                                     queryTranscript2.setDirection(queryTranscript1.getDirection()) | 
|  | 959                                 queryTranscript1.merge(queryTranscript2, self.normalization) | 
|  | 960                                 transcripts[self.QUERY][iQuery2] = None | 
|  | 961                                 nbMerges += 1 | 
|  | 962                                 if not self.multiple: | 
|  | 963                                     mergedTranscripts[iQuery1] = 0 | 
|  | 964 | 
|  | 965                 # Update the sets of transcripts and write into database (also update mergedTranscripts) | 
|  | 966                 newTranscripts = {self.QUERY: [], self.REFERENCE: []} | 
|  | 967                 newMergedTranscripts = {} | 
|  | 968                 for type in (self.QUERY, self.REFERENCE): | 
|  | 969                     for i, transcript in enumerate(transcripts[type]): | 
|  | 970                         if transcript == None: continue | 
|  | 971                         correction = 0 if self.QUERY else self.maxDistance | 
|  | 972                         if transcript.getEnd() < rangeEnd - correction: | 
|  | 973                             if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)): | 
|  | 974                                 self.writeTranscript(transcripts[type][i]) | 
|  | 975                         else: | 
|  | 976                             if type == self.QUERY and i in mergedTranscripts: | 
|  | 977                                 newMergedTranscripts[len(newTranscripts[type])] = 0 | 
|  | 978                             newTranscripts[type].append(transcript) | 
|  | 979                 transcripts = newTranscripts | 
|  | 980                 mergedTranscripts = newMergedTranscripts | 
|  | 981 | 
|  | 982                 progress.inc() | 
|  | 983             progress.done() | 
|  | 984 | 
|  | 985             for type in (self.QUERY, self.REFERENCE): | 
|  | 986                 for i, transcript in enumerate(transcripts[type]): | 
|  | 987                     if transcripts == None: continue | 
|  | 988                     if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)): | 
|  | 989                         self.writeTranscript(transcripts[type][i]) | 
|  | 990 | 
|  | 991         # Manage chromosomes with no corresponding data | 
|  | 992         if self.multiple: | 
|  | 993             for type in self.INPUTTYPES: | 
|  | 994                 for chromosome in self.getTables(type): | 
|  | 995                     if chromosome in self.getTables(1 - type): | 
|  | 996                         continue | 
|  | 997                     for transcript in self.getTables(self.OUTPUT)[chromosome].getIterator(): | 
|  | 998                         self.writeTranscript(transcript) | 
|  | 999 | 
|  | 1000         self.flushData() | 
|  | 1001         if self.writer != None: | 
|  | 1002             self.writer.close() | 
|  | 1003             self.writer = None | 
|  | 1004 | 
|  | 1005         if self.verbosity > 0: | 
|  | 1006             print "query:    %d sequences" % (self.nbTranscripts[self.QUERY]) | 
|  | 1007             print "# merges: %d" % (nbMerges) | 
|  | 1008             print "# printed %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100) | 
|  | 1009 | 
|  | 1010 | 
|  | 1011     def compareTranscriptListSelfMerge(self): | 
|  | 1012         """ | 
|  | 1013         Merge the query list of transcript with itself | 
|  | 1014         @return: the merged transcripts in a transcript list database | 
|  | 1015         """ | 
|  | 1016         nbMerges  = 0 | 
|  | 1017         distance  = self.maxDistance if self.maxDistance != None else 0 | 
|  | 1018 | 
|  | 1019         self.addIndexes([self.QUERY]) | 
|  | 1020         self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) | 
|  | 1021         self.flushData() | 
|  | 1022 | 
|  | 1023         # looping | 
|  | 1024         for chromosome1 in sorted(self.getTables(self.QUERY).keys()): | 
|  | 1025             transcripts2 = [] | 
|  | 1026 | 
|  | 1027             # get range of transcripts | 
|  | 1028             progress = Progress(self.getTables(self.QUERY)[chromosome1].getNbElements(), "Analyzing chromosome %s" % (chromosome1), self.verbosity-1) | 
|  | 1029             command  = "SELECT * FROM %s ORDER BY start" % (self.getTables(self.QUERY)[chromosome1].getName()) | 
|  | 1030             for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command): | 
|  | 1031 | 
|  | 1032                 # compare sets | 
|  | 1033                 toBeRemoved = set() | 
|  | 1034                 toBePrinted = set() | 
|  | 1035                 for index2, transcript2 in enumerate(transcripts2): | 
|  | 1036 | 
|  | 1037                     if self.compareTranscript(transcript1, transcript2, True): | 
|  | 1038                         if transcript1.getDirection() != transcript2.getDirection(): | 
|  | 1039                             transcript2.setDirection(transcript1.getDirection()) | 
|  | 1040                         transcript1.merge(transcript2, self.normalization) | 
|  | 1041                         toBeRemoved.add(index2) | 
|  | 1042                         nbMerges += 1 | 
|  | 1043                     elif transcript2.getEnd() + distance < transcript1.getStart(): | 
|  | 1044                         toBePrinted.add(index2) | 
|  | 1045                 transcripts2.append(transcript1) | 
|  | 1046 | 
|  | 1047                 for index2 in sorted(toBePrinted): | 
|  | 1048                     self.writeTranscript(transcripts2[index2]) | 
|  | 1049                 transcripts2 = [transcripts2[index2] for index2 in range(len(transcripts2)) if index2 not in (toBeRemoved | toBePrinted)] | 
|  | 1050 | 
|  | 1051             for transcript2 in transcripts2: | 
|  | 1052                 self.writeTranscript(transcript2) | 
|  | 1053             progress.done() | 
|  | 1054             self.getTables(self.QUERY)[chromosome1].remove() | 
|  | 1055 | 
|  | 1056         self.flushData() | 
|  | 1057         if self.writer != None: | 
|  | 1058             self.writer.close() | 
|  | 1059             self.writer = None | 
|  | 1060 | 
|  | 1061         if self.verbosity > 0: | 
|  | 1062             print "query:    %d sequences" % (self.nbTranscripts[self.QUERY]) | 
|  | 1063             print "# merges: %d" % (nbMerges) | 
|  | 1064             print "# printed %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100) | 
|  | 1065 | 
|  | 1066 | 
|  | 1067     def getDifferenceTranscriptList(self): | 
|  | 1068         """ | 
|  | 1069         Get the elements of the first list which do not overlap the second list (at the nucleotide level) | 
|  | 1070         @return: the transcripts that overlap with the reference set | 
|  | 1071         """ | 
|  | 1072         distance = 0 if self.maxDistance == None else self.maxDistance | 
|  | 1073 | 
|  | 1074         self.addIndexes([self.QUERY, self.REFERENCE]) | 
|  | 1075 | 
|  | 1076         # export the container into tables | 
|  | 1077         self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) | 
|  | 1078         self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True) | 
|  | 1079 | 
|  | 1080         # looping | 
|  | 1081         for chromosome1 in sorted(self.getTables(self.QUERY).keys()): | 
|  | 1082             # get range of transcripts | 
|  | 1083             command = "SELECT MIN(start), MAX(end), COUNT(id) FROM %s" % (self.getTables(self.QUERY)[chromosome1].getName()) | 
|  | 1084             query   = self.mySqlConnection.executeQuery(command) | 
|  | 1085             result  = query.getLine() | 
|  | 1086             first   = result[0] | 
|  | 1087             last    = result[1] | 
|  | 1088             nb      = result[2] | 
|  | 1089 | 
|  | 1090             transcripts1 = [] | 
|  | 1091             transcripts2 = [] | 
|  | 1092             nbChunks     = max(1, nb / 100) | 
|  | 1093             chunkSize    = (last - first) / nbChunks | 
|  | 1094             progress     = Progress(nbChunks + 1, "Analyzing chromosome %s" % (chromosome1), self.verbosity-1) | 
|  | 1095             for chunk in range(nbChunks + 1): | 
|  | 1096 | 
|  | 1097                 # load transcripts | 
|  | 1098                 start   = first + chunk * chunkSize | 
|  | 1099                 end     = start + chunkSize - 1 | 
|  | 1100                 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.QUERY)[chromosome1].getName(), start, end-1) | 
|  | 1101                 for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command): | 
|  | 1102                     transcripts1.append(transcript1) | 
|  | 1103                 command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.QUERY)[chromosome1].getName(), end) | 
|  | 1104                 self.mySqlConnection.executeQuery(command) | 
|  | 1105                 if chromosome1 in self.getTables(self.REFERENCE): | 
|  | 1106                     command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), start-distance, end+distance-1) | 
|  | 1107                     if chunk == 0: | 
|  | 1108                         command = "SELECT * FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) | 
|  | 1109                     for index2, transcript2 in self.getTables(self.REFERENCE)[chromosome1].selectTranscripts(command): | 
|  | 1110                         transcripts2.append(transcript2) | 
|  | 1111                     command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) | 
|  | 1112                     self.mySqlConnection.executeQuery(command) | 
|  | 1113 | 
|  | 1114                 # compare sets | 
|  | 1115                 toBeRemoved1 = [] | 
|  | 1116                 for index1, transcript1 in enumerate(transcripts1): | 
|  | 1117                     newTranscript1 = Transcript() | 
|  | 1118                     newTranscript1.copy(transcript1) | 
|  | 1119                     for transcript2 in transcripts2: | 
|  | 1120                         newTranscript1 = newTranscript1.getDifference(transcript2) | 
|  | 1121                         if newTranscript1 == None: | 
|  | 1122                             toBeRemoved1.append(index1) | 
|  | 1123                             break | 
|  | 1124                     transcripts1[index1] = newTranscript1 | 
|  | 1125 | 
|  | 1126                     # check if query transcript extends bounds of the chunk | 
|  | 1127                     if newTranscript1 != None and newTranscript1.getEnd() < end: | 
|  | 1128                         if self.splitDifference: | 
|  | 1129                             for exon in newTranscript1.getExons(): | 
|  | 1130                                 transcript = Transcript() | 
|  | 1131                                 transcript.copy(exon) | 
|  | 1132                                 self.writeTranscript(transcript) | 
|  | 1133                         else: | 
|  | 1134                             self.writeTranscript(newTranscript1) | 
|  | 1135                         toBeRemoved1.append(index1) | 
|  | 1136 | 
|  | 1137                 # update list of query transcripts | 
|  | 1138                 for index1 in reversed(toBeRemoved1): | 
|  | 1139                     del transcripts1[index1] | 
|  | 1140 | 
|  | 1141                 # check if the reference transcripts extends bounds of the chunk | 
|  | 1142                 toBeRemoved2 = [] | 
|  | 1143                 for index2, transcript2 in enumerate(transcripts2): | 
|  | 1144                     if transcript2.getEnd() + distance < end: | 
|  | 1145                         toBeRemoved2.append(index2) | 
|  | 1146                 for index2 in reversed(toBeRemoved2): | 
|  | 1147                     del transcripts2[index2] | 
|  | 1148 | 
|  | 1149                 progress.inc() | 
|  | 1150 | 
|  | 1151             for transcript1 in transcripts1: | 
|  | 1152                 if self.splitDifference: | 
|  | 1153                     for exon in transcript1.getExons(): | 
|  | 1154                         transcript = Transcript() | 
|  | 1155                         transcript.copy(exon) | 
|  | 1156                         self.writeTranscript(transcript) | 
|  | 1157                 else: | 
|  | 1158                     self.writeTranscript(transcript1) | 
|  | 1159             progress.done() | 
|  | 1160             self.getTables(self.QUERY)[chromosome1].remove() | 
|  | 1161             if chromosome1 in self.getTables(self.REFERENCE): | 
|  | 1162                 self.getTables(self.REFERENCE)[chromosome1].remove() | 
|  | 1163                 self.getTables(self.WORKING)[chromosome1].remove() | 
|  | 1164 | 
|  | 1165         self.flushData() | 
|  | 1166         if self.writer != None: | 
|  | 1167             self.writer.close() | 
|  | 1168             self.writer = None | 
|  | 1169 | 
|  | 1170         if self.verbosity > 0: | 
|  | 1171             print "query:     %d elements" % (self.nbTranscripts[self.QUERY]) | 
|  | 1172             print "reference: %d elements" % (self.nbTranscripts[self.REFERENCE]) | 
|  | 1173             print "# printed: %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100) | 
|  | 1174 | 
|  | 1175 | 
|  | 1176     def getOddsPerTranscript(self): | 
|  | 1177         """ | 
|  | 1178         Return overlap results | 
|  | 1179         @return a dict of data | 
|  | 1180         """ | 
|  | 1181         if not self.odds: | 
|  | 1182             raise Exception("Did not compute odds!") | 
|  | 1183         return self.overlapResults | 
|  | 1184 | 
|  | 1185 | 
|  | 1186     def getOdds(self): | 
|  | 1187         """ | 
|  | 1188         Return odds about the overlap | 
|  | 1189         @return a dict of data | 
|  | 1190         """ | 
|  | 1191         if not self.odds: | 
|  | 1192             raise Exception("Did not compute odds!") | 
|  | 1193         if self.oddResults != None: | 
|  | 1194             return self.oddResults | 
|  | 1195         self.oddResults = {} | 
|  | 1196         for name, value in self.overlapResults.iteritems(): | 
|  | 1197             self.oddResults[value] = self.oddResults.get(value, 0) + 1 | 
|  | 1198         return self.oddResults |