s_mart: SMART/Java/Python/structure/TranscriptListsComparator.py comparison

comparison SMART/Java/Python/structure/TranscriptListsComparator.py @ 38:2c0c0a89fad7

Uploaded

author	m-zytnicki
date	Thu, 02 May 2013 09:56:47 -0400
parents	769e306b7933
children

comparison

equal deleted inserted replaced

-:d22fadc825e3
+:2c0c0a89fad7
+#
+# Copyright INRA-URGI 2009-2010
+#
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software. You can use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+#
+# As a counterpart to the access to the source code and rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty and the software's author, the holder of the
+# economic rights, and the successive licensors have only limited
+# liability.
+#
+# In this respect, the user's attention is drawn to the risks associated
+# with loading, using, modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean that it is complicated to manipulate, and that also
+# therefore means that it is reserved for developers and experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and, more generally, to use and operate it in the
+# same conditions as regards security.
+#
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+#
+import sys
+import random
+from SMART.Java.Python.misc import Utils
+from SMART.Java.Python.structure.Transcript import Transcript
+from SMART.Java.Python.structure.TranscriptList import TranscriptList
+from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
+from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection
+from SMART.Java.Python.mySql.MySqlTranscriptTable import MySqlTranscriptTable
+from SMART.Java.Python.misc.Progress import Progress
+from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter
+class TranscriptListsComparator(object):
+"""
+Compare two transcript lists, using a database for one of the list
+Uses one TranscriptContainer for query data,
+one TranscriptContainer exported to MySqlTranscriptTable for reference data,
+one MySqlTranscriptTable for transformed reference data
+@ivar inputTranscriptContainers: parsers to the list of query transcripts
+@type inputTranscriptContainers: list of 2 L{TranscriptContainer<TranscriptContainer>}
+@ivar writer:                    transcript list writer
+@type writer:                    class L{TranscriptListWriter<TranscriptListWriter>}
+@ivar mySqlConnection:           connection to a MySQL database (to compute the ovelapping efficiently)
+@type mySqlConnection:           class L{MySqlConnection<MySqlConnection>}
+@ivar introns:                   compare transcripts or exons only
+@type introns:                   list of 2 boolean
+@ivar starts:                    restrict the query transcripts to first nucleotides
+@type starts:                    list of 2 int or None
+@ivar fivePrimes:                extend a list of transcripts by their 5' end
+@type fivePrimes:                list of 2 int or None
+@ivar threePrimes:               extend a list of transcripts by their 3' end
+@type threePrimes:               list of 2 int or None
+@ivar minDistance:               min distance between two transcripts [default: 0]
+@type minDistance:               int
+@ivar maxDistance:               max distance between two transcripts [default: 0]
+@type maxDistance:               int
+@ivar minOverlap:                minimum number of overlapping nucleotides to declare an overlap
+@type minOverlap:                int
+@ivar pcOverlap:                 percentage of overlapping nucleotides to declare an overlap
+@type pcOverlap:                 int
+@ivar upstreams:                 consider distances with elements which are upstream of the transcripts
+@type upstreams:                 boolean
+@ivar downstreams:               consider distances with elements which are downstream of the transcripts
+@type downstreams:               boolean
+@ivar colinear:                  whether transcripts should overlap in the same direction
+@type colinear:                  boolean
+@ivar antisense:                 whether transcripts should overlap in the opposite direction
+@type antisense:                 boolean
+@ivar outputDistance:            output distance between query and reference instead of query transcript
+@type outputDistance:            boolean
+@ivar absolute:                  do not consider the strand while computing distance
+@type absolute:                  boolean
+@ivar strandedDistance:          return a line per strand while computing distances
+@type strandedDistance:          boolean
+@ivar QUERY:                     constant specifying the query objects
+@type QUERY:                     int
+@ivar REFERENCE:                 constant specifying the reference objects
+@type REFERENCE:                 int
+@ivar INPUTTYPES:                set of input types of data (query or reference) objects
+@type INPUTTYPES:                list of 2 int
+@ivar typeToString:              string representation of the previous types
+@type typeToString:              dict
+@ivar tableNames:                name of the transcript tables
+@type tableNames:                dict of strings
+@ivar nbTranscripts:             number of transcript in the query/reference set
+@type nbTranscripts:             list of 2 int or None
+@ivar nbNucleotides:             number of nucleotides in the query/reference set
+@type nbNucleotides:             list of 2 int or None
+@ivar transcriptsToBeStored:     transcripts that will be stored into database
+@type transcriptsToBeStored:     dict of class L{TranscriptList<TranscriptList>}
+@ivar multiple:                  in merge mode, aggregate multiple transcripts
+@type multiple:                  boolean
+@ivar normalization:             normalize each element by the number of mappings of this element
+@type normalization:             boolean
+@ivar invert:                    invert the current comparison
+@type invert:                    boolean
+@ivar splitDifference:           split into intervals when computing difference
+@type splitDifference:           boolean
+@ivar odds:                      whether odds about the comparison should be computed
+@type odds:                      boolean
+@ivar overlapResults:            count the number of overlaps
+@type overlapResults:            dictionary
+@ivar oddResults:                compute the number of times each transcript overlaps (or is merged with) another one
+@type oddResults:                dictionary
+@ivar outputContainer:           container of the output transcripts
+@type outputContainer:           class L{TranscriptContainer<TranscriptContainer>}
+@ivar logHandle:                 log handle
+@type logHandle:                 file
+@ivar verbosity:                 verbosity
+@type verbosity:                 int
+"""
+def __init__(self, logHandle = None, verbosity = 0):
+"""
+Constructor
+@param transcriptListParser2: parser to the list of reference transcripts
+@type  transcriptListParser2: class L{TranscriptListParser<TranscriptListParser>}
+@param logHandle:             log handle
+@type  logHandle:             file
+@param verbosity:             verbosity
+@type  verbosity:             int
+"""
+self.QUERY                     = 0
+self.REFERENCE                 = 1
+self.WORKING                   = 2
+self.INPUTTYPES                = (self.QUERY, self.REFERENCE)
+self.INPUTWORKINGTYPES         = (self.QUERY, self.REFERENCE, self.WORKING)
+self.typeToString              = {self.QUERY: "Query", self.REFERENCE: "Reference", self.WORKING: "Working"}
+self.logHandle                 = logHandle
+self.verbosity                 = verbosity
+self.mySqlConnection           = MySqlConnection(self.verbosity-1)
+self.inputTranscriptContainers = [None, None]
+self.tableNames                = ["tmpQueryTable_%d" % (random.randint(0, 100000)), "tmpReferenceTable_%d" % (random.randint(0, 100000)), "tmpOutputTable_%d" % (random.randint(0, 100000)), "tmpWorkingTable_%d" % (random.randint(0, 100000))]
+self.mySqlTranscriptWriters    = [MySqlTranscriptWriter(self.mySqlConnection, name, verbosity-1) for name in self.tableNames]
+self.writer                    = None
+self.introns                   = [False, False]
+self.starts                    = [None, None]
+self.ends                      = [None, None]
+self.fivePrimes                = [None, None]
+self.threePrimes               = [None, None]
+self.minDistance               = None
+self.maxDistance               = 0
+self.minOverlap                = 1
+self.pcOverlap                 = None
+self.colinear                  = False
+self.antisense                 = False
+self.downstreams               = [False, False]
+self.upstreams                 = [False, False]
+self.outputDistance            = False
+self.absolute                  = False
+self.strandedDistance          = False
+self.nbTranscripts             = [None, None]
+self.nbNucleotides             = [None, None]
+self.normalization             = False
+self.included                  = False
+self.including                 = False
+self.invert                    = False
+self.notOverlapping            = False
+self.splitDifference           = False
+self.multiple                  = False
+self.odds                      = False
+self.overlapResults            = None
+self.oddResults                = None
+self.outputContainer           = None
+self.transcriptsToBeStored     = dict([(type, TranscriptList()) for type in self.INPUTWORKINGTYPES])
+self.nbPrinted                 = 0
+self.mySqlConnection.createDatabase()
+def __del__(self):
+"""
+Destructor
+Remove all temporary tables
+"""
+for type in self.INPUTWORKINGTYPES:
+self.mySqlTranscriptWriters[type].removeTables()
+self.mySqlConnection.deleteDatabase()
+def acceptIntrons(self, type, bool):
+"""
+Compare transcripts or exons only
+@param type: whether use query/reference data
+@type  type: int
+@param bool: include introns or not
+@type  bool: boolean
+"""
+self.introns[type] = bool
+def restrictToStart(self, type, size):
+"""
+Restrict a list of transcripts to first nucleotides
+@param type: whether use query/reference data
+@type  type: int
+@param size: the size of the transcript to be considered
+@type  size: int
+"""
+self.starts[type]  = size
+self.introns[type] = False
+def restrictToEnd(self, type, size):
+"""
+Restrict a list of transcripts to first nucleotides
+@param type: whether use query/reference data
+@type  type: int
+@param size: the size of the transcript to be considered
+@type  size: int
+"""
+self.ends[type]    = size
+self.introns[type] = False
+def extendFivePrime(self, type, size):
+"""
+Extend a list of transcripts by their 5' end
+@param type: whether use query/reference data
+@type  type: int
+@param size: size of the extension
+@type  size: int
+"""
+self.fivePrimes[type] = size
+def extendThreePrime(self, type, size):
+"""
+Extend the list of query transcripts by their 3' end
+@param type: whether use query/reference data
+@type  type: int
+@param size: size of the extension
+@type  size: int
+"""
+self.threePrimes[type] = size
+def setMinDistance(self, distance):
+"""
+Set the min distance between two transcripts
+@param distance: distance
+@type  distance: int
+"""
+self.minDistance = distance
+def setMaxDistance(self, distance):
+"""
+Set the max distance between two transcripts
+@param distance: distance
+@type  distance: int
+"""
+self.maxDistance = distance
+def setMinOverlap(self, overlap):
+"""
+Set the minimum number of nucleotides to declare an overlap
+@param overlap: minimum number of nucleotides
+@type  overlap: int
+"""
+self.minOverlap = overlap
+def setPcOverlap(self, overlap):
+"""
+Set the percentage of nucleotides to declare an overlap
+@param overlap: percentage of nucleotides
+@type  overlap: int
+"""
+self.pcOverlap = overlap
+def setUpstream(self, type, boolean):
+"""
+Consider transcripts which are upstream of some transcripts
+@param type:        whether use query/reference data
+@type  type:        int
+@param boolean: consider only these transcripts or not
+@type  boolean: boolean
+"""
+self.upstreams[type] = boolean
+def setDownstream(self, type, boolean):
+"""
+Consider transcripts which are downstream of some transcripts
+@param type:        whether use query/reference data
+@type  type:        int
+@param boolean: consider only these transcripts or not
+@type  boolean: boolean
+"""
+self.downstreams[type] = boolean
+def setOutputDistance(self, boolean):
+"""
+Output distance between query and reference instead of query transcript
+@param boolean: whether distance should be output
+@type  boolean: boolean
+"""
+self.outputDistance = boolean
+def setAbsolute(self, boolean):
+"""
+Do not consider strand when computing distance (thus, having only non-negative values)
+@param boolean: whether we should consider strands
+@type  boolean: boolean
+"""
+self.absolute = boolean
+def setStrandedDistance(self, boolean):
+"""
+Return two distance distributions, one per strand
+@param boolean: whether we should return 2 distance distance
+@type  boolean: boolean
+"""
+self.strandedDistance = boolean
+def getColinearOnly(self, boolean):
+"""
+Only consider transcripts that overlap in the same direction
+@param boolean: whether transcripts should overlap in the same direction
+@type  boolean: boolean
+"""
+self.colinear = boolean
+def getAntisenseOnly(self, boolean):
+"""
+Only consider transcripts that overlap in the opposite direction
+@param boolean: whether transcripts should overlap in the opposite direction
+@type  boolean: boolean
+"""
+self.antisense = boolean
+def setIncludedOnly(self, boolean):
+"""
+Keep the elements from first set which are included in the second set
+@param boolean: whether to keep included elements only
+@type  boolean: boolean
+"""
+self.included = boolean
+def setIncludingOnly(self, boolean):
+"""
+Keep the elements from second set which are included in the first set
+@param boolean: whether to keep included elements only
+@type  boolean: boolean
+"""
+self.including = boolean
+def setNormalization(self, boolean):
+"""
+Normalize the elements by the number of mappings in the genome
+@param boolean: whether normalize
+@type  boolean: boolean
+"""
+self.normalization = boolean
+def getInvert(self, boolean):
+"""
+Only consider transcripts that do not overlap
+@param boolean: whether invert the selection
+@type  boolean: boolean
+"""
+self.invert = boolean
+def includeNotOverlapping(self, boolean):
+"""
+Also output the elements which do not overlap
+@param boolean: whether output the elements which do not overlap
+@type  boolean: boolean
+"""
+self.notOverlapping = boolean
+def setSplitDifference(self, boolean):
+"""
+Split into intervals when computing difference
+@param boolean: whether to split
+@type  boolean: boolean
+"""
+self.splitDifference = boolean
+def aggregate(self, boolean):
+"""
+In merge mode, aggregate multiple transcripts
+@param boolean: aggregate multiple transcripts
+@type  boolean: boolean
+"""
+self.multiple = boolean
+def getTables(self, type):
+"""
+Get the SQL tables
+@param type: type of the table (query, reference, etc.)
+@type  type: int
+"""
+return self.mySqlTranscriptWriters[type].getTables()
+def computeOdds(self, boolean):
+"""
+Compute odds
+@param boolean: whether odds should be computed
+@type  boolean: boolean
+"""
+self.odds = boolean
+if self.odds:
+self.overlapResults = dict()
+def computeOddsPerTranscript(self, boolean):
+"""
+Compute odds for each transcript
+@param boolean: whether odds for each transcript should be computed
+@type  boolean: boolean
+"""
+self.odds = boolean
+if self.odds:
+self.overlapResults = dict()
+def removeTables(self):
+"""
+Remove the temporary MySQL tables
+"""
+for type in self.INPUTWORKINGTYPES:
+for chromosome in self.getTables(type):
+self.getTables(type)[chromosome].remove()
+def clearTables(self):
+"""
+Empty the content of the databases
+"""
+for type in self.INPUTWORKINGTYPES:
+if self.transcriptListParsers[type] != None:
+for chromosome in self.getTables(type):
+self.getTables(type)[chromosome].clear()
+def extendTranscript(self, type, transcript):
+"""
+Extend a transcript corresponding to the parameters of the class
+@param transcript: a transcript
+@type  transcript: class L{Transcript<Transcript>}
+@return: the possibly extended transcript
+"""
+extendedTranscript = Transcript()
+extendedTranscript.copy(transcript)
+if self.starts[type] != None:
+extendedTranscript.restrictStart(self.starts[type])
+if self.ends[type] != None:
+extendedTranscript.restrictEnd(self.ends[type])
+if self.fivePrimes[type] != None:
+extendedTranscript.extendStart(self.fivePrimes[type])
+if self.threePrimes[type] != None:
+extendedTranscript.extendEnd(self.threePrimes[type])
+return extendedTranscript
+def storeTranscript(self, type, transcript, now = True):
+"""
+Add a transcript to a MySQL database, or postpone the store
+@param type:       whether use query/reference table
+@type  type:       int
+@param transcript: a transcript
+@type  transcript: class L{Transcript<Transcript>}
+@param now:        whether transcript should be stored now (or stored can be postponed)
+@type  now:        bool
+"""
+self.mySqlTranscriptWriters[type].addTranscript(transcript)
+if type == self.REFERENCE:
+self.mySqlTranscriptWriters[self.WORKING].addTranscript(transcript)
+if now:
+self.mySqlTranscriptWriters[type].write()
+if type == self.REFERENCE:
+self.mySqlTranscriptWriters[self.WORKING].write()
+def writeTranscript(self, transcript):
+"""
+Write a transcript in the output file
+@param transcript: a transcript
+@type  transcript: class L{Transcript<Transcript>}
+"""
+if self.writer != None:
+self.writer.addTranscript(transcript)
+self.nbPrinted += 1
+def flushData(self, type = None):
+"""
+Store the remaining transcripts
+@param type: whether use query/reference table (None for all)
+@type  type: int or None
+"""
+if type == None:
+types = self.INPUTWORKINGTYPES
+else:
+types = [type]
+for type in types:
+self.mySqlTranscriptWriters[type].write()
+if self.writer != None:
+self.writer.write()
+def unstoreTranscript(self, type, transcript):
+"""
+Remove a transcript from a MySQL database
+@param type:       whether use query/reference table
+@type  type:       int
+@param transcript: a transcript
+@type  transcript: class L{Transcript<Transcript>}
+"""
+self.getTables(type)[transcript.getChromosome()].removeTranscript(transcript)
+if type == self.REFERENCE:
+self.getTables(self.WORKING)[transcript.getChromosome()].removeTranscript(transcript)
+def addIndexes(self, tables):
+"""
+Add useful indexes to the tables
+@param tables: which tables should be indexed
+@type  tables: list of int
+"""
+for type in tables:
+for chromosome in self.getTables(type):
+self.getTables(type)[chromosome].createIndex("iStart_transcript_%s_%d_%d" % (chromosome, type, random.randint(0, 100000)), ["start"])
+self.getTables(type)[chromosome].exonsTable.createIndex("iTranscriptId_exon_%s_%d_%d" % (chromosome, type, random.randint(0, 100000)), ["transcriptId"])
+def storeTranscriptList(self, type, transcriptListParser, extension):
+"""
+Store a transcript list into database
+@param type:      whether use query/reference parser
+@type  type:      int
+@param parser:    a parser of transcript list
+@type  parser:    class L{TranscriptContainer<TranscriptContainer>}
+@param extension: extend (or not) the transcripts
+@type  extension: boolean
+"""
+progress = Progress(transcriptListParser.getNbTranscripts(), "Writing transcripts for %s" % ("query" if type == self.QUERY else "reference"), self.verbosity-1)
+for transcript in transcriptListParser.getIterator():
+if extension:
+transcript = self.extendTranscript(type, transcript)
+self.mySqlTranscriptWriters[type].addTranscript(transcript)
+progress.inc()
+self.mySqlTranscriptWriters[type].write()
+progress.done()
+if type == self.REFERENCE:
+for chromosome in self.getTables(self.REFERENCE):
+self.getTables(self.WORKING)[chromosome] = MySqlTranscriptTable(self.mySqlConnection, self.tableNames[self.WORKING], chromosome, self.verbosity-1)
+self.getTables(self.WORKING)[chromosome].copy(self.getTables(self.REFERENCE)[chromosome])
+def setInputTranscriptContainer(self, type, inputTranscriptContainer):
+"""
+Set an input transcript list container
+@param type:                     whether use query/reference parser
+@type  type:                     int
+@param inputTranscriptContainer: a container
+@type  inputTranscriptContainer: class L{TranscriptContainer<TranscriptContainer>}
+"""
+self.inputTranscriptContainers[type] = inputTranscriptContainer
+self.nbTranscripts[type]             = self.inputTranscriptContainers[type].getNbTranscripts()
+self.nbNucleotides[type]             = self.inputTranscriptContainers[type].getNbNucleotides()
+def setOutputWriter(self, writer):
+"""
+Set an output transcript list writer
+@param writer: a writer
+@type  writer: class L{TranscriptListWriter<TranscriptListWriter>}
+"""
+self.writer = writer
+def compareTranscript(self, transcript1, transcript2, includeDistance = False):
+"""
+Compare two transcripts, using user defined parameters
+@param transcript1:     a transcript from the query set (already extended)
+@type  transcript1:     class L{Transcript<Transcript>}
+@param transcript2:     a transcript from the reference set (already extended)
+@type  transcript2:     class L{Transcript<Transcript>}
+@param includeDistance: take into account the distance too
+@type  includeDistance: boolean
+@return:                true, if they overlap
+"""
+extendedTranscript1 = Transcript()
+extendedTranscript1.copy(transcript1)
+if includeDistance:
+if self.maxDistance > 0:
+extendedTranscript1.extendStart(self.maxDistance)
+extendedTranscript1.extendEnd(self.maxDistance)
+minOverlap = self.minOverlap
+if self.pcOverlap != None:
+minOverlap = max(minOverlap, transcript1.getSize() / 100.0 * self.pcOverlap)
+if not extendedTranscript1.overlapWith(transcript2, self.minOverlap):
+return False
+if (self.downstreams[self.QUERY]       and transcript2.getStart() > extendedTranscript1.getStart()) or \
+(self.upstreams[self.QUERY]       and transcript2.getEnd()   < extendedTranscript1.getEnd())   or \
+(self.downstreams[self.REFERENCE] and extendedTranscript1.getStart() > transcript2.getStart()) or \
+(self.upstreams[self.REFERENCE]   and extendedTranscript1.getEnd()   < transcript2.getEnd()):
+return False
+if (self.antisense and extendedTranscript1.getDirection() == transcript2.getDirection()) or (self.colinear and extendedTranscript1.getDirection() != transcript2.getDirection()):
+return False
+if self.included and not transcript2.include(extendedTranscript1):
+return False
+if self.including and not extendedTranscript1.include(transcript2):
+return False
+if self.introns[self.REFERENCE] and self.introns[self.QUERY]:
+if self.logHandle != None:
+self.logHandle.write("%s overlaps with intron of %s\n" % (str(extendedTranscript1), str(transcript2)))
+return True
+if (not self.introns[self.REFERENCE]) and (not self.introns[self.QUERY]) and extendedTranscript1.overlapWithExon(transcript2, minOverlap):
+if self.logHandle != None:
+self.logHandle.write("%s overlaps with exon of %s\n" % (str(extendedTranscript1), str(transcript2)))
+return True
+return False
+def compareTranscriptToList(self, transcript1):
+"""
+Compare a transcript to the reference list of transcripts
+(Do not extend the transcripts, except for the distance)
+@param transcript1: a transcript (from the query set)
+@type  transcript1: class L{Transcript<Transcript>}
+@return:            the reference transcripts overlapping
+"""
+# no transcript in the reference table
+if transcript1.getChromosome() not in self.getTables(self.WORKING):
+return
+# retrieve the the transcripts that may overlap in the working tables
+clauses = []
+extendedTranscript1 = Transcript()
+extendedTranscript1.copy(transcript1)
+if self.maxDistance > 0:
+extendedTranscript1.extendStart(self.maxDistance)
+if self.maxDistance > 0:
+extendedTranscript1.extendEnd(self.maxDistance)
+command = "SELECT * FROM %s WHERE (" % (self.getTables(self.WORKING)[transcript1.getChromosome()].getName())
+for binPair in extendedTranscript1.getBins():
+clause = "bin "
+if binPair[0] == binPair[1]:
+clause += "= %i" % (binPair[0])
+else:
+clause += "BETWEEN %i AND %i" % (binPair[0], binPair[1])
+clauses.append(clause)
+command += " OR ".join(clauses)
+command += ") AND start <= %d AND end >= %d" % (extendedTranscript1.getEnd(), extendedTranscript1.getStart())
+for index2, transcript2 in self.getTables(self.REFERENCE)[transcript1.getChromosome()].selectTranscripts(command):
+if self.compareTranscript(extendedTranscript1, transcript2):
+yield transcript2
+def compareTranscriptList(self):
+"""
+Compare a list of transcript to the reference one
+@return: the transcripts that overlap with the reference set
+"""
+distance      = 0
+nbClustersIn  = 0
+nbClustersOut = 0
+if self.maxDistance != None:
+distance = self.maxDistance
+self.addIndexes([self.QUERY, self.REFERENCE])
+# export the container into tables
+self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True)
+self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True)
+# looping
+for chromosome1 in sorted(self.getTables(self.QUERY).keys()):
+# get range of transcripts
+command = "SELECT MIN(start), MAX(end), COUNT(id) FROM %s" % (self.getTables(self.QUERY)[chromosome1].getName())
+query     = self.mySqlConnection.executeQuery(command)
+result    = query.getLine()
+first     = result[0]
+last      = result[1]
+nb        = result[2]
+transcripts1 = []
+toBeRemoved1 = []
+transcripts2 = []
+toBeRemoved2 = []
+overlapsWith = []
+nbOverlaps   = []
+nbChunks     = max(1, nb / 100)
+chunkSize    = (last - first) / nbChunks
+progress     = Progress(nbChunks + 1, "Analyzing chromosome %s" % (chromosome1), self.verbosity-1)
+for chunk in range(nbChunks + 1):
+# load transcripts
+start   = first + chunk * chunkSize
+end     = start + chunkSize - 1
+command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.QUERY)[chromosome1].getName(), start, end-1)
+for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command):
+transcripts1.append(transcript1)
+overlapsWith.append([])
+nbOverlaps.append(0)
+nbClustersIn += 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements")
+command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.QUERY)[chromosome1].getName(), end)
+self.mySqlConnection.executeQuery(command)
+if chromosome1 in self.getTables(self.REFERENCE):
+command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), start-distance, end+distance-1)
+if chunk == 0:
+command = "SELECT * FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance)
+for index2, transcript2 in self.getTables(self.REFERENCE)[chromosome1].selectTranscripts(command):
+transcripts2.append(transcript2)
+command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance)
+self.mySqlConnection.executeQuery(command)
+# compare sets
+for index1, transcript1 in enumerate(transcripts1):
+overlappingNames = []
+nbElements1      = 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements")
+for transcript2 in transcripts2:
+if self.compareTranscript(transcript1, transcript2, True):
+id2 = transcript2.getTagValue("ID") if "ID" in transcript2.getTagNames() else transcript2.getName()
+if id2 not in overlapsWith[index1]:
+overlapsWith[index1].append(id2)
+nbOverlaps[index1] += 1 if "nbElements" not in transcript2.getTagNames() else transcript2.getTagValue("nbElements")
+if self.odds:
+if transcript2.getName() not in self.overlapResults:
+self.overlapResults[transcript2.getName()] = 0
+self.overlapResults[transcript2.getName()] += nbElements1
+# check if query transcript extends bounds of the chunk
+if transcript1.getEnd() < end:
+if Utils.xor(overlapsWith[index1], self.invert) or self.notOverlapping:
+if overlapsWith[index1]:
+transcript1.setTagValue("overlapWith", ",".join(overlapsWith[index1])[:100])
+transcript1.setTagValue("nbOverlaps", "%d" % (nbOverlaps[index1]))
+elif self.notOverlapping:
+transcript1.setTagValue("nbOverlaps", "0")
+self.writeTranscript(transcript1)
+nbClustersOut += nbElements1
+toBeRemoved1.append(index1)
+# update list of query transcripts
+for index1 in reversed(toBeRemoved1):
+del transcripts1[index1]
+del overlapsWith[index1]
+del nbOverlaps[index1]
+toBeRemoved1 = []
+# check if the reference transcripts extends bounds of the chunk
+for index2, transcript2 in enumerate(transcripts2):
+if transcript2.getEnd() + distance < end:
+toBeRemoved2.append(index2)
+for index2 in reversed(toBeRemoved2):
+del transcripts2[index2]
+toBeRemoved2 = []
+progress.inc()
+for index1, transcript1 in enumerate(transcripts1):
+if Utils.xor(overlapsWith[index1], self.invert) or self.notOverlapping:
+if overlapsWith[index1]:
+transcript1.setTagValue("overlapWith", ",".join(overlapsWith[index1])[:100])
+transcript1.setTagValue("nbOverlaps", "%d" % (nbOverlaps[index1]))
+elif self.notOverlapping:
+transcript1.setTagValue("nbOverlaps", "0")
+self.writeTranscript(transcript1)
+nbClustersOut += 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements")
+progress.done()
+self.getTables(self.QUERY)[chromosome1].remove()
+if chromosome1 in self.getTables(self.REFERENCE):
+self.getTables(self.REFERENCE)[chromosome1].remove()
+self.getTables(self.WORKING)[chromosome1].remove()
+self.flushData()
+if self.writer != None:
+self.writer.close()
+self.writer = None
+if self.verbosity > 0:
+print "reference: %d elements" % (self.nbTranscripts[self.REFERENCE])
+print "query:     %d elements, %d clustered" % (self.nbTranscripts[self.QUERY], nbClustersIn)
+if self.nbTranscripts[self.QUERY] != 0:
+print "output:    %d elements (%.2f%%)"% (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100),
+if nbClustersOut != 0:
+print ", %d clustered (%.2f%%)" % (nbClustersOut, float(nbClustersOut) / nbClustersIn * 100)
+def compareTranscriptListDistance(self):
+"""
+Compare a list of transcript to the reference one
+@return: the distance distributions in a hash
+"""
+nbDistances  = 0
+distances    = {}
+absDistances = {}
+strandedDistances = dict([(strand, {}) for strand in (1, -1)])
+# export the container into tables
+self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True)
+self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True)
+progress = Progress(self.nbTranscripts[self.QUERY], "Analyzing chromosomes", self.verbosity-1)
+for transcript1 in self.inputTranscriptContainers[self.QUERY].getIterator():
+# get the distance
+transcript1    = self.extendTranscript(self.QUERY, transcript1)
+distance       = self.maxDistance + 1
+strand         = None
+closestElement = "None"
+for transcript2 in self.compareTranscriptToList(transcript1):
+thisStrand = transcript1.getDirection() * transcript2.getDirection()
+if self.antisense or (not self.colinear and transcript1.getDirection() != transcript2.getDirection()):
+transcript2.reverse()
+if self.absolute:
+transcript2.setDirection(transcript1.getDirection())
+if transcript2.getDirection() == transcript1.getDirection():
+if self.starts[self.REFERENCE] != None:
+transcript2.restrictStart(self.starts[self.REFERENCE])
+if self.ends[self.REFERENCE] != None:
+transcript2.restrictEnd(self.ends[self.REFERENCE])
+thisDistance = transcript1.getRelativeDistance(transcript2)
+if (self.absolute):
+thisDistance = abs(thisDistance)
+if abs(thisDistance) < abs(distance):
+distance       = thisDistance
+strand         = thisStrand
+closestElement = transcript2.getTagValue("ID") if "ID" in transcript2.getTagNames() else transcript2.getName()
+if (distance <= self.maxDistance) and (self.minDistance == None or distance >= self.minDistance):
+nbDistances                        += 1
+distances[distance]                 = distances.get(distance, 0) + 1
+absDistance                         = abs(distance)
+absDistances[absDistance]           = absDistances.get(absDistance, 0) + 1
+strandedDistances[strand][distance] = strandedDistances[strand].get(distance, 0)
+if distance not in strandedDistances[-strand]:
+strandedDistances[-strand][distance] = 0
+# write transcript
+if distance == self.maxDistance + 1:
+distance = "None"
+tmpTranscript = Transcript()
+tmpTranscript.copy(transcript1)
+tmpTranscript.setTagValue("distance", distance)
+tmpTranscript.setTagValue("closestElement", closestElement)
+self.writeTranscript(tmpTranscript)
+progress.inc()
+progress.done()
+self.flushData()
+if self.verbosity > 0:
+print "reference: %d sequences" % (self.nbTranscripts[self.REFERENCE])
+print "query:     %d sequences" % (self.nbTranscripts[self.QUERY])
+if nbDistances == 0:
+print "Nothing matches"
+else:
+print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(absDistances)
+print "for %d distances (%.2f%%)" % (nbDistances, float(nbDistances) / self.nbTranscripts[self.QUERY] * 100)
+if self.strandedDistance:
+return strandedDistances
+return distances
+def compareTranscriptListMerge(self):
+"""
+Merge the query list of transcript with itself
+@return: the merged transcripts in a transcript list database
+"""
+nbMerges  = 0
+for type in (self.QUERY, self.REFERENCE):
+self.storeTranscriptList(type, self.inputTranscriptContainers[type], True)
+self.flushData()
+# Loop on the chromosomes
+for chromosome in sorted(self.getTables(self.QUERY).keys()):
+if chromosome not in self.getTables(self.REFERENCE):
+continue
+# Get the size of the chromosome
+maxEnd   = 0
+nbChunks = 0
+for type in (self.QUERY, self.REFERENCE):
+command  = "SELECT MAX(end) from %s" % (self.getTables(type)[chromosome].getName())
+query    = self.mySqlConnection.executeQuery(command)
+maxEnd   = max(maxEnd, int(query.getLine()[0]))
+nbChunks = max(nbChunks, self.getTables(type)[chromosome].getNbElements())
+mergedTranscripts = {}
+transcripts = {self.QUERY: [], self.REFERENCE: []}
+progress    = Progress(nbChunks, "Analyzing %s" % (chromosome), self.verbosity-1)
+for i in range(nbChunks):
+rangeStart = int(i * (float(maxEnd) / nbChunks)) + 1
+rangeEnd   = int((i+1) * (float(maxEnd) / nbChunks))
+# Get all transcripts in query and reference from chunk
+for type in (self.QUERY, self.REFERENCE):
+correction = 0 if self.QUERY else self.maxDistance
+command = "SELECT * FROM %s WHERE start <= %d" % (self.getTables(type)[chromosome].getName(), rangeEnd + correction)
+for index, transcript in self.getTables(type)[chromosome].selectTranscripts(command):
+transcripts[type].append(transcript)
+# Merge elements between the two samples
+for iQuery, queryTranscript in enumerate(transcripts[self.QUERY]):
+for iReference, referenceTranscript in enumerate(transcripts[self.REFERENCE]):
+if referenceTranscript == None: continue
+if self.compareTranscript(queryTranscript, referenceTranscript, True):
+if queryTranscript.getDirection() != referenceTranscript.getDirection():
+referenceTranscript.setDirection(queryTranscript.getDirection())
+queryTranscript.merge(referenceTranscript, self.normalization)
+nbMerges += 1
+transcripts[self.REFERENCE][iReference] = None
+if not self.multiple:
+mergedTranscripts[iQuery] = 0
+# Remove transcripts from database
+for type in (self.QUERY, self.REFERENCE):
+correction = 0 if self.QUERY else self.maxDistance
+command    = "DELETE FROM %s WHERE start <= %d" % (self.getTables(type)[chromosome].getName(), rangeEnd - correction)
+query      = self.mySqlConnection.executeQuery(command)
+# Just in case, self-merge the elements in the query (beware of mergedTranscripts!)
+if (self.multiple):
+for iQuery1, queryTranscript1 in enumerate(transcripts[self.QUERY]):
+if queryTranscript1 == None: continue
+for iQuery2, queryTranscript2 in enumerate(transcripts[self.QUERY]):
+if iQuery2 <= iQuery1 or queryTranscript2 == None: continue
+minOverlap = self.minOverlap
+if self.pcOverlap != None:
+minOverlap = max(minOverlap, queryTranscript1.getSize() / 100.0 * self.pcOverlap)
+if queryTranscript2.overlapWith(queryTranscript1, minOverlap) and (queryTranscript1.getDirection() == queryTranscript2.getDirection() or not self.colinear):
+if queryTranscript1.getDirection() != queryTranscript2.getDirection():
+queryTranscript2.setDirection(queryTranscript1.getDirection())
+queryTranscript1.merge(queryTranscript2, self.normalization)
+transcripts[self.QUERY][iQuery2] = None
+nbMerges += 1
+if not self.multiple:
+mergedTranscripts[iQuery1] = 0
+# Update the sets of transcripts and write into database (also update mergedTranscripts)
+newTranscripts = {self.QUERY: [], self.REFERENCE: []}
+newMergedTranscripts = {}
+for type in (self.QUERY, self.REFERENCE):
+for i, transcript in enumerate(transcripts[type]):
+if transcript == None: continue
+correction = 0 if self.QUERY else self.maxDistance
+if transcript.getEnd() < rangeEnd - correction:
+if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)):
+self.writeTranscript(transcripts[type][i])
+else:
+if type == self.QUERY and i in mergedTranscripts:
+newMergedTranscripts[len(newTranscripts[type])] = 0
+newTranscripts[type].append(transcript)
+transcripts = newTranscripts
+mergedTranscripts = newMergedTranscripts
+progress.inc()
+progress.done()
+for type in (self.QUERY, self.REFERENCE):
+for i, transcript in enumerate(transcripts[type]):
+if transcripts == None: continue
+if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)):
+self.writeTranscript(transcripts[type][i])
+# Manage chromosomes with no corresponding data
+if self.multiple:
+for type in self.INPUTTYPES:
+for chromosome in self.getTables(type):
+if chromosome in self.getTables(1 - type):
+continue
+for transcript in self.getTables(self.OUTPUT)[chromosome].getIterator():
+self.writeTranscript(transcript)
+self.flushData()
+if self.writer != None:
+self.writer.close()
+self.writer = None
+if self.verbosity > 0:
+print "query:    %d sequences" % (self.nbTranscripts[self.QUERY])
+print "# merges: %d" % (nbMerges)
+print "# printed %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100)
+def compareTranscriptListSelfMerge(self):
+"""
+Merge the query list of transcript with itself
+@return: the merged transcripts in a transcript list database
+"""
+nbMerges  = 0
+distance  = self.maxDistance if self.maxDistance != None else 0
+self.addIndexes([self.QUERY])
+self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True)
+self.flushData()
+# looping
+for chromosome1 in sorted(self.getTables(self.QUERY).keys()):
+transcripts2 = []
+# get range of transcripts
+progress = Progress(self.getTables(self.QUERY)[chromosome1].getNbElements(), "Analyzing chromosome %s" % (chromosome1), self.verbosity-1)
+command  = "SELECT * FROM %s ORDER BY start" % (self.getTables(self.QUERY)[chromosome1].getName())
+for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command):
+# compare sets
+toBeRemoved = set()
+toBePrinted = set()
+for index2, transcript2 in enumerate(transcripts2):
+if self.compareTranscript(transcript1, transcript2, True):
+if transcript1.getDirection() != transcript2.getDirection():
+transcript2.setDirection(transcript1.getDirection())
+transcript1.merge(transcript2, self.normalization)
+toBeRemoved.add(index2)
+nbMerges += 1
+elif transcript2.getEnd() + distance < transcript1.getStart():
+toBePrinted.add(index2)
+transcripts2.append(transcript1)
+for index2 in sorted(toBePrinted):
+self.writeTranscript(transcripts2[index2])
+transcripts2 = [transcripts2[index2] for index2 in range(len(transcripts2)) if index2 not in (toBeRemoved | toBePrinted)]
+for transcript2 in transcripts2:
+self.writeTranscript(transcript2)
+progress.done()
+self.getTables(self.QUERY)[chromosome1].remove()
+self.flushData()
+if self.writer != None:
+self.writer.close()
+self.writer = None
+if self.verbosity > 0:
+print "query:    %d sequences" % (self.nbTranscripts[self.QUERY])
+print "# merges: %d" % (nbMerges)
+print "# printed %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100)
+def getDifferenceTranscriptList(self):
+"""
+Get the elements of the first list which do not overlap the second list (at the nucleotide level)
+@return: the transcripts that overlap with the reference set
+"""
+distance = 0 if self.maxDistance == None else self.maxDistance
+self.addIndexes([self.QUERY, self.REFERENCE])
+# export the container into tables
+self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True)
+self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True)
+# looping
+for chromosome1 in sorted(self.getTables(self.QUERY).keys()):
+# get range of transcripts
+command = "SELECT MIN(start), MAX(end), COUNT(id) FROM %s" % (self.getTables(self.QUERY)[chromosome1].getName())
+query   = self.mySqlConnection.executeQuery(command)
+result  = query.getLine()
+first   = result[0]
+last    = result[1]
+nb      = result[2]
+transcripts1 = []
+transcripts2 = []
+nbChunks     = max(1, nb / 100)
+chunkSize    = (last - first) / nbChunks
+progress     = Progress(nbChunks + 1, "Analyzing chromosome %s" % (chromosome1), self.verbosity-1)
+for chunk in range(nbChunks + 1):
+# load transcripts
+start   = first + chunk * chunkSize
+end     = start + chunkSize - 1
+command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.QUERY)[chromosome1].getName(), start, end-1)
+for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command):
+transcripts1.append(transcript1)
+command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.QUERY)[chromosome1].getName(), end)
+self.mySqlConnection.executeQuery(command)
+if chromosome1 in self.getTables(self.REFERENCE):
+command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), start-distance, end+distance-1)
+if chunk == 0:
+command = "SELECT * FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance)
+for index2, transcript2 in self.getTables(self.REFERENCE)[chromosome1].selectTranscripts(command):
+transcripts2.append(transcript2)
+command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance)
+self.mySqlConnection.executeQuery(command)
+# compare sets
+toBeRemoved1 = []
+for index1, transcript1 in enumerate(transcripts1):
+newTranscript1 = Transcript()
+newTranscript1.copy(transcript1)
+for transcript2 in transcripts2:
+newTranscript1 = newTranscript1.getDifference(transcript2)
+if newTranscript1 == None:
+toBeRemoved1.append(index1)
+break
+transcripts1[index1] = newTranscript1
+# check if query transcript extends bounds of the chunk
+if newTranscript1 != None and newTranscript1.getEnd() < end:
+if self.splitDifference:
+for exon in newTranscript1.getExons():
+transcript = Transcript()
+transcript.copy(exon)
+self.writeTranscript(transcript)
+else:
+self.writeTranscript(newTranscript1)
+toBeRemoved1.append(index1)
+# update list of query transcripts
+for index1 in reversed(toBeRemoved1):
+del transcripts1[index1]
+# check if the reference transcripts extends bounds of the chunk
+toBeRemoved2 = []
+for index2, transcript2 in enumerate(transcripts2):
+if transcript2.getEnd() + distance < end:
+toBeRemoved2.append(index2)
+for index2 in reversed(toBeRemoved2):
+del transcripts2[index2]
+progress.inc()
+for transcript1 in transcripts1:
+if self.splitDifference:
+for exon in transcript1.getExons():
+transcript = Transcript()
+transcript.copy(exon)
+self.writeTranscript(transcript)
+else:
+self.writeTranscript(transcript1)
+progress.done()
+self.getTables(self.QUERY)[chromosome1].remove()
+if chromosome1 in self.getTables(self.REFERENCE):
+self.getTables(self.REFERENCE)[chromosome1].remove()
+self.getTables(self.WORKING)[chromosome1].remove()
+self.flushData()
+if self.writer != None:
+self.writer.close()
+self.writer = None
+if self.verbosity > 0:
+print "query:     %d elements" % (self.nbTranscripts[self.QUERY])
+print "reference: %d elements" % (self.nbTranscripts[self.REFERENCE])
+print "# printed: %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100)
+def getOddsPerTranscript(self):
+"""
+Return overlap results
+@return a dict of data
+"""
+if not self.odds:
+raise Exception("Did not compute odds!")
+return self.overlapResults
+def getOdds(self):
+"""
+Return odds about the overlap
+@return a dict of data
+"""
+if not self.odds:
+raise Exception("Did not compute odds!")
+if self.oddResults != None:
+return self.oddResults
+self.oddResults = {}
+for name, value in self.overlapResults.iteritems():
+self.oddResults[value] = self.oddResults.get(value, 0) + 1
+return self.oddResults

Mercurial > repos > yufei-luo > s_mart

comparison SMART/Java/Python/structure/TranscriptListsComparator.py @ 38:2c0c0a89fad7