Mercurial > repos > yufei-luo > s_mart

#
# Copyright INRA-URGI 2009-2010
#
# This software is governed by the CeCILL license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
#
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
#
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL license and that you accept its terms.
#
import sys
import random
from SMART.Java.Python.misc import Utils
from SMART.Java.Python.structure.Transcript import Transcript
from SMART.Java.Python.structure.TranscriptList import TranscriptList
from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection
from SMART.Java.Python.mySql.MySqlTranscriptTable import MySqlTranscriptTable
from SMART.Java.Python.misc.Progress import Progress
from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter


class TranscriptListsComparator(object):
    """
    Compare two transcript lists, using a database for one of the list
    Uses one TranscriptContainer for query data,
             one TranscriptContainer exported to MySqlTranscriptTable for reference data,
             one MySqlTranscriptTable for transformed reference data
    @ivar inputTranscriptContainers: parsers to the list of query transcripts
    @type inputTranscriptContainers: list of 2 L{TranscriptContainer<TranscriptContainer>}
    @ivar writer:                    transcript list writer
    @type writer:                    class L{TranscriptListWriter<TranscriptListWriter>}
    @ivar mySqlConnection:           connection to a MySQL database (to compute the ovelapping efficiently)
    @type mySqlConnection:           class L{MySqlConnection<MySqlConnection>}
    @ivar introns:                   compare transcripts or exons only
    @type introns:                   list of 2 boolean
    @ivar starts:                    restrict the query transcripts to first nucleotides
    @type starts:                    list of 2 int or None
    @ivar fivePrimes:                extend a list of transcripts by their 5' end
    @type fivePrimes:                list of 2 int or None
    @ivar threePrimes:               extend a list of transcripts by their 3' end
    @type threePrimes:               list of 2 int or None
    @ivar minDistance:               min distance between two transcripts [default: 0]
    @type minDistance:               int
    @ivar maxDistance:               max distance between two transcripts [default: 0]
    @type maxDistance:               int
    @ivar minOverlap:                minimum number of overlapping nucleotides to declare an overlap
    @type minOverlap:                int
    @ivar pcOverlap:                 percentage of overlapping nucleotides to declare an overlap
    @type pcOverlap:                 int
    @ivar upstreams:                 consider distances with elements which are upstream of the transcripts
    @type upstreams:                 boolean
    @ivar downstreams:               consider distances with elements which are downstream of the transcripts
    @type downstreams:               boolean
    @ivar colinear:                  whether transcripts should overlap in the same direction
    @type colinear:                  boolean
    @ivar antisense:                 whether transcripts should overlap in the opposite direction
    @type antisense:                 boolean
    @ivar outputDistance:            output distance between query and reference instead of query transcript
    @type outputDistance:            boolean
    @ivar absolute:                  do not consider the strand while computing distance
    @type absolute:                  boolean
    @ivar strandedDistance:          return a line per strand while computing distances
    @type strandedDistance:          boolean
    @ivar QUERY:                     constant specifying the query objects
    @type QUERY:                     int
    @ivar REFERENCE:                 constant specifying the reference objects
    @type REFERENCE:                 int
    @ivar INPUTTYPES:                set of input types of data (query or reference) objects
    @type INPUTTYPES:                list of 2 int
    @ivar typeToString:              string representation of the previous types
    @type typeToString:              dict
    @ivar tableNames:                name of the transcript tables
    @type tableNames:                dict of strings
    @ivar nbTranscripts:             number of transcript in the query/reference set
    @type nbTranscripts:             list of 2 int or None
    @ivar nbNucleotides:             number of nucleotides in the query/reference set
    @type nbNucleotides:             list of 2 int or None
    @ivar transcriptsToBeStored:     transcripts that will be stored into database
    @type transcriptsToBeStored:     dict of class L{TranscriptList<TranscriptList>}
    @ivar multiple:                  in merge mode, aggregate multiple transcripts
    @type multiple:                  boolean
    @ivar normalization:             normalize each element by the number of mappings of this element
    @type normalization:             boolean
    @ivar invert:                    invert the current comparison
    @type invert:                    boolean
    @ivar splitDifference:           split into intervals when computing difference
    @type splitDifference:           boolean
    @ivar odds:                      whether odds about the comparison should be computed
    @type odds:                      boolean
    @ivar overlapResults:            count the number of overlaps
    @type overlapResults:            dictionary
    @ivar oddResults:                compute the number of times each transcript overlaps (or is merged with) another one
    @type oddResults:                dictionary
    @ivar outputContainer:           container of the output transcripts
    @type outputContainer:           class L{TranscriptContainer<TranscriptContainer>}
    @ivar logHandle:                 log handle
    @type logHandle:                 file
    @ivar verbosity:                 verbosity
    @type verbosity:                 int
    """

    def __init__(self, logHandle = None, verbosity = 0):
        """
        Constructor
        @param transcriptListParser2: parser to the list of reference transcripts
        @type  transcriptListParser2: class L{TranscriptListParser<TranscriptListParser>}
        @param logHandle:             log handle
        @type  logHandle:             file
        @param verbosity:             verbosity
        @type  verbosity:             int
        """
        self.QUERY                     = 0
        self.REFERENCE                 = 1
        self.WORKING                   = 2
        self.INPUTTYPES                = (self.QUERY, self.REFERENCE)
        self.INPUTWORKINGTYPES         = (self.QUERY, self.REFERENCE, self.WORKING)
        self.typeToString              = {self.QUERY: "Query", self.REFERENCE: "Reference", self.WORKING: "Working"}

        self.logHandle                 = logHandle
        self.verbosity                 = verbosity
        self.mySqlConnection           = MySqlConnection(self.verbosity-1)
        self.inputTranscriptContainers = [None, None]
        self.tableNames                = ["tmpQueryTable_%d" % (random.randint(0, 100000)), "tmpReferenceTable_%d" % (random.randint(0, 100000)), "tmpOutputTable_%d" % (random.randint(0, 100000)), "tmpWorkingTable_%d" % (random.randint(0, 100000))]
        self.mySqlTranscriptWriters    = [MySqlTranscriptWriter(self.mySqlConnection, name, verbosity-1) for name in self.tableNames]
        self.writer                    = None
        self.introns                   = [False, False]
        self.starts                    = [None, None]
        self.ends                      = [None, None]
        self.fivePrimes                = [None, None]
        self.threePrimes               = [None, None]
        self.minDistance               = None
        self.maxDistance               = 0
        self.minOverlap                = 1
        self.pcOverlap                 = None
        self.colinear                  = False
        self.antisense                 = False
        self.downstreams               = [False, False]
        self.upstreams                 = [False, False]
        self.outputDistance            = False
        self.absolute                  = False
        self.strandedDistance          = False
        self.nbTranscripts             = [None, None]
        self.nbNucleotides             = [None, None]
        self.normalization             = False
        self.included                  = False
        self.including                 = False
        self.invert                    = False
        self.notOverlapping            = False
        self.splitDifference           = False
        self.multiple                  = False
        self.odds                      = False
        self.overlapResults            = None
        self.oddResults                = None
        self.outputContainer           = None
        self.transcriptsToBeStored     = dict([(type, TranscriptList()) for type in self.INPUTWORKINGTYPES])
        self.nbPrinted                 = 0

        self.mySqlConnection.createDatabase()


    def __del__(self):
        """
        Destructor
        Remove all temporary tables
        """
        for type in self.INPUTWORKINGTYPES:
            self.mySqlTranscriptWriters[type].removeTables()
        self.mySqlConnection.deleteDatabase()

    def acceptIntrons(self, type, bool):
        """
        Compare transcripts or exons only
        @param type: whether use query/reference data
        @type  type: int
        @param bool: include introns or not
        @type  bool: boolean
        """
        self.introns[type] = bool


    def restrictToStart(self, type, size):
        """
        Restrict a list of transcripts to first nucleotides
        @param type: whether use query/reference data
        @type  type: int
        @param size: the size of the transcript to be considered
        @type  size: int
        """
        self.starts[type]  = size
        self.introns[type] = False


    def restrictToEnd(self, type, size):
        """
        Restrict a list of transcripts to first nucleotides
        @param type: whether use query/reference data
        @type  type: int
        @param size: the size of the transcript to be considered
        @type  size: int
        """
        self.ends[type]    = size
        self.introns[type] = False


    def extendFivePrime(self, type, size):
        """
        Extend a list of transcripts by their 5' end
        @param type: whether use query/reference data
        @type  type: int
        @param size: size of the extension
        @type  size: int
        """
        self.fivePrimes[type] = size


    def extendThreePrime(self, type, size):
        """
        Extend the list of query transcripts by their 3' end
        @param type: whether use query/reference data
        @type  type: int
        @param size: size of the extension
        @type  size: int
        """
        self.threePrimes[type] = size


    def setMinDistance(self, distance):
        """
        Set the min distance between two transcripts
        @param distance: distance
        @type  distance: int
        """
        self.minDistance = distance


    def setMaxDistance(self, distance):
        """
        Set the max distance between two transcripts
        @param distance: distance
        @type  distance: int
        """
        self.maxDistance = distance


    def setMinOverlap(self, overlap):
        """
        Set the minimum number of nucleotides to declare an overlap
        @param overlap: minimum number of nucleotides
        @type  overlap: int
        """
        self.minOverlap = overlap


    def setPcOverlap(self, overlap):
        """
        Set the percentage of nucleotides to declare an overlap
        @param overlap: percentage of nucleotides
        @type  overlap: int
        """
        self.pcOverlap = overlap


    def setUpstream(self, type, boolean):
        """
        Consider transcripts which are upstream of some transcripts
        @param type:        whether use query/reference data
        @type  type:        int
        @param boolean: consider only these transcripts or not
        @type  boolean: boolean
        """
        self.upstreams[type] = boolean


    def setDownstream(self, type, boolean):
        """
        Consider transcripts which are downstream of some transcripts
        @param type:        whether use query/reference data
        @type  type:        int
        @param boolean: consider only these transcripts or not
        @type  boolean: boolean
        """
        self.downstreams[type] = boolean


    def setOutputDistance(self, boolean):
        """
        Output distance between query and reference instead of query transcript
        @param boolean: whether distance should be output
        @type  boolean: boolean
        """
        self.outputDistance = boolean


    def setAbsolute(self, boolean):
        """
        Do not consider strand when computing distance (thus, having only non-negative values)
        @param boolean: whether we should consider strands
        @type  boolean: boolean
        """
        self.absolute = boolean


    def setStrandedDistance(self, boolean):
        """
        Return two distance distributions, one per strand
        @param boolean: whether we should return 2 distance distance
        @type  boolean: boolean
        """
        self.strandedDistance = boolean


    def getColinearOnly(self, boolean):
        """
        Only consider transcripts that overlap in the same direction
        @param boolean: whether transcripts should overlap in the same direction
        @type  boolean: boolean
        """
        self.colinear = boolean


    def getAntisenseOnly(self, boolean):
        """
        Only consider transcripts that overlap in the opposite direction
        @param boolean: whether transcripts should overlap in the opposite direction
        @type  boolean: boolean
        """
        self.antisense = boolean


    def setIncludedOnly(self, boolean):
        """
        Keep the elements from first set which are included in the second set
        @param boolean: whether to keep included elements only
        @type  boolean: boolean
        """
        self.included = boolean


    def setIncludingOnly(self, boolean):
        """
        Keep the elements from second set which are included in the first set
        @param boolean: whether to keep included elements only
        @type  boolean: boolean
        """
        self.including = boolean


    def setNormalization(self, boolean):
        """
        Normalize the elements by the number of mappings in the genome
        @param boolean: whether normalize
        @type  boolean: boolean
        """
        self.normalization = boolean


    def getInvert(self, boolean):
        """
        Only consider transcripts that do not overlap
        @param boolean: whether invert the selection
        @type  boolean: boolean
        """
        self.invert = boolean


    def includeNotOverlapping(self, boolean):
        """
        Also output the elements which do not overlap
        @param boolean: whether output the elements which do not overlap
        @type  boolean: boolean
        """
        self.notOverlapping = boolean


    def setSplitDifference(self, boolean):
        """
        Split into intervals when computing difference
        @param boolean: whether to split
        @type  boolean: boolean
        """
        self.splitDifference = boolean


    def aggregate(self, boolean):
        """
        In merge mode, aggregate multiple transcripts
        @param boolean: aggregate multiple transcripts
        @type  boolean: boolean
        """
        self.multiple = boolean


    def getTables(self, type):
        """
        Get the SQL tables
        @param type: type of the table (query, reference, etc.)
        @type  type: int
        """
        return self.mySqlTranscriptWriters[type].getTables()


    def computeOdds(self, boolean):
        """
        Compute odds
        @param boolean: whether odds should be computed
        @type  boolean: boolean
        """
        self.odds = boolean
        if self.odds:
            self.overlapResults = dict()


    def computeOddsPerTranscript(self, boolean):
        """
        Compute odds for each transcript
        @param boolean: whether odds for each transcript should be computed
        @type  boolean: boolean
        """
        self.odds = boolean
        if self.odds:
            self.overlapResults = dict()


    def removeTables(self):
        """
        Remove the temporary MySQL tables
        """
        for type in self.INPUTWORKINGTYPES:
            for chromosome in self.getTables(type):
                self.getTables(type)[chromosome].remove()


    def clearTables(self):
        """
        Empty the content of the databases
        """
        for type in self.INPUTWORKINGTYPES:
            if self.transcriptListParsers[type] != None:
                for chromosome in self.getTables(type):
                    self.getTables(type)[chromosome].clear()


    def extendTranscript(self, type, transcript):
        """
        Extend a transcript corresponding to the parameters of the class
        @param transcript: a transcript
        @type  transcript: class L{Transcript<Transcript>}
        @return: the possibly extended transcript
        """
        extendedTranscript = Transcript()
        extendedTranscript.copy(transcript)
        if self.starts[type] != None:
            extendedTranscript.restrictStart(self.starts[type])
        if self.ends[type] != None:
            extendedTranscript.restrictEnd(self.ends[type])
        if self.fivePrimes[type] != None:
            extendedTranscript.extendStart(self.fivePrimes[type])
        if self.threePrimes[type] != None:
            extendedTranscript.extendEnd(self.threePrimes[type])
        return extendedTranscript


    def storeTranscript(self, type, transcript, now = True):
        """
        Add a transcript to a MySQL database, or postpone the store
        @param type:       whether use query/reference table
        @type  type:       int
        @param transcript: a transcript
        @type  transcript: class L{Transcript<Transcript>}
        @param now:        whether transcript should be stored now (or stored can be postponed)
        @type  now:        bool
        """
        self.mySqlTranscriptWriters[type].addTranscript(transcript)
        if type == self.REFERENCE:
            self.mySqlTranscriptWriters[self.WORKING].addTranscript(transcript)
        if now:
            self.mySqlTranscriptWriters[type].write()
            if type == self.REFERENCE:
                self.mySqlTranscriptWriters[self.WORKING].write()


    def writeTranscript(self, transcript):
        """
        Write a transcript in the output file
        @param transcript: a transcript
        @type  transcript: class L{Transcript<Transcript>}
        """
        if self.writer != None:
            self.writer.addTranscript(transcript)
            self.nbPrinted += 1


    def flushData(self, type = None):
        """
        Store the remaining transcripts
        @param type: whether use query/reference table (None for all)
        @type  type: int or None
        """
        if type == None:
            types = self.INPUTWORKINGTYPES
        else:
            types = [type]
        for type in types:
            self.mySqlTranscriptWriters[type].write()
        if self.writer != None:
            self.writer.write()


    def unstoreTranscript(self, type, transcript):
        """
        Remove a transcript from a MySQL database
        @param type:       whether use query/reference table
        @type  type:       int
        @param transcript: a transcript
        @type  transcript: class L{Transcript<Transcript>}
        """
        self.getTables(type)[transcript.getChromosome()].removeTranscript(transcript)
        if type == self.REFERENCE:
            self.getTables(self.WORKING)[transcript.getChromosome()].removeTranscript(transcript)


    def addIndexes(self, tables):
        """
        Add useful indexes to the tables
        @param tables: which tables should be indexed
        @type  tables: list of int
        """
        for type in tables:
            for chromosome in self.getTables(type):
                self.getTables(type)[chromosome].createIndex("iStart_transcript_%s_%d_%d" % (chromosome, type, random.randint(0, 100000)), ["start"])
                self.getTables(type)[chromosome].exonsTable.createIndex("iTranscriptId_exon_%s_%d_%d" % (chromosome, type, random.randint(0, 100000)), ["transcriptId"])


    def storeTranscriptList(self, type, transcriptListParser, extension):
        """
        Store a transcript list into database
        @param type:      whether use query/reference parser
        @type  type:      int
        @param parser:    a parser of transcript list
        @type  parser:    class L{TranscriptContainer<TranscriptContainer>}
        @param extension: extend (or not) the transcripts
        @type  extension: boolean
        """
        progress = Progress(transcriptListParser.getNbTranscripts(), "Writing transcripts for %s" % ("query" if type == self.QUERY else "reference"), self.verbosity-1)
        for transcript in transcriptListParser.getIterator():
            if extension:
                transcript = self.extendTranscript(type, transcript)
            self.mySqlTranscriptWriters[type].addTranscript(transcript)
            progress.inc()
        self.mySqlTranscriptWriters[type].write()
        progress.done()
        if type == self.REFERENCE:
            for chromosome in self.getTables(self.REFERENCE):
                self.getTables(self.WORKING)[chromosome] = MySqlTranscriptTable(self.mySqlConnection, self.tableNames[self.WORKING], chromosome, self.verbosity-1)
                self.getTables(self.WORKING)[chromosome].copy(self.getTables(self.REFERENCE)[chromosome])


    def setInputTranscriptContainer(self, type, inputTranscriptContainer):
        """
        Set an input transcript list container
        @param type:                     whether use query/reference parser
        @type  type:                     int
        @param inputTranscriptContainer: a container
        @type  inputTranscriptContainer: class L{TranscriptContainer<TranscriptContainer>}
        """
        self.inputTranscriptContainers[type] = inputTranscriptContainer
        self.nbTranscripts[type]             = self.inputTranscriptContainers[type].getNbTranscripts()
        self.nbNucleotides[type]             = self.inputTranscriptContainers[type].getNbNucleotides()


    def setOutputWriter(self, writer):
        """
        Set an output transcript list writer
        @param writer: a writer
        @type  writer: class L{TranscriptListWriter<TranscriptListWriter>}
        """
        self.writer = writer


    def compareTranscript(self, transcript1, transcript2, includeDistance = False):
        """
        Compare two transcripts, using user defined parameters
        @param transcript1:     a transcript from the query set (already extended)
        @type  transcript1:     class L{Transcript<Transcript>}
        @param transcript2:     a transcript from the reference set (already extended)
        @type  transcript2:     class L{Transcript<Transcript>}
        @param includeDistance: take into account the distance too
        @type  includeDistance: boolean
        @return:                true, if they overlap
        """
        extendedTranscript1 = Transcript()
        extendedTranscript1.copy(transcript1)
        if includeDistance:
            if self.maxDistance > 0:
                extendedTranscript1.extendStart(self.maxDistance)
                extendedTranscript1.extendEnd(self.maxDistance)

        minOverlap = self.minOverlap
        if self.pcOverlap != None:
            minOverlap = max(minOverlap, transcript1.getSize() / 100.0 * self.pcOverlap)
        if not extendedTranscript1.overlapWith(transcript2, self.minOverlap):
            return False
        if (self.downstreams[self.QUERY]       and transcript2.getStart() > extendedTranscript1.getStart()) or \
             (self.upstreams[self.QUERY]       and transcript2.getEnd()   < extendedTranscript1.getEnd())   or \
             (self.downstreams[self.REFERENCE] and extendedTranscript1.getStart() > transcript2.getStart()) or \
             (self.upstreams[self.REFERENCE]   and extendedTranscript1.getEnd()   < transcript2.getEnd()):
            return False
        if (self.antisense and extendedTranscript1.getDirection() == transcript2.getDirection()) or (self.colinear and extendedTranscript1.getDirection() != transcript2.getDirection()):
            return False
        if self.included and not transcript2.include(extendedTranscript1):
            return False
        if self.including and not extendedTranscript1.include(transcript2):
            return False
        if self.introns[self.REFERENCE] and self.introns[self.QUERY]:
            if self.logHandle != None:
                self.logHandle.write("%s overlaps with intron of %s\n" % (str(extendedTranscript1), str(transcript2)))
            return True
        if (not self.introns[self.REFERENCE]) and (not self.introns[self.QUERY]) and extendedTranscript1.overlapWithExon(transcript2, minOverlap):
            if self.logHandle != None:
                self.logHandle.write("%s overlaps with exon of %s\n" % (str(extendedTranscript1), str(transcript2)))
            return True
        return False


    def compareTranscriptToList(self, transcript1):
        """
        Compare a transcript to the reference list of transcripts
        (Do not extend the transcripts, except for the distance)
        @param transcript1: a transcript (from the query set)
        @type  transcript1: class L{Transcript<Transcript>}
        @return:            the reference transcripts overlapping
        """
        # no transcript in the reference table
        if transcript1.getChromosome() not in self.getTables(self.WORKING):
            return

        # retrieve the the transcripts that may overlap in the working tables
        clauses = []
        extendedTranscript1 = Transcript()
        extendedTranscript1.copy(transcript1)
        if self.maxDistance > 0:
            extendedTranscript1.extendStart(self.maxDistance)
        if self.maxDistance > 0:
            extendedTranscript1.extendEnd(self.maxDistance)
        command = "SELECT * FROM %s WHERE (" % (self.getTables(self.WORKING)[transcript1.getChromosome()].getName())
        for binPair in extendedTranscript1.getBins():
            clause = "bin "
            if binPair[0] == binPair[1]:
                clause += "= %i" % (binPair[0])
            else:
                clause += "BETWEEN %i AND %i" % (binPair[0], binPair[1])
            clauses.append(clause)
        command += " OR ".join(clauses)
        command += ") AND start <= %d AND end >= %d" % (extendedTranscript1.getEnd(), extendedTranscript1.getStart())

        for index2, transcript2 in self.getTables(self.REFERENCE)[transcript1.getChromosome()].selectTranscripts(command):
            if self.compareTranscript(extendedTranscript1, transcript2):
                yield transcript2


    def compareTranscriptList(self):
        """
        Compare a list of transcript to the reference one
        @return: the transcripts that overlap with the reference set
        """
        distance      = 0
        nbClustersIn  = 0
        nbClustersOut = 0
        if self.maxDistance != None:
            distance = self.maxDistance

        self.addIndexes([self.QUERY, self.REFERENCE])

        # export the container into tables
        self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True)
        self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True)

        # looping
        for chromosome1 in sorted(self.getTables(self.QUERY).keys()):
            # get range of transcripts
            command = "SELECT MIN(start), MAX(end), COUNT(id) FROM %s" % (self.getTables(self.QUERY)[chromosome1].getName())
            query     = self.mySqlConnection.executeQuery(command)
            result    = query.getLine()
            first     = result[0]
            last      = result[1]
            nb        = result[2]

            transcripts1 = []
            toBeRemoved1 = []
            transcripts2 = []
            toBeRemoved2 = []
            overlapsWith = []
            nbOverlaps   = []
            nbChunks     = max(1, nb / 100)
            chunkSize    = (last - first) / nbChunks
            progress     = Progress(nbChunks + 1, "Analyzing chromosome %s" % (chromosome1), self.verbosity-1)
            for chunk in range(nbChunks + 1):

                # load transcripts
                start   = first + chunk * chunkSize
                end     = start + chunkSize - 1
                command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.QUERY)[chromosome1].getName(), start, end-1)
                for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command):
                    transcripts1.append(transcript1)
                    overlapsWith.append([])
                    nbOverlaps.append(0)
                    nbClustersIn += 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements")
                command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.QUERY)[chromosome1].getName(), end)
                self.mySqlConnection.executeQuery(command)
                if chromosome1 in self.getTables(self.REFERENCE):
                    command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), start-distance, end+distance-1)
                    if chunk == 0:
                        command = "SELECT * FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance)
                    for index2, transcript2 in self.getTables(self.REFERENCE)[chromosome1].selectTranscripts(command):
                        transcripts2.append(transcript2)
                    command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance)
                    self.mySqlConnection.executeQuery(command)

                # compare sets
                for index1, transcript1 in enumerate(transcripts1):
                    overlappingNames = []
                    nbElements1      = 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements")
                    for transcript2 in transcripts2:
                        if self.compareTranscript(transcript1, transcript2, True):
                            id2 = transcript2.getTagValue("ID") if "ID" in transcript2.getTagNames() else transcript2.getName()
                            if id2 not in overlapsWith[index1]:
                                overlapsWith[index1].append(id2)
                                nbOverlaps[index1] += 1 if "nbElements" not in transcript2.getTagNames() else transcript2.getTagValue("nbElements")
                                if self.odds:
                                    if transcript2.getName() not in self.overlapResults:
                                        self.overlapResults[transcript2.getName()] = 0
                                    self.overlapResults[transcript2.getName()] += nbElements1

                    # check if query transcript extends bounds of the chunk
                    if transcript1.getEnd() < end:
                        if Utils.xor(overlapsWith[index1], self.invert) or self.notOverlapping:
                            if overlapsWith[index1]:
                                transcript1.setTagValue("overlapWith", ",".join(overlapsWith[index1])[:100])
                                transcript1.setTagValue("nbOverlaps", "%d" % (nbOverlaps[index1]))
                            elif self.notOverlapping:
                                transcript1.setTagValue("nbOverlaps", "0")
                            self.writeTranscript(transcript1)
                            nbClustersOut += nbElements1
                        toBeRemoved1.append(index1)

                # update list of query transcripts
                for index1 in reversed(toBeRemoved1):
                    del transcripts1[index1]
                    del overlapsWith[index1]
                    del nbOverlaps[index1]
                toBeRemoved1 = []

                # check if the reference transcripts extends bounds of the chunk
                for index2, transcript2 in enumerate(transcripts2):
                    if transcript2.getEnd() + distance < end:
                        toBeRemoved2.append(index2)
                for index2 in reversed(toBeRemoved2):
                    del transcripts2[index2]
                toBeRemoved2 = []

                progress.inc()

            for index1, transcript1 in enumerate(transcripts1):
                if Utils.xor(overlapsWith[index1], self.invert) or self.notOverlapping:
                    if overlapsWith[index1]:
                        transcript1.setTagValue("overlapWith", ",".join(overlapsWith[index1])[:100])
                        transcript1.setTagValue("nbOverlaps", "%d" % (nbOverlaps[index1]))
                    elif self.notOverlapping:
                        transcript1.setTagValue("nbOverlaps", "0")
                    self.writeTranscript(transcript1)
                    nbClustersOut += 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements")
            progress.done()
            self.getTables(self.QUERY)[chromosome1].remove()
            if chromosome1 in self.getTables(self.REFERENCE):
                self.getTables(self.REFERENCE)[chromosome1].remove()
                self.getTables(self.WORKING)[chromosome1].remove()

        self.flushData()
        if self.writer != None:
            self.writer.close()
            self.writer = None

        if self.verbosity > 0:
            print "reference: %d elements" % (self.nbTranscripts[self.REFERENCE])
            print "query:     %d elements, %d clustered" % (self.nbTranscripts[self.QUERY], nbClustersIn)
            if self.nbTranscripts[self.QUERY] != 0:
                print "output:    %d elements (%.2f%%)"% (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100),
                if nbClustersOut != 0:
                    print ", %d clustered (%.2f%%)" % (nbClustersOut, float(nbClustersOut) / nbClustersIn * 100)


    def compareTranscriptListDistance(self):
        """
        Compare a list of transcript to the reference one
        @return: the distance distributions in a hash
        """
        nbDistances  = 0
        distances    = {}
        absDistances = {}
        strandedDistances = dict([(strand, {}) for strand in (1, -1)])

        # export the container into tables
        self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True)
        self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True)

        progress = Progress(self.nbTranscripts[self.QUERY], "Analyzing chromosomes", self.verbosity-1)
        for transcript1 in self.inputTranscriptContainers[self.QUERY].getIterator():
            # get the distance
            transcript1    = self.extendTranscript(self.QUERY, transcript1)
            distance       = self.maxDistance + 1
            strand         = None
            closestElement = "None"
            for transcript2 in self.compareTranscriptToList(transcript1):
                thisStrand = transcript1.getDirection() * transcript2.getDirection()
                if self.antisense or (not self.colinear and transcript1.getDirection() != transcript2.getDirection()):
                    transcript2.reverse()
                if self.absolute:
                    transcript2.setDirection(transcript1.getDirection())
                if transcript2.getDirection() == transcript1.getDirection():
                    if self.starts[self.REFERENCE] != None:
                        transcript2.restrictStart(self.starts[self.REFERENCE])
                    if self.ends[self.REFERENCE] != None:
                        transcript2.restrictEnd(self.ends[self.REFERENCE])
                    thisDistance = transcript1.getRelativeDistance(transcript2)
                    if (self.absolute):
                        thisDistance = abs(thisDistance)
                    if abs(thisDistance) < abs(distance):
                        distance       = thisDistance
                        strand         = thisStrand
                        closestElement = transcript2.getTagValue("ID") if "ID" in transcript2.getTagNames() else transcript2.getName()
            if (distance <= self.maxDistance) and (self.minDistance == None or distance >= self.minDistance):
                nbDistances                        += 1
                distances[distance]                 = distances.get(distance, 0) + 1
                absDistance                         = abs(distance)
                absDistances[absDistance]           = absDistances.get(absDistance, 0) + 1
                strandedDistances[strand][distance] = strandedDistances[strand].get(distance, 0)
                if distance not in strandedDistances[-strand]:
                    strandedDistances[-strand][distance] = 0

            # write transcript
            if distance == self.maxDistance + 1:
                distance = "None"
            tmpTranscript = Transcript()
            tmpTranscript.copy(transcript1)
            tmpTranscript.setTagValue("distance", distance)
            tmpTranscript.setTagValue("closestElement", closestElement)
            self.writeTranscript(tmpTranscript)

            progress.inc()
        progress.done()

        self.flushData()

        if self.verbosity > 0:
            print "reference: %d sequences" % (self.nbTranscripts[self.REFERENCE])
            print "query:     %d sequences" % (self.nbTranscripts[self.QUERY])
            if nbDistances == 0:
                print "Nothing matches"
            else:
                print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(absDistances)
                print "for %d distances (%.2f%%)" % (nbDistances, float(nbDistances) / self.nbTranscripts[self.QUERY] * 100)

        if self.strandedDistance:
            return strandedDistances
        return distances


    def compareTranscriptListMerge(self):
        """
        Merge the query list of transcript with itself
        @return: the merged transcripts in a transcript list database
        """
        nbMerges  = 0

        for type in (self.QUERY, self.REFERENCE):
            self.storeTranscriptList(type, self.inputTranscriptContainers[type], True)
        self.flushData()

        # Loop on the chromosomes
        for chromosome in sorted(self.getTables(self.QUERY).keys()):
            if chromosome not in self.getTables(self.REFERENCE):
                continue

            # Get the size of the chromosome
            maxEnd   = 0
            nbChunks = 0
            for type in (self.QUERY, self.REFERENCE):
                command  = "SELECT MAX(end) from %s" % (self.getTables(type)[chromosome].getName())
                query    = self.mySqlConnection.executeQuery(command)
                maxEnd   = max(maxEnd, int(query.getLine()[0]))
                nbChunks = max(nbChunks, self.getTables(type)[chromosome].getNbElements())

            mergedTranscripts = {}
            transcripts = {self.QUERY: [], self.REFERENCE: []}
            progress    = Progress(nbChunks, "Analyzing %s" % (chromosome), self.verbosity-1)
            for i in range(nbChunks):
                rangeStart = int(i * (float(maxEnd) / nbChunks)) + 1
                rangeEnd   = int((i+1) * (float(maxEnd) / nbChunks))

                # Get all transcripts in query and reference from chunk
                for type in (self.QUERY, self.REFERENCE):
                    correction = 0 if self.QUERY else self.maxDistance
                    command = "SELECT * FROM %s WHERE start <= %d" % (self.getTables(type)[chromosome].getName(), rangeEnd + correction)
                    for index, transcript in self.getTables(type)[chromosome].selectTranscripts(command):
                        transcripts[type].append(transcript)

                # Merge elements between the two samples
                for iQuery, queryTranscript in enumerate(transcripts[self.QUERY]):
                    for iReference, referenceTranscript in enumerate(transcripts[self.REFERENCE]):
                        if referenceTranscript == None: continue
                        if self.compareTranscript(queryTranscript, referenceTranscript, True):
                            if queryTranscript.getDirection() != referenceTranscript.getDirection():
                                referenceTranscript.setDirection(queryTranscript.getDirection())
                            queryTranscript.merge(referenceTranscript, self.normalization)
                            nbMerges += 1
                            transcripts[self.REFERENCE][iReference] = None
                            if not self.multiple:
                                mergedTranscripts[iQuery] = 0

                # Remove transcripts from database
                for type in (self.QUERY, self.REFERENCE):
                    correction = 0 if self.QUERY else self.maxDistance
                    command    = "DELETE FROM %s WHERE start <= %d" % (self.getTables(type)[chromosome].getName(), rangeEnd - correction)
                    query      = self.mySqlConnection.executeQuery(command)

                # Just in case, self-merge the elements in the query (beware of mergedTranscripts!)
                if (self.multiple):
                    for iQuery1, queryTranscript1 in enumerate(transcripts[self.QUERY]):
                        if queryTranscript1 == None: continue
                        for iQuery2, queryTranscript2 in enumerate(transcripts[self.QUERY]):
                            if iQuery2 <= iQuery1 or queryTranscript2 == None: continue
                            minOverlap = self.minOverlap
                            if self.pcOverlap != None:
                                minOverlap = max(minOverlap, queryTranscript1.getSize() / 100.0 * self.pcOverlap)
                            if queryTranscript2.overlapWith(queryTranscript1, minOverlap) and (queryTranscript1.getDirection() == queryTranscript2.getDirection() or not self.colinear):
                                if queryTranscript1.getDirection() != queryTranscript2.getDirection():
                                    queryTranscript2.setDirection(queryTranscript1.getDirection())
                                queryTranscript1.merge(queryTranscript2, self.normalization)
                                transcripts[self.QUERY][iQuery2] = None
                                nbMerges += 1
                                if not self.multiple:
                                    mergedTranscripts[iQuery1] = 0

                # Update the sets of transcripts and write into database (also update mergedTranscripts)
                newTranscripts = {self.QUERY: [], self.REFERENCE: []}
                newMergedTranscripts = {}
                for type in (self.QUERY, self.REFERENCE):
                    for i, transcript in enumerate(transcripts[type]):
                        if transcript == None: continue
                        correction = 0 if self.QUERY else self.maxDistance
                        if transcript.getEnd() < rangeEnd - correction:
                            if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)):
                                self.writeTranscript(transcripts[type][i])
                        else:
                            if type == self.QUERY and i in mergedTranscripts:
                                newMergedTranscripts[len(newTranscripts[type])] = 0
                            newTranscripts[type].append(transcript)
                transcripts = newTranscripts
                mergedTranscripts = newMergedTranscripts

                progress.inc()
            progress.done()

            for type in (self.QUERY, self.REFERENCE):
                for i, transcript in enumerate(transcripts[type]):
                    if transcripts == None: continue
                    if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)):
                        self.writeTranscript(transcripts[type][i])

        # Manage chromosomes with no corresponding data
        if self.multiple:
            for type in self.INPUTTYPES:
                for chromosome in self.getTables(type):
                    if chromosome in self.getTables(1 - type):
                        continue
                    for transcript in self.getTables(self.OUTPUT)[chromosome].getIterator():
                        self.writeTranscript(transcript)

        self.flushData()
        if self.writer != None:
            self.writer.close()
            self.writer = None

        if self.verbosity > 0:
            print "query:    %d sequences" % (self.nbTranscripts[self.QUERY])
            print "# merges: %d" % (nbMerges)
            print "# printed %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100)


    def compareTranscriptListSelfMerge(self):
        """
        Merge the query list of transcript with itself
        @return: the merged transcripts in a transcript list database
        """
        nbMerges  = 0
        distance  = self.maxDistance if self.maxDistance != None else 0

        self.addIndexes([self.QUERY])
        self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True)
        self.flushData()

        # looping
        for chromosome1 in sorted(self.getTables(self.QUERY).keys()):
            transcripts2 = []

            # get range of transcripts
            progress = Progress(self.getTables(self.QUERY)[chromosome1].getNbElements(), "Analyzing chromosome %s" % (chromosome1), self.verbosity-1)
            command  = "SELECT * FROM %s ORDER BY start" % (self.getTables(self.QUERY)[chromosome1].getName())
            for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command):

                # compare sets
                toBeRemoved = set()
                toBePrinted = set()
                for index2, transcript2 in enumerate(transcripts2):

                    if self.compareTranscript(transcript1, transcript2, True):
                        if transcript1.getDirection() != transcript2.getDirection():
                            transcript2.setDirection(transcript1.getDirection())
                        transcript1.merge(transcript2, self.normalization)
                        toBeRemoved.add(index2)
                        nbMerges += 1
                    elif transcript2.getEnd() + distance < transcript1.getStart():
                        toBePrinted.add(index2)
                transcripts2.append(transcript1)

                for index2 in sorted(toBePrinted):
                    self.writeTranscript(transcripts2[index2])
                transcripts2 = [transcripts2[index2] for index2 in range(len(transcripts2)) if index2 not in (toBeRemoved | toBePrinted)]

            for transcript2 in transcripts2:
                self.writeTranscript(transcript2)
            progress.done()
            self.getTables(self.QUERY)[chromosome1].remove()

        self.flushData()
        if self.writer != None:
            self.writer.close()
            self.writer = None

        if self.verbosity > 0:
            print "query:    %d sequences" % (self.nbTranscripts[self.QUERY])
            print "# merges: %d" % (nbMerges)
            print "# printed %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100)


    def getDifferenceTranscriptList(self):
        """
        Get the elements of the first list which do not overlap the second list (at the nucleotide level)
        @return: the transcripts that overlap with the reference set
        """
        distance = 0 if self.maxDistance == None else self.maxDistance

        self.addIndexes([self.QUERY, self.REFERENCE])

        # export the container into tables
        self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True)
        self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True)

        # looping
        for chromosome1 in sorted(self.getTables(self.QUERY).keys()):
            # get range of transcripts
            command = "SELECT MIN(start), MAX(end), COUNT(id) FROM %s" % (self.getTables(self.QUERY)[chromosome1].getName())
            query   = self.mySqlConnection.executeQuery(command)
            result  = query.getLine()
            first   = result[0]
            last    = result[1]
            nb      = result[2]

            transcripts1 = []
            transcripts2 = []
            nbChunks     = max(1, nb / 100)
            chunkSize    = (last - first) / nbChunks
            progress     = Progress(nbChunks + 1, "Analyzing chromosome %s" % (chromosome1), self.verbosity-1)
            for chunk in range(nbChunks + 1):

                # load transcripts
                start   = first + chunk * chunkSize
                end     = start + chunkSize - 1
                command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.QUERY)[chromosome1].getName(), start, end-1)
                for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command):
                    transcripts1.append(transcript1)
                command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.QUERY)[chromosome1].getName(), end)
                self.mySqlConnection.executeQuery(command)
                if chromosome1 in self.getTables(self.REFERENCE):
                    command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), start-distance, end+distance-1)
                    if chunk == 0:
                        command = "SELECT * FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance)
                    for index2, transcript2 in self.getTables(self.REFERENCE)[chromosome1].selectTranscripts(command):
                        transcripts2.append(transcript2)
                    command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance)
                    self.mySqlConnection.executeQuery(command)

                # compare sets
                toBeRemoved1 = []
                for index1, transcript1 in enumerate(transcripts1):
                    newTranscript1 = Transcript()
                    newTranscript1.copy(transcript1)
                    for transcript2 in transcripts2:
                        newTranscript1 = newTranscript1.getDifference(transcript2)
                        if newTranscript1 == None:
                            toBeRemoved1.append(index1)
                            break
                    transcripts1[index1] = newTranscript1

                    # check if query transcript extends bounds of the chunk
                    if newTranscript1 != None and newTranscript1.getEnd() < end:
                        if self.splitDifference:
                            for exon in newTranscript1.getExons():
                                transcript = Transcript()
                                transcript.copy(exon)
                                self.writeTranscript(transcript)
                        else:
                            self.writeTranscript(newTranscript1)
                        toBeRemoved1.append(index1)

                # update list of query transcripts
                for index1 in reversed(toBeRemoved1):
                    del transcripts1[index1]

                # check if the reference transcripts extends bounds of the chunk
                toBeRemoved2 = []
                for index2, transcript2 in enumerate(transcripts2):
                    if transcript2.getEnd() + distance < end:
                        toBeRemoved2.append(index2)
                for index2 in reversed(toBeRemoved2):
                    del transcripts2[index2]

                progress.inc()

            for transcript1 in transcripts1:
                if self.splitDifference:
                    for exon in transcript1.getExons():
                        transcript = Transcript()
                        transcript.copy(exon)
                        self.writeTranscript(transcript)
                else:
                    self.writeTranscript(transcript1)
            progress.done()
            self.getTables(self.QUERY)[chromosome1].remove()
            if chromosome1 in self.getTables(self.REFERENCE):
                self.getTables(self.REFERENCE)[chromosome1].remove()
                self.getTables(self.WORKING)[chromosome1].remove()

        self.flushData()
        if self.writer != None:
            self.writer.close()
            self.writer = None

        if self.verbosity > 0:
            print "query:     %d elements" % (self.nbTranscripts[self.QUERY])
            print "reference: %d elements" % (self.nbTranscripts[self.REFERENCE])
            print "# printed: %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100)


    def getOddsPerTranscript(self):
        """
        Return overlap results
        @return a dict of data
        """
        if not self.odds:
            raise Exception("Did not compute odds!")
        return self.overlapResults


    def getOdds(self):
        """
        Return odds about the overlap
        @return a dict of data
        """
        if not self.odds:
            raise Exception("Did not compute odds!")
        if self.oddResults != None:
            return self.oddResults
        self.oddResults = {}
        for name, value in self.overlapResults.iteritems():
            self.oddResults[value] = self.oddResults.get(value, 0) + 1
        return self.oddResults
author	m-zytnicki
date	Wed, 18 Nov 2015 10:59:50 -0500
parents	769e306b7933
children