view commons/tools/PostAnalyzeTELib.py @ 31:0ab839023fe4

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 14:33:21 -0400
parents 94ab73e8a190
children
line wrap: on
line source

#!/usr/bin/env python

# Copyright INRA (Institut National de la Recherche Agronomique)
# http://www.inra.fr
# http://urgi.versailles.inra.fr
#
# This software is governed by the CeCILL license under French law and
# abiding by the rules of distribution of free software.  You can  use, 
# modify and/ or redistribute the software under the terms of the CeCILL
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info". 
#
# As a counterpart to the access to the source code and  rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty  and the software's author,  the holder of the
# economic rights,  and the successive licensors  have only  limited
# liability. 
#
# In this respect, the user's attention is drawn to the risks associated
# with loading,  using,  modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean  that it is complicated to manipulate,  and  that  also
# therefore means  that it is reserved for developers  and  experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or 
# data to be ensured and,  more generally, to use and operate it in the 
# same conditions as regards security. 
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL license and that you accept its terms.

from commons.core.LoggerFactory import LoggerFactory
from commons.core.utils.RepetOptionParser import RepetOptionParser
from commons.core.utils.FileUtils import FileUtils
from commons.core.stat.Stat import Stat
from commons.core.seq.BioseqDB import BioseqDB
from commons.launcher.LaunchBlastclust import LaunchBlastclust
from commons.tools.AnnotationStats import AnnotationStats
import os

CONSENSUS = "TE"
CLUSTER = "Cluster"
LOG_DEPTH = "repet.tools"
LOG_FORMAT = "%(message)s"

class PostAnalyzeTELib(object):
    
    def __init__(self, analysis = 1, fastaFileName = "", clusterFileName = "", pathTableName="", seqTableName="", genomeSize=0, configFileName = "", doClean = False, verbosity = 3):
        self._analysis = analysis
        self._fastaFileName = fastaFileName
        self._pathTableName = pathTableName
        self._seqTableName = seqTableName
        self._genomeSize = genomeSize
        if self._analysis == 1:
            self.setBioseqDB()
        self._identity = 0
        self._coverage = 80
        self._applyCovThresholdOnBothSeq = False
        self.setClusterFileName(clusterFileName)
        self.setStatPerClusterFileName()
        self.setClassifStatPerClusterFileName()
        self.setAnnotationStatsPerTEFileName()
        self.setAnnotationStatsPerClusterFileName()
        self._doClean = doClean
        self._verbosity = verbosity
        self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity, LOG_FORMAT)
        
    def setAttributesFromCmdLine(self):
        description = "Tool to post-analyze a TE library : clusterize, give stats on cluster, on annotation,...\n"
        epilog = "\nExample 1: clustering (e.g. to detect redundancy)\n"
        epilog += "\t$ python PostAnalyzeTELib.py -a 1 -i TElib.fa -L 98 -S 95 -b\n"
        epilog += "Example 2: classification stats per cluster\n"
        epilog += "\t$ python PostAnalyzeTELib.py -a 2 -t TElib.tab\n"
        epilog += "Example 3: annotation stats per consensus\n"
        epilog += "\t$ python PostAnalyzeTELib.py -a 3 -p project_chr_allTEs_nr_noSSR_join_path -s project_refTEs_seq -g 129919500\n"
        epilog += "Example 4: annotation stats per cluster\n"
        epilog += "\t$ python PostAnalyzeTELib.py -a 4 -t TElib.tab -p project_chr_allTEs_nr_noSSR_join_path -s project_refTEs_seq -g 129919500\n"
        parser = RepetOptionParser(description = description, epilog = epilog)
        parser.add_option("-a", "--analysis",   dest = "analysis",          action = "store",       type = "int",    help = "analysis number [default: 1, 2, 3, 4]", default = 1)
        parser.add_option("-i", "--fasta",      dest = "fastaFileName",     action = "store",       type = "string", help = "fasta file name [for analysis 1] [format: fasta]", default = "")
        parser.add_option("-L", "--coverage",   dest = "coverage",          action = "store",       type = "int",    help = "length coverage threshold [for analysis 1] [format: %] [default: 80]", default = 80)
        parser.add_option("-S", "--identity",   dest = "identity",          action = "store",       type = "int",    help = "identity threshold [for analysis 1 with BlastClust] [format: %] [default: 0 => no threshold]", default = 0)
        parser.add_option("-b", "--both",       dest = "bothSeq",           action = "store_true",                   help = "require coverage on both neighbours [for analysis 1] [default: False]", default = False)
        parser.add_option("-t", "--cluster",    dest = "clusterFileName",   action = "store",       type = "string", help = "cluster file name [for analysis 2 and 4] [default: <input>.tab]", default = "")
        parser.add_option("-p", "--path",       dest = "pathTableName",     action = "store",       type = "string", help = "name of the table (_path) with the annotated TE copies [for analysis 3 and 4]", default = "")
        parser.add_option("-s", "--seq",        dest = "seqTableName",      action = "store",       type = "string", help = "name of the table (_seq) with the TE reference sequences [for analysis 3 and 4]", default = "")
        parser.add_option("-g", "--genome",     dest = "genomeSize",        action = "store",       type = "int",    help = "genome size [for analysis 3 and 4]", default = None)
        parser.add_option("-c", "--clean",      dest = "doClean",           action = "store_true",                   help = "clean temporary files [optional] [default: False]", default = False)
        parser.add_option("-v", "--verbosity",  dest = "verbosity",         action = "store",       type = "int",    help = "verbosity [optional] [default: 1]", default = 1)
        options = parser.parse_args()[0]
        self._setAttributesFromOptions(options)
        
    def _setAttributesFromOptions(self, options):
        self.setAnalysis(options.analysis)
        if self._analysis == 1:
            self.setFastaFileName(options.fastaFileName)
            self.setBioseqDB()
            self.setIdentity(options.identity)
            self.setCoverage(options.coverage)
            self.setApplyCovThresholdOnBothSeq(options.bothSeq)
        self.setClusterFileName(options.clusterFileName)
        self.setPathTableName(options.pathTableName)
        self.setSeqTableName(options.seqTableName)
        self.setStatPerClusterFileName()
        self.setClassifStatPerClusterFileName()
        self.setAnnotationStatsPerTEFileName()
        self.setAnnotationStatsPerClusterFileName()
        self.setGenomeSize(options.genomeSize)
        self.setDoClean(options.doClean)
        self.setVerbosity(options.verbosity)

    def setAnalysis(self, analysis):
        self._analysis = analysis
        
    def setFastaFileName(self, fastaFileName):
        self._fastaFileName = fastaFileName
    
    def setBioseqDB(self):
        self._iBioseqDB = BioseqDB(name = self._fastaFileName)   
        
    def setIdentity(self, identity):
        self._identity = identity
        
    def setCoverage(self, coverage):
        self._coverage = coverage
        
    def setApplyCovThresholdOnBothSeq(self, bothSeq):
        self._applyCovThresholdOnBothSeq = bothSeq
    
    def setClusterFileName(self, clusterFileName):
        if clusterFileName == "":
            self._clusterFileName = "%s.tab" % os.path.splitext(os.path.basename(self._fastaFileName))[0]
        else:
            self._clusterFileName = clusterFileName
    
    def setStatPerClusterFileName(self):
        self._statPerClusterFileName = "%s.statsPerCluster.tab" % os.path.splitext(self._clusterFileName)[0]
        self._globalStatPerClusterFileName = "%s.globalStatsPerCluster.txt" % os.path.splitext(self._clusterFileName)[0]
        
    def setClassifStatPerClusterFileName(self):
        self._classifStatPerClusterFileName = "%s.classifStatsPerCluster.tab" % os.path.splitext(self._clusterFileName)[0]
        
    def setAnnotationStatsPerTEFileName(self):
        self._annotStatsPerTEFileName = "%s.annotStatsPerTE.tab" % self._pathTableName
        self._globalAnnotStatsPerTEFileName = "%s.globalAnnotStatsPerTE.txt" % self._pathTableName
        
    def setAnnotationStatsPerClusterFileName(self):
        self._annotStatsPerClusterFileName = "%s.annotStatsPerCluster.tab" % self._pathTableName
        
    def setPathTableName(self, pathTableName):
        self._pathTableName = pathTableName
    
    def setSeqTableName(self, seqTableName):
        self._seqTableName = seqTableName
        
    def setGenomeSize(self, genomeSize):
        self._genomeSize = genomeSize
                 
    def setDoClean(self, doClean):
        self._doClean = doClean
        
    def setVerbosity(self, verbosity):
        self._verbosity = verbosity
        
    def _checkOptions(self):
        if (self._fastaFileName == "" or not FileUtils.isRessourceExists(self._fastaFileName)) and self._analysis == 1:
            self._logAndRaise("ERROR: Missing fasta file"  )
            
    def _logAndRaise(self, errorMsg):
        self._log.error(errorMsg)
        raise Exception(errorMsg)
                    
    def run(self):
        LoggerFactory.setLevel(self._log, self._verbosity)
        self._checkOptions()
        self._log.info("START PostAnalyzeTELib analysis %d" % self._analysis)
        if self._analysis == 1:
            #TODO: option to choose clustering program (blastclust, MCL, uclust,...)
            self._log.debug("TE lib: %s" % self._fastaFileName)
            self._log.info("Launch blastclust...")
            iLaunchBlastclust = LaunchBlastclust(clean = self._doClean)
            iLaunchBlastclust.setTmpFileName(self._clusterFileName)
            iLaunchBlastclust.setIdentityThreshold(self._identity)
            iLaunchBlastclust.setCoverageThreshold(self._coverage/100.0)
            if self._applyCovThresholdOnBothSeq:
                iLaunchBlastclust.setBothSequences("T")
            else:
                iLaunchBlastclust.setBothSequences("F")
            iLaunchBlastclust.setVerbosityLevel(self._verbosity - 3)
            iLaunchBlastclust.launchBlastclust(self._fastaFileName)
            self._log.info("Compute stats...")
            self._giveStatsOnTEClusters()
        if self._analysis == 2:
            #TODO add global stats (e.g.: nb of cluster without PotentialChimeric, nb of clusters with LTR...) ?
            self._log.debug("Consensus cluster file name: %s" % self._clusterFileName)
            self._log.info("Compute classification stats...")
            self._giveStatsOnConsensusClusters()
        if self._analysis == 3:
            #TODO: in option, save stats in DB
            self._log.info("Compute annotation stats per TE...")
            iAnnotationStats = AnnotationStats(analysisName="TE", seqTableName=self._seqTableName, pathTableName=self._pathTableName,\
                                            genomeLength=self._genomeSize, statsFileName=self._annotStatsPerTEFileName, globalStatsFileName=self._globalAnnotStatsPerTEFileName, verbosity=self._verbosity-1)
            iAnnotationStats.run()
        if self._analysis == 4:
            self._log.info("Compute annotation stats per cluster...")
            iAnnotationStats = AnnotationStats(analysisName="Cluster", clusterFileName=self._clusterFileName, seqTableName=self._seqTableName, pathTableName=self._pathTableName,\
                                            genomeLength=self._genomeSize, statsFileName=self._annotStatsPerClusterFileName, verbosity=self._verbosity-1)
            iAnnotationStats.run()
        self._log.info("END PostAnalyzeTELib analysis %d" % self._analysis)

    def _giveStatsOnConsensusClusters(self):
        with open(self._classifStatPerClusterFileName, "w") as f:
            f.write("cluster\tnoCat\tPotentialChimeric\tcomp\tincomp\tclassifs (nbTEs)\n")
            with open(self._clusterFileName) as fCluster:
                for clusterId, line in enumerate(fCluster):
                    dClassifNb = {}
                    nbIncomp =0
                    nbComp=0
                    nbChim=0
                    nbNoCat=0
                    lConsensus = line.split()
                    for consensus in lConsensus:
                        classifInfos = consensus.split("_")[0]
                        
                        if "-incomp" in classifInfos:
                            nbIncomp += 1
                            classifInfos = classifInfos.replace("-incomp","")
                        if "-comp" in classifInfos:
                            nbComp += 1
                            classifInfos = classifInfos.replace("-comp","")
                        if "-chim" in classifInfos:
                            nbChim += 1
                            classifInfos = classifInfos.replace("-chim","")
                        if "noCat" in classifInfos:
                            nbNoCat += 1
                            classifInfos = classifInfos.replace("noCat","")
                            
                        classif = classifInfos.split("-")[-1]
                        if classif != "":
                            if dClassifNb.get(classif, None) is None:
                                dClassifNb[classif] = 0
                            dClassifNb[classif] +=1
                            
                    occurences= []
                    for classif, occs in dClassifNb.items():
                        occurences.append("%s (%d)" % (classif, occs))
                    
                    f.write("%d\t%d\t%d\t%d\t%d\t%s\n" % (clusterId+1, nbNoCat, nbChim\
                                        , nbComp, nbIncomp,"\t".join(occurences)))
                    
    def _giveStatsOnTEClusters(self):
        with open(self._clusterFileName) as fCluster:
            with open(self._statPerClusterFileName, 'w') as fStatPerCluster:
                fStatPerCluster.write("cluster\tsequencesNb\tsizeOfSmallestSeq\tsizeOfLargestSeq\taverageSize\tmedSize\n")
                line = fCluster.readline()
                clusterNb = 0
                clusterSeqList= line.split()          
                minClusterSize = len(clusterSeqList)
                maxClusterSize = 0
                totalSeqNb = 0
                seqNbInBigClusters = 0
                dClusterSize2ClusterNb = {1:0, 2:0, 3:0}
                while line:
                    clusterSeqList= line.split()  
                    seqNb = len(clusterSeqList)
                    totalSeqNb += seqNb
                    if seqNb > 2:
                        seqNbInBigClusters += seqNb
                        dClusterSize2ClusterNb[3] += 1
                    else:
                        dClusterSize2ClusterNb[seqNb] += 1
                    if seqNb > maxClusterSize:
                        maxClusterSize = seqNb
                    if seqNb < minClusterSize:
                        minClusterSize = seqNb
                    line = fCluster.readline()
                    clusterNb += 1
                    clusterSeqLengths = self._iBioseqDB.getSeqLengthByListOfName(clusterSeqList)
                    iStatSeqLengths = Stat(clusterSeqLengths)
                    fStatPerCluster.write("%d\t%d\t%d\t%d\t%d\t%d\n" %(clusterNb, seqNb, min(clusterSeqLengths), max(clusterSeqLengths),  iStatSeqLengths.mean(),  iStatSeqLengths.median()))
        
        with open(self._globalStatPerClusterFileName, 'w') as fG:
            fG.write("nb of clusters: %d\n" % clusterNb)
            fG.write("nb of clusters with 1 sequence: %d\n" % dClusterSize2ClusterNb[1])
            fG.write("nb of clusters with 2 sequences: %d\n" % dClusterSize2ClusterNb[2])
            fG.write("nb of clusters with >2 sequences: %d (%d sequences)\n" % (dClusterSize2ClusterNb[3], seqNbInBigClusters))
            fG.write("nb of sequences: %d\n" % totalSeqNb)
            fG.write("nb of sequences in the largest cluster: %d\n" % maxClusterSize)
            fG.write("nb of sequences in the smallest cluster: %d\n" % minClusterSize)
            lSeqSizes = self._iBioseqDB.getListOfSequencesLength()
            iStat = Stat(lSeqSizes)
            fG.write("size of the smallest sequence: %d\n" % min(lSeqSizes))
            fG.write("size of the largest sequence: %d\n" % max(lSeqSizes))
            fG.write("average sequences size: %d\n" % iStat.mean())
            fG.write("median sequences size: %d\n" % iStat.median())

if __name__ == "__main__":
    iLaunch = PostAnalyzeTELib()
    iLaunch.setAttributesFromCmdLine()
    iLaunch.run()