diff commons/tools/PostAnalyzeTELib.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/commons/tools/PostAnalyzeTELib.py	Mon Apr 29 03:20:15 2013 -0400
@@ -0,0 +1,301 @@
+#!/usr/bin/env python
+
+# Copyright INRA (Institut National de la Recherche Agronomique)
+# http://www.inra.fr
+# http://urgi.versailles.inra.fr
+#
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software.  You can  use, 
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info". 
+#
+# As a counterpart to the access to the source code and  rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty  and the software's author,  the holder of the
+# economic rights,  and the successive licensors  have only  limited
+# liability. 
+#
+# In this respect, the user's attention is drawn to the risks associated
+# with loading,  using,  modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean  that it is complicated to manipulate,  and  that  also
+# therefore means  that it is reserved for developers  and  experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or 
+# data to be ensured and,  more generally, to use and operate it in the 
+# same conditions as regards security. 
+#
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+
+from commons.core.LoggerFactory import LoggerFactory
+from commons.core.utils.RepetOptionParser import RepetOptionParser
+from commons.core.utils.FileUtils import FileUtils
+from commons.core.stat.Stat import Stat
+from commons.core.seq.BioseqDB import BioseqDB
+from commons.launcher.LaunchBlastclust import LaunchBlastclust
+from commons.tools.AnnotationStats import AnnotationStats
+import os
+
+CONSENSUS = "TE"
+CLUSTER = "Cluster"
+LOG_DEPTH = "repet.tools"
+LOG_FORMAT = "%(message)s"
+
+class PostAnalyzeTELib(object):
+    
+    def __init__(self, analysis = 1, fastaFileName = "", clusterFileName = "", pathTableName="", seqTableName="", genomeSize=0, configFileName = "", doClean = False, verbosity = 3):
+        self._analysis = analysis
+        self._fastaFileName = fastaFileName
+        self._pathTableName = pathTableName
+        self._seqTableName = seqTableName
+        self._genomeSize = genomeSize
+        if self._analysis == 1:
+            self.setBioseqDB()
+        self._identity = 0
+        self._coverage = 80
+        self._applyCovThresholdOnBothSeq = False
+        self.setClusterFileName(clusterFileName)
+        self.setStatPerClusterFileName()
+        self.setClassifStatPerClusterFileName()
+        self.setAnnotationStatsPerTEFileName()
+        self.setAnnotationStatsPerClusterFileName()
+        self._doClean = doClean
+        self._verbosity = verbosity
+        self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity, LOG_FORMAT)
+        
+    def setAttributesFromCmdLine(self):
+        description = "Tool to post-analyze a TE library : clusterize, give stats on cluster, on annotation,...\n"
+        epilog = "\nExample 1: clustering (e.g. to detect redundancy)\n"
+        epilog += "\t$ python PostAnalyzeTELib.py -a 1 -i TElib.fa -L 98 -S 95 -b\n"
+        epilog += "Example 2: classification stats per cluster\n"
+        epilog += "\t$ python PostAnalyzeTELib.py -a 2 -t TElib.tab\n"
+        epilog += "Example 3: annotation stats per consensus\n"
+        epilog += "\t$ python PostAnalyzeTELib.py -a 3 -p project_chr_allTEs_nr_noSSR_join_path -s project_refTEs_seq -g 129919500\n"
+        epilog += "Example 4: annotation stats per cluster\n"
+        epilog += "\t$ python PostAnalyzeTELib.py -a 4 -t TElib.tab -p project_chr_allTEs_nr_noSSR_join_path -s project_refTEs_seq -g 129919500\n"
+        parser = RepetOptionParser(description = description, epilog = epilog)
+        parser.add_option("-a", "--analysis",   dest = "analysis",          action = "store",       type = "int",    help = "analysis number [default: 1, 2, 3, 4]", default = 1)
+        parser.add_option("-i", "--fasta",      dest = "fastaFileName",     action = "store",       type = "string", help = "fasta file name [for analysis 1] [format: fasta]", default = "")
+        parser.add_option("-L", "--coverage",   dest = "coverage",          action = "store",       type = "int",    help = "length coverage threshold [for analysis 1] [format: %] [default: 80]", default = 80)
+        parser.add_option("-S", "--identity",   dest = "identity",          action = "store",       type = "int",    help = "identity threshold [for analysis 1 with BlastClust] [format: %] [default: 0 => no threshold]", default = 0)
+        parser.add_option("-b", "--both",       dest = "bothSeq",           action = "store_true",                   help = "require coverage on both neighbours [for analysis 1] [default: False]", default = False)
+        parser.add_option("-t", "--cluster",    dest = "clusterFileName",   action = "store",       type = "string", help = "cluster file name [for analysis 2 and 4] [default: <input>.tab]", default = "")
+        parser.add_option("-p", "--path",       dest = "pathTableName",     action = "store",       type = "string", help = "name of the table (_path) with the annotated TE copies [for analysis 3 and 4]", default = "")
+        parser.add_option("-s", "--seq",        dest = "seqTableName",      action = "store",       type = "string", help = "name of the table (_seq) with the TE reference sequences [for analysis 3 and 4]", default = "")
+        parser.add_option("-g", "--genome",     dest = "genomeSize",        action = "store",       type = "int",    help = "genome size [for analysis 3 and 4]", default = None)
+        parser.add_option("-c", "--clean",      dest = "doClean",           action = "store_true",                   help = "clean temporary files [optional] [default: False]", default = False)
+        parser.add_option("-v", "--verbosity",  dest = "verbosity",         action = "store",       type = "int",    help = "verbosity [optional] [default: 1]", default = 1)
+        options = parser.parse_args()[0]
+        self._setAttributesFromOptions(options)
+        
+    def _setAttributesFromOptions(self, options):
+        self.setAnalysis(options.analysis)
+        if self._analysis == 1:
+            self.setFastaFileName(options.fastaFileName)
+            self.setBioseqDB()
+            self.setIdentity(options.identity)
+            self.setCoverage(options.coverage)
+            self.setApplyCovThresholdOnBothSeq(options.bothSeq)
+        self.setClusterFileName(options.clusterFileName)
+        self.setPathTableName(options.pathTableName)
+        self.setSeqTableName(options.seqTableName)
+        self.setStatPerClusterFileName()
+        self.setClassifStatPerClusterFileName()
+        self.setAnnotationStatsPerTEFileName()
+        self.setAnnotationStatsPerClusterFileName()
+        self.setGenomeSize(options.genomeSize)
+        self.setDoClean(options.doClean)
+        self.setVerbosity(options.verbosity)
+
+    def setAnalysis(self, analysis):
+        self._analysis = analysis
+        
+    def setFastaFileName(self, fastaFileName):
+        self._fastaFileName = fastaFileName
+    
+    def setBioseqDB(self):
+        self._iBioseqDB = BioseqDB(name = self._fastaFileName)   
+        
+    def setIdentity(self, identity):
+        self._identity = identity
+        
+    def setCoverage(self, coverage):
+        self._coverage = coverage
+        
+    def setApplyCovThresholdOnBothSeq(self, bothSeq):
+        self._applyCovThresholdOnBothSeq = bothSeq
+    
+    def setClusterFileName(self, clusterFileName):
+        if clusterFileName == "":
+            self._clusterFileName = "%s.tab" % os.path.splitext(os.path.basename(self._fastaFileName))[0]
+        else:
+            self._clusterFileName = clusterFileName
+    
+    def setStatPerClusterFileName(self):
+        self._statPerClusterFileName = "%s.statsPerCluster.tab" % os.path.splitext(self._clusterFileName)[0]
+        self._globalStatPerClusterFileName = "%s.globalStatsPerCluster.txt" % os.path.splitext(self._clusterFileName)[0]
+        
+    def setClassifStatPerClusterFileName(self):
+        self._classifStatPerClusterFileName = "%s.classifStatsPerCluster.tab" % os.path.splitext(self._clusterFileName)[0]
+        
+    def setAnnotationStatsPerTEFileName(self):
+        self._annotStatsPerTEFileName = "%s.annotStatsPerTE.tab" % self._pathTableName
+        self._globalAnnotStatsPerTEFileName = "%s.globalAnnotStatsPerTE.txt" % self._pathTableName
+        
+    def setAnnotationStatsPerClusterFileName(self):
+        self._annotStatsPerClusterFileName = "%s.annotStatsPerCluster.tab" % self._pathTableName
+        
+    def setPathTableName(self, pathTableName):
+        self._pathTableName = pathTableName
+    
+    def setSeqTableName(self, seqTableName):
+        self._seqTableName = seqTableName
+        
+    def setGenomeSize(self, genomeSize):
+        self._genomeSize = genomeSize
+                 
+    def setDoClean(self, doClean):
+        self._doClean = doClean
+        
+    def setVerbosity(self, verbosity):
+        self._verbosity = verbosity
+        
+    def _checkOptions(self):
+        if (self._fastaFileName == "" or not FileUtils.isRessourceExists(self._fastaFileName)) and self._analysis == 1:
+            self._logAndRaise("ERROR: Missing fasta file"  )
+            
+    def _logAndRaise(self, errorMsg):
+        self._log.error(errorMsg)
+        raise Exception(errorMsg)
+                    
+    def run(self):
+        LoggerFactory.setLevel(self._log, self._verbosity)
+        self._checkOptions()
+        self._log.info("START PostAnalyzeTELib analysis %d" % self._analysis)
+        if self._analysis == 1:
+            #TODO: option to choose clustering program (blastclust, MCL, uclust,...)
+            self._log.debug("TE lib: %s" % self._fastaFileName)
+            self._log.info("Launch blastclust...")
+            iLaunchBlastclust = LaunchBlastclust(clean = self._doClean)
+            iLaunchBlastclust.setTmpFileName(self._clusterFileName)
+            iLaunchBlastclust.setIdentityThreshold(self._identity)
+            iLaunchBlastclust.setCoverageThreshold(self._coverage/100.0)
+            if self._applyCovThresholdOnBothSeq:
+                iLaunchBlastclust.setBothSequences("T")
+            else:
+                iLaunchBlastclust.setBothSequences("F")
+            iLaunchBlastclust.setVerbosityLevel(self._verbosity - 3)
+            iLaunchBlastclust.launchBlastclust(self._fastaFileName)
+            self._log.info("Compute stats...")
+            self._giveStatsOnTEClusters()
+        if self._analysis == 2:
+            #TODO add global stats (e.g.: nb of cluster without PotentialChimeric, nb of clusters with LTR...) ?
+            self._log.debug("Consensus cluster file name: %s" % self._clusterFileName)
+            self._log.info("Compute classification stats...")
+            self._giveStatsOnConsensusClusters()
+        if self._analysis == 3:
+            #TODO: in option, save stats in DB
+            self._log.info("Compute annotation stats per TE...")
+            iAnnotationStats = AnnotationStats(analysisName="TE", seqTableName=self._seqTableName, pathTableName=self._pathTableName,\
+                                            genomeLength=self._genomeSize, statsFileName=self._annotStatsPerTEFileName, globalStatsFileName=self._globalAnnotStatsPerTEFileName, verbosity=self._verbosity-1)
+            iAnnotationStats.run()
+        if self._analysis == 4:
+            self._log.info("Compute annotation stats per cluster...")
+            iAnnotationStats = AnnotationStats(analysisName="Cluster", clusterFileName=self._clusterFileName, seqTableName=self._seqTableName, pathTableName=self._pathTableName,\
+                                            genomeLength=self._genomeSize, statsFileName=self._annotStatsPerClusterFileName, verbosity=self._verbosity-1)
+            iAnnotationStats.run()
+        self._log.info("END PostAnalyzeTELib analysis %d" % self._analysis)
+
+    def _giveStatsOnConsensusClusters(self):
+        with open(self._classifStatPerClusterFileName, "w") as f:
+            f.write("cluster\tnoCat\tPotentialChimeric\tcomp\tincomp\tclassifs (nbTEs)\n")
+            with open(self._clusterFileName) as fCluster:
+                for clusterId, line in enumerate(fCluster):
+                    dClassifNb = {}
+                    nbIncomp =0
+                    nbComp=0
+                    nbChim=0
+                    nbNoCat=0
+                    lConsensus = line.split()
+                    for consensus in lConsensus:
+                        classifInfos = consensus.split("_")[0]
+                        
+                        if "-incomp" in classifInfos:
+                            nbIncomp += 1
+                            classifInfos = classifInfos.replace("-incomp","")
+                        if "-comp" in classifInfos:
+                            nbComp += 1
+                            classifInfos = classifInfos.replace("-comp","")
+                        if "-chim" in classifInfos:
+                            nbChim += 1
+                            classifInfos = classifInfos.replace("-chim","")
+                        if "noCat" in classifInfos:
+                            nbNoCat += 1
+                            classifInfos = classifInfos.replace("noCat","")
+                            
+                        classif = classifInfos.split("-")[-1]
+                        if classif != "":
+                            if dClassifNb.get(classif, None) is None:
+                                dClassifNb[classif] = 0
+                            dClassifNb[classif] +=1
+                            
+                    occurences= []
+                    for classif, occs in dClassifNb.items():
+                        occurences.append("%s (%d)" % (classif, occs))
+                    
+                    f.write("%d\t%d\t%d\t%d\t%d\t%s\n" % (clusterId+1, nbNoCat, nbChim\
+                                        , nbComp, nbIncomp,"\t".join(occurences)))
+                    
+    def _giveStatsOnTEClusters(self):
+        with open(self._clusterFileName) as fCluster:
+            with open(self._statPerClusterFileName, 'w') as fStatPerCluster:
+                fStatPerCluster.write("cluster\tsequencesNb\tsizeOfSmallestSeq\tsizeOfLargestSeq\taverageSize\tmedSize\n")
+                line = fCluster.readline()
+                clusterNb = 0
+                clusterSeqList= line.split()          
+                minClusterSize = len(clusterSeqList)
+                maxClusterSize = 0
+                totalSeqNb = 0
+                seqNbInBigClusters = 0
+                dClusterSize2ClusterNb = {1:0, 2:0, 3:0}
+                while line:
+                    clusterSeqList= line.split()  
+                    seqNb = len(clusterSeqList)
+                    totalSeqNb += seqNb
+                    if seqNb > 2:
+                        seqNbInBigClusters += seqNb
+                        dClusterSize2ClusterNb[3] += 1
+                    else:
+                        dClusterSize2ClusterNb[seqNb] += 1
+                    if seqNb > maxClusterSize:
+                        maxClusterSize = seqNb
+                    if seqNb < minClusterSize:
+                        minClusterSize = seqNb
+                    line = fCluster.readline()
+                    clusterNb += 1
+                    clusterSeqLengths = self._iBioseqDB.getSeqLengthByListOfName(clusterSeqList)
+                    iStatSeqLengths = Stat(clusterSeqLengths)
+                    fStatPerCluster.write("%d\t%d\t%d\t%d\t%d\t%d\n" %(clusterNb, seqNb, min(clusterSeqLengths), max(clusterSeqLengths),  iStatSeqLengths.mean(),  iStatSeqLengths.median()))
+        
+        with open(self._globalStatPerClusterFileName, 'w') as fG:
+            fG.write("nb of clusters: %d\n" % clusterNb)
+            fG.write("nb of clusters with 1 sequence: %d\n" % dClusterSize2ClusterNb[1])
+            fG.write("nb of clusters with 2 sequences: %d\n" % dClusterSize2ClusterNb[2])
+            fG.write("nb of clusters with >2 sequences: %d (%d sequences)\n" % (dClusterSize2ClusterNb[3], seqNbInBigClusters))
+            fG.write("nb of sequences: %d\n" % totalSeqNb)
+            fG.write("nb of sequences in the largest cluster: %d\n" % maxClusterSize)
+            fG.write("nb of sequences in the smallest cluster: %d\n" % minClusterSize)
+            lSeqSizes = self._iBioseqDB.getListOfSequencesLength()
+            iStat = Stat(lSeqSizes)
+            fG.write("size of the smallest sequence: %d\n" % min(lSeqSizes))
+            fG.write("size of the largest sequence: %d\n" % max(lSeqSizes))
+            fG.write("average sequences size: %d\n" % iStat.mean())
+            fG.write("median sequences size: %d\n" % iStat.median())
+
+if __name__ == "__main__":
+    iLaunch = PostAnalyzeTELib()
+    iLaunch.setAttributesFromCmdLine()
+    iLaunch.run()
\ No newline at end of file