Mercurial > repos > yufei-luo > s_mart
diff commons/tools/PostAnalyzeTELib.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/tools/PostAnalyzeTELib.py Mon Apr 29 03:20:15 2013 -0400 @@ -0,0 +1,301 @@ +#!/usr/bin/env python + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +from commons.core.LoggerFactory import LoggerFactory +from commons.core.utils.RepetOptionParser import RepetOptionParser +from commons.core.utils.FileUtils import FileUtils +from commons.core.stat.Stat import Stat +from commons.core.seq.BioseqDB import BioseqDB +from commons.launcher.LaunchBlastclust import LaunchBlastclust +from commons.tools.AnnotationStats import AnnotationStats +import os + +CONSENSUS = "TE" +CLUSTER = "Cluster" +LOG_DEPTH = "repet.tools" +LOG_FORMAT = "%(message)s" + +class PostAnalyzeTELib(object): + + def __init__(self, analysis = 1, fastaFileName = "", clusterFileName = "", pathTableName="", seqTableName="", genomeSize=0, configFileName = "", doClean = False, verbosity = 3): + self._analysis = analysis + self._fastaFileName = fastaFileName + self._pathTableName = pathTableName + self._seqTableName = seqTableName + self._genomeSize = genomeSize + if self._analysis == 1: + self.setBioseqDB() + self._identity = 0 + self._coverage = 80 + self._applyCovThresholdOnBothSeq = False + self.setClusterFileName(clusterFileName) + self.setStatPerClusterFileName() + self.setClassifStatPerClusterFileName() + self.setAnnotationStatsPerTEFileName() + self.setAnnotationStatsPerClusterFileName() + self._doClean = doClean + self._verbosity = verbosity + self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity, LOG_FORMAT) + + def setAttributesFromCmdLine(self): + description = "Tool to post-analyze a TE library : clusterize, give stats on cluster, on annotation,...\n" + epilog = "\nExample 1: clustering (e.g. to detect redundancy)\n" + epilog += "\t$ python PostAnalyzeTELib.py -a 1 -i TElib.fa -L 98 -S 95 -b\n" + epilog += "Example 2: classification stats per cluster\n" + epilog += "\t$ python PostAnalyzeTELib.py -a 2 -t TElib.tab\n" + epilog += "Example 3: annotation stats per consensus\n" + epilog += "\t$ python PostAnalyzeTELib.py -a 3 -p project_chr_allTEs_nr_noSSR_join_path -s project_refTEs_seq -g 129919500\n" + epilog += "Example 4: annotation stats per cluster\n" + epilog += "\t$ python PostAnalyzeTELib.py -a 4 -t TElib.tab -p project_chr_allTEs_nr_noSSR_join_path -s project_refTEs_seq -g 129919500\n" + parser = RepetOptionParser(description = description, epilog = epilog) + parser.add_option("-a", "--analysis", dest = "analysis", action = "store", type = "int", help = "analysis number [default: 1, 2, 3, 4]", default = 1) + parser.add_option("-i", "--fasta", dest = "fastaFileName", action = "store", type = "string", help = "fasta file name [for analysis 1] [format: fasta]", default = "") + parser.add_option("-L", "--coverage", dest = "coverage", action = "store", type = "int", help = "length coverage threshold [for analysis 1] [format: %] [default: 80]", default = 80) + parser.add_option("-S", "--identity", dest = "identity", action = "store", type = "int", help = "identity threshold [for analysis 1 with BlastClust] [format: %] [default: 0 => no threshold]", default = 0) + parser.add_option("-b", "--both", dest = "bothSeq", action = "store_true", help = "require coverage on both neighbours [for analysis 1] [default: False]", default = False) + parser.add_option("-t", "--cluster", dest = "clusterFileName", action = "store", type = "string", help = "cluster file name [for analysis 2 and 4] [default: <input>.tab]", default = "") + parser.add_option("-p", "--path", dest = "pathTableName", action = "store", type = "string", help = "name of the table (_path) with the annotated TE copies [for analysis 3 and 4]", default = "") + parser.add_option("-s", "--seq", dest = "seqTableName", action = "store", type = "string", help = "name of the table (_seq) with the TE reference sequences [for analysis 3 and 4]", default = "") + parser.add_option("-g", "--genome", dest = "genomeSize", action = "store", type = "int", help = "genome size [for analysis 3 and 4]", default = None) + parser.add_option("-c", "--clean", dest = "doClean", action = "store_true", help = "clean temporary files [optional] [default: False]", default = False) + parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity [optional] [default: 1]", default = 1) + options = parser.parse_args()[0] + self._setAttributesFromOptions(options) + + def _setAttributesFromOptions(self, options): + self.setAnalysis(options.analysis) + if self._analysis == 1: + self.setFastaFileName(options.fastaFileName) + self.setBioseqDB() + self.setIdentity(options.identity) + self.setCoverage(options.coverage) + self.setApplyCovThresholdOnBothSeq(options.bothSeq) + self.setClusterFileName(options.clusterFileName) + self.setPathTableName(options.pathTableName) + self.setSeqTableName(options.seqTableName) + self.setStatPerClusterFileName() + self.setClassifStatPerClusterFileName() + self.setAnnotationStatsPerTEFileName() + self.setAnnotationStatsPerClusterFileName() + self.setGenomeSize(options.genomeSize) + self.setDoClean(options.doClean) + self.setVerbosity(options.verbosity) + + def setAnalysis(self, analysis): + self._analysis = analysis + + def setFastaFileName(self, fastaFileName): + self._fastaFileName = fastaFileName + + def setBioseqDB(self): + self._iBioseqDB = BioseqDB(name = self._fastaFileName) + + def setIdentity(self, identity): + self._identity = identity + + def setCoverage(self, coverage): + self._coverage = coverage + + def setApplyCovThresholdOnBothSeq(self, bothSeq): + self._applyCovThresholdOnBothSeq = bothSeq + + def setClusterFileName(self, clusterFileName): + if clusterFileName == "": + self._clusterFileName = "%s.tab" % os.path.splitext(os.path.basename(self._fastaFileName))[0] + else: + self._clusterFileName = clusterFileName + + def setStatPerClusterFileName(self): + self._statPerClusterFileName = "%s.statsPerCluster.tab" % os.path.splitext(self._clusterFileName)[0] + self._globalStatPerClusterFileName = "%s.globalStatsPerCluster.txt" % os.path.splitext(self._clusterFileName)[0] + + def setClassifStatPerClusterFileName(self): + self._classifStatPerClusterFileName = "%s.classifStatsPerCluster.tab" % os.path.splitext(self._clusterFileName)[0] + + def setAnnotationStatsPerTEFileName(self): + self._annotStatsPerTEFileName = "%s.annotStatsPerTE.tab" % self._pathTableName + self._globalAnnotStatsPerTEFileName = "%s.globalAnnotStatsPerTE.txt" % self._pathTableName + + def setAnnotationStatsPerClusterFileName(self): + self._annotStatsPerClusterFileName = "%s.annotStatsPerCluster.tab" % self._pathTableName + + def setPathTableName(self, pathTableName): + self._pathTableName = pathTableName + + def setSeqTableName(self, seqTableName): + self._seqTableName = seqTableName + + def setGenomeSize(self, genomeSize): + self._genomeSize = genomeSize + + def setDoClean(self, doClean): + self._doClean = doClean + + def setVerbosity(self, verbosity): + self._verbosity = verbosity + + def _checkOptions(self): + if (self._fastaFileName == "" or not FileUtils.isRessourceExists(self._fastaFileName)) and self._analysis == 1: + self._logAndRaise("ERROR: Missing fasta file" ) + + def _logAndRaise(self, errorMsg): + self._log.error(errorMsg) + raise Exception(errorMsg) + + def run(self): + LoggerFactory.setLevel(self._log, self._verbosity) + self._checkOptions() + self._log.info("START PostAnalyzeTELib analysis %d" % self._analysis) + if self._analysis == 1: + #TODO: option to choose clustering program (blastclust, MCL, uclust,...) + self._log.debug("TE lib: %s" % self._fastaFileName) + self._log.info("Launch blastclust...") + iLaunchBlastclust = LaunchBlastclust(clean = self._doClean) + iLaunchBlastclust.setTmpFileName(self._clusterFileName) + iLaunchBlastclust.setIdentityThreshold(self._identity) + iLaunchBlastclust.setCoverageThreshold(self._coverage/100.0) + if self._applyCovThresholdOnBothSeq: + iLaunchBlastclust.setBothSequences("T") + else: + iLaunchBlastclust.setBothSequences("F") + iLaunchBlastclust.setVerbosityLevel(self._verbosity - 3) + iLaunchBlastclust.launchBlastclust(self._fastaFileName) + self._log.info("Compute stats...") + self._giveStatsOnTEClusters() + if self._analysis == 2: + #TODO add global stats (e.g.: nb of cluster without PotentialChimeric, nb of clusters with LTR...) ? + self._log.debug("Consensus cluster file name: %s" % self._clusterFileName) + self._log.info("Compute classification stats...") + self._giveStatsOnConsensusClusters() + if self._analysis == 3: + #TODO: in option, save stats in DB + self._log.info("Compute annotation stats per TE...") + iAnnotationStats = AnnotationStats(analysisName="TE", seqTableName=self._seqTableName, pathTableName=self._pathTableName,\ + genomeLength=self._genomeSize, statsFileName=self._annotStatsPerTEFileName, globalStatsFileName=self._globalAnnotStatsPerTEFileName, verbosity=self._verbosity-1) + iAnnotationStats.run() + if self._analysis == 4: + self._log.info("Compute annotation stats per cluster...") + iAnnotationStats = AnnotationStats(analysisName="Cluster", clusterFileName=self._clusterFileName, seqTableName=self._seqTableName, pathTableName=self._pathTableName,\ + genomeLength=self._genomeSize, statsFileName=self._annotStatsPerClusterFileName, verbosity=self._verbosity-1) + iAnnotationStats.run() + self._log.info("END PostAnalyzeTELib analysis %d" % self._analysis) + + def _giveStatsOnConsensusClusters(self): + with open(self._classifStatPerClusterFileName, "w") as f: + f.write("cluster\tnoCat\tPotentialChimeric\tcomp\tincomp\tclassifs (nbTEs)\n") + with open(self._clusterFileName) as fCluster: + for clusterId, line in enumerate(fCluster): + dClassifNb = {} + nbIncomp =0 + nbComp=0 + nbChim=0 + nbNoCat=0 + lConsensus = line.split() + for consensus in lConsensus: + classifInfos = consensus.split("_")[0] + + if "-incomp" in classifInfos: + nbIncomp += 1 + classifInfos = classifInfos.replace("-incomp","") + if "-comp" in classifInfos: + nbComp += 1 + classifInfos = classifInfos.replace("-comp","") + if "-chim" in classifInfos: + nbChim += 1 + classifInfos = classifInfos.replace("-chim","") + if "noCat" in classifInfos: + nbNoCat += 1 + classifInfos = classifInfos.replace("noCat","") + + classif = classifInfos.split("-")[-1] + if classif != "": + if dClassifNb.get(classif, None) is None: + dClassifNb[classif] = 0 + dClassifNb[classif] +=1 + + occurences= [] + for classif, occs in dClassifNb.items(): + occurences.append("%s (%d)" % (classif, occs)) + + f.write("%d\t%d\t%d\t%d\t%d\t%s\n" % (clusterId+1, nbNoCat, nbChim\ + , nbComp, nbIncomp,"\t".join(occurences))) + + def _giveStatsOnTEClusters(self): + with open(self._clusterFileName) as fCluster: + with open(self._statPerClusterFileName, 'w') as fStatPerCluster: + fStatPerCluster.write("cluster\tsequencesNb\tsizeOfSmallestSeq\tsizeOfLargestSeq\taverageSize\tmedSize\n") + line = fCluster.readline() + clusterNb = 0 + clusterSeqList= line.split() + minClusterSize = len(clusterSeqList) + maxClusterSize = 0 + totalSeqNb = 0 + seqNbInBigClusters = 0 + dClusterSize2ClusterNb = {1:0, 2:0, 3:0} + while line: + clusterSeqList= line.split() + seqNb = len(clusterSeqList) + totalSeqNb += seqNb + if seqNb > 2: + seqNbInBigClusters += seqNb + dClusterSize2ClusterNb[3] += 1 + else: + dClusterSize2ClusterNb[seqNb] += 1 + if seqNb > maxClusterSize: + maxClusterSize = seqNb + if seqNb < minClusterSize: + minClusterSize = seqNb + line = fCluster.readline() + clusterNb += 1 + clusterSeqLengths = self._iBioseqDB.getSeqLengthByListOfName(clusterSeqList) + iStatSeqLengths = Stat(clusterSeqLengths) + fStatPerCluster.write("%d\t%d\t%d\t%d\t%d\t%d\n" %(clusterNb, seqNb, min(clusterSeqLengths), max(clusterSeqLengths), iStatSeqLengths.mean(), iStatSeqLengths.median())) + + with open(self._globalStatPerClusterFileName, 'w') as fG: + fG.write("nb of clusters: %d\n" % clusterNb) + fG.write("nb of clusters with 1 sequence: %d\n" % dClusterSize2ClusterNb[1]) + fG.write("nb of clusters with 2 sequences: %d\n" % dClusterSize2ClusterNb[2]) + fG.write("nb of clusters with >2 sequences: %d (%d sequences)\n" % (dClusterSize2ClusterNb[3], seqNbInBigClusters)) + fG.write("nb of sequences: %d\n" % totalSeqNb) + fG.write("nb of sequences in the largest cluster: %d\n" % maxClusterSize) + fG.write("nb of sequences in the smallest cluster: %d\n" % minClusterSize) + lSeqSizes = self._iBioseqDB.getListOfSequencesLength() + iStat = Stat(lSeqSizes) + fG.write("size of the smallest sequence: %d\n" % min(lSeqSizes)) + fG.write("size of the largest sequence: %d\n" % max(lSeqSizes)) + fG.write("average sequences size: %d\n" % iStat.mean()) + fG.write("median sequences size: %d\n" % iStat.median()) + +if __name__ == "__main__": + iLaunch = PostAnalyzeTELib() + iLaunch.setAttributesFromCmdLine() + iLaunch.run() \ No newline at end of file