| 18 | 1 #!/usr/bin/env python | 
|  | 2 | 
|  | 3 # Copyright INRA (Institut National de la Recherche Agronomique) | 
|  | 4 # http://www.inra.fr | 
|  | 5 # http://urgi.versailles.inra.fr | 
|  | 6 # | 
|  | 7 # This software is governed by the CeCILL license under French law and | 
|  | 8 # abiding by the rules of distribution of free software.  You can  use, | 
|  | 9 # modify and/ or redistribute the software under the terms of the CeCILL | 
|  | 10 # license as circulated by CEA, CNRS and INRIA at the following URL | 
|  | 11 # "http://www.cecill.info". | 
|  | 12 # | 
|  | 13 # As a counterpart to the access to the source code and  rights to copy, | 
|  | 14 # modify and redistribute granted by the license, users are provided only | 
|  | 15 # with a limited warranty  and the software's author,  the holder of the | 
|  | 16 # economic rights,  and the successive licensors  have only  limited | 
|  | 17 # liability. | 
|  | 18 # | 
|  | 19 # In this respect, the user's attention is drawn to the risks associated | 
|  | 20 # with loading,  using,  modifying and/or developing or reproducing the | 
|  | 21 # software by the user in light of its specific status of free software, | 
|  | 22 # that may mean  that it is complicated to manipulate,  and  that  also | 
|  | 23 # therefore means  that it is reserved for developers  and  experienced | 
|  | 24 # professionals having in-depth computer knowledge. Users are therefore | 
|  | 25 # encouraged to load and test the software's suitability as regards their | 
|  | 26 # requirements in conditions enabling the security of their systems and/or | 
|  | 27 # data to be ensured and,  more generally, to use and operate it in the | 
|  | 28 # same conditions as regards security. | 
|  | 29 # | 
|  | 30 # The fact that you are presently reading this means that you have had | 
|  | 31 # knowledge of the CeCILL license and that you accept its terms. | 
|  | 32 | 
|  | 33 from commons.core.LoggerFactory import LoggerFactory | 
|  | 34 from commons.core.utils.RepetOptionParser import RepetOptionParser | 
|  | 35 from commons.core.utils.FileUtils import FileUtils | 
|  | 36 from commons.core.stat.Stat import Stat | 
|  | 37 from commons.core.seq.BioseqDB import BioseqDB | 
|  | 38 from commons.launcher.LaunchBlastclust import LaunchBlastclust | 
|  | 39 from commons.tools.AnnotationStats import AnnotationStats | 
|  | 40 import os | 
|  | 41 | 
|  | 42 CONSENSUS = "TE" | 
|  | 43 CLUSTER = "Cluster" | 
|  | 44 LOG_DEPTH = "repet.tools" | 
|  | 45 LOG_FORMAT = "%(message)s" | 
|  | 46 | 
|  | 47 class PostAnalyzeTELib(object): | 
|  | 48 | 
|  | 49     def __init__(self, analysis = 1, fastaFileName = "", clusterFileName = "", pathTableName="", seqTableName="", genomeSize=0, configFileName = "", doClean = False, verbosity = 3): | 
|  | 50         self._analysis = analysis | 
|  | 51         self._fastaFileName = fastaFileName | 
|  | 52         self._pathTableName = pathTableName | 
|  | 53         self._seqTableName = seqTableName | 
|  | 54         self._genomeSize = genomeSize | 
|  | 55         if self._analysis == 1: | 
|  | 56             self.setBioseqDB() | 
|  | 57         self._identity = 0 | 
|  | 58         self._coverage = 80 | 
|  | 59         self._applyCovThresholdOnBothSeq = False | 
|  | 60         self.setClusterFileName(clusterFileName) | 
|  | 61         self.setStatPerClusterFileName() | 
|  | 62         self.setClassifStatPerClusterFileName() | 
|  | 63         self.setAnnotationStatsPerTEFileName() | 
|  | 64         self.setAnnotationStatsPerClusterFileName() | 
|  | 65         self._doClean = doClean | 
|  | 66         self._verbosity = verbosity | 
|  | 67         self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity, LOG_FORMAT) | 
|  | 68 | 
|  | 69     def setAttributesFromCmdLine(self): | 
|  | 70         description = "Tool to post-analyze a TE library : clusterize, give stats on cluster, on annotation,...\n" | 
|  | 71         epilog = "\nExample 1: clustering (e.g. to detect redundancy)\n" | 
|  | 72         epilog += "\t$ python PostAnalyzeTELib.py -a 1 -i TElib.fa -L 98 -S 95 -b\n" | 
|  | 73         epilog += "Example 2: classification stats per cluster\n" | 
|  | 74         epilog += "\t$ python PostAnalyzeTELib.py -a 2 -t TElib.tab\n" | 
|  | 75         epilog += "Example 3: annotation stats per consensus\n" | 
|  | 76         epilog += "\t$ python PostAnalyzeTELib.py -a 3 -p project_chr_allTEs_nr_noSSR_join_path -s project_refTEs_seq -g 129919500\n" | 
|  | 77         epilog += "Example 4: annotation stats per cluster\n" | 
|  | 78         epilog += "\t$ python PostAnalyzeTELib.py -a 4 -t TElib.tab -p project_chr_allTEs_nr_noSSR_join_path -s project_refTEs_seq -g 129919500\n" | 
|  | 79         parser = RepetOptionParser(description = description, epilog = epilog) | 
|  | 80         parser.add_option("-a", "--analysis",   dest = "analysis",          action = "store",       type = "int",    help = "analysis number [default: 1, 2, 3, 4]", default = 1) | 
|  | 81         parser.add_option("-i", "--fasta",      dest = "fastaFileName",     action = "store",       type = "string", help = "fasta file name [for analysis 1] [format: fasta]", default = "") | 
|  | 82         parser.add_option("-L", "--coverage",   dest = "coverage",          action = "store",       type = "int",    help = "length coverage threshold [for analysis 1] [format: %] [default: 80]", default = 80) | 
|  | 83         parser.add_option("-S", "--identity",   dest = "identity",          action = "store",       type = "int",    help = "identity threshold [for analysis 1 with BlastClust] [format: %] [default: 0 => no threshold]", default = 0) | 
|  | 84         parser.add_option("-b", "--both",       dest = "bothSeq",           action = "store_true",                   help = "require coverage on both neighbours [for analysis 1] [default: False]", default = False) | 
|  | 85         parser.add_option("-t", "--cluster",    dest = "clusterFileName",   action = "store",       type = "string", help = "cluster file name [for analysis 2 and 4] [default: <input>.tab]", default = "") | 
|  | 86         parser.add_option("-p", "--path",       dest = "pathTableName",     action = "store",       type = "string", help = "name of the table (_path) with the annotated TE copies [for analysis 3 and 4]", default = "") | 
|  | 87         parser.add_option("-s", "--seq",        dest = "seqTableName",      action = "store",       type = "string", help = "name of the table (_seq) with the TE reference sequences [for analysis 3 and 4]", default = "") | 
|  | 88         parser.add_option("-g", "--genome",     dest = "genomeSize",        action = "store",       type = "int",    help = "genome size [for analysis 3 and 4]", default = None) | 
|  | 89         parser.add_option("-c", "--clean",      dest = "doClean",           action = "store_true",                   help = "clean temporary files [optional] [default: False]", default = False) | 
|  | 90         parser.add_option("-v", "--verbosity",  dest = "verbosity",         action = "store",       type = "int",    help = "verbosity [optional] [default: 1]", default = 1) | 
|  | 91         options = parser.parse_args()[0] | 
|  | 92         self._setAttributesFromOptions(options) | 
|  | 93 | 
|  | 94     def _setAttributesFromOptions(self, options): | 
|  | 95         self.setAnalysis(options.analysis) | 
|  | 96         if self._analysis == 1: | 
|  | 97             self.setFastaFileName(options.fastaFileName) | 
|  | 98             self.setBioseqDB() | 
|  | 99             self.setIdentity(options.identity) | 
|  | 100             self.setCoverage(options.coverage) | 
|  | 101             self.setApplyCovThresholdOnBothSeq(options.bothSeq) | 
|  | 102         self.setClusterFileName(options.clusterFileName) | 
|  | 103         self.setPathTableName(options.pathTableName) | 
|  | 104         self.setSeqTableName(options.seqTableName) | 
|  | 105         self.setStatPerClusterFileName() | 
|  | 106         self.setClassifStatPerClusterFileName() | 
|  | 107         self.setAnnotationStatsPerTEFileName() | 
|  | 108         self.setAnnotationStatsPerClusterFileName() | 
|  | 109         self.setGenomeSize(options.genomeSize) | 
|  | 110         self.setDoClean(options.doClean) | 
|  | 111         self.setVerbosity(options.verbosity) | 
|  | 112 | 
|  | 113     def setAnalysis(self, analysis): | 
|  | 114         self._analysis = analysis | 
|  | 115 | 
|  | 116     def setFastaFileName(self, fastaFileName): | 
|  | 117         self._fastaFileName = fastaFileName | 
|  | 118 | 
|  | 119     def setBioseqDB(self): | 
|  | 120         self._iBioseqDB = BioseqDB(name = self._fastaFileName) | 
|  | 121 | 
|  | 122     def setIdentity(self, identity): | 
|  | 123         self._identity = identity | 
|  | 124 | 
|  | 125     def setCoverage(self, coverage): | 
|  | 126         self._coverage = coverage | 
|  | 127 | 
|  | 128     def setApplyCovThresholdOnBothSeq(self, bothSeq): | 
|  | 129         self._applyCovThresholdOnBothSeq = bothSeq | 
|  | 130 | 
|  | 131     def setClusterFileName(self, clusterFileName): | 
|  | 132         if clusterFileName == "": | 
|  | 133             self._clusterFileName = "%s.tab" % os.path.splitext(os.path.basename(self._fastaFileName))[0] | 
|  | 134         else: | 
|  | 135             self._clusterFileName = clusterFileName | 
|  | 136 | 
|  | 137     def setStatPerClusterFileName(self): | 
|  | 138         self._statPerClusterFileName = "%s.statsPerCluster.tab" % os.path.splitext(self._clusterFileName)[0] | 
|  | 139         self._globalStatPerClusterFileName = "%s.globalStatsPerCluster.txt" % os.path.splitext(self._clusterFileName)[0] | 
|  | 140 | 
|  | 141     def setClassifStatPerClusterFileName(self): | 
|  | 142         self._classifStatPerClusterFileName = "%s.classifStatsPerCluster.tab" % os.path.splitext(self._clusterFileName)[0] | 
|  | 143 | 
|  | 144     def setAnnotationStatsPerTEFileName(self): | 
|  | 145         self._annotStatsPerTEFileName = "%s.annotStatsPerTE.tab" % self._pathTableName | 
|  | 146         self._globalAnnotStatsPerTEFileName = "%s.globalAnnotStatsPerTE.txt" % self._pathTableName | 
|  | 147 | 
|  | 148     def setAnnotationStatsPerClusterFileName(self): | 
|  | 149         self._annotStatsPerClusterFileName = "%s.annotStatsPerCluster.tab" % self._pathTableName | 
|  | 150 | 
|  | 151     def setPathTableName(self, pathTableName): | 
|  | 152         self._pathTableName = pathTableName | 
|  | 153 | 
|  | 154     def setSeqTableName(self, seqTableName): | 
|  | 155         self._seqTableName = seqTableName | 
|  | 156 | 
|  | 157     def setGenomeSize(self, genomeSize): | 
|  | 158         self._genomeSize = genomeSize | 
|  | 159 | 
|  | 160     def setDoClean(self, doClean): | 
|  | 161         self._doClean = doClean | 
|  | 162 | 
|  | 163     def setVerbosity(self, verbosity): | 
|  | 164         self._verbosity = verbosity | 
|  | 165 | 
|  | 166     def _checkOptions(self): | 
|  | 167         if (self._fastaFileName == "" or not FileUtils.isRessourceExists(self._fastaFileName)) and self._analysis == 1: | 
|  | 168             self._logAndRaise("ERROR: Missing fasta file"  ) | 
|  | 169 | 
|  | 170     def _logAndRaise(self, errorMsg): | 
|  | 171         self._log.error(errorMsg) | 
|  | 172         raise Exception(errorMsg) | 
|  | 173 | 
|  | 174     def run(self): | 
|  | 175         LoggerFactory.setLevel(self._log, self._verbosity) | 
|  | 176         self._checkOptions() | 
|  | 177         self._log.info("START PostAnalyzeTELib analysis %d" % self._analysis) | 
|  | 178         if self._analysis == 1: | 
|  | 179             #TODO: option to choose clustering program (blastclust, MCL, uclust,...) | 
|  | 180             self._log.debug("TE lib: %s" % self._fastaFileName) | 
|  | 181             self._log.info("Launch blastclust...") | 
|  | 182             iLaunchBlastclust = LaunchBlastclust(clean = self._doClean) | 
|  | 183             iLaunchBlastclust.setTmpFileName(self._clusterFileName) | 
|  | 184             iLaunchBlastclust.setIdentityThreshold(self._identity) | 
|  | 185             iLaunchBlastclust.setCoverageThreshold(self._coverage/100.0) | 
|  | 186             if self._applyCovThresholdOnBothSeq: | 
|  | 187                 iLaunchBlastclust.setBothSequences("T") | 
|  | 188             else: | 
|  | 189                 iLaunchBlastclust.setBothSequences("F") | 
|  | 190             iLaunchBlastclust.setVerbosityLevel(self._verbosity - 3) | 
|  | 191             iLaunchBlastclust.launchBlastclust(self._fastaFileName) | 
|  | 192             self._log.info("Compute stats...") | 
|  | 193             self._giveStatsOnTEClusters() | 
|  | 194         if self._analysis == 2: | 
|  | 195             #TODO add global stats (e.g.: nb of cluster without PotentialChimeric, nb of clusters with LTR...) ? | 
|  | 196             self._log.debug("Consensus cluster file name: %s" % self._clusterFileName) | 
|  | 197             self._log.info("Compute classification stats...") | 
|  | 198             self._giveStatsOnConsensusClusters() | 
|  | 199         if self._analysis == 3: | 
|  | 200             #TODO: in option, save stats in DB | 
|  | 201             self._log.info("Compute annotation stats per TE...") | 
|  | 202             iAnnotationStats = AnnotationStats(analysisName="TE", seqTableName=self._seqTableName, pathTableName=self._pathTableName,\ | 
|  | 203                                             genomeLength=self._genomeSize, statsFileName=self._annotStatsPerTEFileName, globalStatsFileName=self._globalAnnotStatsPerTEFileName, verbosity=self._verbosity-1) | 
|  | 204             iAnnotationStats.run() | 
|  | 205         if self._analysis == 4: | 
|  | 206             self._log.info("Compute annotation stats per cluster...") | 
|  | 207             iAnnotationStats = AnnotationStats(analysisName="Cluster", clusterFileName=self._clusterFileName, seqTableName=self._seqTableName, pathTableName=self._pathTableName,\ | 
|  | 208                                             genomeLength=self._genomeSize, statsFileName=self._annotStatsPerClusterFileName, verbosity=self._verbosity-1) | 
|  | 209             iAnnotationStats.run() | 
|  | 210         self._log.info("END PostAnalyzeTELib analysis %d" % self._analysis) | 
|  | 211 | 
|  | 212     def _giveStatsOnConsensusClusters(self): | 
|  | 213         with open(self._classifStatPerClusterFileName, "w") as f: | 
|  | 214             f.write("cluster\tnoCat\tPotentialChimeric\tcomp\tincomp\tclassifs (nbTEs)\n") | 
|  | 215             with open(self._clusterFileName) as fCluster: | 
|  | 216                 for clusterId, line in enumerate(fCluster): | 
|  | 217                     dClassifNb = {} | 
|  | 218                     nbIncomp =0 | 
|  | 219                     nbComp=0 | 
|  | 220                     nbChim=0 | 
|  | 221                     nbNoCat=0 | 
|  | 222                     lConsensus = line.split() | 
|  | 223                     for consensus in lConsensus: | 
|  | 224                         classifInfos = consensus.split("_")[0] | 
|  | 225 | 
|  | 226                         if "-incomp" in classifInfos: | 
|  | 227                             nbIncomp += 1 | 
|  | 228                             classifInfos = classifInfos.replace("-incomp","") | 
|  | 229                         if "-comp" in classifInfos: | 
|  | 230                             nbComp += 1 | 
|  | 231                             classifInfos = classifInfos.replace("-comp","") | 
|  | 232                         if "-chim" in classifInfos: | 
|  | 233                             nbChim += 1 | 
|  | 234                             classifInfos = classifInfos.replace("-chim","") | 
|  | 235                         if "noCat" in classifInfos: | 
|  | 236                             nbNoCat += 1 | 
|  | 237                             classifInfos = classifInfos.replace("noCat","") | 
|  | 238 | 
|  | 239                         classif = classifInfos.split("-")[-1] | 
|  | 240                         if classif != "": | 
|  | 241                             if dClassifNb.get(classif, None) is None: | 
|  | 242                                 dClassifNb[classif] = 0 | 
|  | 243                             dClassifNb[classif] +=1 | 
|  | 244 | 
|  | 245                     occurences= [] | 
|  | 246                     for classif, occs in dClassifNb.items(): | 
|  | 247                         occurences.append("%s (%d)" % (classif, occs)) | 
|  | 248 | 
|  | 249                     f.write("%d\t%d\t%d\t%d\t%d\t%s\n" % (clusterId+1, nbNoCat, nbChim\ | 
|  | 250                                         , nbComp, nbIncomp,"\t".join(occurences))) | 
|  | 251 | 
|  | 252     def _giveStatsOnTEClusters(self): | 
|  | 253         with open(self._clusterFileName) as fCluster: | 
|  | 254             with open(self._statPerClusterFileName, 'w') as fStatPerCluster: | 
|  | 255                 fStatPerCluster.write("cluster\tsequencesNb\tsizeOfSmallestSeq\tsizeOfLargestSeq\taverageSize\tmedSize\n") | 
|  | 256                 line = fCluster.readline() | 
|  | 257                 clusterNb = 0 | 
|  | 258                 clusterSeqList= line.split() | 
|  | 259                 minClusterSize = len(clusterSeqList) | 
|  | 260                 maxClusterSize = 0 | 
|  | 261                 totalSeqNb = 0 | 
|  | 262                 seqNbInBigClusters = 0 | 
|  | 263                 dClusterSize2ClusterNb = {1:0, 2:0, 3:0} | 
|  | 264                 while line: | 
|  | 265                     clusterSeqList= line.split() | 
|  | 266                     seqNb = len(clusterSeqList) | 
|  | 267                     totalSeqNb += seqNb | 
|  | 268                     if seqNb > 2: | 
|  | 269                         seqNbInBigClusters += seqNb | 
|  | 270                         dClusterSize2ClusterNb[3] += 1 | 
|  | 271                     else: | 
|  | 272                         dClusterSize2ClusterNb[seqNb] += 1 | 
|  | 273                     if seqNb > maxClusterSize: | 
|  | 274                         maxClusterSize = seqNb | 
|  | 275                     if seqNb < minClusterSize: | 
|  | 276                         minClusterSize = seqNb | 
|  | 277                     line = fCluster.readline() | 
|  | 278                     clusterNb += 1 | 
|  | 279                     clusterSeqLengths = self._iBioseqDB.getSeqLengthByListOfName(clusterSeqList) | 
|  | 280                     iStatSeqLengths = Stat(clusterSeqLengths) | 
|  | 281                     fStatPerCluster.write("%d\t%d\t%d\t%d\t%d\t%d\n" %(clusterNb, seqNb, min(clusterSeqLengths), max(clusterSeqLengths),  iStatSeqLengths.mean(),  iStatSeqLengths.median())) | 
|  | 282 | 
|  | 283         with open(self._globalStatPerClusterFileName, 'w') as fG: | 
|  | 284             fG.write("nb of clusters: %d\n" % clusterNb) | 
|  | 285             fG.write("nb of clusters with 1 sequence: %d\n" % dClusterSize2ClusterNb[1]) | 
|  | 286             fG.write("nb of clusters with 2 sequences: %d\n" % dClusterSize2ClusterNb[2]) | 
|  | 287             fG.write("nb of clusters with >2 sequences: %d (%d sequences)\n" % (dClusterSize2ClusterNb[3], seqNbInBigClusters)) | 
|  | 288             fG.write("nb of sequences: %d\n" % totalSeqNb) | 
|  | 289             fG.write("nb of sequences in the largest cluster: %d\n" % maxClusterSize) | 
|  | 290             fG.write("nb of sequences in the smallest cluster: %d\n" % minClusterSize) | 
|  | 291             lSeqSizes = self._iBioseqDB.getListOfSequencesLength() | 
|  | 292             iStat = Stat(lSeqSizes) | 
|  | 293             fG.write("size of the smallest sequence: %d\n" % min(lSeqSizes)) | 
|  | 294             fG.write("size of the largest sequence: %d\n" % max(lSeqSizes)) | 
|  | 295             fG.write("average sequences size: %d\n" % iStat.mean()) | 
|  | 296             fG.write("median sequences size: %d\n" % iStat.median()) | 
|  | 297 | 
|  | 298 if __name__ == "__main__": | 
|  | 299     iLaunch = PostAnalyzeTELib() | 
|  | 300     iLaunch.setAttributesFromCmdLine() | 
|  | 301     iLaunch.run() |