Mercurial > repos > yufei-luo > s_mart
diff commons/tools/GetSpecificTELibAccordingToAnnotation.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/tools/GetSpecificTELibAccordingToAnnotation.py Mon Apr 29 03:20:15 2013 -0400 @@ -0,0 +1,178 @@ +#!/usr/bin/env python + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import os +import sys +from commons.core.sql.DbMySql import DbMySql +from commons.core.utils.RepetOptionParser import RepetOptionParser +from commons.core.utils.FileUtils import FileUtils +from commons.core.sql.TableSeqAdaptator import TableSeqAdaptator +from commons.core.LoggerFactory import LoggerFactory + +LOG_DEPTH = "repet.tools" +LOG_FORMAT = "%(message)s" +#TODO: use configuration file + +## Get 3 annotation files, using output from TEannot: +#- consensus with one or more full length copy, +#- consensus with one or more full length fragment, +#- consensus without copy + +class GetSpecificTELibAccordingToAnnotation(object): + + def __init__(self, inInfoFileName = "", tableName = "", verbose = 0): + self._inInfoFileName = inInfoFileName + self._tableName = tableName + self._verbose = verbose + self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbose, LOG_FORMAT) + + def setAttributesFromCmdLine(self): + desc = "Splits a GiveInfoTEannot \"statsPerTE.txt\" file in 3 subfiles containing consensus which have at least one copy, one full length fragment or one full length copy. " + desc += "A TEs library is built according to each category. Connection to the database parameters are retrieved from the environment" + + examples = "\nExample : with a project called \"MyTEannotAnalysis\":\n" + examples += "\t$ python GetSpecificTELibAccordingToAnnotation.py -i MyTEannotAnalysis_chr_allTEs_nr_noSSR_join_path_statsPerTE.txt -t MyTEannotAnalysis_refTEs_seq" + examples += "\n\t" + examples += "\n\n" + + parser = RepetOptionParser(description = desc, epilog = examples) + parser.add_option("-i", "--file", dest = "inInfoFileName", action = "store", type = "string", help = "input file (mandatory) = output file from GiveInfoTEannot.py (e.g. <project_name>_chr_allTEs_nr_noSSR_join_path_statsPerTE.txt)", default = "") + parser.add_option("-t", "--table", dest = "tableName", action = "store", type = "string", help = "table name of TEs sequences (mandatory, seq format, e.g. <project_name>_refTEs_seq)", default = "") + parser.add_option("-v", "--verbose", dest = "verbose", action = "store", type = "int", help = "verbosity level (default=0, else 1)", default = 0) + (options, args) = parser.parse_args() + self._setAttributesFromOptions(options) + + def _setAttributesFromOptions(self, options): + self.setInInfoFileName(options.inInfoFileName) + self.setTableName(options.tableName) + self.setVerbose(options.verbose) + + def setTableName(self, tableName): + self._tableName = tableName + + def setInInfoFileName(self, inInfoFileName): + self._inInfoFileName = inInfoFileName + + def setVerbose(self, verbose): + self._verbose = verbose + + def checkOptions(self): + if self._inInfoFileName != "": + if not FileUtils.isRessourceExists(self._inClassifFileName): + self._logAndRaise("ERROR: Input GiveInfoTEannot.txt output file does not exist!") + else: + self._logAndRaise("ERROR: No specified -i option!") + + if self._tableName != "": + iDb = DbMySql() + if not iDb.doesTableExist(self._tableName): + self._logAndRaise("ERROR: table does not exist!") + iDb.close() + else: + self._logAndRaise("ERROR: No specified -t option!") + + def _logAndRaise(self, errorMsg): + self._log.error(errorMsg) + raise Exception(errorMsg) + + def writeFastaFileFromGiveInfoTEAnnot(self, fileName): + fFileHandler = open(fileName,"r") + lineHeader = fFileHandler.readline() + line = fFileHandler.readline() + lConsensusName = [] + while line: + lConsensusName.append(line.split()[0]) + line = fFileHandler.readline() + + fFileHandler.close() + iDb = DbMySql() + iTSA = TableSeqAdaptator(iDb, self._tableName) + outPutFileName = "%s.fa" % os.path.splitext(fileName)[0] + iTSA.saveAccessionsListInFastaFile(lConsensusName, outPutFileName) + iDb.close() + + def run(self): + LoggerFactory.setLevel(self._log, self._verbose) + + outInfoFileNameFullCopy = "%s_FullLengthCopy.txt" % os.path.splitext(os.path.basename(self._inInfoFileName))[0] + outInfoFileNameCopy = "%s_OneCopyAndMore.txt" % os.path.splitext(os.path.basename(self._inInfoFileName))[0] + outInfoFileNameFullFrag = "%s_FullLengthFrag.txt" % os.path.splitext(os.path.basename(self._inInfoFileName))[0] + + outInfoFileFullCopy = open(outInfoFileNameFullCopy, "w") + outInfoFileCopy = open(outInfoFileNameCopy, "w") + outInfoFileFullFrag = open(outInfoFileNameFullFrag, "w") + + self._log.info("START GetSpecificTELibAccordingToAnnotation\n input info file: %s" % self._inInfoFileName) + + inFileFh = open(self._inInfoFileName, "r") + line = inFileFh.readline() + lHeaders = line.split() + if "fullLgthCopies" not in lHeaders: + self._logAndRaise("ERROR: No headers in %s!" % self._inInfoFileName ) + + outInfoFileFullCopy.write(line) + outInfoFileCopy.write(line) + outInfoFileFullFrag.write(line) + + line = inFileFh.readline() + while line: + dTokens = {} + for index, token in enumerate(line.split()): + dTokens[lHeaders[index]] = token + + if int(dTokens["fullLgthCopies"]) > 0: + outInfoFileFullCopy.write(line) + if int(dTokens["copies"]) > 0: + outInfoFileCopy.write(line) + if int(dTokens["fullLgthFrags"]) > 0: + outInfoFileFullFrag.write(line) + line = inFileFh.readline() + + inFileFh.close() + outInfoFileFullCopy.close() + outInfoFileCopy.close() + outInfoFileFullFrag.close() + + self.writeFastaFileFromGiveInfoTEAnnot(outInfoFileNameFullCopy) + self.writeFastaFileFromGiveInfoTEAnnot(outInfoFileNameCopy) + self.writeFastaFileFromGiveInfoTEAnnot(outInfoFileNameFullFrag) + + self._log.info("END GetSpecificTELibAccordingToAnnotation\n" ) + + return 0 + +if __name__ == '__main__': + iGetTELib = GetSpecificTELibAccordingToAnnotation() + iGetTELib.setAttributesFromCmdLine() + iGetTELib.run() + \ No newline at end of file