Mercurial > repos > vmarcon > repet_teannot
diff TEannot_lite.py @ 0:b126ea31824f draft default tip
1st Uploaded
| author | vmarcon |
|---|---|
| date | Mon, 06 Feb 2017 13:37:49 -0500 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEannot_lite.py Mon Feb 06 13:37:49 2017 -0500 @@ -0,0 +1,245 @@ +#!/usr/bin/env python + +import os +import sys +import time +import glob +import shutil +import ConfigParser +import re + +if not "REPET_PATH" in os.environ.keys(): + print("ERROR: no environment variable REPET_PATH") + sys.exit(1) + +if (not "REPET_DB" in os.environ.keys()) or (not "REPET_HOST" in os.environ.keys()) or (not "REPET_PORT" in os.environ.keys()) or (not "REPET_USER" in os.environ.keys()) or (not "REPET_PW" in os.environ.keys()): + print "ERROR: there is at least one environment database variable missing : REPET_DB, REPET_PORT, REPET_HOST, REPET_USER or REPET_PW" + sys.exit(1) + +if not "REPET_JOB_MANAGER" in os.environ.keys(): + print "ERROR: no environment variable REPET_JOB_MANAGER" + sys.exit(1) + + +if not "%s/bin" % os.environ["REPET_PATH"] in os.environ["PATH"]: + os.environ["PATH"] = "%s/bin:%s" % (os.environ["REPET_PATH"], os.environ["PATH"]) + +sys.path.append(os.environ["REPET_PATH"]) +if not "PYTHONPATH" in os.environ.keys(): + os.environ["PYTHONPATH"] = os.environ["REPET_PATH"] +else: + os.environ["PYTHONPATH"] = "%s:%s" % (os.environ["REPET_PATH"], os.environ["PYTHONPATH"]) + + +from commons.core.LoggerFactory import LoggerFactory +from commons.core.checker.RepetException import RepetException +from commons.core.utils.FileUtils import FileUtils +from commons.core.utils.RepetOptionParser import RepetOptionParser +from commons.core.seq.FastaUtils import * #FastaUtils +from commons.core.sql.DbFactory import DbFactory + +LOG_DEPTH = "TEannot.pipeline" + +class TEannot_lite(object): + + def __init__(self, configFileName = "", fastaFileName = "", libraryFileName = "", verbosity = 0): + self._configFileName = configFileName + self._fastaFileName = os.path.abspath(fastaFileName) + self._libraryFileName = os.path.abspath(libraryFileName) + self._projectName = time.strftime("%Y%m%d%H%M%S") + self._outputGff = "" + self._classif = "" + #self._maskedThreshold = 80 + self._statsFile = "" + self._outputMasked = "" + if "REPET_TMP_DIR" in os.environ.keys(): + self._tmp_dir = os.environ["REPET_TMP_DIR"] + else : + self._tmp_dir = "" + self._verbosity = verbosity + self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) + + def setAttributesFromCommandLine(self): + description = "This script is a ligth version of TEannot. It writes configuration file and launches TEannot." + epilog = "Example: TEannot_lite.py -i fastaFileName -l fastaLibraryFileName \n" + version = "1.1" + parser = RepetOptionParser(description = description, epilog = epilog, version = version) + parser.add_option("-i", "--fasta", dest = "fastaFileName" , action = "store" , type = "string", help ="Input fasta file name ", default = "") + parser.add_option("-l", "--lib", dest = "libraryFileName" , action = "store" , type = "string", help ="Input fasta library file name ", default = "") + parser.add_option("-c", "--withClassif", dest = "withClassif" , action = "store" , type = "string" , metavar="CLASSIFFILE" , help ="[optional] To add classification informations in GFF3 file, please put classif file from TEdenovo step. ", default = "") + #parser.add_option("-t", "--maskedThreshold", dest = "maskedThreshold" , action = "store", type = "int", metavar="80", help ="[optional] [default: 80] To choose the threshold of the identity percent for the masked fasta file. ", default = 80) + parser.add_option("-s", "--stats", dest="withStats", action="store_true",help = " Get statistical file in output.", default = False) + parser.add_option("-o", "--output", dest="outputLabel" , action = "store", type = "string", help = " [optional] Label for GFF3 output file", default = "") + parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", metavar="2", help = "Verbosity [optional] [default: 2]", default = 2) + options = parser.parse_args()[0] + self._setAttributesFromOptions(options) + + def _setAttributesFromOptions(self, options): + self.setConfigFileName("") + if options.fastaFileName=="": + print "ERROR : You have to enter an input fasta file" + print "Example: TEdenovo_lite.py -i fastaFileName \n" + print "More option : TEdenovo_lite.py --help " + exit(1) + else : + self._fastaFileName = os.path.abspath(options.fastaFileName) + if options.libraryFileName=="": + print "ERROR : You have to enter an input libary fasta file" + print "Example: TEannot_lite.py -i fastaFileName -l fastaLibraryFileName \n" + print "More option : TEannot_lite.py --help " + exit(1) + else : + self._libraryFileName = os.path.abspath(options.libraryFileName) + if options.outputLabel=="": + fastaBaseName=os.path.abspath(re.search(r'([^\/\\]*)\.[fa|fasta|fsa|fas]',options.fastaFileName).groups()[0]) + options.outputLabel = fastaBaseName + self._outputGff = os.path.abspath(options.outputLabel+'-%s.gff3'%self._projectName[:8]) + + if options.withClassif!='': + self._classif = os.path.abspath(options.withClassif) + + self._outputMasked = os.path.abspath(options.outputLabel+'-%s.mask'%self._projectName[:8]) + #if options.maskedThreshold : + # self._maskedThreshold = options.maskedThreshold + if options.withStats : + self._statsFile = os.path.abspath(options.outputLabel+'-%s-TEstats.txt'%self._projectName[:8]) + self._verbosity = options.verbosity + + def setConfigFileName(self, configFileName): + self._configFileName = configFileName + if not self._configFileName: + self._configFileName = "TEannot_Galaxy_config_%s" % self._projectName + + def setAttributesFromConfigFile(self, configFileName): + config = ConfigParser.ConfigParser() + config.readfp( open(configFileName) ) + + def _writeConfigFile(self): + if FileUtils.isRessourceExists(self._configFileName): + self._logAndRaise("Configuration file '%s' already exists. Won't be overwritten.") + + shutil.copy("%s/config/TEannot.cfg" % os.environ.get("REPET_PATH"), self._configFileName) + self.setAttributesFromConfigFile(self._configFileName) + + os.system("sed -i 's|repet_host: <your_MySQL_host>|repet_host: %s|' %s" % (os.environ["REPET_HOST"], self._configFileName)) + os.system("sed -i 's|repet_user: <your_MySQL_login>|repet_user: %s|' %s" % (os.environ["REPET_USER"], self._configFileName)) + os.system("sed -i 's|repet_pw: <your_MySQL_password>|repet_pw: %s|' %s" % (os.environ["REPET_PW"], self._configFileName)) + os.system("sed -i 's|repet_db: <your_MySQL_db>|repet_db: %s|' %s" % (os.environ["REPET_DB"], self._configFileName)) + os.system("sed -i 's|repet_port: 3306|repet_port: %s|' %s" % (os.environ["REPET_PORT"], self._configFileName)) + os.system("sed -i 's|repet_job_manager: SGE|repet_job_manager: %s|' %s" % (os.environ["REPET_JOB_MANAGER"], self._configFileName)) + os.system("sed -i 's|project_name: <your_project_name>|project_name: %s|' %s" % (self._projectName, self._configFileName)) + os.system("sed -i 's|project_dir: <absolute_path_to_your_project_directory>|project_dir: %s|' %s" % (os.getcwd().replace("/", "\/"), self._configFileName)) + os.system("sed -i 's|do_join: yes|do_join: no|' %s" % ( self._configFileName)) + os.system("sed -i 's|add_SSRs: no|add_SSRs: yes|' %s" % ( self._configFileName)) + os.system("sed -i 's|gff3_compulsory_match_part: no|gff3_compulsory_match_part: yes|' %s" % ( self._configFileName)) + os.system("sed -i 's|BLR_sensitivity: 3|BLR_sensitivity: 2|' %s" % ( self._configFileName)) + os.system("sed -i 's|tmpDir:|tmpDir: %s|g' %s" % (self._tmp_dir,self._configFileName)) + if self._classif!="" : + os.system("sed -i 's|gff3_with_classif_info: no|gff3_with_classif_info: yes|' %s" % ( self._configFileName)) + os.system("sed -i 's|classif_table_name: <name_of_TEs_table>|classif_table_name: %s_consensus_classif|' %s" % ( self._projectName,self._configFileName)) + + def _mergeOutputGff(self): + file_out=open(self._outputGff,'w') + file_out.write('##gff-version 3\n') + file_out.close() + directory="%s_GFF3chr/"%self._projectName + outGffs = glob.glob("%s*.gff3"%directory) + for outGff in outGffs : + os.system("grep -v '#' %s >> %s"%(outGff,self._outputGff)) + os.system("sed -i 's|%s_REPET_TEs|REPET_TEs|g' %s" % (self._projectName,self._outputGff)) + + def _launchTEannot(self): + print "START time: %s" % time.strftime("%Y-%m-%d %H:%M:%S") + lCmds = [] + lCmds.append( "TEannot.py -P %s -C %s -S 1 -v %i" % (self._projectName, self._configFileName, self._verbosity) ) + lCmds.append( "TEannot.py -P %s -C %s -S 2 -a BLR -v %i" % (self._projectName, self._configFileName, self._verbosity) ) + lCmds.append( "TEannot.py -P %s -C %s -S 2 -a RM -v %i" % (self._projectName, self._configFileName, self._verbosity) ) + lCmds.append( "TEannot.py -P %s -C %s -S 2 -a CEN -v %i" % (self._projectName, self._configFileName, self._verbosity) ) + lCmds.append( "TEannot.py -P %s -C %s -S 2 -a BLR -r -v %i" % (self._projectName, self._configFileName, self._verbosity) ) # + lCmds.append( "TEannot.py -P %s -C %s -S 2 -a RM -r -v %i" % (self._projectName, self._configFileName, self._verbosity) ) # + lCmds.append( "TEannot.py -P %s -C %s -S 2 -a CEN -r -v %i" % (self._projectName, self._configFileName, self._verbosity) ) # + lCmds.append( "TEannot.py -P %s -C %s -S 4 -s TRF -v %i" % (self._projectName, self._configFileName, self._verbosity) ) + lCmds.append( "TEannot.py -P %s -C %s -S 4 -s RMSSR -v %i" % (self._projectName, self._configFileName, self._verbosity) ) + lCmds.append( "TEannot.py -P %s -C %s -S 4 -s Mreps -v %i" % (self._projectName, self._configFileName, self._verbosity) ) + lCmds.append( "TEannot.py -P %s -C %s -S 5 -v %i" % (self._projectName, self._configFileName, self._verbosity) ) + lCmds.append( "TEannot.py -P %s -C %s -S 3 -c BLR+RM+CEN -v %i" % (self._projectName, self._configFileName, self._verbosity) ) + lCmds.append( "TEannot.py -P %s -C %s -S 7 -v %i" % (self._projectName, self._configFileName, self._verbosity) ) + lCmds.append( "TEannot.py -P %s -C %s -S 8 -v %i -o GFF3" % (self._projectName, self._configFileName, self._verbosity) ) + + if self._classif!='': + self._setClassifTable() + + for cmd in lCmds: + returnValue = os.system(cmd) + if returnValue != 0: + print "ERROR: command '%s' returned %i" % (cmd, returnValue) + self._cleanTables() + sys.exit(1) + + print "END time: %s" % time.strftime("%Y-%m-%d %H:%M:%S") + + + def _maskFasta(self): + pathFile = self._outputMasked+"_tmp.path" + setFile = self._outputMasked+"_tmp.set" + lCmds = [] + lCmds.append("srptExportTable.py -i %s_chr_allTEs_nr_noSSR_path -C %s -o %s -v %s" % (self._projectName,self._configFileName,pathFile,self._verbosity)) + lCmds.append("MaskSeqFromCoord.py -i %s -m %s -f path -X -o %s -v %s" % (self._fastaFileName,pathFile,self._outputMasked,self._verbosity)) + lCmds.append("srptExportTable.py -i %s_chr_allSSRs_set -C %s -o %s -v %s " % (self._projectName,self._configFileName, setFile,self._verbosity)) + lCmds.append("MaskSeqFromCoord.py -i %s -m %s -f set -X -o %s_SSRmask.fa -v %s" % (self._outputMasked, setFile, self._outputMasked, self._verbosity)) + + for cmd in lCmds: + returnValue = os.system(cmd) + if returnValue != 0: + print "ERROR: command '%s' returned %i" % (cmd, returnValue) + self._cleanTables() + sys.exit(1) + + #os.system("rm -f %s"%pathFile) + + def _createStatsFile(self): + fastaFile=open(self._fastaFileName) + fastaLength=FastaUtils.dbCumLength( fastaFile ) + cmd = "PostAnalyzeTELib.py -a 3 -g {0} -p {1}_chr_allTEs_nr_noSSR_path -s {1}_refTEs_seq".format(fastaLength,self._projectName) + os.system(cmd) + cmd = "mv %s_chr_allTEs_nr_noSSR_path.globalAnnotStatsPerTE.txt %s"%(self._projectName,self._statsFile) + os.system(cmd) + + def _setClassifTable(self): + iDb = DbFactory.createInstance() + iDb.createTable("%s_consensus_classif" % self._projectName, "classif", self._classif, True) + iDb.close() + + def _launchListAndDropTables(self): + cmd = "ListAndDropTables.py" + cmd += " -C %s" % self._configFileName + cmd += " -d '%s'" % self._projectName + os.system(cmd) + + def _cleanJobsTable(self): + db = DbFactory.createInstance( configFileName = self._configFileName ) + sql_cmd="DELETE FROM jobs WHERE groupid like '%s%%';"%self._projectName + db.execute( sql_cmd ) + db.close() + + def _cleanTables(self): + self._launchListAndDropTables() + self. _cleanJobsTable() + + def run(self): + os.mkdir(self._projectName) + os.chdir(self._projectName) + self._writeConfigFile() + os.symlink(self._fastaFileName,"%s/%s.fa" %(os.getcwd(),self._projectName)) #creer repertoire projet + os.symlink(self._libraryFileName,"%s/%s_refTEs.fa" %(os.getcwd(),self._projectName)) + self._launchTEannot() + self._mergeOutputGff() + self._maskFasta() + if self._statsFile : + self._createStatsFile() + self._cleanTables() + +if __name__ == '__main__': + iTEannot= TEannot_lite() + iTEannot.setAttributesFromCommandLine() + iTEannot.run()
