Mercurial > repos > vmarcon > repet_tedenovo
diff TEdenovo_lite.py @ 0:baea09e6722b draft default tip
1st Uploaded
author | vmarcon |
---|---|
date | Mon, 06 Feb 2017 13:31:53 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEdenovo_lite.py Mon Feb 06 13:31:53 2017 -0500 @@ -0,0 +1,406 @@ +#!/usr/bin/env python + + +import os +import sys +import time +import glob +import shutil +import ConfigParser +from commons.core.seq.FastaUtils import * +import operator +import re + + + +if not "REPET_PATH" in os.environ.keys(): + print "ERROR: no environment variable REPET_PATH" + sys.exit(1) + +if (not "REPET_DB" in os.environ.keys()) or (not "REPET_HOST" in os.environ.keys()) or (not "REPET_PORT" in os.environ.keys()) or (not "REPET_USER" in os.environ.keys()) or (not "REPET_PW" in os.environ.keys()): + print "ERROR: there is at least one environment database variable missing : REPET_DB, REPET_PORT, REPET_HOST, REPET_USER or REPET_PW" + sys.exit(1) + +if not "REPET_JOB_MANAGER" in os.environ.keys(): + print "ERROR: no environment variable REPET_JOB_MANAGER" + sys.exit(1) + + +if not "%s/bin" % os.environ["REPET_PATH"] in os.environ["PATH"]: + os.environ["PATH"] = "%s/bin:%s" % (os.environ["REPET_PATH"], os.environ["PATH"]) + +sys.path.append(os.environ["REPET_PATH"]) +if not "PYTHONPATH" in os.environ.keys(): + os.environ["PYTHONPATH"] = os.environ["REPET_PATH"] +else: + os.environ["PYTHONPATH"] = "%s:%s" % (os.environ["REPET_PATH"], os.environ["PYTHONPATH"]) + +from commons.core.LoggerFactory import LoggerFactory +from commons.core.checker.RepetException import RepetException +from commons.core.utils.FileUtils import FileUtils +from commons.core.utils.RepetOptionParser import RepetOptionParser +from commons.core.seq.FastaUtils import FastaUtils +from commons.core.sql.DbFactory import DbFactory +from itertools import islice + +LOG_DEPTH = "TEdenovo.pipeline" + +class TEdenovo_lite(object): + + def __init__(self, configFileName = "", fastaFileName = "", verbosity = 0): + self._configFileName = configFileName + self._fastaFileName = os.path.abspath(fastaFileName) + self._projectName = time.strftime("%Y%m%d%H%M%S") + self._limitSeqSize = 200000000 + + if "REPET_NUCL_BANK" in os.environ.keys(): + if os.path.exists(os.environ["REPET_NUCL_BANK"]): + self._nucl_bank = os.environ["REPET_NUCL_BANK"] + else : + print "ERROR : the nucleotides bank configured doesn't exist. Please correct it in the REPET_NUCL_BANK variable" + sys.exit(1) + else : + self._nucl_bank = "" + if "REPET_PROT_BANK" in os.environ.keys(): + if os.path.exists(os.environ["REPET_PROT_BANK"]): + self._prot_bank = os.environ["REPET_PROT_BANK"] + else : + print "ERROR : the proteins bank configured doesn't exist. Please correct it in the REPET_PROT_BANK variable" + sys.exit(1) + else : + self._prot_bank = "" + if "REPET_HMM_PROFILES" in os.environ.keys(): + if os.path.exists(os.environ["REPET_HMM_PROFILES"]): + self._HMM_profiles = os.environ["REPET_HMM_PROFILES"] + else : + print "ERROR : the hmm profiles bank configured doesn't exist. Please correct it in the REPET_HMM_PROFILES variable" + sys.exit(1) + else : + self._HMM_profiles = "" + if "REPET_RDNA_BANK" in os.environ.keys(): + if os.path.exists(os.environ["REPET_RDNA_BANK"]): + self._rdna_bank = os.environ["REPET_RDNA_BANK"] + else : + print "ERROR : the rDNA bank configured doesn't exist. Please correct it in the REPET_PROT_BANK variable" + sys.exit(1) + else : + self._rdna_bank = "" + if self._nucl_bank == "" and self._prot_bank == "" and self._HMM_profiles == "" and self._rdna_bank == "" : + print "WARNING : No bank are configured ... To set banks please add REPET_NUCL_BANK, REPET_PROT_BANK, REPET_HMM_PROFILES and/or REPET_RDNA_BANK in your environment" + if "REPET_TMP_DIR" in os.environ.keys(): + self._tmp_dir = os.environ["REPET_TMP_DIR"] + else : + self._tmp_dir = "" + self._outputFasta = "" + self._classif = False + self._outputClassif = "" + self._outputStats = "" + self._verbosity = verbosity + self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) + + def setAttributesFromCommandLine(self): + description = "This script is a ligth version of TEdenovo. It writes configuration file and launches TEdenovo." + epilog = "Example: TEdenovo_lite.py -i fastaFileName \n" + version = "2.0" + parser = RepetOptionParser(description = description, epilog = epilog, version = version) + parser.add_option("-i", "--fasta", dest = "fastaFileName" , action = "store" , type = "string", help ="input fasta file name ", default = "") + parser.add_option("-c", "--withClassif", dest="withClassif", action="store_true", help = " Get classification files in output.", default = False) + parser.add_option("-o", "--output", dest="outputLabel" , action = "store", type = "string", help = "[optional] Prefix label for output file(s).", default = "") + parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "Verbosity [optional] [default: 2]", default = 2) + options = parser.parse_args()[0] + self._setAttributesFromOptions(options) + + def _setAttributesFromOptions(self, options): + self.setConfigFileName("") + if options.fastaFileName=="": + print "ERROR : You have to enter an input fasta file" + print "Example: TEdenovo_lite.py -i fastaFileName \n" + print "More option : TEdenovo_lite.py --help " + exit(1) + else : + self._fastaFileName = os.path.abspath(options.fastaFileName) + if options.outputLabel=="": + fastaBaseName=os.path.abspath(re.search(r'([^\/\\]*)\.[fa|fasta|fsa|fas]',options.fastaFileName).groups()[0]) + options.outputLabel=fastaBaseName + self._outputFasta = os.path.abspath(options.outputLabel+"-%s-denovoLibTEs_filtered.fa"%self._projectName[:8]) + self._outputStats = os.path.abspath(options.outputLabel+"-%s-classif_stats.txt"%self._projectName[:8]) + self._verbosity = options.verbosity + if options.withClassif : + self._classif=True + self._outputClassif = os.path.abspath(options.outputLabel+'-%s.classif'%self._projectName[:8]) + + def setConfigFileName(self, configFileName): + self._configFileName = configFileName + if not self._configFileName: + self._configFileName = "TEdenovo_Galaxy_config_%s" % self._projectName + + def setAttributesFromConfigFile(self, configFileName): + config = ConfigParser.ConfigParser() + config.readfp( open(configFileName) ) + + + def _writeConfigFile(self): + if FileUtils.isRessourceExists(self._configFileName): + self._logAndRaise("Configuration file '%s' already exists. Won't be overwritten.") + + shutil.copy("%s/config/TEdenovo.cfg" % os.environ.get("REPET_PATH"), self._configFileName) + self.setAttributesFromConfigFile(self._configFileName) + + os.system("sed -i 's|repet_host: <your_MySQL_host>|repet_host: %s|' %s" % (os.environ["REPET_HOST"], self._configFileName)) + os.system("sed -i 's|repet_user: <your_MySQL_login>|repet_user: %s|' %s" % (os.environ["REPET_USER"], self._configFileName)) + os.system("sed -i 's|repet_pw: <your_MySQL_password>|repet_pw: %s|' %s" % (os.environ["REPET_PW"], self._configFileName)) + os.system("sed -i 's|repet_db: <your_MySQL_db>|repet_db: %s|' %s" % (os.environ["REPET_DB"], self._configFileName)) + os.system("sed -i 's|repet_port: 3306|repet_port: %s|' %s" % (os.environ["REPET_PORT"], self._configFileName)) + os.system("sed -i 's|repet_job_manager: SGE|repet_job_manager: %s|' %s" % (os.environ["REPET_JOB_MANAGER"], self._configFileName)) + os.system("sed -i 's|project_name: <your_project_name>|project_name: %s|' %s" % (self._projectName, self._configFileName)) + os.system("sed -i 's|project_dir: <absolute_path_to_your_project_directory>|project_dir: %s|' %s" % (os.getcwd().replace("/", "\/"), self._configFileName)) + os.system("sed -i 's|tmpDir:|tmpDir: %s|g' %s" % (self._tmp_dir, self._configFileName)) + + if self._nucl_bank != "" and self._nucl_bank != None: + os.system("sed -i 's|TE_BLRn: no|TE_BLRn: yes|' %s" % self._configFileName) + os.system("sed -i 's|TE_BLRtx: no|TE_BLRtx: yes|' %s" % self._configFileName) + os.system("sed -i 's|TE_nucl_bank: <bank_of_TE_nucleotide_sequences_such_as_Repbase>|TE_nucl_bank: %s|' %s" % (os.path.basename(self._nucl_bank), self._configFileName)) + + if self._prot_bank != "" and self._prot_bank != None: + os.system("sed -i 's|TE_BLRx: no|TE_BLRx: yes|' %s" % self._configFileName) + os.system("sed -i 's|TE_prot_bank: <bank_of_TE_amino-acid_sequences_such_as_Repbase>|TE_prot_bank: %s|' %s" % (os.path.basename(self._prot_bank), self._configFileName)) + + if self._HMM_profiles != "" and self._HMM_profiles != None: + os.system("sed -i 's|TE_HMMER: no|TE_HMMER: yes|' %s" % self._configFileName) + os.system("sed -i 's|TE_HMM_profiles: <bank_of_HMM_profiles>|TE_HMM_profiles: %s|' %s" % (os.path.basename(self._HMM_profiles),self._configFileName)) + + if self._rdna_bank != "" and self._rdna_bank != None: + os.system("sed -i 's|rDNA_BLRn: no|rDNA_BLRn: yes|' %s" % self._configFileName) + os.system("sed -i 's|rDNA_bank: <bank_of_rDNA_sequences_from_eukaryota>|rDNA_bank: %s|' %s" % (os.path.basename(self._rdna_bank),self._configFileName)) + + os.system("sed -i 's|filter_host_gene: no|filter_host_gene: yes|' %s" % (self._configFileName)) + + + def removeNstretches(self,maxNstretchesSize=11,minContigsize=10000): + if self._verbosity > 0: + print "Removing Nstretches longer than %d pb and removing conting shorter than %d pb"%(maxNstretchesSize,minContigsize) + t0=time.time() + Nstretches=FastaUtils.getNstretchesRangesList(self._fastaFileName,maxNstretchesSize) + t1=time.time() + refBSDB = BioseqDB(self._fastaFileName) + t3=time.time() + debut=1 + t2=time.time() + if len(Nstretches)>0: + currentchrom=Nstretches[0].seqname + refBS=refBSDB.fetch(currentchrom) + t3=time.time() + newBSDB = BioseqDB() + i=0 + seqInNstretches = [] + for Nstretch in Nstretches : + i+=1 + tmpBS="" + if Nstretch.seqname not in seqInNstretches : + seqInNstretches.append(Nstretch.seqname) + if currentchrom==Nstretch.seqname: + fin=Nstretch.start-1 + size=fin-debut+1 + if size>minContigsize : + tmpBS=refBS.subseq(debut,fin) + newBSDB.add(tmpBS) + + debut=Nstretch.end+1 + + else : + fin = refBSDB.getSeqLength(currentchrom) + size=fin-debut+1 + if size>minContigsize : + tmpBS=refBS.subseq(debut,fin) + newBSDB.add(tmpBS) + currentchrom=Nstretch.seqname + refBS=refBSDB.fetch(currentchrom) + debut=1 + fin==Nstretch.start + size=fin-debut+1 + if size>minContigsize : + tmpBS=refBS.subseq(debut,fin) + newBSDB.add(tmpBS) + debut=Nstretch.end+1 + + if len(Nstretches)>0: + fin = refBSDB.getSeqLength(currentchrom) + size=fin-debut+1 + if size>minContigsize : + tmpBS=refBS.subseq(debut,fin) + newBSDB.add(tmpBS) + + for refName in refBSDB.getHeaderList() : + if refName not in seqInNstretches: + debut=1 + fin=refBSDB.getSeqLength(refName) + size=fin-debut+1 + if size>minContigsize : + refBS=refBSDB.fetch(refName) + tmpBS=refBS.subseq(debut,fin) + newBSDB.add(tmpBS) + + + t5b=time.time() + if self._verbosity >= 2: + print "%s contigs selected from %s scaffolds"%(newBSDB.getSize(),refBSDB.getSize()) + #newBSDB.sortByLength(reverse=True) + + return newBSDB + + + +#TODO refactoring about min size of genome for preprocess + def selectContigs4givenSize(self,BSDB,limit=200000000): + if self._verbosity > 0: + print "Selecting contigs to reach %s pb "%limit + contigsHeadersAndLength=zip(BSDB.getHeaderList(),BSDB.getListOfSequencesLength()) + size=0 + size_small=0 + size_big=500000000 + lselectedContigs=[] + + for seq in BSDB.db : + size+=seq.getLength() + if size<limit: + lselectedContigs.append(seq) + size_small=size + else : + size_big=size + break + + if size_big-limit<limit-size_small : + lselectedContigs.append(seq) + + if self._verbosity > 0: + print "%s contigs selected to reach %s pb (%s contigs initially) "%(len(lselectedContigs),limit,len(contigsHeadersAndLength)) + + selectedContigsBSDB=BioseqDB() + selectedContigsBSDB.setData(lselectedContigs) + return selectedContigsBSDB + + def writeFastaInput(self,BSDB,outFileName=''): + if self._verbosity > 0: + print "Writing fasta file" + + if not outFileName: + outFileName = self._projectName + ".fastaExtract" + + BSDB.save(outFileName) + if self._verbosity > 0: + print '%d sequences saved.'%BSDB.getSize() + + return outFileName + + def correctHeader(self,BSDB): + if self._verbosity > 0: + print "Correcting fasta headers" + replacedSeqNb=0 + for header in BSDB.getHeaderList() : + p = re.compile('[^a-zA-Z0-9_:\.\-]', re.IGNORECASE) + if p.search(header): + sub=list(set(p.findall(header))) + correctedHeader=header + for s in sub : + correctedHeader=correctedHeader.replace(s,'_') + if self._verbosity>2: + print "Correct Header : '%s' replaced by '%s'"%(header,correctedHeader) + BSDB.fetch(header).setHeader(correctedHeader) + replacedSeqNb+=1 + if self._verbosity > 0: + print '%s sequence headers corrected'%replacedSeqNb + return BSDB + + + def _launchTEdenovo(self): + print "START time: %s" % time.strftime("%Y-%m-%d %H:%M:%S") + lCmds = [] + lCmds.append( "TEdenovo.py -P %s -C %s -S 1 -v %i" % (self._projectName, self._configFileName, self._verbosity) ) + lCmds.append( "TEdenovo.py -P %s -C %s -S 2 -s Blaster -v %i" % (self._projectName, self._configFileName, self._verbosity) ) + lCmds.append( "TEdenovo.py -P %s -C %s -S 3 -s Blaster -c Grouper -v %i" % (self._projectName, self._configFileName, self._verbosity) ) + lCmds.append( "TEdenovo.py -P %s -C %s -S 3 -s Blaster -c Recon -v %i" % (self._projectName, self._configFileName, self._verbosity) ) + lCmds.append( "TEdenovo.py -P %s -C %s -S 3 -s Blaster -c Piler -v %i" % (self._projectName, self._configFileName, self._verbosity) ) + lCmds.append( "TEdenovo.py -P %s -C %s -S 4 -s Blaster -c Grouper -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) ) + lCmds.append( "TEdenovo.py -P %s -C %s -S 4 -s Blaster -c Recon -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) ) + lCmds.append( "TEdenovo.py -P %s -C %s -S 4 -s Blaster -c Piler -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) ) + lCmds.append( "TEdenovo.py -P %s -C %s -S 5 -s Blaster -c GrpRecPil -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) ) + lCmds.append( "TEdenovo.py -P %s -C %s -S 6 -s Blaster -c GrpRecPil -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) ) + lCmds.append( "TEdenovo.py -P %s -C %s -S 7 -s Blaster -c GrpRecPil -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) ) + + for cmd in lCmds: + returnValue = os.system(cmd) + if returnValue != 0: + print "ERROR: command '%s' returned %i" % (cmd, returnValue) + self._cleanTables() + sys.exit(1) + + print "END time: %s" % time.strftime("%Y-%m-%d %H:%M:%S") + outFastaFile = glob.glob("%s_Blaster_GrpRecPil_Map_TEclassif_Filtered/*_denovoLibTEs_filtered.fa"%self._projectName) + shutil.copy(outFastaFile[0], self._outputFasta) + outStatsFile = glob.glob("%s_Blaster_GrpRecPil_Map_TEclassif_Filtered/*.classif_stats.txt"%self._projectName) + shutil.copy(outStatsFile[0], self._outputStats) + if self._classif: + outClassifFile = glob.glob("%s_Blaster_GrpRecPil_Map_TEclassif/classifConsensus/*_withoutRedundancy_negStrandReversed_WickerH.classif"%self._projectName) + shutil.copy(outClassifFile[0], self._outputClassif) + self._renameTE() + + def _renameTE(self): + name=re.search(r'([^\/\\]*)-\d{8}-denovoLibTEs_filtered\.[fa|fasta|fsa|fas]',self._outputFasta).groups()[0] + os.system("sed -i 's|%s|%s|' %s" % (self._projectName,name, self._outputFasta)) + if self._classif: + os.system("sed -i 's|%s|%s|' %s" % (self._projectName,name, self._outputClassif)) + + def preprocessFastaFile(self): + inFileHandler = open(self._fastaFileName, "r") + cumulLength = FastaUtils.dbCumLength(inFileHandler) + inFileHandler.close() + if cumulLength >= self._limitSeqSize: + print "Preprocess lauched" + allContigsBSDB=self.removeNstretches() + selectedContigsBSDB=self.selectContigs4givenSize(allContigsBSDB) + self.correctHeader(selectedContigsBSDB) + fastaFile=self.writeFastaInput(selectedContigsBSDB) + print "Preprocess finished" + else: + fastaFile=self._fastaFileName + print "No preprocess : the genome size %s lower than %s Mbp" % (cumulLength, self._limitSeqSize/1000000) + os.symlink(fastaFile,"%s/%s.fa" %(os.getcwd(),self._projectName)) #creer repertoire projet + + def _launchListAndDropTables(self): + cmd = "ListAndDropTables.py" + cmd += " -C %s" % self._configFileName + cmd += " -d '%s'" % self._projectName + os.system(cmd) + + def _cleanJobsTable(self): + db = DbFactory.createInstance( configFileName = self._configFileName ) + sql_cmd="DELETE FROM jobs WHERE groupid like '%s%%';"%self._projectName + db.execute( sql_cmd ) + db.close() + + def _cleanTables(self): + self._launchListAndDropTables() + self. _cleanJobsTable() + + + def run(self): + os.mkdir(self._projectName) + os.chdir(self._projectName) + self._writeConfigFile() + self.preprocessFastaFile() + if self._nucl_bank != "" and self._nucl_bank != None: + os.symlink(self._nucl_bank,"%s/%s" %(os.getcwd(),os.path.basename(self._nucl_bank))) + if self._prot_bank != "" and self._prot_bank != None: + os.symlink(self._prot_bank,"%s/%s" %(os.getcwd(),os.path.basename(self._prot_bank))) + if self._HMM_profiles != "" and self._HMM_profiles != None: + os.symlink(self._HMM_profiles,"%s/%s" %(os.getcwd(),os.path.basename(self._HMM_profiles))) + if self._rdna_bank != "" and self._rdna_bank != None: + os.symlink(self._rdna_bank,"%s/%s" %(os.getcwd(),os.path.basename(self._rdna_bank))) + + self._launchTEdenovo() + self._cleanTables() + +if __name__ == '__main__': + iTEdenovo = TEdenovo_lite() + iTEdenovo.setAttributesFromCommandLine() + iTEdenovo.run()