Mercurial > repos > vmarcon > repet_tedenovo
view TEdenovo_lite.py @ 0:baea09e6722b draft default tip
1st Uploaded
author | vmarcon |
---|---|
date | Mon, 06 Feb 2017 13:31:53 -0500 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python import os import sys import time import glob import shutil import ConfigParser from commons.core.seq.FastaUtils import * import operator import re if not "REPET_PATH" in os.environ.keys(): print "ERROR: no environment variable REPET_PATH" sys.exit(1) if (not "REPET_DB" in os.environ.keys()) or (not "REPET_HOST" in os.environ.keys()) or (not "REPET_PORT" in os.environ.keys()) or (not "REPET_USER" in os.environ.keys()) or (not "REPET_PW" in os.environ.keys()): print "ERROR: there is at least one environment database variable missing : REPET_DB, REPET_PORT, REPET_HOST, REPET_USER or REPET_PW" sys.exit(1) if not "REPET_JOB_MANAGER" in os.environ.keys(): print "ERROR: no environment variable REPET_JOB_MANAGER" sys.exit(1) if not "%s/bin" % os.environ["REPET_PATH"] in os.environ["PATH"]: os.environ["PATH"] = "%s/bin:%s" % (os.environ["REPET_PATH"], os.environ["PATH"]) sys.path.append(os.environ["REPET_PATH"]) if not "PYTHONPATH" in os.environ.keys(): os.environ["PYTHONPATH"] = os.environ["REPET_PATH"] else: os.environ["PYTHONPATH"] = "%s:%s" % (os.environ["REPET_PATH"], os.environ["PYTHONPATH"]) from commons.core.LoggerFactory import LoggerFactory from commons.core.checker.RepetException import RepetException from commons.core.utils.FileUtils import FileUtils from commons.core.utils.RepetOptionParser import RepetOptionParser from commons.core.seq.FastaUtils import FastaUtils from commons.core.sql.DbFactory import DbFactory from itertools import islice LOG_DEPTH = "TEdenovo.pipeline" class TEdenovo_lite(object): def __init__(self, configFileName = "", fastaFileName = "", verbosity = 0): self._configFileName = configFileName self._fastaFileName = os.path.abspath(fastaFileName) self._projectName = time.strftime("%Y%m%d%H%M%S") self._limitSeqSize = 200000000 if "REPET_NUCL_BANK" in os.environ.keys(): if os.path.exists(os.environ["REPET_NUCL_BANK"]): self._nucl_bank = os.environ["REPET_NUCL_BANK"] else : print "ERROR : the nucleotides bank configured doesn't exist. Please correct it in the REPET_NUCL_BANK variable" sys.exit(1) else : self._nucl_bank = "" if "REPET_PROT_BANK" in os.environ.keys(): if os.path.exists(os.environ["REPET_PROT_BANK"]): self._prot_bank = os.environ["REPET_PROT_BANK"] else : print "ERROR : the proteins bank configured doesn't exist. Please correct it in the REPET_PROT_BANK variable" sys.exit(1) else : self._prot_bank = "" if "REPET_HMM_PROFILES" in os.environ.keys(): if os.path.exists(os.environ["REPET_HMM_PROFILES"]): self._HMM_profiles = os.environ["REPET_HMM_PROFILES"] else : print "ERROR : the hmm profiles bank configured doesn't exist. Please correct it in the REPET_HMM_PROFILES variable" sys.exit(1) else : self._HMM_profiles = "" if "REPET_RDNA_BANK" in os.environ.keys(): if os.path.exists(os.environ["REPET_RDNA_BANK"]): self._rdna_bank = os.environ["REPET_RDNA_BANK"] else : print "ERROR : the rDNA bank configured doesn't exist. Please correct it in the REPET_PROT_BANK variable" sys.exit(1) else : self._rdna_bank = "" if self._nucl_bank == "" and self._prot_bank == "" and self._HMM_profiles == "" and self._rdna_bank == "" : print "WARNING : No bank are configured ... To set banks please add REPET_NUCL_BANK, REPET_PROT_BANK, REPET_HMM_PROFILES and/or REPET_RDNA_BANK in your environment" if "REPET_TMP_DIR" in os.environ.keys(): self._tmp_dir = os.environ["REPET_TMP_DIR"] else : self._tmp_dir = "" self._outputFasta = "" self._classif = False self._outputClassif = "" self._outputStats = "" self._verbosity = verbosity self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) def setAttributesFromCommandLine(self): description = "This script is a ligth version of TEdenovo. It writes configuration file and launches TEdenovo." epilog = "Example: TEdenovo_lite.py -i fastaFileName \n" version = "2.0" parser = RepetOptionParser(description = description, epilog = epilog, version = version) parser.add_option("-i", "--fasta", dest = "fastaFileName" , action = "store" , type = "string", help ="input fasta file name ", default = "") parser.add_option("-c", "--withClassif", dest="withClassif", action="store_true", help = " Get classification files in output.", default = False) parser.add_option("-o", "--output", dest="outputLabel" , action = "store", type = "string", help = "[optional] Prefix label for output file(s).", default = "") parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "Verbosity [optional] [default: 2]", default = 2) options = parser.parse_args()[0] self._setAttributesFromOptions(options) def _setAttributesFromOptions(self, options): self.setConfigFileName("") if options.fastaFileName=="": print "ERROR : You have to enter an input fasta file" print "Example: TEdenovo_lite.py -i fastaFileName \n" print "More option : TEdenovo_lite.py --help " exit(1) else : self._fastaFileName = os.path.abspath(options.fastaFileName) if options.outputLabel=="": fastaBaseName=os.path.abspath(re.search(r'([^\/\\]*)\.[fa|fasta|fsa|fas]',options.fastaFileName).groups()[0]) options.outputLabel=fastaBaseName self._outputFasta = os.path.abspath(options.outputLabel+"-%s-denovoLibTEs_filtered.fa"%self._projectName[:8]) self._outputStats = os.path.abspath(options.outputLabel+"-%s-classif_stats.txt"%self._projectName[:8]) self._verbosity = options.verbosity if options.withClassif : self._classif=True self._outputClassif = os.path.abspath(options.outputLabel+'-%s.classif'%self._projectName[:8]) def setConfigFileName(self, configFileName): self._configFileName = configFileName if not self._configFileName: self._configFileName = "TEdenovo_Galaxy_config_%s" % self._projectName def setAttributesFromConfigFile(self, configFileName): config = ConfigParser.ConfigParser() config.readfp( open(configFileName) ) def _writeConfigFile(self): if FileUtils.isRessourceExists(self._configFileName): self._logAndRaise("Configuration file '%s' already exists. Won't be overwritten.") shutil.copy("%s/config/TEdenovo.cfg" % os.environ.get("REPET_PATH"), self._configFileName) self.setAttributesFromConfigFile(self._configFileName) os.system("sed -i 's|repet_host: <your_MySQL_host>|repet_host: %s|' %s" % (os.environ["REPET_HOST"], self._configFileName)) os.system("sed -i 's|repet_user: <your_MySQL_login>|repet_user: %s|' %s" % (os.environ["REPET_USER"], self._configFileName)) os.system("sed -i 's|repet_pw: <your_MySQL_password>|repet_pw: %s|' %s" % (os.environ["REPET_PW"], self._configFileName)) os.system("sed -i 's|repet_db: <your_MySQL_db>|repet_db: %s|' %s" % (os.environ["REPET_DB"], self._configFileName)) os.system("sed -i 's|repet_port: 3306|repet_port: %s|' %s" % (os.environ["REPET_PORT"], self._configFileName)) os.system("sed -i 's|repet_job_manager: SGE|repet_job_manager: %s|' %s" % (os.environ["REPET_JOB_MANAGER"], self._configFileName)) os.system("sed -i 's|project_name: <your_project_name>|project_name: %s|' %s" % (self._projectName, self._configFileName)) os.system("sed -i 's|project_dir: <absolute_path_to_your_project_directory>|project_dir: %s|' %s" % (os.getcwd().replace("/", "\/"), self._configFileName)) os.system("sed -i 's|tmpDir:|tmpDir: %s|g' %s" % (self._tmp_dir, self._configFileName)) if self._nucl_bank != "" and self._nucl_bank != None: os.system("sed -i 's|TE_BLRn: no|TE_BLRn: yes|' %s" % self._configFileName) os.system("sed -i 's|TE_BLRtx: no|TE_BLRtx: yes|' %s" % self._configFileName) os.system("sed -i 's|TE_nucl_bank: <bank_of_TE_nucleotide_sequences_such_as_Repbase>|TE_nucl_bank: %s|' %s" % (os.path.basename(self._nucl_bank), self._configFileName)) if self._prot_bank != "" and self._prot_bank != None: os.system("sed -i 's|TE_BLRx: no|TE_BLRx: yes|' %s" % self._configFileName) os.system("sed -i 's|TE_prot_bank: <bank_of_TE_amino-acid_sequences_such_as_Repbase>|TE_prot_bank: %s|' %s" % (os.path.basename(self._prot_bank), self._configFileName)) if self._HMM_profiles != "" and self._HMM_profiles != None: os.system("sed -i 's|TE_HMMER: no|TE_HMMER: yes|' %s" % self._configFileName) os.system("sed -i 's|TE_HMM_profiles: <bank_of_HMM_profiles>|TE_HMM_profiles: %s|' %s" % (os.path.basename(self._HMM_profiles),self._configFileName)) if self._rdna_bank != "" and self._rdna_bank != None: os.system("sed -i 's|rDNA_BLRn: no|rDNA_BLRn: yes|' %s" % self._configFileName) os.system("sed -i 's|rDNA_bank: <bank_of_rDNA_sequences_from_eukaryota>|rDNA_bank: %s|' %s" % (os.path.basename(self._rdna_bank),self._configFileName)) os.system("sed -i 's|filter_host_gene: no|filter_host_gene: yes|' %s" % (self._configFileName)) def removeNstretches(self,maxNstretchesSize=11,minContigsize=10000): if self._verbosity > 0: print "Removing Nstretches longer than %d pb and removing conting shorter than %d pb"%(maxNstretchesSize,minContigsize) t0=time.time() Nstretches=FastaUtils.getNstretchesRangesList(self._fastaFileName,maxNstretchesSize) t1=time.time() refBSDB = BioseqDB(self._fastaFileName) t3=time.time() debut=1 t2=time.time() if len(Nstretches)>0: currentchrom=Nstretches[0].seqname refBS=refBSDB.fetch(currentchrom) t3=time.time() newBSDB = BioseqDB() i=0 seqInNstretches = [] for Nstretch in Nstretches : i+=1 tmpBS="" if Nstretch.seqname not in seqInNstretches : seqInNstretches.append(Nstretch.seqname) if currentchrom==Nstretch.seqname: fin=Nstretch.start-1 size=fin-debut+1 if size>minContigsize : tmpBS=refBS.subseq(debut,fin) newBSDB.add(tmpBS) debut=Nstretch.end+1 else : fin = refBSDB.getSeqLength(currentchrom) size=fin-debut+1 if size>minContigsize : tmpBS=refBS.subseq(debut,fin) newBSDB.add(tmpBS) currentchrom=Nstretch.seqname refBS=refBSDB.fetch(currentchrom) debut=1 fin==Nstretch.start size=fin-debut+1 if size>minContigsize : tmpBS=refBS.subseq(debut,fin) newBSDB.add(tmpBS) debut=Nstretch.end+1 if len(Nstretches)>0: fin = refBSDB.getSeqLength(currentchrom) size=fin-debut+1 if size>minContigsize : tmpBS=refBS.subseq(debut,fin) newBSDB.add(tmpBS) for refName in refBSDB.getHeaderList() : if refName not in seqInNstretches: debut=1 fin=refBSDB.getSeqLength(refName) size=fin-debut+1 if size>minContigsize : refBS=refBSDB.fetch(refName) tmpBS=refBS.subseq(debut,fin) newBSDB.add(tmpBS) t5b=time.time() if self._verbosity >= 2: print "%s contigs selected from %s scaffolds"%(newBSDB.getSize(),refBSDB.getSize()) #newBSDB.sortByLength(reverse=True) return newBSDB #TODO refactoring about min size of genome for preprocess def selectContigs4givenSize(self,BSDB,limit=200000000): if self._verbosity > 0: print "Selecting contigs to reach %s pb "%limit contigsHeadersAndLength=zip(BSDB.getHeaderList(),BSDB.getListOfSequencesLength()) size=0 size_small=0 size_big=500000000 lselectedContigs=[] for seq in BSDB.db : size+=seq.getLength() if size<limit: lselectedContigs.append(seq) size_small=size else : size_big=size break if size_big-limit<limit-size_small : lselectedContigs.append(seq) if self._verbosity > 0: print "%s contigs selected to reach %s pb (%s contigs initially) "%(len(lselectedContigs),limit,len(contigsHeadersAndLength)) selectedContigsBSDB=BioseqDB() selectedContigsBSDB.setData(lselectedContigs) return selectedContigsBSDB def writeFastaInput(self,BSDB,outFileName=''): if self._verbosity > 0: print "Writing fasta file" if not outFileName: outFileName = self._projectName + ".fastaExtract" BSDB.save(outFileName) if self._verbosity > 0: print '%d sequences saved.'%BSDB.getSize() return outFileName def correctHeader(self,BSDB): if self._verbosity > 0: print "Correcting fasta headers" replacedSeqNb=0 for header in BSDB.getHeaderList() : p = re.compile('[^a-zA-Z0-9_:\.\-]', re.IGNORECASE) if p.search(header): sub=list(set(p.findall(header))) correctedHeader=header for s in sub : correctedHeader=correctedHeader.replace(s,'_') if self._verbosity>2: print "Correct Header : '%s' replaced by '%s'"%(header,correctedHeader) BSDB.fetch(header).setHeader(correctedHeader) replacedSeqNb+=1 if self._verbosity > 0: print '%s sequence headers corrected'%replacedSeqNb return BSDB def _launchTEdenovo(self): print "START time: %s" % time.strftime("%Y-%m-%d %H:%M:%S") lCmds = [] lCmds.append( "TEdenovo.py -P %s -C %s -S 1 -v %i" % (self._projectName, self._configFileName, self._verbosity) ) lCmds.append( "TEdenovo.py -P %s -C %s -S 2 -s Blaster -v %i" % (self._projectName, self._configFileName, self._verbosity) ) lCmds.append( "TEdenovo.py -P %s -C %s -S 3 -s Blaster -c Grouper -v %i" % (self._projectName, self._configFileName, self._verbosity) ) lCmds.append( "TEdenovo.py -P %s -C %s -S 3 -s Blaster -c Recon -v %i" % (self._projectName, self._configFileName, self._verbosity) ) lCmds.append( "TEdenovo.py -P %s -C %s -S 3 -s Blaster -c Piler -v %i" % (self._projectName, self._configFileName, self._verbosity) ) lCmds.append( "TEdenovo.py -P %s -C %s -S 4 -s Blaster -c Grouper -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) ) lCmds.append( "TEdenovo.py -P %s -C %s -S 4 -s Blaster -c Recon -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) ) lCmds.append( "TEdenovo.py -P %s -C %s -S 4 -s Blaster -c Piler -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) ) lCmds.append( "TEdenovo.py -P %s -C %s -S 5 -s Blaster -c GrpRecPil -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) ) lCmds.append( "TEdenovo.py -P %s -C %s -S 6 -s Blaster -c GrpRecPil -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) ) lCmds.append( "TEdenovo.py -P %s -C %s -S 7 -s Blaster -c GrpRecPil -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) ) for cmd in lCmds: returnValue = os.system(cmd) if returnValue != 0: print "ERROR: command '%s' returned %i" % (cmd, returnValue) self._cleanTables() sys.exit(1) print "END time: %s" % time.strftime("%Y-%m-%d %H:%M:%S") outFastaFile = glob.glob("%s_Blaster_GrpRecPil_Map_TEclassif_Filtered/*_denovoLibTEs_filtered.fa"%self._projectName) shutil.copy(outFastaFile[0], self._outputFasta) outStatsFile = glob.glob("%s_Blaster_GrpRecPil_Map_TEclassif_Filtered/*.classif_stats.txt"%self._projectName) shutil.copy(outStatsFile[0], self._outputStats) if self._classif: outClassifFile = glob.glob("%s_Blaster_GrpRecPil_Map_TEclassif/classifConsensus/*_withoutRedundancy_negStrandReversed_WickerH.classif"%self._projectName) shutil.copy(outClassifFile[0], self._outputClassif) self._renameTE() def _renameTE(self): name=re.search(r'([^\/\\]*)-\d{8}-denovoLibTEs_filtered\.[fa|fasta|fsa|fas]',self._outputFasta).groups()[0] os.system("sed -i 's|%s|%s|' %s" % (self._projectName,name, self._outputFasta)) if self._classif: os.system("sed -i 's|%s|%s|' %s" % (self._projectName,name, self._outputClassif)) def preprocessFastaFile(self): inFileHandler = open(self._fastaFileName, "r") cumulLength = FastaUtils.dbCumLength(inFileHandler) inFileHandler.close() if cumulLength >= self._limitSeqSize: print "Preprocess lauched" allContigsBSDB=self.removeNstretches() selectedContigsBSDB=self.selectContigs4givenSize(allContigsBSDB) self.correctHeader(selectedContigsBSDB) fastaFile=self.writeFastaInput(selectedContigsBSDB) print "Preprocess finished" else: fastaFile=self._fastaFileName print "No preprocess : the genome size %s lower than %s Mbp" % (cumulLength, self._limitSeqSize/1000000) os.symlink(fastaFile,"%s/%s.fa" %(os.getcwd(),self._projectName)) #creer repertoire projet def _launchListAndDropTables(self): cmd = "ListAndDropTables.py" cmd += " -C %s" % self._configFileName cmd += " -d '%s'" % self._projectName os.system(cmd) def _cleanJobsTable(self): db = DbFactory.createInstance( configFileName = self._configFileName ) sql_cmd="DELETE FROM jobs WHERE groupid like '%s%%';"%self._projectName db.execute( sql_cmd ) db.close() def _cleanTables(self): self._launchListAndDropTables() self. _cleanJobsTable() def run(self): os.mkdir(self._projectName) os.chdir(self._projectName) self._writeConfigFile() self.preprocessFastaFile() if self._nucl_bank != "" and self._nucl_bank != None: os.symlink(self._nucl_bank,"%s/%s" %(os.getcwd(),os.path.basename(self._nucl_bank))) if self._prot_bank != "" and self._prot_bank != None: os.symlink(self._prot_bank,"%s/%s" %(os.getcwd(),os.path.basename(self._prot_bank))) if self._HMM_profiles != "" and self._HMM_profiles != None: os.symlink(self._HMM_profiles,"%s/%s" %(os.getcwd(),os.path.basename(self._HMM_profiles))) if self._rdna_bank != "" and self._rdna_bank != None: os.symlink(self._rdna_bank,"%s/%s" %(os.getcwd(),os.path.basename(self._rdna_bank))) self._launchTEdenovo() self._cleanTables() if __name__ == '__main__': iTEdenovo = TEdenovo_lite() iTEdenovo.setAttributesFromCommandLine() iTEdenovo.run()