diff TEdenovo_lite.py @ 0:baea09e6722b draft default tip

1st Uploaded
author vmarcon
date Mon, 06 Feb 2017 13:31:53 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TEdenovo_lite.py	Mon Feb 06 13:31:53 2017 -0500
@@ -0,0 +1,406 @@
+#!/usr/bin/env python
+
+
+import os
+import sys
+import time
+import glob
+import shutil
+import ConfigParser
+from commons.core.seq.FastaUtils import *
+import operator
+import re
+
+
+
+if not "REPET_PATH" in os.environ.keys():
+    print "ERROR: no environment variable REPET_PATH"
+    sys.exit(1)
+
+if (not "REPET_DB" in os.environ.keys()) or (not "REPET_HOST" in os.environ.keys()) or (not "REPET_PORT" in os.environ.keys()) or (not "REPET_USER" in os.environ.keys()) or (not "REPET_PW" in os.environ.keys()):
+    print "ERROR: there is at least one environment database variable missing : REPET_DB, REPET_PORT, REPET_HOST, REPET_USER or REPET_PW"
+    sys.exit(1)
+
+if not "REPET_JOB_MANAGER" in os.environ.keys():
+    print "ERROR: no environment variable REPET_JOB_MANAGER"
+    sys.exit(1)
+
+
+if not "%s/bin" % os.environ["REPET_PATH"] in os.environ["PATH"]: 
+    os.environ["PATH"] = "%s/bin:%s" % (os.environ["REPET_PATH"], os.environ["PATH"])
+    
+sys.path.append(os.environ["REPET_PATH"])
+if not "PYTHONPATH" in os.environ.keys():
+    os.environ["PYTHONPATH"] = os.environ["REPET_PATH"]
+else:
+    os.environ["PYTHONPATH"] = "%s:%s" % (os.environ["REPET_PATH"], os.environ["PYTHONPATH"])
+
+from commons.core.LoggerFactory import LoggerFactory
+from commons.core.checker.RepetException import RepetException
+from commons.core.utils.FileUtils import FileUtils
+from commons.core.utils.RepetOptionParser import RepetOptionParser
+from commons.core.seq.FastaUtils import FastaUtils
+from commons.core.sql.DbFactory import DbFactory
+from itertools import islice
+
+LOG_DEPTH = "TEdenovo.pipeline"
+
+class TEdenovo_lite(object):
+
+    def __init__(self, configFileName = "", fastaFileName = "", verbosity = 0):
+        self._configFileName = configFileName
+        self._fastaFileName = os.path.abspath(fastaFileName)
+        self._projectName = time.strftime("%Y%m%d%H%M%S")
+        self._limitSeqSize = 200000000
+
+	if "REPET_NUCL_BANK" in os.environ.keys():
+	    if os.path.exists(os.environ["REPET_NUCL_BANK"]):
+        	self._nucl_bank = os.environ["REPET_NUCL_BANK"]
+	    else : 
+		print "ERROR : the nucleotides bank configured doesn't exist. Please correct it in the REPET_NUCL_BANK variable"
+                sys.exit(1)
+	else : 
+	    self._nucl_bank = ""
+	if "REPET_PROT_BANK" in os.environ.keys():
+            if os.path.exists(os.environ["REPET_PROT_BANK"]):
+		self._prot_bank = os.environ["REPET_PROT_BANK"]
+	    else :
+                print "ERROR : the proteins bank configured doesn't exist. Please correct it in the REPET_PROT_BANK variable"
+                sys.exit(1)
+	else :
+	     self._prot_bank = ""
+	if "REPET_HMM_PROFILES" in os.environ.keys():
+            if os.path.exists(os.environ["REPET_HMM_PROFILES"]):
+        	self._HMM_profiles = os.environ["REPET_HMM_PROFILES"]
+            else :
+                print "ERROR : the hmm profiles bank configured doesn't exist. Please correct it in the REPET_HMM_PROFILES variable"
+                sys.exit(1)
+	else : 
+	    self._HMM_profiles = ""
+	if "REPET_RDNA_BANK" in os.environ.keys():
+            if os.path.exists(os.environ["REPET_RDNA_BANK"]):
+        	self._rdna_bank = os.environ["REPET_RDNA_BANK"]
+            else :
+                print "ERROR : the rDNA bank configured doesn't exist. Please correct it in the REPET_PROT_BANK variable"
+                sys.exit(1)
+	else :
+            self._rdna_bank = ""
+	if self._nucl_bank == "" and self._prot_bank == "" and self._HMM_profiles == "" and self._rdna_bank == ""  : 
+	    print "WARNING : No bank are configured ... To set banks please add REPET_NUCL_BANK, REPET_PROT_BANK, REPET_HMM_PROFILES and/or REPET_RDNA_BANK in your environment"
+	if "REPET_TMP_DIR" in os.environ.keys():
+	    self._tmp_dir = os.environ["REPET_TMP_DIR"]
+	else : 
+	    self._tmp_dir = ""
+        self._outputFasta = ""
+        self._classif = False
+        self._outputClassif = ""
+	self._outputStats = ""
+        self._verbosity = verbosity
+        self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)
+
+    def setAttributesFromCommandLine(self):
+        description = "This script is a ligth version of TEdenovo. It writes configuration file and launches TEdenovo."
+        epilog = "Example: TEdenovo_lite.py -i fastaFileName \n"
+        version = "2.0"
+        parser = RepetOptionParser(description = description, epilog = epilog, version = version)
+        parser.add_option("-i", "--fasta",           dest = "fastaFileName" , action = "store" , type = "string", help ="input fasta file name ", default = "")
+        parser.add_option("-c", "--withClassif",      dest="withClassif",	action="store_true",	help = " Get classification files in output.", default = False)
+        parser.add_option("-o", "--output",      dest="outputLabel"      , action = "store", type = "string", help = "[optional] Prefix label for output file(s).", default = "")
+        parser.add_option("-v", "--verbosity",       dest = "verbosity",      action = "store", type = "int",    help = "Verbosity [optional] [default: 2]", default = 2)
+        options = parser.parse_args()[0]
+        self._setAttributesFromOptions(options)
+        
+    def _setAttributesFromOptions(self, options):
+        self.setConfigFileName("")
+        if options.fastaFileName=="":
+            print "ERROR : You have to enter an input fasta file"
+            print "Example: TEdenovo_lite.py -i fastaFileName \n"
+            print "More option : TEdenovo_lite.py --help "
+            exit(1)
+        else : 
+            self._fastaFileName = os.path.abspath(options.fastaFileName)
+        if options.outputLabel=="":
+            fastaBaseName=os.path.abspath(re.search(r'([^\/\\]*)\.[fa|fasta|fsa|fas]',options.fastaFileName).groups()[0])
+            options.outputLabel=fastaBaseName
+        self._outputFasta = os.path.abspath(options.outputLabel+"-%s-denovoLibTEs_filtered.fa"%self._projectName[:8])
+	self._outputStats = os.path.abspath(options.outputLabel+"-%s-classif_stats.txt"%self._projectName[:8])
+        self._verbosity = options.verbosity
+        if options.withClassif : 
+            self._classif=True
+            self._outputClassif = os.path.abspath(options.outputLabel+'-%s.classif'%self._projectName[:8])
+
+    def setConfigFileName(self, configFileName):
+        self._configFileName = configFileName
+        if not self._configFileName:
+            self._configFileName = "TEdenovo_Galaxy_config_%s" % self._projectName
+
+    def setAttributesFromConfigFile(self, configFileName):
+        config = ConfigParser.ConfigParser()
+        config.readfp( open(configFileName) )
+
+
+    def _writeConfigFile(self):
+        if FileUtils.isRessourceExists(self._configFileName):
+            self._logAndRaise("Configuration file '%s' already exists. Won't be overwritten.")
+
+        shutil.copy("%s/config/TEdenovo.cfg" % os.environ.get("REPET_PATH"), self._configFileName)
+        self.setAttributesFromConfigFile(self._configFileName)
+
+        os.system("sed -i 's|repet_host: <your_MySQL_host>|repet_host: %s|' %s" % (os.environ["REPET_HOST"], self._configFileName))
+        os.system("sed -i 's|repet_user: <your_MySQL_login>|repet_user: %s|' %s" % (os.environ["REPET_USER"], self._configFileName))
+        os.system("sed -i 's|repet_pw: <your_MySQL_password>|repet_pw: %s|' %s" % (os.environ["REPET_PW"], self._configFileName))
+        os.system("sed -i 's|repet_db: <your_MySQL_db>|repet_db: %s|' %s" % (os.environ["REPET_DB"], self._configFileName))
+        os.system("sed -i 's|repet_port: 3306|repet_port: %s|' %s" % (os.environ["REPET_PORT"], self._configFileName))
+        os.system("sed -i 's|repet_job_manager: SGE|repet_job_manager: %s|' %s" % (os.environ["REPET_JOB_MANAGER"], self._configFileName))
+        os.system("sed -i 's|project_name: <your_project_name>|project_name: %s|' %s" % (self._projectName, self._configFileName))
+        os.system("sed -i 's|project_dir: <absolute_path_to_your_project_directory>|project_dir: %s|' %s" % (os.getcwd().replace("/", "\/"), self._configFileName))
+        os.system("sed -i 's|tmpDir:|tmpDir: %s|g' %s" % (self._tmp_dir, self._configFileName))
+
+        if  self._nucl_bank != "" and self._nucl_bank != None:
+            os.system("sed -i 's|TE_BLRn: no|TE_BLRn: yes|' %s" %  self._configFileName)
+            os.system("sed -i 's|TE_BLRtx: no|TE_BLRtx: yes|' %s" %  self._configFileName)
+            os.system("sed -i 's|TE_nucl_bank: <bank_of_TE_nucleotide_sequences_such_as_Repbase>|TE_nucl_bank: %s|' %s" % (os.path.basename(self._nucl_bank), self._configFileName))
+
+        if  self._prot_bank != "" and self._prot_bank != None:
+            os.system("sed -i 's|TE_BLRx: no|TE_BLRx: yes|' %s" %  self._configFileName)
+            os.system("sed -i 's|TE_prot_bank: <bank_of_TE_amino-acid_sequences_such_as_Repbase>|TE_prot_bank: %s|' %s" % (os.path.basename(self._prot_bank), self._configFileName))
+
+        if  self._HMM_profiles != "" and self._HMM_profiles != None:
+            os.system("sed -i 's|TE_HMMER: no|TE_HMMER: yes|' %s" %  self._configFileName)
+            os.system("sed -i 's|TE_HMM_profiles: <bank_of_HMM_profiles>|TE_HMM_profiles: %s|' %s" % (os.path.basename(self._HMM_profiles),self._configFileName))
+
+        if  self._rdna_bank != "" and self._rdna_bank != None:
+            os.system("sed -i 's|rDNA_BLRn: no|rDNA_BLRn: yes|' %s" %  self._configFileName)
+            os.system("sed -i 's|rDNA_bank: <bank_of_rDNA_sequences_from_eukaryota>|rDNA_bank: %s|' %s" % (os.path.basename(self._rdna_bank),self._configFileName))
+
+        os.system("sed -i 's|filter_host_gene: no|filter_host_gene: yes|' %s" % (self._configFileName))
+
+
+    def removeNstretches(self,maxNstretchesSize=11,minContigsize=10000):
+	if self._verbosity > 0:
+            print "Removing Nstretches longer than %d pb and removing conting shorter than %d pb"%(maxNstretchesSize,minContigsize)
+        t0=time.time()
+        Nstretches=FastaUtils.getNstretchesRangesList(self._fastaFileName,maxNstretchesSize)
+        t1=time.time()
+        refBSDB = BioseqDB(self._fastaFileName)
+        t3=time.time()
+        debut=1
+        t2=time.time()
+        if len(Nstretches)>0:
+            currentchrom=Nstretches[0].seqname
+            refBS=refBSDB.fetch(currentchrom)
+        t3=time.time()
+        newBSDB = BioseqDB()
+        i=0
+        seqInNstretches = []
+        for Nstretch in Nstretches :
+            i+=1
+            tmpBS=""
+            if Nstretch.seqname not in seqInNstretches :
+                seqInNstretches.append(Nstretch.seqname)
+            if currentchrom==Nstretch.seqname:
+                    fin=Nstretch.start-1
+                    size=fin-debut+1
+                    if size>minContigsize :
+                            tmpBS=refBS.subseq(debut,fin)
+                            newBSDB.add(tmpBS)
+
+                    debut=Nstretch.end+1
+
+            else :
+                    fin = refBSDB.getSeqLength(currentchrom)
+                    size=fin-debut+1
+                    if size>minContigsize :
+                        tmpBS=refBS.subseq(debut,fin)
+                        newBSDB.add(tmpBS)
+                    currentchrom=Nstretch.seqname
+                    refBS=refBSDB.fetch(currentchrom)
+                    debut=1
+                    fin==Nstretch.start
+                    size=fin-debut+1
+                    if size>minContigsize :
+                        tmpBS=refBS.subseq(debut,fin)
+                        newBSDB.add(tmpBS)
+                    debut=Nstretch.end+1
+
+        if len(Nstretches)>0:
+            fin = refBSDB.getSeqLength(currentchrom)
+            size=fin-debut+1
+            if size>minContigsize :
+                tmpBS=refBS.subseq(debut,fin)
+                newBSDB.add(tmpBS)
+
+        for refName in refBSDB.getHeaderList() :
+            if refName not in seqInNstretches:
+                debut=1
+                fin=refBSDB.getSeqLength(refName)
+                size=fin-debut+1
+                if size>minContigsize :
+                    refBS=refBSDB.fetch(refName)
+                    tmpBS=refBS.subseq(debut,fin)
+                    newBSDB.add(tmpBS)
+
+
+        t5b=time.time()
+        if self._verbosity >= 2:
+            print "%s contigs selected from %s scaffolds"%(newBSDB.getSize(),refBSDB.getSize())
+        #newBSDB.sortByLength(reverse=True)
+
+        return newBSDB
+
+
+
+#TODO refactoring about min size of genome for preprocess
+    def selectContigs4givenSize(self,BSDB,limit=200000000):
+        if self._verbosity > 0:
+            print "Selecting contigs to reach %s pb "%limit
+        contigsHeadersAndLength=zip(BSDB.getHeaderList(),BSDB.getListOfSequencesLength())
+        size=0
+        size_small=0
+        size_big=500000000
+        lselectedContigs=[]	
+
+        for seq in BSDB.db : 
+            size+=seq.getLength()
+            if size<limit:
+                lselectedContigs.append(seq)
+                size_small=size
+            else :
+                size_big=size
+                break
+    
+        if size_big-limit<limit-size_small :
+            lselectedContigs.append(seq)
+        
+        if self._verbosity > 0:
+            print "%s contigs selected to reach %s pb (%s contigs initially) "%(len(lselectedContigs),limit,len(contigsHeadersAndLength))
+        
+        selectedContigsBSDB=BioseqDB()
+        selectedContigsBSDB.setData(lselectedContigs)
+        return selectedContigsBSDB
+
+    def writeFastaInput(self,BSDB,outFileName=''):
+        if self._verbosity > 0:
+            print "Writing fasta file"
+
+        if not outFileName:
+            outFileName = self._projectName + ".fastaExtract"
+        
+        BSDB.save(outFileName)
+        if self._verbosity > 0:
+            print '%d sequences saved.'%BSDB.getSize() 
+        
+        return outFileName
+
+    def correctHeader(self,BSDB):
+        if self._verbosity > 0:
+            print "Correcting fasta headers"
+        replacedSeqNb=0
+        for header in BSDB.getHeaderList() :
+            p = re.compile('[^a-zA-Z0-9_:\.\-]', re.IGNORECASE)
+            if p.search(header):
+                sub=list(set(p.findall(header)))
+                correctedHeader=header
+                for s in sub : 
+                    correctedHeader=correctedHeader.replace(s,'_')
+                if self._verbosity>2:
+                    print "Correct Header : '%s' replaced by '%s'"%(header,correctedHeader)
+                BSDB.fetch(header).setHeader(correctedHeader)
+                replacedSeqNb+=1
+        if self._verbosity > 0:
+            print '%s sequence headers corrected'%replacedSeqNb
+        return BSDB
+
+
+    def _launchTEdenovo(self):
+        print "START time: %s" % time.strftime("%Y-%m-%d %H:%M:%S")
+        lCmds = []
+        lCmds.append( "TEdenovo.py -P %s -C %s -S 1 -v %i" % (self._projectName, self._configFileName, self._verbosity) )
+        lCmds.append( "TEdenovo.py -P %s -C %s -S 2 -s Blaster -v %i" % (self._projectName, self._configFileName, self._verbosity) )
+        lCmds.append( "TEdenovo.py -P %s -C %s -S 3 -s Blaster -c Grouper -v %i" % (self._projectName, self._configFileName, self._verbosity) )
+        lCmds.append( "TEdenovo.py -P %s -C %s -S 3 -s Blaster -c Recon -v %i" % (self._projectName, self._configFileName, self._verbosity) )
+        lCmds.append( "TEdenovo.py -P %s -C %s -S 3 -s Blaster -c Piler -v %i" % (self._projectName, self._configFileName, self._verbosity) )
+        lCmds.append( "TEdenovo.py -P %s -C %s -S 4 -s Blaster -c Grouper -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )
+        lCmds.append( "TEdenovo.py -P %s -C %s -S 4 -s Blaster -c Recon -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )
+        lCmds.append( "TEdenovo.py -P %s -C %s -S 4 -s Blaster -c Piler -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )
+        lCmds.append( "TEdenovo.py -P %s -C %s -S 5 -s Blaster -c GrpRecPil -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )
+        lCmds.append( "TEdenovo.py -P %s -C %s -S 6 -s Blaster -c GrpRecPil -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )
+        lCmds.append( "TEdenovo.py -P %s -C %s -S 7 -s Blaster -c GrpRecPil -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )
+
+        for cmd in lCmds:
+            returnValue = os.system(cmd)
+            if returnValue != 0:
+                print "ERROR: command '%s' returned %i" % (cmd, returnValue)
+                self._cleanTables()
+                sys.exit(1)
+
+        print "END time: %s" % time.strftime("%Y-%m-%d %H:%M:%S")
+        outFastaFile = glob.glob("%s_Blaster_GrpRecPil_Map_TEclassif_Filtered/*_denovoLibTEs_filtered.fa"%self._projectName) 
+        shutil.copy(outFastaFile[0], self._outputFasta)
+	outStatsFile = glob.glob("%s_Blaster_GrpRecPil_Map_TEclassif_Filtered/*.classif_stats.txt"%self._projectName)
+        shutil.copy(outStatsFile[0], self._outputStats)
+        if self._classif:
+            outClassifFile = glob.glob("%s_Blaster_GrpRecPil_Map_TEclassif/classifConsensus/*_withoutRedundancy_negStrandReversed_WickerH.classif"%self._projectName)
+            shutil.copy(outClassifFile[0], self._outputClassif)
+        self._renameTE()
+
+    def _renameTE(self):
+        name=re.search(r'([^\/\\]*)-\d{8}-denovoLibTEs_filtered\.[fa|fasta|fsa|fas]',self._outputFasta).groups()[0]
+        os.system("sed -i 's|%s|%s|' %s" % (self._projectName,name, self._outputFasta))
+        if self._classif:
+            os.system("sed -i 's|%s|%s|' %s" % (self._projectName,name, self._outputClassif))	
+
+    def preprocessFastaFile(self):
+        inFileHandler = open(self._fastaFileName, "r")
+        cumulLength = FastaUtils.dbCumLength(inFileHandler)
+        inFileHandler.close()
+        if cumulLength >= self._limitSeqSize:
+            print "Preprocess lauched"
+            allContigsBSDB=self.removeNstretches()
+            selectedContigsBSDB=self.selectContigs4givenSize(allContigsBSDB)
+            self.correctHeader(selectedContigsBSDB)
+            fastaFile=self.writeFastaInput(selectedContigsBSDB)
+            print "Preprocess finished"
+        else:
+            fastaFile=self._fastaFileName
+            print "No preprocess : the genome size %s lower than %s Mbp" % (cumulLength, self._limitSeqSize/1000000)
+        os.symlink(fastaFile,"%s/%s.fa" %(os.getcwd(),self._projectName)) #creer repertoire projet
+
+    def _launchListAndDropTables(self):	
+        cmd = "ListAndDropTables.py"
+        cmd += " -C %s" % self._configFileName
+        cmd += " -d '%s'" % self._projectName
+        os.system(cmd)
+
+    def _cleanJobsTable(self):
+        db = DbFactory.createInstance( configFileName = self._configFileName )
+        sql_cmd="DELETE FROM jobs WHERE groupid like '%s%%';"%self._projectName
+        db.execute( sql_cmd )
+        db.close()
+
+    def _cleanTables(self):
+        self._launchListAndDropTables()
+        self. _cleanJobsTable()
+
+
+    def run(self):
+        os.mkdir(self._projectName)
+        os.chdir(self._projectName)
+        self._writeConfigFile()
+        self.preprocessFastaFile()
+        if  self._nucl_bank != "" and self._nucl_bank != None:
+            os.symlink(self._nucl_bank,"%s/%s" %(os.getcwd(),os.path.basename(self._nucl_bank)))
+        if  self._prot_bank != "" and self._prot_bank != None:
+            os.symlink(self._prot_bank,"%s/%s" %(os.getcwd(),os.path.basename(self._prot_bank)))
+        if  self._HMM_profiles != "" and self._HMM_profiles != None:
+            os.symlink(self._HMM_profiles,"%s/%s" %(os.getcwd(),os.path.basename(self._HMM_profiles)))
+        if  self._rdna_bank != "" and self._rdna_bank != None:
+            os.symlink(self._rdna_bank,"%s/%s" %(os.getcwd(),os.path.basename(self._rdna_bank)))
+
+        self._launchTEdenovo()
+        self._cleanTables()
+
+if __name__ == '__main__':
+    iTEdenovo = TEdenovo_lite()
+    iTEdenovo.setAttributesFromCommandLine()
+    iTEdenovo.run()