view TEdenovo_lite.py @ 0:baea09e6722b draft default tip

1st Uploaded
author vmarcon
date Mon, 06 Feb 2017 13:31:53 -0500
parents
children
line wrap: on
line source

#!/usr/bin/env python


import os
import sys
import time
import glob
import shutil
import ConfigParser
from commons.core.seq.FastaUtils import *
import operator
import re



if not "REPET_PATH" in os.environ.keys():
    print "ERROR: no environment variable REPET_PATH"
    sys.exit(1)

if (not "REPET_DB" in os.environ.keys()) or (not "REPET_HOST" in os.environ.keys()) or (not "REPET_PORT" in os.environ.keys()) or (not "REPET_USER" in os.environ.keys()) or (not "REPET_PW" in os.environ.keys()):
    print "ERROR: there is at least one environment database variable missing : REPET_DB, REPET_PORT, REPET_HOST, REPET_USER or REPET_PW"
    sys.exit(1)

if not "REPET_JOB_MANAGER" in os.environ.keys():
    print "ERROR: no environment variable REPET_JOB_MANAGER"
    sys.exit(1)


if not "%s/bin" % os.environ["REPET_PATH"] in os.environ["PATH"]: 
    os.environ["PATH"] = "%s/bin:%s" % (os.environ["REPET_PATH"], os.environ["PATH"])
    
sys.path.append(os.environ["REPET_PATH"])
if not "PYTHONPATH" in os.environ.keys():
    os.environ["PYTHONPATH"] = os.environ["REPET_PATH"]
else:
    os.environ["PYTHONPATH"] = "%s:%s" % (os.environ["REPET_PATH"], os.environ["PYTHONPATH"])

from commons.core.LoggerFactory import LoggerFactory
from commons.core.checker.RepetException import RepetException
from commons.core.utils.FileUtils import FileUtils
from commons.core.utils.RepetOptionParser import RepetOptionParser
from commons.core.seq.FastaUtils import FastaUtils
from commons.core.sql.DbFactory import DbFactory
from itertools import islice

LOG_DEPTH = "TEdenovo.pipeline"

class TEdenovo_lite(object):

    def __init__(self, configFileName = "", fastaFileName = "", verbosity = 0):
        self._configFileName = configFileName
        self._fastaFileName = os.path.abspath(fastaFileName)
        self._projectName = time.strftime("%Y%m%d%H%M%S")
        self._limitSeqSize = 200000000

	if "REPET_NUCL_BANK" in os.environ.keys():
	    if os.path.exists(os.environ["REPET_NUCL_BANK"]):
        	self._nucl_bank = os.environ["REPET_NUCL_BANK"]
	    else : 
		print "ERROR : the nucleotides bank configured doesn't exist. Please correct it in the REPET_NUCL_BANK variable"
                sys.exit(1)
	else : 
	    self._nucl_bank = ""
	if "REPET_PROT_BANK" in os.environ.keys():
            if os.path.exists(os.environ["REPET_PROT_BANK"]):
		self._prot_bank = os.environ["REPET_PROT_BANK"]
	    else :
                print "ERROR : the proteins bank configured doesn't exist. Please correct it in the REPET_PROT_BANK variable"
                sys.exit(1)
	else :
	     self._prot_bank = ""
	if "REPET_HMM_PROFILES" in os.environ.keys():
            if os.path.exists(os.environ["REPET_HMM_PROFILES"]):
        	self._HMM_profiles = os.environ["REPET_HMM_PROFILES"]
            else :
                print "ERROR : the hmm profiles bank configured doesn't exist. Please correct it in the REPET_HMM_PROFILES variable"
                sys.exit(1)
	else : 
	    self._HMM_profiles = ""
	if "REPET_RDNA_BANK" in os.environ.keys():
            if os.path.exists(os.environ["REPET_RDNA_BANK"]):
        	self._rdna_bank = os.environ["REPET_RDNA_BANK"]
            else :
                print "ERROR : the rDNA bank configured doesn't exist. Please correct it in the REPET_PROT_BANK variable"
                sys.exit(1)
	else :
            self._rdna_bank = ""
	if self._nucl_bank == "" and self._prot_bank == "" and self._HMM_profiles == "" and self._rdna_bank == ""  : 
	    print "WARNING : No bank are configured ... To set banks please add REPET_NUCL_BANK, REPET_PROT_BANK, REPET_HMM_PROFILES and/or REPET_RDNA_BANK in your environment"
	if "REPET_TMP_DIR" in os.environ.keys():
	    self._tmp_dir = os.environ["REPET_TMP_DIR"]
	else : 
	    self._tmp_dir = ""
        self._outputFasta = ""
        self._classif = False
        self._outputClassif = ""
	self._outputStats = ""
        self._verbosity = verbosity
        self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)

    def setAttributesFromCommandLine(self):
        description = "This script is a ligth version of TEdenovo. It writes configuration file and launches TEdenovo."
        epilog = "Example: TEdenovo_lite.py -i fastaFileName \n"
        version = "2.0"
        parser = RepetOptionParser(description = description, epilog = epilog, version = version)
        parser.add_option("-i", "--fasta",           dest = "fastaFileName" , action = "store" , type = "string", help ="input fasta file name ", default = "")
        parser.add_option("-c", "--withClassif",      dest="withClassif",	action="store_true",	help = " Get classification files in output.", default = False)
        parser.add_option("-o", "--output",      dest="outputLabel"      , action = "store", type = "string", help = "[optional] Prefix label for output file(s).", default = "")
        parser.add_option("-v", "--verbosity",       dest = "verbosity",      action = "store", type = "int",    help = "Verbosity [optional] [default: 2]", default = 2)
        options = parser.parse_args()[0]
        self._setAttributesFromOptions(options)
        
    def _setAttributesFromOptions(self, options):
        self.setConfigFileName("")
        if options.fastaFileName=="":
            print "ERROR : You have to enter an input fasta file"
            print "Example: TEdenovo_lite.py -i fastaFileName \n"
            print "More option : TEdenovo_lite.py --help "
            exit(1)
        else : 
            self._fastaFileName = os.path.abspath(options.fastaFileName)
        if options.outputLabel=="":
            fastaBaseName=os.path.abspath(re.search(r'([^\/\\]*)\.[fa|fasta|fsa|fas]',options.fastaFileName).groups()[0])
            options.outputLabel=fastaBaseName
        self._outputFasta = os.path.abspath(options.outputLabel+"-%s-denovoLibTEs_filtered.fa"%self._projectName[:8])
	self._outputStats = os.path.abspath(options.outputLabel+"-%s-classif_stats.txt"%self._projectName[:8])
        self._verbosity = options.verbosity
        if options.withClassif : 
            self._classif=True
            self._outputClassif = os.path.abspath(options.outputLabel+'-%s.classif'%self._projectName[:8])

    def setConfigFileName(self, configFileName):
        self._configFileName = configFileName
        if not self._configFileName:
            self._configFileName = "TEdenovo_Galaxy_config_%s" % self._projectName

    def setAttributesFromConfigFile(self, configFileName):
        config = ConfigParser.ConfigParser()
        config.readfp( open(configFileName) )


    def _writeConfigFile(self):
        if FileUtils.isRessourceExists(self._configFileName):
            self._logAndRaise("Configuration file '%s' already exists. Won't be overwritten.")

        shutil.copy("%s/config/TEdenovo.cfg" % os.environ.get("REPET_PATH"), self._configFileName)
        self.setAttributesFromConfigFile(self._configFileName)

        os.system("sed -i 's|repet_host: <your_MySQL_host>|repet_host: %s|' %s" % (os.environ["REPET_HOST"], self._configFileName))
        os.system("sed -i 's|repet_user: <your_MySQL_login>|repet_user: %s|' %s" % (os.environ["REPET_USER"], self._configFileName))
        os.system("sed -i 's|repet_pw: <your_MySQL_password>|repet_pw: %s|' %s" % (os.environ["REPET_PW"], self._configFileName))
        os.system("sed -i 's|repet_db: <your_MySQL_db>|repet_db: %s|' %s" % (os.environ["REPET_DB"], self._configFileName))
        os.system("sed -i 's|repet_port: 3306|repet_port: %s|' %s" % (os.environ["REPET_PORT"], self._configFileName))
        os.system("sed -i 's|repet_job_manager: SGE|repet_job_manager: %s|' %s" % (os.environ["REPET_JOB_MANAGER"], self._configFileName))
        os.system("sed -i 's|project_name: <your_project_name>|project_name: %s|' %s" % (self._projectName, self._configFileName))
        os.system("sed -i 's|project_dir: <absolute_path_to_your_project_directory>|project_dir: %s|' %s" % (os.getcwd().replace("/", "\/"), self._configFileName))
        os.system("sed -i 's|tmpDir:|tmpDir: %s|g' %s" % (self._tmp_dir, self._configFileName))

        if  self._nucl_bank != "" and self._nucl_bank != None:
            os.system("sed -i 's|TE_BLRn: no|TE_BLRn: yes|' %s" %  self._configFileName)
            os.system("sed -i 's|TE_BLRtx: no|TE_BLRtx: yes|' %s" %  self._configFileName)
            os.system("sed -i 's|TE_nucl_bank: <bank_of_TE_nucleotide_sequences_such_as_Repbase>|TE_nucl_bank: %s|' %s" % (os.path.basename(self._nucl_bank), self._configFileName))

        if  self._prot_bank != "" and self._prot_bank != None:
            os.system("sed -i 's|TE_BLRx: no|TE_BLRx: yes|' %s" %  self._configFileName)
            os.system("sed -i 's|TE_prot_bank: <bank_of_TE_amino-acid_sequences_such_as_Repbase>|TE_prot_bank: %s|' %s" % (os.path.basename(self._prot_bank), self._configFileName))

        if  self._HMM_profiles != "" and self._HMM_profiles != None:
            os.system("sed -i 's|TE_HMMER: no|TE_HMMER: yes|' %s" %  self._configFileName)
            os.system("sed -i 's|TE_HMM_profiles: <bank_of_HMM_profiles>|TE_HMM_profiles: %s|' %s" % (os.path.basename(self._HMM_profiles),self._configFileName))

        if  self._rdna_bank != "" and self._rdna_bank != None:
            os.system("sed -i 's|rDNA_BLRn: no|rDNA_BLRn: yes|' %s" %  self._configFileName)
            os.system("sed -i 's|rDNA_bank: <bank_of_rDNA_sequences_from_eukaryota>|rDNA_bank: %s|' %s" % (os.path.basename(self._rdna_bank),self._configFileName))

        os.system("sed -i 's|filter_host_gene: no|filter_host_gene: yes|' %s" % (self._configFileName))


    def removeNstretches(self,maxNstretchesSize=11,minContigsize=10000):
	if self._verbosity > 0:
            print "Removing Nstretches longer than %d pb and removing conting shorter than %d pb"%(maxNstretchesSize,minContigsize)
        t0=time.time()
        Nstretches=FastaUtils.getNstretchesRangesList(self._fastaFileName,maxNstretchesSize)
        t1=time.time()
        refBSDB = BioseqDB(self._fastaFileName)
        t3=time.time()
        debut=1
        t2=time.time()
        if len(Nstretches)>0:
            currentchrom=Nstretches[0].seqname
            refBS=refBSDB.fetch(currentchrom)
        t3=time.time()
        newBSDB = BioseqDB()
        i=0
        seqInNstretches = []
        for Nstretch in Nstretches :
            i+=1
            tmpBS=""
            if Nstretch.seqname not in seqInNstretches :
                seqInNstretches.append(Nstretch.seqname)
            if currentchrom==Nstretch.seqname:
                    fin=Nstretch.start-1
                    size=fin-debut+1
                    if size>minContigsize :
                            tmpBS=refBS.subseq(debut,fin)
                            newBSDB.add(tmpBS)

                    debut=Nstretch.end+1

            else :
                    fin = refBSDB.getSeqLength(currentchrom)
                    size=fin-debut+1
                    if size>minContigsize :
                        tmpBS=refBS.subseq(debut,fin)
                        newBSDB.add(tmpBS)
                    currentchrom=Nstretch.seqname
                    refBS=refBSDB.fetch(currentchrom)
                    debut=1
                    fin==Nstretch.start
                    size=fin-debut+1
                    if size>minContigsize :
                        tmpBS=refBS.subseq(debut,fin)
                        newBSDB.add(tmpBS)
                    debut=Nstretch.end+1

        if len(Nstretches)>0:
            fin = refBSDB.getSeqLength(currentchrom)
            size=fin-debut+1
            if size>minContigsize :
                tmpBS=refBS.subseq(debut,fin)
                newBSDB.add(tmpBS)

        for refName in refBSDB.getHeaderList() :
            if refName not in seqInNstretches:
                debut=1
                fin=refBSDB.getSeqLength(refName)
                size=fin-debut+1
                if size>minContigsize :
                    refBS=refBSDB.fetch(refName)
                    tmpBS=refBS.subseq(debut,fin)
                    newBSDB.add(tmpBS)


        t5b=time.time()
        if self._verbosity >= 2:
            print "%s contigs selected from %s scaffolds"%(newBSDB.getSize(),refBSDB.getSize())
        #newBSDB.sortByLength(reverse=True)

        return newBSDB



#TODO refactoring about min size of genome for preprocess
    def selectContigs4givenSize(self,BSDB,limit=200000000):
        if self._verbosity > 0:
            print "Selecting contigs to reach %s pb "%limit
        contigsHeadersAndLength=zip(BSDB.getHeaderList(),BSDB.getListOfSequencesLength())
        size=0
        size_small=0
        size_big=500000000
        lselectedContigs=[]	

        for seq in BSDB.db : 
            size+=seq.getLength()
            if size<limit:
                lselectedContigs.append(seq)
                size_small=size
            else :
                size_big=size
                break
    
        if size_big-limit<limit-size_small :
            lselectedContigs.append(seq)
        
        if self._verbosity > 0:
            print "%s contigs selected to reach %s pb (%s contigs initially) "%(len(lselectedContigs),limit,len(contigsHeadersAndLength))
        
        selectedContigsBSDB=BioseqDB()
        selectedContigsBSDB.setData(lselectedContigs)
        return selectedContigsBSDB

    def writeFastaInput(self,BSDB,outFileName=''):
        if self._verbosity > 0:
            print "Writing fasta file"

        if not outFileName:
            outFileName = self._projectName + ".fastaExtract"
        
        BSDB.save(outFileName)
        if self._verbosity > 0:
            print '%d sequences saved.'%BSDB.getSize() 
        
        return outFileName

    def correctHeader(self,BSDB):
        if self._verbosity > 0:
            print "Correcting fasta headers"
        replacedSeqNb=0
        for header in BSDB.getHeaderList() :
            p = re.compile('[^a-zA-Z0-9_:\.\-]', re.IGNORECASE)
            if p.search(header):
                sub=list(set(p.findall(header)))
                correctedHeader=header
                for s in sub : 
                    correctedHeader=correctedHeader.replace(s,'_')
                if self._verbosity>2:
                    print "Correct Header : '%s' replaced by '%s'"%(header,correctedHeader)
                BSDB.fetch(header).setHeader(correctedHeader)
                replacedSeqNb+=1
        if self._verbosity > 0:
            print '%s sequence headers corrected'%replacedSeqNb
        return BSDB


    def _launchTEdenovo(self):
        print "START time: %s" % time.strftime("%Y-%m-%d %H:%M:%S")
        lCmds = []
        lCmds.append( "TEdenovo.py -P %s -C %s -S 1 -v %i" % (self._projectName, self._configFileName, self._verbosity) )
        lCmds.append( "TEdenovo.py -P %s -C %s -S 2 -s Blaster -v %i" % (self._projectName, self._configFileName, self._verbosity) )
        lCmds.append( "TEdenovo.py -P %s -C %s -S 3 -s Blaster -c Grouper -v %i" % (self._projectName, self._configFileName, self._verbosity) )
        lCmds.append( "TEdenovo.py -P %s -C %s -S 3 -s Blaster -c Recon -v %i" % (self._projectName, self._configFileName, self._verbosity) )
        lCmds.append( "TEdenovo.py -P %s -C %s -S 3 -s Blaster -c Piler -v %i" % (self._projectName, self._configFileName, self._verbosity) )
        lCmds.append( "TEdenovo.py -P %s -C %s -S 4 -s Blaster -c Grouper -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )
        lCmds.append( "TEdenovo.py -P %s -C %s -S 4 -s Blaster -c Recon -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )
        lCmds.append( "TEdenovo.py -P %s -C %s -S 4 -s Blaster -c Piler -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )
        lCmds.append( "TEdenovo.py -P %s -C %s -S 5 -s Blaster -c GrpRecPil -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )
        lCmds.append( "TEdenovo.py -P %s -C %s -S 6 -s Blaster -c GrpRecPil -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )
        lCmds.append( "TEdenovo.py -P %s -C %s -S 7 -s Blaster -c GrpRecPil -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )

        for cmd in lCmds:
            returnValue = os.system(cmd)
            if returnValue != 0:
                print "ERROR: command '%s' returned %i" % (cmd, returnValue)
                self._cleanTables()
                sys.exit(1)

        print "END time: %s" % time.strftime("%Y-%m-%d %H:%M:%S")
        outFastaFile = glob.glob("%s_Blaster_GrpRecPil_Map_TEclassif_Filtered/*_denovoLibTEs_filtered.fa"%self._projectName) 
        shutil.copy(outFastaFile[0], self._outputFasta)
	outStatsFile = glob.glob("%s_Blaster_GrpRecPil_Map_TEclassif_Filtered/*.classif_stats.txt"%self._projectName)
        shutil.copy(outStatsFile[0], self._outputStats)
        if self._classif:
            outClassifFile = glob.glob("%s_Blaster_GrpRecPil_Map_TEclassif/classifConsensus/*_withoutRedundancy_negStrandReversed_WickerH.classif"%self._projectName)
            shutil.copy(outClassifFile[0], self._outputClassif)
        self._renameTE()

    def _renameTE(self):
        name=re.search(r'([^\/\\]*)-\d{8}-denovoLibTEs_filtered\.[fa|fasta|fsa|fas]',self._outputFasta).groups()[0]
        os.system("sed -i 's|%s|%s|' %s" % (self._projectName,name, self._outputFasta))
        if self._classif:
            os.system("sed -i 's|%s|%s|' %s" % (self._projectName,name, self._outputClassif))	

    def preprocessFastaFile(self):
        inFileHandler = open(self._fastaFileName, "r")
        cumulLength = FastaUtils.dbCumLength(inFileHandler)
        inFileHandler.close()
        if cumulLength >= self._limitSeqSize:
            print "Preprocess lauched"
            allContigsBSDB=self.removeNstretches()
            selectedContigsBSDB=self.selectContigs4givenSize(allContigsBSDB)
            self.correctHeader(selectedContigsBSDB)
            fastaFile=self.writeFastaInput(selectedContigsBSDB)
            print "Preprocess finished"
        else:
            fastaFile=self._fastaFileName
            print "No preprocess : the genome size %s lower than %s Mbp" % (cumulLength, self._limitSeqSize/1000000)
        os.symlink(fastaFile,"%s/%s.fa" %(os.getcwd(),self._projectName)) #creer repertoire projet

    def _launchListAndDropTables(self):	
        cmd = "ListAndDropTables.py"
        cmd += " -C %s" % self._configFileName
        cmd += " -d '%s'" % self._projectName
        os.system(cmd)

    def _cleanJobsTable(self):
        db = DbFactory.createInstance( configFileName = self._configFileName )
        sql_cmd="DELETE FROM jobs WHERE groupid like '%s%%';"%self._projectName
        db.execute( sql_cmd )
        db.close()

    def _cleanTables(self):
        self._launchListAndDropTables()
        self. _cleanJobsTable()


    def run(self):
        os.mkdir(self._projectName)
        os.chdir(self._projectName)
        self._writeConfigFile()
        self.preprocessFastaFile()
        if  self._nucl_bank != "" and self._nucl_bank != None:
            os.symlink(self._nucl_bank,"%s/%s" %(os.getcwd(),os.path.basename(self._nucl_bank)))
        if  self._prot_bank != "" and self._prot_bank != None:
            os.symlink(self._prot_bank,"%s/%s" %(os.getcwd(),os.path.basename(self._prot_bank)))
        if  self._HMM_profiles != "" and self._HMM_profiles != None:
            os.symlink(self._HMM_profiles,"%s/%s" %(os.getcwd(),os.path.basename(self._HMM_profiles)))
        if  self._rdna_bank != "" and self._rdna_bank != None:
            os.symlink(self._rdna_bank,"%s/%s" %(os.getcwd(),os.path.basename(self._rdna_bank)))

        self._launchTEdenovo()
        self._cleanTables()

if __name__ == '__main__':
    iTEdenovo = TEdenovo_lite()
    iTEdenovo.setAttributesFromCommandLine()
    iTEdenovo.run()