Mercurial > repos > yufei-luo > s_mart
view commons/tools/GetMultAlignAndPhylogenyPerTErefSeq.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python ##@file # For each TE reference sequence, it computes a multiple alignment and a phylogeny of all its copies. # usage: GetMultAlignAndPhylogenyPerTErefSeq.py [ options ] # options: # -h: this help # -S: step (0: all steps [default], 1:file generation, 2:multiple alignements, 3:phylogenies) # -p: table with the annotations (format=path) # -s: table with the TE reference sequences (format=seq) # -g: table with the genome sequence (format=seq) # -r: name or file with TE reference sequence(s) (all by default) # -m: MSA method (default=Refalign/Map) # -l: minimum length of copies (default=100) # -n: number of longest copies to use (default=20) # -y: minimum copy proportion compare to references (default=0.5) # -R: keep the reference sequence (only with Refalign) # -C: configuration file # -q: queue name # -c: clean # -d: temporary directory # -v: verbosity level (default=0/1) import os import sys import glob import ConfigParser import pyRepet.launcher.programLauncher from commons.core.coord.PathUtils import PathUtils from commons.core.seq.FastaUtils import FastaUtils from commons.core.coord.SetUtils import SetUtils from commons.core.sql.TablePathAdaptator import TablePathAdaptator from commons.core.sql.TableSeqAdaptator import TableSeqAdaptator from commons.tools.OrientSequences import OrientSequences from ConfigParser import MissingSectionHeaderError from commons.core.utils.RepetOptionParser import RepetOptionParser from commons.core.LoggerFactory import LoggerFactory from commons.core.seq.AlignedBioseqDB import AlignedBioseqDB from commons.launcher import LaunchMap from commons.core.sql.DbFactory import DbFactory from commons.core.sql.TableJobAdaptatorFactory import TableJobAdaptatorFactory from commons.core.launcher.Launcher import Launcher from commons.core.utils.FileUtils import FileUtils LOG_DEPTH = "repet.tools" ## For each TE reference sequence, it computes a multiple alignment and a phylogeny of all its copies. # class GetMultAlignAndPhylogenyPerTErefSeq(object): def __init__(self, pathTableName="",refSeqTableName="", genomeSeqTableName="", step=0, mSAmethod="RefAlign",keepRefseq=False, configFileName= "", clean = True, verbosity=3): """ Constructor. """ self.step = step self._pathTable = pathTableName self._refSeqTable = refSeqTableName self._genomeSeqTable = genomeSeqTableName self._TErefseq = "" self._MSAmethod = mSAmethod self._minCopyLength = 100 self._nbLongestCopies = 20 self._minPropCopy = 0.5 self._keepRefseq = keepRefseq self.setConfigFileName(configFileName) self._queue = "" self._tmpDir = "" self._clean = clean self._verbosity = verbosity self._db = None self._tpaAnnot = None self._tsaRef = None self._pL = pyRepet.launcher.programLauncher.programLauncher() self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) def _logAndRaise(self, errorMsg): self._log.error(errorMsg) raise Exception(errorMsg) def setAttributesFromCmdLine(self): desc = "For each TE reference sequence, it computes a multiple alignment and a phylogeny of all its copies.\n" #Commented: it's not true, Config File is mandatory! # desc += "Connection to the database parameters are retrieved from the environment" #TODO: format options as other scripts (have a look at LaunchTemplate) parser = RepetOptionParser(description = desc, epilog = "") parser.add_option("-S", "--step", dest = "step" , action = "store", type = "int", help = "step (0: all steps [default], 1:file generation, 2:multiple alignments, 3:phylogenies)", default = 0 ) parser.add_option("-p", "--pathTable", dest = "path", action= "store", type = "string", help = "(mandatory) table with the annotations (format=path)", default = "") parser.add_option("-s", "--refSeqTable",dest = "refSeqTable", action= "store", type = "string", help = "(mandatory) table with the TE reference sequences (format=seq)", default = "") parser.add_option("-g", "--genomeSeqTable",dest = "genomeSeqTable",action= "store", type = "string", help = "(mandatory) table with the genome sequence (format=seq)", default = "") parser.add_option("-r", "--TErefseq",dest = "TErefseq", action= "store", type = "string", help = "name or file with TE reference sequence(s) (all by default)", default = "") parser.add_option("-m", "--MSAmethod",dest = "MSAmethod", action= "store", type = "string", help = "MSA method (default=RefAlign/Map)", default = "RefAlign") parser.add_option("-l", "--minCopyLength",dest = "minCopyLength", action= "store", type = "int", help = "minimum length of copies (default=100)", default = 100) parser.add_option("-n", "--nbLongestCopies",dest = "nbLongestCopies", action= "store", type = "int", help = "number of longest copies to use (default=20)", default = 20) parser.add_option("-y", "--minPropCopy",dest = "minPropCopy", action= "store", type = "float", help = "minimum copy proportion compare to references (default=0.5)", default = 0.5) parser.add_option("-R", "--keepRefseq",dest = "keepRefseq", action="store_true", help = "keep the reference sequence (only with Refalign)", default = False) parser.add_option("-C", "--config", dest = "configFileName", action = "store", type = "string", help = "(mandatory) config file name to set database connection", default = "") parser.add_option("-q", "--queue",dest = "queue", action= "store", type = "string", help = "queue name", default = "") parser.add_option("-c", "--clean", action="store_false", dest = "clean", help = "don't clean", default = True) parser.add_option("-d", "--tmpDir",dest = "tmpDir", action= "store", type = "string", help = "temporary directory", default = "") parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity level (default=0)", default = 0) options = parser.parse_args()[0] self._setAttributesFromOptions(options) def _setAttributesFromOptions(self, options): self.setConfigFileName(options.configFileName) self.setStep(options.step) self.setPathTable(options.path) self.setRefSeqTable(options.refSeqTable) self.setGenomeSeqTable(options.genomeSeqTable) self.setTErefseq(options.TErefseq) self.setMSAmethod(options.MSAmethod) self.setMinCopyLength(options.minCopyLength) self.setNbLongestCopies(options.nbLongestCopies) self.setMinPropCopy(options.minPropCopy) self.setKeepRefseq(options.keepRefseq) self.setQueue(options.queue) self.setClean(options.clean) self.setTmpDir(options.tmpDir) self.setVerbosity(options.verbosity) def setStep(self, step): self.step = step def setPathTable(self, path): self._pathTable = path def setRefSeqTable(self, refSeqTable): self._refSeqTable = refSeqTable def setGenomeSeqTable(self, genomeSeqTable): self._genomeSeqTable = genomeSeqTable def setTErefseq(self, TErefseq): self._TErefseq = TErefseq def setMSAmethod(self, MSAmethod): self._MSAmethod = MSAmethod def setMinCopyLength(self, minCopyLength): self._minCopyLength = minCopyLength def setNbLongestCopies(self, nbLongestCopies): self._nbLongestCopies = nbLongestCopies def setMinPropCopy(self, minPropCopy): self._minPropCopy = minPropCopy def setKeepRefseq(self, keepRefseq): self._keepRefseq = keepRefseq def setQueue(self, queue): self._queue = queue def setClean(self, clean): self._clean = clean def setTmpDir(self, tmpDir): self._tmpDir = tmpDir def setVerbosity(self, verbosity): self._verbosity = verbosity def setup_env(self): configFileHandle = open(self._configFileName) # Use RepetConfigParser? config = ConfigParser.ConfigParser() try : config.readfp(configFileHandle) except MissingSectionHeaderError: self._logAndRaise("Configuration file %s must begin with a section header" % self._configFileName) os.environ["REPET_HOST"] = config.get("repet_env", "repet_host") os.environ["REPET_USER"] = config.get("repet_env", "repet_user") os.environ["REPET_PW"] = config.get("repet_env", "repet_pw") os.environ["REPET_DB"] = config.get("repet_env", "repet_db") os.environ["REPET_PORT"] = config.get("repet_env", "repet_port") os.environ["REPET_JOB_MANAGER"] = config.get("repet_env", "repet_job_manager") def setConfigFileName(self, configFileName): self._configFileName = configFileName def checkAttributes( self ): """ Check the attributes are valid before running the algorithm. """ if self._pathTable == "": self._logAndRaise("PathTable is mandatory") if self._refSeqTable == "": self._logAndRaise("RefSeqTable is mandatory") if self._genomeSeqTable == "": self._logAndRaise("GenomeSeqTable is mandatory") if self._configFileName == "": self._logAndRaise("Configuration file is mandatory") else: if FileUtils.isRessourceExists(self._configFileName): self.setup_env() else: self._logAndRaise("Configuration file '%s' does not exist!" % self._configFileName) if ( self.step == 2 or self.step == 3 ) and self._MSAmethod not in ["RefAlign","Map"]: if self._MSAmethod == None or self._MSAmethod == "": self._logAndRaise("Missing method option") self._logAndRaise("Method '%s' not yet available" % ( self._MSAmethod )) if self._tmpDir == "": self._tmpDir = os.getcwd() def connectSql(self): self._db = DbFactory().createInstance(configFileName = self._configFileName, verbosity = 1) self._tpaAnnot = TablePathAdaptator(self._db, self._pathTable) self._tsaRef = TableSeqAdaptator(self._db,self._refSeqTable) def getNamesOfTErefSeq( self ): """ Return a list with the names of reference TEs. """ lNamesTErefSeq = [] if self._TErefseq == "": lNamesTErefSeq = self._tsaRef.getAccessionsList() else: if os.path.isfile( self._TErefseq ): refseqFileHandler = open( self._TErefseq, "r" ) while True: line = refseqFileHandler.readline() if line == "": break lNamesTErefSeq.append( line[:-1].split("\t")[0] ) refseqFileHandler.close() else: lNamesTErefSeq = [ self._TErefseq ] for name in lNamesTErefSeq: if not self._tsaRef.isAccessionInTable( name ): self._log.warning("'%s' not in table '%s'" % (name, self._refSeqTable)) lNamesTErefSeq.remove( name ) lNamesTErefSeq.sort() self._log.info("nb of TE reference sequences: %d" % (len(lNamesTErefSeq))) return lNamesTErefSeq def getTErefSeqInFastaFiles( self, lNamesTErefSeq ): """ Save sequences of reference TEs in fasta files. """ for name in lNamesTErefSeq: self._log.debug("save sequence of '%s'..." % ( name )) self._tsaRef.saveAccessionsListInFastaFile( [name], name+".fa" ) def getCopiesInFastaFilesPerTErefSeq( self, lNamesTErefSeq ): """ Save sequences of TE copies in fasta files. """ self._log.info("retrieve the copies...") tsaChr = TableSeqAdaptator( self._db, self._genomeSeqTable ) totalNbCopies = 0 totalNbSavedCopies = 0 for name in lNamesTErefSeq: nbCopies = 0 if os.path.exists(name+"_copies.fa"): continue outFile = open( name+"_copies.fa", "w" ) self._log.debug("Fetching path nums for subject: %s " % name) lPathNums = self._tpaAnnot.getIdListSortedByDecreasingChainLengthFromSubject(name) nbCopies = len(lPathNums) totalNbCopies += nbCopies self._log.debug("refseq '%s': %d copies" % ( name, nbCopies )) i = 0 nbSavedCopies = 0 nbSavedFragments = 0 lengthRefseq = self._tsaRef.getSeqLengthFromAccession( name ) while i < len(lPathNums) and nbSavedCopies < self._nbLongestCopies: lPaths = self._tpaAnnot.getPathListFromId( lPathNums[i] ) lSets = PathUtils.getSetListFromQueries( lPaths ) copyLength = SetUtils.getCumulLength( lSets ) if copyLength >= self._minCopyLength \ and copyLength >= self._minPropCopy * lengthRefseq: bs = tsaChr.getBioseqFromSetList( lSets ) bs.write(outFile) nbSavedCopies += 1 nbSavedFragments += len(lPaths) i += 1 self._log.debug(" (saved: %d copies, %d fragments)\n" % ( nbSavedCopies, nbSavedFragments ) ) outFile.close() totalNbSavedCopies += nbSavedCopies if nbSavedCopies == 0 and nbCopies != 0: self._log.warning("No copy >= %d" % ( self._minCopyLength )) self._log.info("nb of copies: %d (%d saved)" % ( totalNbCopies, totalNbSavedCopies )) def filter4Alignments( self, lNamesTErefSeq ): """ Filter TE copy sequences according to their length. """ self._log.info("filtering copies...") if len(lNamesTErefSeq) == 1: if not os.path.exists( "%s_copies.fa" % ( lNamesTErefSeq[0] ) ): self._logAndRaise("first run step 1") lInFiles = [ "%s_copies.fa" % ( lNamesTErefSeq[0] ) ] else: lInFiles = glob.glob( "*_copies.fa" ) if len(lInFiles) == 0: self._logAndRaise("no file '*_copies.fa'") count = 0 for inFileName in lInFiles: if os.path.exists( "%s.filtered" % ( inFileName ) ): continue count += 1 self._log.debug("TE %d --> %s" %(count,inFileName)) FastaUtils.dbLengthFilter( self._minCopyLength, inFileName, verbose=self._verbosity ) FastaUtils.dbLongestSequences( self._nbLongestCopies, inFileName+".Sup"+str(self._minCopyLength), verbose=self._verbosity ) os.rename( inFileName+".Sup"+str(self._minCopyLength)+".best"+str(self._nbLongestCopies), inFileName+".filtered" ) os.remove( inFileName+".Sup"+str(self._minCopyLength) ) os.remove( inFileName+".Inf"+str(self._minCopyLength) ) self._log.info("files filtered: %d" % (count)) def buildInFiles4Launcher( self, lNamesTErefSeq ): """ Save sequences of TE copies with reference in fasta files for launcher usage. """ self._log.info("building input files for launcher...") for name in lNamesTErefSeq: if os.path.exists( "%s_all.fa.oriented" % ( name ) ): continue if FastaUtils.dbSize( "%s_copies.fa.filtered" % ( name ) ) > 0: os.system( "cat "+name+".fa "+ name+"_copies.fa.filtered > " + name+"_all.fa" ) ors = OrientSequences(inFileName= "%s_all.fa" %(name), prgToOrient="mummer", clean=True, verbosity =self._verbosity - 1) ors.run() ors.clean() if len( glob.glob("*_all.fa.oriented") ) == 0: self._logAndRaise("no copies") self._log.info("done building input files for launcher...") def renameFile( self, fromName, toName): lFiles = glob.glob( "*%s" %fromName ) for f in lFiles: os.rename( f, f.replace(fromName,toName)) def _preparejobs(self, iLauncher, cDir): self._log.info("Preparing Align jobs") lCmdsTuples = [] print(self.queriesDir,self.alignFileSuffix) queryFiles = glob.glob("%s/%s" % (self.queriesDir,self.alignFileSuffix)) print("queryFiles",queryFiles) for queryFile in queryFiles:#os.listdir(self.queriesDir): lCmds = [] lCmdStart = [] lCmdFinish = [] #['shutil.move("%s/*" %newDir, "../" )'] queryFilePath = os.path.join(self.queriesDir,queryFile) lCmds.append(self._createLaunchAlignCommands(iLauncher, queryFilePath)) #lCmdFinish.append("shutil.move(\"%s.%s\", \"%s/%s.%s\")" % (benchName,self.outputformat,self.resultsDir,benchName,self.outputformat)) lCmdsTuples.append(iLauncher.prepareCommands_withoutIndentation(lCmds, lCmdStart, lCmdFinish)) self._log.info("Finished preparing Align jobs") return lCmdsTuples def _preparePhyMljobs(self, iLauncher, cDir): self._log.info("Preparing PhyMl jobs") lCmdsTuples = [] queryFiles = glob.glob("%s/%s" % (self.queriesDir,self.phyloFileSuffix)) print("queryFiles",queryFiles) for queryFile in queryFiles:#os.listdir(self.queriesDir): lCmds = [] lCmdStart = [] lCmdFinish = [] #['shutil.move("%s/*" %newDir, "../" )'] queryFilePath = os.path.join(self.queriesDir,queryFile) lCmds.append(self._createLaunchPhyMLCommands(iLauncher, queryFilePath)) lCmdsTuples.append(iLauncher.prepareCommands_withoutIndentation(lCmds, lCmdStart, lCmdFinish)) self._log.info("Finished preparing PhyMl jobs") return lCmdsTuples def _createLaunchAlignCommands(self, iLauncher, query): if self._MSAmethod == "Map": prg = "LaunchMap.py" elif self._MSAmethod == "RefAlign": prg = "LaunchRefAlign.py" lArgs = [] lArgs.append("-i %s" % query) lArgs.append(" -o %s.fa_aln" % query) lArgs.append("-v 1" ) if self._MSAmethod == "RefAlign" and self._keepRefseq: lArgs.append("-r " ) self._log.debug("Prepared Align commands : %s %s" % (prg, " ".join(lArgs))) return iLauncher.getSystemCommand("%s" % prg, lArgs) def launchMultAlignments(self, lNamesTErefSeq): """ Make multiple alignments via Map for each TE family """ self.queriesDir = os.getcwd() if len(lNamesTErefSeq) == 1: self.alignFileSuffix = "%s_all.fa.oriented" % (lNamesTErefSeq[0]) else: self.alignFileSuffix = "*_all.fa.oriented" queue = self._queue cDir = os.getcwd() tmpDir = self._tmpDir groupid = "%s_%s" % ( self._refSeqTable, self._MSAmethod ) acronym = "Align" iDb = DbFactory.createInstance(configFileName=self._configFileName) iTJA = TableJobAdaptatorFactory.createInstance(iDb, "jobs") iLauncher = Launcher(iTJA, os.getcwd(), "", "", cDir, tmpDir, "jobs", queue, groupid) lCmdsTuples = self._preparejobs(iLauncher, cDir) iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, self._clean) self.renameFile("_all.fa.oriented.fa_aln", "_all.fa.oriented_%s.fa_aln" % (self._MSAmethod.lower()) ) # def __makeMultAlignments( self, lNamesTErefSeq ): # """ # Make multiple alignments via Map or Refalign for each TE family # """ # self._pL.reset("") # if self._MSAmethod == "Map": # # prg = os.environ["REPET_PATH"] + "/bin/srptMAP.py" # elif self._MSAmethod == "RefAlign": # prg = os.environ["REPET_PATH"] + "/bin/srptRefalign.py" # cmd = prg # cmd += " -g %s_%s" % ( self._refSeqTable, self._MSAmethod ) # cmd += " -q %s" % ( os.getcwd() ) # if len(lNamesTErefSeq) == 1: # cmd += " -S %s_all.fa.oriented" % ( lNamesTErefSeq[0] ) # else: # cmd += " -S '*_all.fa.oriented'" # cmd += " -Q %s" % ( self._queue ) # cmd += " -C %s" % ( self._configFileName ) # if self._MSAmethod == "Refalign" and self._keepRefseq: # cmd += " -r" # if not self._clean : # cmd += " -c" # if self._tmpDir != "": # cmd += " -d %s" % ( self._tmpDir ) # cmd += " -v %d" % ( self._verbosity ) # self._pL.launch( prg, cmd ) # lFiles = glob.glob( "*_all.fa.oriented.fa_aln" ) # for f in lFiles: # os.rename( f, f.replace("_all.fa.oriented.fa_aln","_all.fa.oriented_%s.fa_aln" % (self._MSAmethod.lower()) ) ) def filter4phylogenies( self, verbosity=0 ): """ Filter TE copy alignment for better phylogenies. """ self._log.info("Filtering MSA") lInFiles = glob.glob( "*_all.fa.oriented_%s.fa_aln" % ( self._MSAmethod.lower() ) ) count = 0 for inFileName in lInFiles: count += 1 self._log.debug("clean MSA %d --> %s" % (count,inFileName)) alignDB = AlignedBioseqDB() alignDB.load(inFileName) alignDB.cleanMSA() if alignDB.getSize() > 2: alignDB.save( inFileName + ".clean" ) self._log.debug("clean!") else: self._log.debug("skip!") self._log.info("MSA cleaned: %d" % count) def _createLaunchPhyMLCommands(self, iLauncher, query): # prg = os.environ["REPET_PATH"] + "/bin/srptPhyML.py" # cmd = prg # cmd += " -g %s_PHY_%s" % ( self._refSeqTable, os.getpid() ) # cmd += " -q %s" % ( os.getcwd() ) # cmd += " -S '*_all.fa.oriented_%s.fa_aln.clean'" % ( self._MSAmethod.lower() ) # cmd += " -Q %s" % ( self._queue ) # cmd += " -C %s" % ( self._configFileName ) prg = "LaunchPhyML.py" lArgs = [] lArgs.append("-i %s" % query) lArgs.append("-o %s.fa_phylo" % query) lArgs.append("-v %d" % (self._verbosity-1)) self._log.debug("Prepared Phyml commands : %s %s" % (prg, " ".join(lArgs))) return iLauncher.getSystemCommand("%s" % prg, lArgs) def makePhylogenies( self ): """ Launch PhyML on each TE family. """ self.phyloFileSuffix = "*_all.fa.oriented_%s.fa_aln.clean" % ( self._MSAmethod.lower() ) queue = self._queue cDir = os.getcwd() tmpDir = self._tmpDir groupid = "%s_PHY_%s" % ( self._refSeqTable, os.getpid() ) acronym = "Phylo" iDb = DbFactory.createInstance(configFileName=self._configFileName) iTJA = TableJobAdaptatorFactory.createInstance(iDb, "jobs") iLauncher = Launcher(iTJA, os.getcwd(), "", "", cDir, tmpDir, "jobs", queue, groupid) lCmdsTuples = self._preparePhyMljobs(iLauncher, cDir) iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, self._clean) def start( self ): """ Useful commands before running the program. """ self.checkAttributes() self._log.info("START GetMultAlignAndPhylogenyPerTErefSeq.py STEP %d" % self.step) self.connectSql() def end( self ): """ Useful commands before ending the program. """ self._db.close() self._log.info("END GetMultAlignAndPhylogenyPerTErefSeq.py STEP %d" % self.step) def run( self ): """ Run the program. """ LoggerFactory.setLevel(self._log, self._verbosity) self.start() lNamesTErefSeq = self.getNamesOfTErefSeq() self._log.debug("lNamesTErefSeq: %s" % " ".join(lNamesTErefSeq)) if self.step in [0, 1]: self.getTErefSeqInFastaFiles( lNamesTErefSeq ) self.getCopiesInFastaFilesPerTErefSeq( lNamesTErefSeq ) if self.step in [0, 2]: self.filter4Alignments(lNamesTErefSeq) self.buildInFiles4Launcher(lNamesTErefSeq) self.launchMultAlignments(lNamesTErefSeq) if self.step in [0, 3]: self.filter4phylogenies(verbosity=self._verbosity) self.makePhylogenies() self.end() if __name__ == "__main__": iGetMultAlignAndPhylogenyPerTErefSeq = GetMultAlignAndPhylogenyPerTErefSeq() iGetMultAlignAndPhylogenyPerTErefSeq.setAttributesFromCmdLine() iGetMultAlignAndPhylogenyPerTErefSeq.run()