s_mart: commons/launcher/LaunchBlastclust.py comparison

comparison commons/launcher/LaunchBlastclust.py @ 18:94ab73e8a190

Uploaded

author	m-zytnicki
date	Mon, 29 Apr 2013 03:20:15 -0400
parents
children

comparison

equal deleted inserted replaced

-:b0e8584489e6
+:94ab73e8a190
+#!/usr/bin/env python
+"""
+Launch Blastclust on nucleotide sequences and return a fasta file.
+"""
+# Copyright INRA (Institut National de la Recherche Agronomique)
+# http://www.inra.fr
+# http://urgi.versailles.inra.fr
+#
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software.  You can  use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+#
+# As a counterpart to the access to the source code and  rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty  and the software's author,  the holder of the
+# economic rights,  and the successive licensors  have only  limited
+# liability.
+#
+# In this respect, the user's attention is drawn to the risks associated
+# with loading,  using,  modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean  that it is complicated to manipulate,  and  that  also
+# therefore means  that it is reserved for developers  and  experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and,  more generally, to use and operate it in the
+# same conditions as regards security.
+#
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+import os
+import sys
+import subprocess
+from commons.core.seq.BioseqDB import BioseqDB
+from commons.core.seq.Bioseq import Bioseq
+from commons.core.utils.RepetOptionParser import RepetOptionParser
+from commons.tools.ChangeSequenceHeaders import ChangeSequenceHeaders
+class LaunchBlastclust(object):
+"""
+Launch Blastclust on nucleotide sequences and return a fasta file.
+"""
+def __init__(self, input = "", outFilePrefix = "", clean = False, verbose = 0):
+"""
+Constructor.
+"""
+self._inFileName = input
+self._identityThreshold = 95
+self._coverageThreshold = 0.9
+self._bothSeq = "T"
+self._filterUnclusteredSeq = False
+self._outFilePrefix = outFilePrefix
+self._isBlastToMap = False
+self._isHeaderForTEdenovo = False
+self._nbCPUs = 1
+self._clean = clean
+self._verbose = verbose
+self._tmpFileName = ""
+def setAttributesFromCmdLine(self):
+"""
+Set the attributes from the command-line.
+"""
+description = "Launch Blastclust on nucleotide sequences and return a fasta file."
+usage = "LaunchBlastclust.py -i inputFileName [options]"
+examples = "\nExample 1: launch Blastclust with default options, highest verbose and clean temporary files.\n"
+examples += "\t$ python ./LaunchBlastclust.py -i MyBank.fa -v 2 -c"
+examples += "\n\t"
+examples += "\t\nExample 2: launch Blastclust with an identity threshold of 90%, rename output files and generate a map file corresponding to the fasta output.\n"
+examples += "\t$ python ./LaunchBlastclust.py -i MyBank.fa -S 90 -o SpecialOutputName -m"
+examples += "\n\tWARNING: Please refer to -m option limitations in the description above.\n"
+#TODO: check if the optionParser can handle '\' into strings for a better code readability in -m option
+parser = RepetOptionParser(description = description, usage = usage, version = "v1.0", epilog = examples)
+parser.add_option("-i", "--input",          dest = "inFileName",            type = "string",    help = "name of the input fasta file (nucleotides)",                    default = "")
+parser.add_option("-L", "--length",         dest = "coverageThreshold",     type = "float",     help = "length coverage threshold (default=0.9)",                       default = 0.9)
+parser.add_option("-S", "--ident",          dest = "identityThreshold",     type = "int",       help = "identity threshold (default=95)",                               default = 95)
+parser.add_option("-b", "--both",           dest = "bothSeq",               type = "string",    help = "require coverage on both neighbours (default=T/F)",             default = "T")
+parser.add_option("-f", "--filter",         dest = "filterUnclusteredSeq",                      help = "filter unclustered sequences",                                  default = False,    action="store_true")
+parser.add_option("-o", "--out",            dest = "outFilePrefix",         type = "string",    help = "prefix of the output files (default=input fasta file name)",    default = "")
+parser.add_option("-m", "--map",            dest = "isBlast2Map",                               help = "generate an additional output file in map format (Warning: only works if blastclust's fasta input headers are formated like LTRharvest fasta output)",   default = False,    action="store_true")
+parser.add_option("", "--TEdenovoHeader",   dest = "isHeaderForTEdenovo",                       help = "format headers for TEdenovo pipeline",                          default = False,    action="store_true")
+parser.add_option("-n", "--num",            dest = "nbCPUs",                type = "int",       help = "number of CPU's to use (default=1)",                            default = 1)
+parser.add_option("-c", "--clean",          dest = "clean",                                     help = "clean temporary files",                                         default = False,    action="store_true")
+parser.add_option("-v", "--verbose",        dest = "verbose",               type = "int",       help = "verbosity level (default=0/1/2)",                               default = 0)
+options = parser.parse_args()[0]
+self._setAttributesFromOptions(options)
+def _setAttributesFromOptions(self, options):
+self.setInputFileName(options.inFileName)
+self.setCoverageThreshold(options.coverageThreshold)
+self.setIdentityThreshold(options.identityThreshold)
+self.setBothSequences(options.bothSeq)
+self.setNbCPUs(options.nbCPUs)
+self.setIsHeaderForTEdenovo(options.isHeaderForTEdenovo)
+if options.filterUnclusteredSeq:
+self.setFilterUnclusteredSequences()
+if options.outFilePrefix != "":
+self.setOutputFilePrefix(options.outFilePrefix)
+else:
+self._outFilePrefix = self._inFileName
+if options.isBlast2Map:
+self.setIsBlastToMap()
+if options.clean:
+self.setClean()
+self.setVerbosityLevel(options.verbose)
+def setInputFileName(self , inFileName):
+self._inFileName = inFileName
+def setCoverageThreshold(self, lengthThresh):
+self._coverageThreshold = float(lengthThresh)
+def setIdentityThreshold(self, identityThresh):
+self._identityThreshold = int(identityThresh)
+def setBothSequences(self, bothSeq):
+self._bothSeq = bothSeq
+def setNbCPUs(self, nbCPUs):
+self._nbCPUs = int(nbCPUs)
+def setFilterUnclusteredSequences(self):
+self._filterUnclusteredSeq = True
+def setOutputFilePrefix(self, outFilePrefix):
+self._outFilePrefix = outFilePrefix
+def setIsBlastToMap(self):
+self._isBlastToMap = True
+def setIsHeaderForTEdenovo(self, isHeaderForTEdenovo):
+self._isHeaderForTEdenovo = isHeaderForTEdenovo
+def setClean(self):
+self._clean = True
+def setVerbosityLevel(self, verbose):
+self._verbose = int(verbose)
+def setTmpFileName(self, tmpFileName):
+self._tmpFileName = tmpFileName
+def checkAttributes(self):
+"""
+Check the attributes are valid before running the algorithm.
+"""
+if self._inFileName == "":
+print "ERROR: missing input file name (-i)"
+sys.exit(1)
+if self._outFilePrefix == "":
+self._outFilePrefix = self._inFileName
+self._tmpFileName = "%s_blastclust.txt" % (self._outFilePrefix)
+def launchBlastclust(self, inFile):
+"""
+Launch Blastclust in command-line.
+"""
+if os.path.exists(os.path.basename(inFile)):
+inFile = os.path.basename(inFile)
+prg = "blastclust"
+cmd = prg
+cmd += " -i %s" % (inFile)
+cmd += " -o %s" % (self._tmpFileName)
+cmd += " -S %i" % (self._identityThreshold)
+cmd += " -L %f" % (self._coverageThreshold)
+cmd += " -b %s" % (self._bothSeq)
+cmd += " -p F"
+cmd += " -a %i" % (self._nbCPUs)
+if self._verbose == 0:
+cmd += " -v blastclust.log"
+if self._verbose > 0:
+print cmd
+sys.stdout.flush()
+process = subprocess.Popen(cmd, shell = True)
+process.communicate()
+if process.returncode != 0:
+raise Exception("ERROR when launching '%s'" % cmd)
+if self._clean and os.path.exists("error.log"):
+os.remove("error.log")
+if self._clean and os.path.exists("blastclust.log"):
+os.remove("blastclust.log")
+def getClustersFromTxtFile(self):
+"""
+Return a dictionary with cluster IDs as keys and sequence headers as values.
+"""
+dClusterId2SeqHeaders = {}
+inF = open(self._tmpFileName, "r")
+line = inF.readline()
+clusterId = 1
+while True:
+if line == "":
+break
+tokens = line[:-1].split(" ")
+dClusterId2SeqHeaders[clusterId] = []
+for seqHeader in tokens:
+if seqHeader != "":
+dClusterId2SeqHeaders[clusterId].append(seqHeader)
+line = inF.readline()
+clusterId += 1
+inF.close()
+if self._verbose > 0:
+print "nb of clusters: %i" % (len(dClusterId2SeqHeaders.keys()))
+sys.stdout.flush()
+return dClusterId2SeqHeaders
+def filterUnclusteredSequences(self, dClusterId2SeqHeaders):
+"""
+Filter clusters having only one sequence.
+"""
+for clusterId in dClusterId2SeqHeaders.keys():
+if len(dClusterId2SeqHeaders[clusterId]) == 1:
+del dClusterId2SeqHeaders[clusterId]
+if self._verbose > 0:
+print "nb of clusters (>1seq): %i" % (len(dClusterId2SeqHeaders.keys()))
+sys.stdout.flush()
+return dClusterId2SeqHeaders
+def getClusteringResultsInFasta(self, inFile):
+"""
+Write a fasta file whose sequence headers contain the clustering IDs.
+"""
+dClusterId2SeqHeaders = self.getClustersFromTxtFile()
+if self._filterUnclusteredSeq:
+dClusterId2SeqHeaders = self.filterUnclusteredSequences(dClusterId2SeqHeaders)
+inDB = BioseqDB(inFile)
+outFileName = "%s_Blastclust.fa" % (inFile)
+outF = open(outFileName, "w")
+for clusterId in dClusterId2SeqHeaders.keys():
+memberId = 1
+for seqHeader in dClusterId2SeqHeaders[clusterId]:
+bs = inDB.fetch(seqHeader)
+bs.header = "BlastclustCluster%iMb%i_%s" % (clusterId, memberId, seqHeader)
+bs.write(outF)
+memberId += 1
+outF.close()
+def getLinkInitNewHeaders(self):
+dNew2Init = {}
+linkFileName = "%s.shortHlink" % (self._inFileName)
+linkFile = open(linkFileName,"r")
+line = linkFile.readline()
+while True:
+if line == "":
+break
+data = line.split("\t")
+dNew2Init[data[0]] = data[1]
+line = linkFile.readline()
+linkFile.close()
+return dNew2Init
+def retrieveInitHeaders(self, dNewH2InitH):
+tmpFaFile = "%s.shortH_Blastclust.fa" % (self._inFileName)
+tmpFaFileHandler = open(tmpFaFile, "r")
+outFaFile = "%s_Blastclust.fa" % (self._outFilePrefix)
+outFaFileHandler = open(outFaFile, "w")
+while True:
+line = tmpFaFileHandler.readline()
+if line == "":
+break
+if line[0] == ">":
+tokens = line[1:-1].split("_")
+initHeader = dNewH2InitH[tokens[1]]
+if self._isHeaderForTEdenovo:
+classif = initHeader.split("_")[0]
+consensusName = "_".join(initHeader.split("_")[1:])
+clusterId = tokens[0].split("Cluster")[1].split("Mb")[0]
+newHeader = "%s_Blc%s_%s" % (classif, clusterId, consensusName)
+else:
+newHeader = "%s_%s" % (tokens[0], initHeader)
+outFaFileHandler.write(">%s\n" % (newHeader))
+else:
+outFaFileHandler.write(line)
+tmpFaFileHandler.close()
+outFaFileHandler.close()
+if self._clean:
+os.remove(tmpFaFile)
+def blastclustToMap(self, blastclustFastaOut):
+"""
+Write a map file from blastclust fasta output.
+Warning: only works if blastclust's fasta input headers are formated like LTRharvest fasta output.
+"""
+fileDb = open(blastclustFastaOut , "r")
+mapFilename = "%s.map" % (os.path.splitext(blastclustFastaOut)[0])
+fileMap = open(mapFilename, "w")
+seq = Bioseq()
+numseq = 0
+while 1:
+seq.read(fileDb)
+if seq.sequence == None:
+break
+numseq = numseq + 1
+ID = seq.header.split(' ')[0].split('_')[0]
+chunk = seq.header.split(' ')[0].split('_')[1]
+start = seq.header.split(' ')[-1].split(',')[0][1:]
+end = seq.header.split(' ')[-1].split(',')[1][:-1]
+line= '%s\t%s\t%s\t%s' % (ID, chunk, start, end)
+fileMap.write(line + "\n")
+fileDb.close()
+fileMap.close()
+print "saved in %s" % mapFilename
+def start(self):
+"""
+Useful commands before running the program.
+"""
+self.checkAttributes()
+if self._verbose > 0:
+print "START %s" % (type(self).__name__)
+def end(self):
+"""
+Useful commands before ending the program.
+"""
+if self._verbose > 0:
+print "END %s" % (type(self).__name__)
+def run(self):
+"""
+Run the program.
+"""
+self.start()
+iCSH = ChangeSequenceHeaders(inFile = self._inFileName, format = "fasta", step = 1, outFile = "%s.shortH" % self._inFileName, linkFile = "%s.shortHlink" % self._inFileName)
+iCSH.run()
+self.launchBlastclust("%s.shortH" % (self._inFileName))
+self.getClusteringResultsInFasta("%s.shortH" % (self._inFileName))
+dNewH2InitH = self.getLinkInitNewHeaders()
+self.retrieveInitHeaders(dNewH2InitH)
+if self._isBlastToMap:
+blastclustFileName = "%s_Blastclust.fa" % (self._outFilePrefix)
+self.blastclustToMap(blastclustFileName)
+if self._clean:
+os.remove("%s.shortH" % (self._inFileName))
+os.remove("%s.shortHlink" % (self._inFileName))
+self.end()
+if __name__ == "__main__":
+i = LaunchBlastclust()
+i.setAttributesFromCmdLine()
+i.run()

Mercurial > repos > yufei-luo > s_mart

comparison commons/launcher/LaunchBlastclust.py @ 18:94ab73e8a190