view commons/launcher/LaunchBlastclust.py @ 31:0ab839023fe4

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 14:33:21 -0400
parents 94ab73e8a190
children
line wrap: on
line source

#!/usr/bin/env python

"""
Launch Blastclust on nucleotide sequences and return a fasta file.
"""

# Copyright INRA (Institut National de la Recherche Agronomique)
# http://www.inra.fr
# http://urgi.versailles.inra.fr
#
# This software is governed by the CeCILL license under French law and
# abiding by the rules of distribution of free software.  You can  use, 
# modify and/ or redistribute the software under the terms of the CeCILL
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info". 
#
# As a counterpart to the access to the source code and  rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty  and the software's author,  the holder of the
# economic rights,  and the successive licensors  have only  limited
# liability. 
#
# In this respect, the user's attention is drawn to the risks associated
# with loading,  using,  modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean  that it is complicated to manipulate,  and  that  also
# therefore means  that it is reserved for developers  and  experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or 
# data to be ensured and,  more generally, to use and operate it in the 
# same conditions as regards security. 
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL license and that you accept its terms.

import os
import sys
import subprocess
from commons.core.seq.BioseqDB import BioseqDB
from commons.core.seq.Bioseq import Bioseq
from commons.core.utils.RepetOptionParser import RepetOptionParser
from commons.tools.ChangeSequenceHeaders import ChangeSequenceHeaders

class LaunchBlastclust(object):
    """
    Launch Blastclust on nucleotide sequences and return a fasta file.
    """
    
    def __init__(self, input = "", outFilePrefix = "", clean = False, verbose = 0):
        """
        Constructor.
        """
        self._inFileName = input
        self._identityThreshold = 95
        self._coverageThreshold = 0.9
        self._bothSeq = "T"
        self._filterUnclusteredSeq = False
        self._outFilePrefix = outFilePrefix
        self._isBlastToMap = False
        self._isHeaderForTEdenovo = False
        self._nbCPUs = 1
        self._clean = clean
        self._verbose = verbose
        self._tmpFileName = ""
        
    def setAttributesFromCmdLine(self):
        """
        Set the attributes from the command-line.
        """
        
        description = "Launch Blastclust on nucleotide sequences and return a fasta file."
        usage = "LaunchBlastclust.py -i inputFileName [options]"
        
        examples = "\nExample 1: launch Blastclust with default options, highest verbose and clean temporary files.\n"
        examples += "\t$ python ./LaunchBlastclust.py -i MyBank.fa -v 2 -c"
        examples += "\n\t"
        examples += "\t\nExample 2: launch Blastclust with an identity threshold of 90%, rename output files and generate a map file corresponding to the fasta output.\n"
        examples += "\t$ python ./LaunchBlastclust.py -i MyBank.fa -S 90 -o SpecialOutputName -m"
        examples += "\n\tWARNING: Please refer to -m option limitations in the description above.\n"
        
        #TODO: check if the optionParser can handle '\' into strings for a better code readability in -m option
        
        parser = RepetOptionParser(description = description, usage = usage, version = "v1.0", epilog = examples)
        parser.add_option("-i", "--input",          dest = "inFileName",            type = "string",    help = "name of the input fasta file (nucleotides)",                    default = "")
        parser.add_option("-L", "--length",         dest = "coverageThreshold",     type = "float",     help = "length coverage threshold (default=0.9)",                       default = 0.9)
        parser.add_option("-S", "--ident",          dest = "identityThreshold",     type = "int",       help = "identity threshold (default=95)",                               default = 95)
        parser.add_option("-b", "--both",           dest = "bothSeq",               type = "string",    help = "require coverage on both neighbours (default=T/F)",             default = "T")
        parser.add_option("-f", "--filter",         dest = "filterUnclusteredSeq",                      help = "filter unclustered sequences",                                  default = False,    action="store_true")
        parser.add_option("-o", "--out",            dest = "outFilePrefix",         type = "string",    help = "prefix of the output files (default=input fasta file name)",    default = "")
        parser.add_option("-m", "--map",            dest = "isBlast2Map",                               help = "generate an additional output file in map format (Warning: only works if blastclust's fasta input headers are formated like LTRharvest fasta output)",   default = False,    action="store_true")
        parser.add_option("", "--TEdenovoHeader",   dest = "isHeaderForTEdenovo",                       help = "format headers for TEdenovo pipeline",                          default = False,    action="store_true")
        parser.add_option("-n", "--num",            dest = "nbCPUs",                type = "int",       help = "number of CPU's to use (default=1)",                            default = 1)
        parser.add_option("-c", "--clean",          dest = "clean",                                     help = "clean temporary files",                                         default = False,    action="store_true")
        parser.add_option("-v", "--verbose",        dest = "verbose",               type = "int",       help = "verbosity level (default=0/1/2)",                               default = 0)

        options = parser.parse_args()[0]
        self._setAttributesFromOptions(options)
        
    def _setAttributesFromOptions(self, options):
        self.setInputFileName(options.inFileName)
        self.setCoverageThreshold(options.coverageThreshold)
        self.setIdentityThreshold(options.identityThreshold)
        self.setBothSequences(options.bothSeq)
        self.setNbCPUs(options.nbCPUs)
        self.setIsHeaderForTEdenovo(options.isHeaderForTEdenovo)
        if options.filterUnclusteredSeq:
            self.setFilterUnclusteredSequences()
        if options.outFilePrefix != "":
            self.setOutputFilePrefix(options.outFilePrefix)
        else:
            self._outFilePrefix = self._inFileName
        if options.isBlast2Map:
            self.setIsBlastToMap()
        if options.clean:
            self.setClean()
        self.setVerbosityLevel(options.verbose)
                
    def setInputFileName(self , inFileName):
        self._inFileName = inFileName
        
    def setCoverageThreshold(self, lengthThresh):
        self._coverageThreshold = float(lengthThresh)
                
    def setIdentityThreshold(self, identityThresh):
        self._identityThreshold = int(identityThresh)
        
    def setBothSequences(self, bothSeq):
        self._bothSeq = bothSeq
        
    def setNbCPUs(self, nbCPUs):
        self._nbCPUs = int(nbCPUs)
        
    def setFilterUnclusteredSequences(self):
        self._filterUnclusteredSeq = True
        
    def setOutputFilePrefix(self, outFilePrefix):
        self._outFilePrefix = outFilePrefix
        
    def setIsBlastToMap(self):
        self._isBlastToMap = True
        
    def setIsHeaderForTEdenovo(self, isHeaderForTEdenovo):
        self._isHeaderForTEdenovo = isHeaderForTEdenovo
        
    def setClean(self):
        self._clean = True
        
    def setVerbosityLevel(self, verbose):
        self._verbose = int(verbose)
    
    def setTmpFileName(self, tmpFileName):    
        self._tmpFileName = tmpFileName
        
        
    def checkAttributes(self):
        """
        Check the attributes are valid before running the algorithm.
        """
        if self._inFileName == "":
            print "ERROR: missing input file name (-i)"
            sys.exit(1)
        if self._outFilePrefix == "":
            self._outFilePrefix = self._inFileName
        self._tmpFileName = "%s_blastclust.txt" % (self._outFilePrefix)
        
        
    def launchBlastclust(self, inFile):
        """
        Launch Blastclust in command-line.
        """
        if os.path.exists(os.path.basename(inFile)):
            inFile = os.path.basename(inFile)
        prg = "blastclust"
        cmd = prg
        cmd += " -i %s" % (inFile)
        cmd += " -o %s" % (self._tmpFileName)
        cmd += " -S %i" % (self._identityThreshold)
        cmd += " -L %f" % (self._coverageThreshold)
        cmd += " -b %s" % (self._bothSeq)
        cmd += " -p F"
        cmd += " -a %i" % (self._nbCPUs)
        if self._verbose == 0:
            cmd += " -v blastclust.log"
        if self._verbose > 0:
            print cmd
            sys.stdout.flush()
        process = subprocess.Popen(cmd, shell = True)
        process.communicate()
        if process.returncode != 0:
            raise Exception("ERROR when launching '%s'" % cmd)
        if self._clean and os.path.exists("error.log"):
            os.remove("error.log")
        if self._clean and os.path.exists("blastclust.log"):
            os.remove("blastclust.log")
            
            
    def getClustersFromTxtFile(self):
        """
        Return a dictionary with cluster IDs as keys and sequence headers as values.
        """
        dClusterId2SeqHeaders = {}
        inF = open(self._tmpFileName, "r")
        line = inF.readline()
        clusterId = 1
        while True:
            if line == "":
                break
            tokens = line[:-1].split(" ")
            dClusterId2SeqHeaders[clusterId] = []
            for seqHeader in tokens:
                if seqHeader != "":
                    dClusterId2SeqHeaders[clusterId].append(seqHeader)
            line = inF.readline()
            clusterId += 1
        inF.close()
        if self._verbose > 0:
            print "nb of clusters: %i" % (len(dClusterId2SeqHeaders.keys()))
            sys.stdout.flush()
        return dClusterId2SeqHeaders
    
    
    def filterUnclusteredSequences(self, dClusterId2SeqHeaders):
        """
        Filter clusters having only one sequence.
        """
        for clusterId in dClusterId2SeqHeaders.keys():
            if len(dClusterId2SeqHeaders[clusterId]) == 1:
                del dClusterId2SeqHeaders[clusterId]
        if self._verbose > 0:
            print "nb of clusters (>1seq): %i" % (len(dClusterId2SeqHeaders.keys()))
            sys.stdout.flush()
        return dClusterId2SeqHeaders
    
    
    def getClusteringResultsInFasta(self, inFile):
        """
        Write a fasta file whose sequence headers contain the clustering IDs.
        """
        dClusterId2SeqHeaders = self.getClustersFromTxtFile()
        if self._filterUnclusteredSeq:
            dClusterId2SeqHeaders = self.filterUnclusteredSequences(dClusterId2SeqHeaders)
        inDB = BioseqDB(inFile)
        outFileName = "%s_Blastclust.fa" % (inFile)
        outF = open(outFileName, "w")
        for clusterId in dClusterId2SeqHeaders.keys():
            memberId = 1
            for seqHeader in dClusterId2SeqHeaders[clusterId]:
                bs = inDB.fetch(seqHeader)
                bs.header = "BlastclustCluster%iMb%i_%s" % (clusterId, memberId, seqHeader)
                bs.write(outF)
                memberId += 1
        outF.close()
        
        
    def getLinkInitNewHeaders(self):
        dNew2Init = {}
        linkFileName = "%s.shortHlink" % (self._inFileName)
        linkFile = open(linkFileName,"r")
        line = linkFile.readline()
        while True:
            if line == "":
                break
            data = line.split("\t")
            dNew2Init[data[0]] = data[1]
            line = linkFile.readline()
        linkFile.close()
        return dNew2Init
    
    
    def retrieveInitHeaders(self, dNewH2InitH):
        tmpFaFile = "%s.shortH_Blastclust.fa" % (self._inFileName)
        tmpFaFileHandler = open(tmpFaFile, "r")
        outFaFile = "%s_Blastclust.fa" % (self._outFilePrefix)
        outFaFileHandler = open(outFaFile, "w")
        while True:
            line = tmpFaFileHandler.readline()
            if line == "":
                break
            if line[0] == ">":
                tokens = line[1:-1].split("_")
                initHeader = dNewH2InitH[tokens[1]]
                if self._isHeaderForTEdenovo:
                    classif = initHeader.split("_")[0]
                    consensusName = "_".join(initHeader.split("_")[1:])
                    clusterId = tokens[0].split("Cluster")[1].split("Mb")[0]
                    newHeader = "%s_Blc%s_%s" % (classif, clusterId, consensusName)
                else:
                    newHeader = "%s_%s" % (tokens[0], initHeader)
                outFaFileHandler.write(">%s\n" % (newHeader))
            else:
                outFaFileHandler.write(line)
        tmpFaFileHandler.close()
        outFaFileHandler.close()
        if self._clean:
            os.remove(tmpFaFile)


    def blastclustToMap(self, blastclustFastaOut):
        """
        Write a map file from blastclust fasta output.
        Warning: only works if blastclust's fasta input headers are formated like LTRharvest fasta output.
        """
        fileDb = open(blastclustFastaOut , "r")
        mapFilename = "%s.map" % (os.path.splitext(blastclustFastaOut)[0])
        fileMap = open(mapFilename, "w")
        seq = Bioseq()
        numseq = 0
        while 1:
            seq.read(fileDb)
            if seq.sequence == None:
                break
            numseq = numseq + 1
            ID = seq.header.split(' ')[0].split('_')[0]
            chunk = seq.header.split(' ')[0].split('_')[1]
            start = seq.header.split(' ')[-1].split(',')[0][1:]
            end = seq.header.split(' ')[-1].split(',')[1][:-1]
            line= '%s\t%s\t%s\t%s' % (ID, chunk, start, end)
            fileMap.write(line + "\n")
    
        fileDb.close()
        fileMap.close()
        print "saved in %s" % mapFilename 
        
        
    def start(self):
        """
        Useful commands before running the program.
        """
        self.checkAttributes()
        if self._verbose > 0:
            print "START %s" % (type(self).__name__)
            
            
    def end(self):
        """
        Useful commands before ending the program.
        """
        if self._verbose > 0:
            print "END %s" % (type(self).__name__)
            
            
    def run(self):
        """
        Run the program.
        """
        self.start()
        
        iCSH = ChangeSequenceHeaders(inFile = self._inFileName, format = "fasta", step = 1, outFile = "%s.shortH" % self._inFileName, linkFile = "%s.shortHlink" % self._inFileName)
        iCSH.run()
        
        self.launchBlastclust("%s.shortH" % (self._inFileName))
        
        self.getClusteringResultsInFasta("%s.shortH" % (self._inFileName))
        
        dNewH2InitH = self.getLinkInitNewHeaders()
        self.retrieveInitHeaders(dNewH2InitH)
        
        if self._isBlastToMap:
            blastclustFileName = "%s_Blastclust.fa" % (self._outFilePrefix)
            self.blastclustToMap(blastclustFileName)
        
        if self._clean:
            os.remove("%s.shortH" % (self._inFileName))
            os.remove("%s.shortHlink" % (self._inFileName))
        
        self.end()
        
if __name__ == "__main__":
    i = LaunchBlastclust()
    i.setAttributesFromCmdLine()
    i.run()