view commons/launcher/LaunchTallymer.py @ 31:0ab839023fe4

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 14:33:21 -0400
parents 94ab73e8a190
children
line wrap: on
line source

#!/usr/bin/env python

"""
Launch Tallymer's sub programs, generate map file, and convert output to gff and wig, as well as visual (RPlot) data
"""

# Copyright INRA (Institut National de la Recherche Agronomique)
# http://www.inra.fr
# http://urgi.versailles.inra.fr
#
# This software is governed by the CeCILL license under French law and
# abiding by the rules of distribution of free software.  You can  use, 
# modify and/ or redistribute the software under the terms of the CeCILL
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info". 
#
# As a counterpart to the access to the source code and  rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty  and the software's author,  the holder of the
# economic rights,  and the successive licensors  have only  limited
# liability. 
#
# In this respect, the user's attention is drawn to the risks associated
# with loading,  using,  modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean  that it is complicated to manipulate,  and  that  also
# therefore means  that it is reserved for developers  and  experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or 
# data to be ensured and,  more generally, to use and operate it in the 
# same conditions as regards security. 
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL license and that you accept its terms.

import os
import shutil
import subprocess
import time
from commons.core.utils.RepetOptionParser import RepetOptionParser
from commons.core.LoggerFactory import LoggerFactory
from SMART.Java.Python.convertTranscriptFile import ConvertTranscriptFile
from commons.core.seq.BioseqUtils import BioseqUtils
from commons.core.seq.BioseqDB import BioseqDB
from Tallymer_pipe.PlotBenchMarkGFFFiles import PlotBenchMarkGFFFiles

LOG_DEPTH = "repet.tools"


class LaunchTallymer(object):
    """
    Launch Tallymer's sub programs, generate map file, and convert output to
    gff and wig, as well as visual (RPlot) data
    """
    
    _lValidFormats = ["gff", "gff3", "wig", "bed", "map"]
    
    def __init__(self, inputFasta="", indexationFasta=None, merSize=20, minOccs=4, outputFormats="gff", nLargestScaffoldsToPlot=0, clean=False, verbosity=0):
        self.inputFasta = inputFasta
        self.indexationFasta = indexationFasta if indexationFasta != None else inputFasta 
        self.merSize = merSize
        self.minOccs = minOccs
        self.outputFormats = outputFormats
        self.nLargestScaffoldsToPlot = nLargestScaffoldsToPlot
        self.doClean = clean
        self.verbosity = verbosity
        
        self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self.verbosity)
        self._workdir = os.path.join(os.getcwd(), "launchTallymer_%s" % time.strftime("%Y%m%d%H%M%S"))
        self._tmpSearchFileName = None
        self._tmpMapFileName = None
        self._tmpStatFileName = None   
        self._tmpPngFileName = None
        self._plot_data = {}
        self._plot_data2 = {}
        
    def setAttributesFromCmdLine(self):
        description = "Generates stats from the results of the tallymer search ."    
        parser = RepetOptionParser(description=description)
        parser.add_option("-i", "--inputFasta", dest="inputFasta", default = "",  action="store", type="string", help="input fasta file [compulsory] [format: fasta]")
        parser.add_option("-u", "--indexationFasta", dest="indexationFasta", default = "",  action="store", type="string", help="input indexation fasta file used to generate kmer index (defaults to input fasta)[optional] [format: fasta]")
        parser.add_option("-s", "--merSize", dest="merSize", default = 20,  action="store", type="int", help="input merSize [optional][default:20]")
        parser.add_option("-m", "--minOccs", dest="minOccs", default = 4,  action="store", type="int", help="input minimal kmer occurence count [default:4]")
        parser.add_option("-f", "--outputFormats", dest="outputFormats", default = "gff",  action="store", type="string", help="comma separated list of output file formats (can  be %s) [optional] [default:gff]" % ", ".join(self._lValidFormats))
        parser.add_option("-n", "--nLargestScaffoldsToPlot",default = 0, type="int", action="store", dest = "nLargestScaffoldsToPlot", help = "generate graph of Kmer repartition along the input sequence for the n biggest scaffolds")
        parser.add_option("-c", "--clean", dest = "clean", help = "clean temporary files", default = False, action="store_true")
        parser.add_option("-v", "--verbosity", dest="verbosity", default = 1,  action="store", type="int", help="verbosity [optional] ")
        options, args = parser.parse_args()
        self._setAttributesFromOptions(options)
        
    def _setAttributesFromOptions(self, options):
        self.inputFasta = options.inputFasta
        self.indexationFasta = options.indexationFasta if options.indexationFasta != "" else options.inputFasta
        self.merSize = options.merSize
        self.minOccs = options.minOccs
        self.outputFormats = options.outputFormats
        self.nLargestScaffoldsToPlot = options.nLargestScaffoldsToPlot
        self.doClean = options.clean
        self.verbosity = options.verbosity
        
    def _checkOptions(self):
        if self.inputFasta == "":
            self._logAndRaise("Error: missing input fasta file")
        if self.merSize < 1:
            self._logAndRaise("Error: invalid kmer size '%i'; must be a positive integer" % self.merSize)
            
        self.outputFormats = self.outputFormats.lower().split(',')
        sOutFormats = set(self.outputFormats)
        sValidFormats = set(self._lValidFormats)
        lInvalidFormats = list(sOutFormats - sValidFormats)
        self.outputFormats = list(sValidFormats.intersection(sOutFormats))
        if lInvalidFormats:
            self._log.warning("Warning: ignoring invalid formats: <%s>" % " ".join(lInvalidFormats))
        if not self.outputFormats:
            self._logAndRaise("Error: no valid output formats specified")

    def _logAndRaise(self, errorMsg):
        self._log.error(errorMsg)
        raise Exception(errorMsg)
    
    def clean(self):
        try:
            shutil.rmtree(self._workdir)
        except Exception as inst: 
            self._log.error(inst)
                    
    def run(self):
        LoggerFactory.setLevel(self._log, self.verbosity)
        self._checkOptions()
        self._log.debug("Input fasta file: %s; K-mer size: %s; Output formats: %s; Cleaning: %s" % (self.inputFasta, self.merSize, str(self.outputFormats), self.doClean))
        try:
            os.makedirs(self._workdir)
        except:pass
        
        srcPath = os.path.abspath(self.inputFasta)
        dstPath = os.path.join(self._workdir,os.path.basename(self.inputFasta))
        os.symlink(srcPath, dstPath)
        
        if (self.indexationFasta !=  self.inputFasta):
            srcPath = os.path.abspath(self.indexationFasta)
            dstPath = os.path.join(self._workdir,os.path.basename(self.indexationFasta))
            try:
                os.symlink(srcPath, dstPath)
            except OSError as inst:
                pass
        
        os.chdir(self._workdir)
        
        if (self.indexationFasta !=  self.inputFasta):
            self.indexationFasta = os.path.basename(self.indexationFasta)
        else:
            self.indexationFasta = os.path.basename(self.inputFasta)
        self.inputFasta = os.path.basename(self.inputFasta)
        
        self._tmpSearchFileName = "%s.tallymer" % os.path.splitext(os.path.basename(self.inputFasta))[0]
        self._tmpMapFileName = "%s_tmp.map" % self._tmpSearchFileName
        self._tmpStatFileName = "%s.stat" % self._tmpSearchFileName
        self._tmpPngFileName = "%s.png" % self._tmpSearchFileName
        
        
            
        
        
        self._runTallymerSearch()
        self._convertTallymerToMap()
        self._writeOutputFiles()
            
        if self.nLargestScaffoldsToPlot > 0:
            self._doPlot()
            shutil.copy2(self._tmpPngFileName, "../.")
        
        shutil.copy2(self._tmpStatFileName, "../.")
        os.chdir("..")
        
        if self.doClean:
            self.clean()

    def _runTallymerSearch(self):
        self._log.info("Starting to run tallymer search of sequence %s " % (self.inputFasta))
        self._indexInputFastaFile()
        self._countAndIndexKmersForGivenK()
        self._searchKmersListInTallymerIndex()
        self._log.info("Finished running tallymer scan of sequence %s " % (self.inputFasta))
        
    def _convertTallymerToMap(self):
        self._log.info("Starting to run tallymer search to map conversion")
        totalNbOcc, dKmer2Occ, self._plot_data, self._plot_data2 = ConvertUtils.convertTallymerFormatIntoMapFormatAndGenerateData(self.inputFasta, self._tmpSearchFileName, self._tmpMapFileName)
        self._log.debug("totalNbOcc=%i" % totalNbOcc)
        self._writeOccurencesNbAndFrequencyOfKmers(totalNbOcc, dKmer2Occ)        
        self._log.info("Finished tallymer search to map conversion")
    
    def _doPlot(self):
        iBioseqDB = BioseqDB()
        iBioseqDB.load(self.inputFasta)
        largest_seqsDb = iBioseqDB.bestLength(self.nLargestScaffoldsToPlot)
        
        for seq in largest_seqsDb.db:
            headerCleaned = seq.header.replace(" ", "_")
            shortFastaName = "%s_%s" % (os.path.basename(self.inputFasta),headerCleaned)
            data = self._plot_data2[seq.header]
            gffPlotter = PlotBenchMarkGFFFiles(yLabel="Number of Kmer Occurences", maxLength=1, title="Kmer repartition along the input sequence: %s; MerSize: %i" % (shortFastaName, self.merSize) )
            gffPlotter.setOutFileName("%s_%s.png" % (self._tmpSearchFileName, headerCleaned))
            gffPlotter._title = "Mers along the input sequence: %s MerSize: %i" % (shortFastaName, self.merSize) 
            gffPlotter._xLabel = "Coordinates along the input sequence (%s)" % shortFastaName
            gffPlotter._rawData = data
            gffPlotter.run()

    def _indexInputFastaFile(self):
        self._log.debug("index the input fasta file: get an enhanced suffix array.")
        cmd = "gt suffixerator -dna -pl -tis -suf -lcp -v -db %s -indexname %s.reads" % (self.indexationFasta, self.indexationFasta)
        process = subprocess.Popen(cmd.split(' '),stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        self._log.debug("Running suffixerator with following params %s" % cmd)
        output = process.communicate()
        self._log.debug("Suffixerator Output:\n%s" % output[0])
        if process.returncode != 0:
            self._logAndRaise("Error in generating enhanced suffix array in %s" % self.indexationFasta)

    def _countAndIndexKmersForGivenK(self):
        self._log.debug("Counting and indexing k-mers for k = %i " % self.merSize)
        cmd = "gt tallymer mkindex -mersize %i -minocc %i -indexname %s.tyr-reads -counts -pl -esa %s.reads" % (self.merSize, self.minOccs, self.indexationFasta, self.indexationFasta)
        process = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        self._log.debug("Running tallymer mkindex with following params %s" % cmd)
        output = process.communicate()
        self._log.debug("Tallymer mkindex Output:\n%s" % output[0])
        if process.returncode != 0:
            self._logAndRaise("Error in indexing kmers in %s.reads" % self.indexationFasta)

    def _searchKmersListInTallymerIndex(self):
        self._log.debug("Searching list of kmers in tallymer-index ")
        cmd = "gt tallymer search -output qseqnum qpos counts sequence -tyr %s.tyr-reads -q %s" % (self.indexationFasta, self.inputFasta)
        process = subprocess.Popen(cmd.split(' '),stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        self._log.debug("Running tallymer search with following params %s" % cmd)
        output = process.communicate()
        self._log.debug("Tallymer search Output:\n%s" % output[0])
        if process.returncode != 0:
            self._logAndRaise("Error in searching for kmers in %s.tyr-reads" % self.indexationFasta)
        tmpOutputFile = open(self._tmpSearchFileName,'w')
        tmpOutputFile.write(output[0])
        tmpOutputFile.close()

    def _writeOccurencesNbAndFrequencyOfKmers(self, totalNbOcc, dKmer2Occ):
        with open(self._tmpStatFileName, "w") as statFile:
            statFile.write("kmer\tocc\tfreq\n")
            for kmer in dKmer2Occ.keys():
                statFile.write("%s\t%i\t%.10f\n" % (kmer, dKmer2Occ[kmer], dKmer2Occ[kmer] / float(totalNbOcc)))

    def _writeOutputFiles(self):
        for format in self.outputFormats:
            self._log.info("Generating %s file" % format)
            outputFileName = "%s.tallymer.%s" % (os.path.splitext(self.inputFasta)[0], format)
            try:
                iConvertTranscriptFile = ConvertTranscriptFile(inputFileName=self._tmpMapFileName, name="Tallymer",\
                         inputFormat="map", outputFileName=outputFileName, outputFormat=format,feature= "Match", featurePart="Match-part", verbosity=0) #self.verbosity
                iConvertTranscriptFile.run()
            except Exception as inst:
                self._log.error("Error: %s - Failed to generate %s format ouput, skipping" % (inst, format))
            shutil.copy2(outputFileName, "../.")


class ConvertUtils(object):
    
    def convertTallymerFormatIntoMapFormatAndGenerateData(fastaFileName, searchFileName, mapFileName):
        dIndex2NameLengthList = ConvertUtils._createDictOfNameLengthAccordingToSeqOrder(fastaFileName)
        plotData = {}
        plotData2 = {}
        with open(searchFileName, "r") as talFile:
            with open(mapFileName, "w") as mapFile:
                totalNbOcc = 0
                dKmer2Occ = {}
                line = talFile.readline()
                while line:
                    data = line[:-1].split("\t")
                    name = "%s_%s" % (data[3], data[2])
                    nbOccs = int(data[2])
                    chrname = dIndex2NameLengthList[int(data[0])][0]
                    if data[1][0] == "+":
                        start = int(data[1][1:]) + 1
                        end = start + len(data[3])
                    elif data[1][0] == "-":
                        start_revcomp = int(data[1][1:])
                        start = dIndex2NameLengthList[int(data[0])][1] - start_revcomp - 1
                        end = end - len(data[3]) + 1
                    mapLine = "%s\t%s\t%s\t%s\t%i\n" % (name, chrname, start, end, nbOccs)
                    mapFile.write(mapLine)
                    
                    if plotData2.get(chrname,None) is None:
                        plotData2[chrname] = {}
                    if plotData2[chrname].get(start, None) is None:
                        plotData2[chrname][start]=0
                    plotData2[chrname][start] += nbOccs
                    
                    totalNbOcc += 1
                    if dKmer2Occ.has_key(data[3]):
                        dKmer2Occ[data[3]] += 1
                    else:
                        dKmer2Occ[data[3]] = 1
                    plotData[start] = nbOccs
                    line = talFile.readline()
        return totalNbOcc, dKmer2Occ, plotData, plotData2
    
    convertTallymerFormatIntoMapFormatAndGenerateData = staticmethod(convertTallymerFormatIntoMapFormatAndGenerateData)
    
    def _createDictOfNameLengthAccordingToSeqOrder(fastaFileName):
        with open(fastaFileName) as fastaFile:
            line = fastaFile.readline()
            i = 0
            length = 0
            dIndex2Name = {}
            while line:
                if line[0] == ">":
                    dIndex2Name[i] = [line[1:-1]]
                    if i > 0:
                        dIndex2Name[i - 1].append(length)
                        length = 0
                    i += 1
                else:
                    length += len(line[:-1])
                line = fastaFile.readline()
            dIndex2Name[i - 1].append(length)
        return dIndex2Name
    
    _createDictOfNameLengthAccordingToSeqOrder = staticmethod(_createDictOfNameLengthAccordingToSeqOrder)

if __name__ == "__main__":
    iLaunchTallymer = LaunchTallymer()
    iLaunchTallymer.setAttributesFromCmdLine()
    iLaunchTallymer.run()