diff commons/launcher/LaunchTallymer.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/commons/launcher/LaunchTallymer.py	Mon Apr 29 03:20:15 2013 -0400
@@ -0,0 +1,328 @@
+#!/usr/bin/env python
+
+"""
+Launch Tallymer's sub programs, generate map file, and convert output to gff and wig, as well as visual (RPlot) data
+"""
+
+# Copyright INRA (Institut National de la Recherche Agronomique)
+# http://www.inra.fr
+# http://urgi.versailles.inra.fr
+#
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software.  You can  use, 
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info". 
+#
+# As a counterpart to the access to the source code and  rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty  and the software's author,  the holder of the
+# economic rights,  and the successive licensors  have only  limited
+# liability. 
+#
+# In this respect, the user's attention is drawn to the risks associated
+# with loading,  using,  modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean  that it is complicated to manipulate,  and  that  also
+# therefore means  that it is reserved for developers  and  experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or 
+# data to be ensured and,  more generally, to use and operate it in the 
+# same conditions as regards security. 
+#
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+
+import os
+import shutil
+import subprocess
+import time
+from commons.core.utils.RepetOptionParser import RepetOptionParser
+from commons.core.LoggerFactory import LoggerFactory
+from SMART.Java.Python.convertTranscriptFile import ConvertTranscriptFile
+from commons.core.seq.BioseqUtils import BioseqUtils
+from commons.core.seq.BioseqDB import BioseqDB
+from Tallymer_pipe.PlotBenchMarkGFFFiles import PlotBenchMarkGFFFiles
+
+LOG_DEPTH = "repet.tools"
+
+
+class LaunchTallymer(object):
+    """
+    Launch Tallymer's sub programs, generate map file, and convert output to
+    gff and wig, as well as visual (RPlot) data
+    """
+    
+    _lValidFormats = ["gff", "gff3", "wig", "bed", "map"]
+    
+    def __init__(self, inputFasta="", indexationFasta=None, merSize=20, minOccs=4, outputFormats="gff", nLargestScaffoldsToPlot=0, clean=False, verbosity=0):
+        self.inputFasta = inputFasta
+        self.indexationFasta = indexationFasta if indexationFasta != None else inputFasta 
+        self.merSize = merSize
+        self.minOccs = minOccs
+        self.outputFormats = outputFormats
+        self.nLargestScaffoldsToPlot = nLargestScaffoldsToPlot
+        self.doClean = clean
+        self.verbosity = verbosity
+        
+        self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self.verbosity)
+        self._workdir = os.path.join(os.getcwd(), "launchTallymer_%s" % time.strftime("%Y%m%d%H%M%S"))
+        self._tmpSearchFileName = None
+        self._tmpMapFileName = None
+        self._tmpStatFileName = None   
+        self._tmpPngFileName = None
+        self._plot_data = {}
+        self._plot_data2 = {}
+        
+    def setAttributesFromCmdLine(self):
+        description = "Generates stats from the results of the tallymer search ."    
+        parser = RepetOptionParser(description=description)
+        parser.add_option("-i", "--inputFasta", dest="inputFasta", default = "",  action="store", type="string", help="input fasta file [compulsory] [format: fasta]")
+        parser.add_option("-u", "--indexationFasta", dest="indexationFasta", default = "",  action="store", type="string", help="input indexation fasta file used to generate kmer index (defaults to input fasta)[optional] [format: fasta]")
+        parser.add_option("-s", "--merSize", dest="merSize", default = 20,  action="store", type="int", help="input merSize [optional][default:20]")
+        parser.add_option("-m", "--minOccs", dest="minOccs", default = 4,  action="store", type="int", help="input minimal kmer occurence count [default:4]")
+        parser.add_option("-f", "--outputFormats", dest="outputFormats", default = "gff",  action="store", type="string", help="comma separated list of output file formats (can  be %s) [optional] [default:gff]" % ", ".join(self._lValidFormats))
+        parser.add_option("-n", "--nLargestScaffoldsToPlot",default = 0, type="int", action="store", dest = "nLargestScaffoldsToPlot", help = "generate graph of Kmer repartition along the input sequence for the n biggest scaffolds")
+        parser.add_option("-c", "--clean", dest = "clean", help = "clean temporary files", default = False, action="store_true")
+        parser.add_option("-v", "--verbosity", dest="verbosity", default = 1,  action="store", type="int", help="verbosity [optional] ")
+        options, args = parser.parse_args()
+        self._setAttributesFromOptions(options)
+        
+    def _setAttributesFromOptions(self, options):
+        self.inputFasta = options.inputFasta
+        self.indexationFasta = options.indexationFasta if options.indexationFasta != "" else options.inputFasta
+        self.merSize = options.merSize
+        self.minOccs = options.minOccs
+        self.outputFormats = options.outputFormats
+        self.nLargestScaffoldsToPlot = options.nLargestScaffoldsToPlot
+        self.doClean = options.clean
+        self.verbosity = options.verbosity
+        
+    def _checkOptions(self):
+        if self.inputFasta == "":
+            self._logAndRaise("Error: missing input fasta file")
+        if self.merSize < 1:
+            self._logAndRaise("Error: invalid kmer size '%i'; must be a positive integer" % self.merSize)
+            
+        self.outputFormats = self.outputFormats.lower().split(',')
+        sOutFormats = set(self.outputFormats)
+        sValidFormats = set(self._lValidFormats)
+        lInvalidFormats = list(sOutFormats - sValidFormats)
+        self.outputFormats = list(sValidFormats.intersection(sOutFormats))
+        if lInvalidFormats:
+            self._log.warning("Warning: ignoring invalid formats: <%s>" % " ".join(lInvalidFormats))
+        if not self.outputFormats:
+            self._logAndRaise("Error: no valid output formats specified")
+
+    def _logAndRaise(self, errorMsg):
+        self._log.error(errorMsg)
+        raise Exception(errorMsg)
+    
+    def clean(self):
+        try:
+            shutil.rmtree(self._workdir)
+        except Exception as inst: 
+            self._log.error(inst)
+                    
+    def run(self):
+        LoggerFactory.setLevel(self._log, self.verbosity)
+        self._checkOptions()
+        self._log.debug("Input fasta file: %s; K-mer size: %s; Output formats: %s; Cleaning: %s" % (self.inputFasta, self.merSize, str(self.outputFormats), self.doClean))
+        try:
+            os.makedirs(self._workdir)
+        except:pass
+        
+        srcPath = os.path.abspath(self.inputFasta)
+        dstPath = os.path.join(self._workdir,os.path.basename(self.inputFasta))
+        os.symlink(srcPath, dstPath)
+        
+        if (self.indexationFasta !=  self.inputFasta):
+            srcPath = os.path.abspath(self.indexationFasta)
+            dstPath = os.path.join(self._workdir,os.path.basename(self.indexationFasta))
+            try:
+                os.symlink(srcPath, dstPath)
+            except OSError as inst:
+                pass
+        
+        os.chdir(self._workdir)
+        
+        if (self.indexationFasta !=  self.inputFasta):
+            self.indexationFasta = os.path.basename(self.indexationFasta)
+        else:
+            self.indexationFasta = os.path.basename(self.inputFasta)
+        self.inputFasta = os.path.basename(self.inputFasta)
+        
+        self._tmpSearchFileName = "%s.tallymer" % os.path.splitext(os.path.basename(self.inputFasta))[0]
+        self._tmpMapFileName = "%s_tmp.map" % self._tmpSearchFileName
+        self._tmpStatFileName = "%s.stat" % self._tmpSearchFileName
+        self._tmpPngFileName = "%s.png" % self._tmpSearchFileName
+        
+        
+            
+        
+        
+        self._runTallymerSearch()
+        self._convertTallymerToMap()
+        self._writeOutputFiles()
+            
+        if self.nLargestScaffoldsToPlot > 0:
+            self._doPlot()
+            shutil.copy2(self._tmpPngFileName, "../.")
+        
+        shutil.copy2(self._tmpStatFileName, "../.")
+        os.chdir("..")
+        
+        if self.doClean:
+            self.clean()
+
+    def _runTallymerSearch(self):
+        self._log.info("Starting to run tallymer search of sequence %s " % (self.inputFasta))
+        self._indexInputFastaFile()
+        self._countAndIndexKmersForGivenK()
+        self._searchKmersListInTallymerIndex()
+        self._log.info("Finished running tallymer scan of sequence %s " % (self.inputFasta))
+        
+    def _convertTallymerToMap(self):
+        self._log.info("Starting to run tallymer search to map conversion")
+        totalNbOcc, dKmer2Occ, self._plot_data, self._plot_data2 = ConvertUtils.convertTallymerFormatIntoMapFormatAndGenerateData(self.inputFasta, self._tmpSearchFileName, self._tmpMapFileName)
+        self._log.debug("totalNbOcc=%i" % totalNbOcc)
+        self._writeOccurencesNbAndFrequencyOfKmers(totalNbOcc, dKmer2Occ)        
+        self._log.info("Finished tallymer search to map conversion")
+    
+    def _doPlot(self):
+        iBioseqDB = BioseqDB()
+        iBioseqDB.load(self.inputFasta)
+        largest_seqsDb = iBioseqDB.bestLength(self.nLargestScaffoldsToPlot)
+        
+        for seq in largest_seqsDb.db:
+            headerCleaned = seq.header.replace(" ", "_")
+            shortFastaName = "%s_%s" % (os.path.basename(self.inputFasta),headerCleaned)
+            data = self._plot_data2[seq.header]
+            gffPlotter = PlotBenchMarkGFFFiles(yLabel="Number of Kmer Occurences", maxLength=1, title="Kmer repartition along the input sequence: %s; MerSize: %i" % (shortFastaName, self.merSize) )
+            gffPlotter.setOutFileName("%s_%s.png" % (self._tmpSearchFileName, headerCleaned))
+            gffPlotter._title = "Mers along the input sequence: %s MerSize: %i" % (shortFastaName, self.merSize) 
+            gffPlotter._xLabel = "Coordinates along the input sequence (%s)" % shortFastaName
+            gffPlotter._rawData = data
+            gffPlotter.run()
+
+    def _indexInputFastaFile(self):
+        self._log.debug("index the input fasta file: get an enhanced suffix array.")
+        cmd = "gt suffixerator -dna -pl -tis -suf -lcp -v -db %s -indexname %s.reads" % (self.indexationFasta, self.indexationFasta)
+        process = subprocess.Popen(cmd.split(' '),stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        self._log.debug("Running suffixerator with following params %s" % cmd)
+        output = process.communicate()
+        self._log.debug("Suffixerator Output:\n%s" % output[0])
+        if process.returncode != 0:
+            self._logAndRaise("Error in generating enhanced suffix array in %s" % self.indexationFasta)
+
+    def _countAndIndexKmersForGivenK(self):
+        self._log.debug("Counting and indexing k-mers for k = %i " % self.merSize)
+        cmd = "gt tallymer mkindex -mersize %i -minocc %i -indexname %s.tyr-reads -counts -pl -esa %s.reads" % (self.merSize, self.minOccs, self.indexationFasta, self.indexationFasta)
+        process = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        self._log.debug("Running tallymer mkindex with following params %s" % cmd)
+        output = process.communicate()
+        self._log.debug("Tallymer mkindex Output:\n%s" % output[0])
+        if process.returncode != 0:
+            self._logAndRaise("Error in indexing kmers in %s.reads" % self.indexationFasta)
+
+    def _searchKmersListInTallymerIndex(self):
+        self._log.debug("Searching list of kmers in tallymer-index ")
+        cmd = "gt tallymer search -output qseqnum qpos counts sequence -tyr %s.tyr-reads -q %s" % (self.indexationFasta, self.inputFasta)
+        process = subprocess.Popen(cmd.split(' '),stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        self._log.debug("Running tallymer search with following params %s" % cmd)
+        output = process.communicate()
+        self._log.debug("Tallymer search Output:\n%s" % output[0])
+        if process.returncode != 0:
+            self._logAndRaise("Error in searching for kmers in %s.tyr-reads" % self.indexationFasta)
+        tmpOutputFile = open(self._tmpSearchFileName,'w')
+        tmpOutputFile.write(output[0])
+        tmpOutputFile.close()
+
+    def _writeOccurencesNbAndFrequencyOfKmers(self, totalNbOcc, dKmer2Occ):
+        with open(self._tmpStatFileName, "w") as statFile:
+            statFile.write("kmer\tocc\tfreq\n")
+            for kmer in dKmer2Occ.keys():
+                statFile.write("%s\t%i\t%.10f\n" % (kmer, dKmer2Occ[kmer], dKmer2Occ[kmer] / float(totalNbOcc)))
+
+    def _writeOutputFiles(self):
+        for format in self.outputFormats:
+            self._log.info("Generating %s file" % format)
+            outputFileName = "%s.tallymer.%s" % (os.path.splitext(self.inputFasta)[0], format)
+            try:
+                iConvertTranscriptFile = ConvertTranscriptFile(inputFileName=self._tmpMapFileName, name="Tallymer",\
+                         inputFormat="map", outputFileName=outputFileName, outputFormat=format,feature= "Match", featurePart="Match-part", verbosity=0) #self.verbosity
+                iConvertTranscriptFile.run()
+            except Exception as inst:
+                self._log.error("Error: %s - Failed to generate %s format ouput, skipping" % (inst, format))
+            shutil.copy2(outputFileName, "../.")
+
+
+class ConvertUtils(object):
+    
+    def convertTallymerFormatIntoMapFormatAndGenerateData(fastaFileName, searchFileName, mapFileName):
+        dIndex2NameLengthList = ConvertUtils._createDictOfNameLengthAccordingToSeqOrder(fastaFileName)
+        plotData = {}
+        plotData2 = {}
+        with open(searchFileName, "r") as talFile:
+            with open(mapFileName, "w") as mapFile:
+                totalNbOcc = 0
+                dKmer2Occ = {}
+                line = talFile.readline()
+                while line:
+                    data = line[:-1].split("\t")
+                    name = "%s_%s" % (data[3], data[2])
+                    nbOccs = int(data[2])
+                    chrname = dIndex2NameLengthList[int(data[0])][0]
+                    if data[1][0] == "+":
+                        start = int(data[1][1:]) + 1
+                        end = start + len(data[3])
+                    elif data[1][0] == "-":
+                        start_revcomp = int(data[1][1:])
+                        start = dIndex2NameLengthList[int(data[0])][1] - start_revcomp - 1
+                        end = end - len(data[3]) + 1
+                    mapLine = "%s\t%s\t%s\t%s\t%i\n" % (name, chrname, start, end, nbOccs)
+                    mapFile.write(mapLine)
+                    
+                    if plotData2.get(chrname,None) is None:
+                        plotData2[chrname] = {}
+                    if plotData2[chrname].get(start, None) is None:
+                        plotData2[chrname][start]=0
+                    plotData2[chrname][start] += nbOccs
+                    
+                    totalNbOcc += 1
+                    if dKmer2Occ.has_key(data[3]):
+                        dKmer2Occ[data[3]] += 1
+                    else:
+                        dKmer2Occ[data[3]] = 1
+                    plotData[start] = nbOccs
+                    line = talFile.readline()
+        return totalNbOcc, dKmer2Occ, plotData, plotData2
+    
+    convertTallymerFormatIntoMapFormatAndGenerateData = staticmethod(convertTallymerFormatIntoMapFormatAndGenerateData)
+    
+    def _createDictOfNameLengthAccordingToSeqOrder(fastaFileName):
+        with open(fastaFileName) as fastaFile:
+            line = fastaFile.readline()
+            i = 0
+            length = 0
+            dIndex2Name = {}
+            while line:
+                if line[0] == ">":
+                    dIndex2Name[i] = [line[1:-1]]
+                    if i > 0:
+                        dIndex2Name[i - 1].append(length)
+                        length = 0
+                    i += 1
+                else:
+                    length += len(line[:-1])
+                line = fastaFile.readline()
+            dIndex2Name[i - 1].append(length)
+        return dIndex2Name
+    
+    _createDictOfNameLengthAccordingToSeqOrder = staticmethod(_createDictOfNameLengthAccordingToSeqOrder)
+
+if __name__ == "__main__":
+    iLaunchTallymer = LaunchTallymer()
+    iLaunchTallymer.setAttributesFromCmdLine()
+    iLaunchTallymer.run()        
\ No newline at end of file