s_mart: commons/launcher/LaunchTallymer.py comparison

comparison commons/launcher/LaunchTallymer.py @ 18:94ab73e8a190

Uploaded

author	m-zytnicki
date	Mon, 29 Apr 2013 03:20:15 -0400
parents
children

comparison

equal deleted inserted replaced

-:b0e8584489e6
+:94ab73e8a190
+#!/usr/bin/env python
+"""
+Launch Tallymer's sub programs, generate map file, and convert output to gff and wig, as well as visual (RPlot) data
+"""
+# Copyright INRA (Institut National de la Recherche Agronomique)
+# http://www.inra.fr
+# http://urgi.versailles.inra.fr
+#
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software.  You can  use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+#
+# As a counterpart to the access to the source code and  rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty  and the software's author,  the holder of the
+# economic rights,  and the successive licensors  have only  limited
+# liability.
+#
+# In this respect, the user's attention is drawn to the risks associated
+# with loading,  using,  modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean  that it is complicated to manipulate,  and  that  also
+# therefore means  that it is reserved for developers  and  experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and,  more generally, to use and operate it in the
+# same conditions as regards security.
+#
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+import os
+import shutil
+import subprocess
+import time
+from commons.core.utils.RepetOptionParser import RepetOptionParser
+from commons.core.LoggerFactory import LoggerFactory
+from SMART.Java.Python.convertTranscriptFile import ConvertTranscriptFile
+from commons.core.seq.BioseqUtils import BioseqUtils
+from commons.core.seq.BioseqDB import BioseqDB
+from Tallymer_pipe.PlotBenchMarkGFFFiles import PlotBenchMarkGFFFiles
+LOG_DEPTH = "repet.tools"
+class LaunchTallymer(object):
+"""
+Launch Tallymer's sub programs, generate map file, and convert output to
+gff and wig, as well as visual (RPlot) data
+"""
+_lValidFormats = ["gff", "gff3", "wig", "bed", "map"]
+def __init__(self, inputFasta="", indexationFasta=None, merSize=20, minOccs=4, outputFormats="gff", nLargestScaffoldsToPlot=0, clean=False, verbosity=0):
+self.inputFasta = inputFasta
+self.indexationFasta = indexationFasta if indexationFasta != None else inputFasta
+self.merSize = merSize
+self.minOccs = minOccs
+self.outputFormats = outputFormats
+self.nLargestScaffoldsToPlot = nLargestScaffoldsToPlot
+self.doClean = clean
+self.verbosity = verbosity
+self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self.verbosity)
+self._workdir = os.path.join(os.getcwd(), "launchTallymer_%s" % time.strftime("%Y%m%d%H%M%S"))
+self._tmpSearchFileName = None
+self._tmpMapFileName = None
+self._tmpStatFileName = None
+self._tmpPngFileName = None
+self._plot_data = {}
+self._plot_data2 = {}
+def setAttributesFromCmdLine(self):
+description = "Generates stats from the results of the tallymer search ."
+parser = RepetOptionParser(description=description)
+parser.add_option("-i", "--inputFasta", dest="inputFasta", default = "",  action="store", type="string", help="input fasta file [compulsory] [format: fasta]")
+parser.add_option("-u", "--indexationFasta", dest="indexationFasta", default = "",  action="store", type="string", help="input indexation fasta file used to generate kmer index (defaults to input fasta)[optional] [format: fasta]")
+parser.add_option("-s", "--merSize", dest="merSize", default = 20,  action="store", type="int", help="input merSize [optional][default:20]")
+parser.add_option("-m", "--minOccs", dest="minOccs", default = 4,  action="store", type="int", help="input minimal kmer occurence count [default:4]")
+parser.add_option("-f", "--outputFormats", dest="outputFormats", default = "gff",  action="store", type="string", help="comma separated list of output file formats (can  be %s) [optional] [default:gff]" % ", ".join(self._lValidFormats))
+parser.add_option("-n", "--nLargestScaffoldsToPlot",default = 0, type="int", action="store", dest = "nLargestScaffoldsToPlot", help = "generate graph of Kmer repartition along the input sequence for the n biggest scaffolds")
+parser.add_option("-c", "--clean", dest = "clean", help = "clean temporary files", default = False, action="store_true")
+parser.add_option("-v", "--verbosity", dest="verbosity", default = 1,  action="store", type="int", help="verbosity [optional] ")
+options, args = parser.parse_args()
+self._setAttributesFromOptions(options)
+def _setAttributesFromOptions(self, options):
+self.inputFasta = options.inputFasta
+self.indexationFasta = options.indexationFasta if options.indexationFasta != "" else options.inputFasta
+self.merSize = options.merSize
+self.minOccs = options.minOccs
+self.outputFormats = options.outputFormats
+self.nLargestScaffoldsToPlot = options.nLargestScaffoldsToPlot
+self.doClean = options.clean
+self.verbosity = options.verbosity
+def _checkOptions(self):
+if self.inputFasta == "":
+self._logAndRaise("Error: missing input fasta file")
+if self.merSize < 1:
+self._logAndRaise("Error: invalid kmer size '%i'; must be a positive integer" % self.merSize)
+self.outputFormats = self.outputFormats.lower().split(',')
+sOutFormats = set(self.outputFormats)
+sValidFormats = set(self._lValidFormats)
+lInvalidFormats = list(sOutFormats - sValidFormats)
+self.outputFormats = list(sValidFormats.intersection(sOutFormats))
+if lInvalidFormats:
+self._log.warning("Warning: ignoring invalid formats: <%s>" % " ".join(lInvalidFormats))
+if not self.outputFormats:
+self._logAndRaise("Error: no valid output formats specified")
+def _logAndRaise(self, errorMsg):
+self._log.error(errorMsg)
+raise Exception(errorMsg)
+def clean(self):
+try:
+shutil.rmtree(self._workdir)
+except Exception as inst:
+self._log.error(inst)
+def run(self):
+LoggerFactory.setLevel(self._log, self.verbosity)
+self._checkOptions()
+self._log.debug("Input fasta file: %s; K-mer size: %s; Output formats: %s; Cleaning: %s" % (self.inputFasta, self.merSize, str(self.outputFormats), self.doClean))
+try:
+os.makedirs(self._workdir)
+except:pass
+srcPath = os.path.abspath(self.inputFasta)
+dstPath = os.path.join(self._workdir,os.path.basename(self.inputFasta))
+os.symlink(srcPath, dstPath)
+if (self.indexationFasta !=  self.inputFasta):
+srcPath = os.path.abspath(self.indexationFasta)
+dstPath = os.path.join(self._workdir,os.path.basename(self.indexationFasta))
+try:
+os.symlink(srcPath, dstPath)
+except OSError as inst:
+pass
+os.chdir(self._workdir)
+if (self.indexationFasta !=  self.inputFasta):
+self.indexationFasta = os.path.basename(self.indexationFasta)
+else:
+self.indexationFasta = os.path.basename(self.inputFasta)
+self.inputFasta = os.path.basename(self.inputFasta)
+self._tmpSearchFileName = "%s.tallymer" % os.path.splitext(os.path.basename(self.inputFasta))[0]
+self._tmpMapFileName = "%s_tmp.map" % self._tmpSearchFileName
+self._tmpStatFileName = "%s.stat" % self._tmpSearchFileName
+self._tmpPngFileName = "%s.png" % self._tmpSearchFileName
+self._runTallymerSearch()
+self._convertTallymerToMap()
+self._writeOutputFiles()
+if self.nLargestScaffoldsToPlot > 0:
+self._doPlot()
+shutil.copy2(self._tmpPngFileName, "../.")
+shutil.copy2(self._tmpStatFileName, "../.")
+os.chdir("..")
+if self.doClean:
+self.clean()
+def _runTallymerSearch(self):
+self._log.info("Starting to run tallymer search of sequence %s " % (self.inputFasta))
+self._indexInputFastaFile()
+self._countAndIndexKmersForGivenK()
+self._searchKmersListInTallymerIndex()
+self._log.info("Finished running tallymer scan of sequence %s " % (self.inputFasta))
+def _convertTallymerToMap(self):
+self._log.info("Starting to run tallymer search to map conversion")
+totalNbOcc, dKmer2Occ, self._plot_data, self._plot_data2 = ConvertUtils.convertTallymerFormatIntoMapFormatAndGenerateData(self.inputFasta, self._tmpSearchFileName, self._tmpMapFileName)
+self._log.debug("totalNbOcc=%i" % totalNbOcc)
+self._writeOccurencesNbAndFrequencyOfKmers(totalNbOcc, dKmer2Occ)
+self._log.info("Finished tallymer search to map conversion")
+def _doPlot(self):
+iBioseqDB = BioseqDB()
+iBioseqDB.load(self.inputFasta)
+largest_seqsDb = iBioseqDB.bestLength(self.nLargestScaffoldsToPlot)
+for seq in largest_seqsDb.db:
+headerCleaned = seq.header.replace(" ", "_")
+shortFastaName = "%s_%s" % (os.path.basename(self.inputFasta),headerCleaned)
+data = self._plot_data2[seq.header]
+gffPlotter = PlotBenchMarkGFFFiles(yLabel="Number of Kmer Occurences", maxLength=1, title="Kmer repartition along the input sequence: %s; MerSize: %i" % (shortFastaName, self.merSize) )
+gffPlotter.setOutFileName("%s_%s.png" % (self._tmpSearchFileName, headerCleaned))
+gffPlotter._title = "Mers along the input sequence: %s MerSize: %i" % (shortFastaName, self.merSize)
+gffPlotter._xLabel = "Coordinates along the input sequence (%s)" % shortFastaName
+gffPlotter._rawData = data
+gffPlotter.run()
+def _indexInputFastaFile(self):
+self._log.debug("index the input fasta file: get an enhanced suffix array.")
+cmd = "gt suffixerator -dna -pl -tis -suf -lcp -v -db %s -indexname %s.reads" % (self.indexationFasta, self.indexationFasta)
+process = subprocess.Popen(cmd.split(' '),stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+self._log.debug("Running suffixerator with following params %s" % cmd)
+output = process.communicate()
+self._log.debug("Suffixerator Output:\n%s" % output[0])
+if process.returncode != 0:
+self._logAndRaise("Error in generating enhanced suffix array in %s" % self.indexationFasta)
+def _countAndIndexKmersForGivenK(self):
+self._log.debug("Counting and indexing k-mers for k = %i " % self.merSize)
+cmd = "gt tallymer mkindex -mersize %i -minocc %i -indexname %s.tyr-reads -counts -pl -esa %s.reads" % (self.merSize, self.minOccs, self.indexationFasta, self.indexationFasta)
+process = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+self._log.debug("Running tallymer mkindex with following params %s" % cmd)
+output = process.communicate()
+self._log.debug("Tallymer mkindex Output:\n%s" % output[0])
+if process.returncode != 0:
+self._logAndRaise("Error in indexing kmers in %s.reads" % self.indexationFasta)
+def _searchKmersListInTallymerIndex(self):
+self._log.debug("Searching list of kmers in tallymer-index ")
+cmd = "gt tallymer search -output qseqnum qpos counts sequence -tyr %s.tyr-reads -q %s" % (self.indexationFasta, self.inputFasta)
+process = subprocess.Popen(cmd.split(' '),stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+self._log.debug("Running tallymer search with following params %s" % cmd)
+output = process.communicate()
+self._log.debug("Tallymer search Output:\n%s" % output[0])
+if process.returncode != 0:
+self._logAndRaise("Error in searching for kmers in %s.tyr-reads" % self.indexationFasta)
+tmpOutputFile = open(self._tmpSearchFileName,'w')
+tmpOutputFile.write(output[0])
+tmpOutputFile.close()
+def _writeOccurencesNbAndFrequencyOfKmers(self, totalNbOcc, dKmer2Occ):
+with open(self._tmpStatFileName, "w") as statFile:
+statFile.write("kmer\tocc\tfreq\n")
+for kmer in dKmer2Occ.keys():
+statFile.write("%s\t%i\t%.10f\n" % (kmer, dKmer2Occ[kmer], dKmer2Occ[kmer] / float(totalNbOcc)))
+def _writeOutputFiles(self):
+for format in self.outputFormats:
+self._log.info("Generating %s file" % format)
+outputFileName = "%s.tallymer.%s" % (os.path.splitext(self.inputFasta)[0], format)
+try:
+iConvertTranscriptFile = ConvertTranscriptFile(inputFileName=self._tmpMapFileName, name="Tallymer",\
+inputFormat="map", outputFileName=outputFileName, outputFormat=format,feature= "Match", featurePart="Match-part", verbosity=0) #self.verbosity
+iConvertTranscriptFile.run()
+except Exception as inst:
+self._log.error("Error: %s - Failed to generate %s format ouput, skipping" % (inst, format))
+shutil.copy2(outputFileName, "../.")
+class ConvertUtils(object):
+def convertTallymerFormatIntoMapFormatAndGenerateData(fastaFileName, searchFileName, mapFileName):
+dIndex2NameLengthList = ConvertUtils._createDictOfNameLengthAccordingToSeqOrder(fastaFileName)
+plotData = {}
+plotData2 = {}
+with open(searchFileName, "r") as talFile:
+with open(mapFileName, "w") as mapFile:
+totalNbOcc = 0
+dKmer2Occ = {}
+line = talFile.readline()
+while line:
+data = line[:-1].split("\t")
+name = "%s_%s" % (data[3], data[2])
+nbOccs = int(data[2])
+chrname = dIndex2NameLengthList[int(data[0])][0]
+if data[1][0] == "+":
+start = int(data[1][1:]) + 1
+end = start + len(data[3])
+elif data[1][0] == "-":
+start_revcomp = int(data[1][1:])
+start = dIndex2NameLengthList[int(data[0])][1] - start_revcomp - 1
+end = end - len(data[3]) + 1
+mapLine = "%s\t%s\t%s\t%s\t%i\n" % (name, chrname, start, end, nbOccs)
+mapFile.write(mapLine)
+if plotData2.get(chrname,None) is None:
+plotData2[chrname] = {}
+if plotData2[chrname].get(start, None) is None:
+plotData2[chrname][start]=0
+plotData2[chrname][start] += nbOccs
+totalNbOcc += 1
+if dKmer2Occ.has_key(data[3]):
+dKmer2Occ[data[3]] += 1
+else:
+dKmer2Occ[data[3]] = 1
+plotData[start] = nbOccs
+line = talFile.readline()
+return totalNbOcc, dKmer2Occ, plotData, plotData2
+convertTallymerFormatIntoMapFormatAndGenerateData = staticmethod(convertTallymerFormatIntoMapFormatAndGenerateData)
+def _createDictOfNameLengthAccordingToSeqOrder(fastaFileName):
+with open(fastaFileName) as fastaFile:
+line = fastaFile.readline()
+i = 0
+length = 0
+dIndex2Name = {}
+while line:
+if line[0] == ">":
+dIndex2Name[i] = [line[1:-1]]
+if i > 0:
+dIndex2Name[i - 1].append(length)
+length = 0
+i += 1
+else:
+length += len(line[:-1])
+line = fastaFile.readline()
+dIndex2Name[i - 1].append(length)
+return dIndex2Name
+_createDictOfNameLengthAccordingToSeqOrder = staticmethod(_createDictOfNameLengthAccordingToSeqOrder)
+if __name__ == "__main__":
+iLaunchTallymer = LaunchTallymer()
+iLaunchTallymer.setAttributesFromCmdLine()
+iLaunchTallymer.run()

Mercurial > repos > yufei-luo > s_mart

comparison commons/launcher/LaunchTallymer.py @ 18:94ab73e8a190