comparison commons/launcher/LaunchTallymer.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
comparison
equal deleted inserted replaced
17:b0e8584489e6 18:94ab73e8a190
1 #!/usr/bin/env python
2
3 """
4 Launch Tallymer's sub programs, generate map file, and convert output to gff and wig, as well as visual (RPlot) data
5 """
6
7 # Copyright INRA (Institut National de la Recherche Agronomique)
8 # http://www.inra.fr
9 # http://urgi.versailles.inra.fr
10 #
11 # This software is governed by the CeCILL license under French law and
12 # abiding by the rules of distribution of free software. You can use,
13 # modify and/ or redistribute the software under the terms of the CeCILL
14 # license as circulated by CEA, CNRS and INRIA at the following URL
15 # "http://www.cecill.info".
16 #
17 # As a counterpart to the access to the source code and rights to copy,
18 # modify and redistribute granted by the license, users are provided only
19 # with a limited warranty and the software's author, the holder of the
20 # economic rights, and the successive licensors have only limited
21 # liability.
22 #
23 # In this respect, the user's attention is drawn to the risks associated
24 # with loading, using, modifying and/or developing or reproducing the
25 # software by the user in light of its specific status of free software,
26 # that may mean that it is complicated to manipulate, and that also
27 # therefore means that it is reserved for developers and experienced
28 # professionals having in-depth computer knowledge. Users are therefore
29 # encouraged to load and test the software's suitability as regards their
30 # requirements in conditions enabling the security of their systems and/or
31 # data to be ensured and, more generally, to use and operate it in the
32 # same conditions as regards security.
33 #
34 # The fact that you are presently reading this means that you have had
35 # knowledge of the CeCILL license and that you accept its terms.
36
37 import os
38 import shutil
39 import subprocess
40 import time
41 from commons.core.utils.RepetOptionParser import RepetOptionParser
42 from commons.core.LoggerFactory import LoggerFactory
43 from SMART.Java.Python.convertTranscriptFile import ConvertTranscriptFile
44 from commons.core.seq.BioseqUtils import BioseqUtils
45 from commons.core.seq.BioseqDB import BioseqDB
46 from Tallymer_pipe.PlotBenchMarkGFFFiles import PlotBenchMarkGFFFiles
47
48 LOG_DEPTH = "repet.tools"
49
50
51 class LaunchTallymer(object):
52 """
53 Launch Tallymer's sub programs, generate map file, and convert output to
54 gff and wig, as well as visual (RPlot) data
55 """
56
57 _lValidFormats = ["gff", "gff3", "wig", "bed", "map"]
58
59 def __init__(self, inputFasta="", indexationFasta=None, merSize=20, minOccs=4, outputFormats="gff", nLargestScaffoldsToPlot=0, clean=False, verbosity=0):
60 self.inputFasta = inputFasta
61 self.indexationFasta = indexationFasta if indexationFasta != None else inputFasta
62 self.merSize = merSize
63 self.minOccs = minOccs
64 self.outputFormats = outputFormats
65 self.nLargestScaffoldsToPlot = nLargestScaffoldsToPlot
66 self.doClean = clean
67 self.verbosity = verbosity
68
69 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self.verbosity)
70 self._workdir = os.path.join(os.getcwd(), "launchTallymer_%s" % time.strftime("%Y%m%d%H%M%S"))
71 self._tmpSearchFileName = None
72 self._tmpMapFileName = None
73 self._tmpStatFileName = None
74 self._tmpPngFileName = None
75 self._plot_data = {}
76 self._plot_data2 = {}
77
78 def setAttributesFromCmdLine(self):
79 description = "Generates stats from the results of the tallymer search ."
80 parser = RepetOptionParser(description=description)
81 parser.add_option("-i", "--inputFasta", dest="inputFasta", default = "", action="store", type="string", help="input fasta file [compulsory] [format: fasta]")
82 parser.add_option("-u", "--indexationFasta", dest="indexationFasta", default = "", action="store", type="string", help="input indexation fasta file used to generate kmer index (defaults to input fasta)[optional] [format: fasta]")
83 parser.add_option("-s", "--merSize", dest="merSize", default = 20, action="store", type="int", help="input merSize [optional][default:20]")
84 parser.add_option("-m", "--minOccs", dest="minOccs", default = 4, action="store", type="int", help="input minimal kmer occurence count [default:4]")
85 parser.add_option("-f", "--outputFormats", dest="outputFormats", default = "gff", action="store", type="string", help="comma separated list of output file formats (can be %s) [optional] [default:gff]" % ", ".join(self._lValidFormats))
86 parser.add_option("-n", "--nLargestScaffoldsToPlot",default = 0, type="int", action="store", dest = "nLargestScaffoldsToPlot", help = "generate graph of Kmer repartition along the input sequence for the n biggest scaffolds")
87 parser.add_option("-c", "--clean", dest = "clean", help = "clean temporary files", default = False, action="store_true")
88 parser.add_option("-v", "--verbosity", dest="verbosity", default = 1, action="store", type="int", help="verbosity [optional] ")
89 options, args = parser.parse_args()
90 self._setAttributesFromOptions(options)
91
92 def _setAttributesFromOptions(self, options):
93 self.inputFasta = options.inputFasta
94 self.indexationFasta = options.indexationFasta if options.indexationFasta != "" else options.inputFasta
95 self.merSize = options.merSize
96 self.minOccs = options.minOccs
97 self.outputFormats = options.outputFormats
98 self.nLargestScaffoldsToPlot = options.nLargestScaffoldsToPlot
99 self.doClean = options.clean
100 self.verbosity = options.verbosity
101
102 def _checkOptions(self):
103 if self.inputFasta == "":
104 self._logAndRaise("Error: missing input fasta file")
105 if self.merSize < 1:
106 self._logAndRaise("Error: invalid kmer size '%i'; must be a positive integer" % self.merSize)
107
108 self.outputFormats = self.outputFormats.lower().split(',')
109 sOutFormats = set(self.outputFormats)
110 sValidFormats = set(self._lValidFormats)
111 lInvalidFormats = list(sOutFormats - sValidFormats)
112 self.outputFormats = list(sValidFormats.intersection(sOutFormats))
113 if lInvalidFormats:
114 self._log.warning("Warning: ignoring invalid formats: <%s>" % " ".join(lInvalidFormats))
115 if not self.outputFormats:
116 self._logAndRaise("Error: no valid output formats specified")
117
118 def _logAndRaise(self, errorMsg):
119 self._log.error(errorMsg)
120 raise Exception(errorMsg)
121
122 def clean(self):
123 try:
124 shutil.rmtree(self._workdir)
125 except Exception as inst:
126 self._log.error(inst)
127
128 def run(self):
129 LoggerFactory.setLevel(self._log, self.verbosity)
130 self._checkOptions()
131 self._log.debug("Input fasta file: %s; K-mer size: %s; Output formats: %s; Cleaning: %s" % (self.inputFasta, self.merSize, str(self.outputFormats), self.doClean))
132 try:
133 os.makedirs(self._workdir)
134 except:pass
135
136 srcPath = os.path.abspath(self.inputFasta)
137 dstPath = os.path.join(self._workdir,os.path.basename(self.inputFasta))
138 os.symlink(srcPath, dstPath)
139
140 if (self.indexationFasta != self.inputFasta):
141 srcPath = os.path.abspath(self.indexationFasta)
142 dstPath = os.path.join(self._workdir,os.path.basename(self.indexationFasta))
143 try:
144 os.symlink(srcPath, dstPath)
145 except OSError as inst:
146 pass
147
148 os.chdir(self._workdir)
149
150 if (self.indexationFasta != self.inputFasta):
151 self.indexationFasta = os.path.basename(self.indexationFasta)
152 else:
153 self.indexationFasta = os.path.basename(self.inputFasta)
154 self.inputFasta = os.path.basename(self.inputFasta)
155
156 self._tmpSearchFileName = "%s.tallymer" % os.path.splitext(os.path.basename(self.inputFasta))[0]
157 self._tmpMapFileName = "%s_tmp.map" % self._tmpSearchFileName
158 self._tmpStatFileName = "%s.stat" % self._tmpSearchFileName
159 self._tmpPngFileName = "%s.png" % self._tmpSearchFileName
160
161
162
163
164
165 self._runTallymerSearch()
166 self._convertTallymerToMap()
167 self._writeOutputFiles()
168
169 if self.nLargestScaffoldsToPlot > 0:
170 self._doPlot()
171 shutil.copy2(self._tmpPngFileName, "../.")
172
173 shutil.copy2(self._tmpStatFileName, "../.")
174 os.chdir("..")
175
176 if self.doClean:
177 self.clean()
178
179 def _runTallymerSearch(self):
180 self._log.info("Starting to run tallymer search of sequence %s " % (self.inputFasta))
181 self._indexInputFastaFile()
182 self._countAndIndexKmersForGivenK()
183 self._searchKmersListInTallymerIndex()
184 self._log.info("Finished running tallymer scan of sequence %s " % (self.inputFasta))
185
186 def _convertTallymerToMap(self):
187 self._log.info("Starting to run tallymer search to map conversion")
188 totalNbOcc, dKmer2Occ, self._plot_data, self._plot_data2 = ConvertUtils.convertTallymerFormatIntoMapFormatAndGenerateData(self.inputFasta, self._tmpSearchFileName, self._tmpMapFileName)
189 self._log.debug("totalNbOcc=%i" % totalNbOcc)
190 self._writeOccurencesNbAndFrequencyOfKmers(totalNbOcc, dKmer2Occ)
191 self._log.info("Finished tallymer search to map conversion")
192
193 def _doPlot(self):
194 iBioseqDB = BioseqDB()
195 iBioseqDB.load(self.inputFasta)
196 largest_seqsDb = iBioseqDB.bestLength(self.nLargestScaffoldsToPlot)
197
198 for seq in largest_seqsDb.db:
199 headerCleaned = seq.header.replace(" ", "_")
200 shortFastaName = "%s_%s" % (os.path.basename(self.inputFasta),headerCleaned)
201 data = self._plot_data2[seq.header]
202 gffPlotter = PlotBenchMarkGFFFiles(yLabel="Number of Kmer Occurences", maxLength=1, title="Kmer repartition along the input sequence: %s; MerSize: %i" % (shortFastaName, self.merSize) )
203 gffPlotter.setOutFileName("%s_%s.png" % (self._tmpSearchFileName, headerCleaned))
204 gffPlotter._title = "Mers along the input sequence: %s MerSize: %i" % (shortFastaName, self.merSize)
205 gffPlotter._xLabel = "Coordinates along the input sequence (%s)" % shortFastaName
206 gffPlotter._rawData = data
207 gffPlotter.run()
208
209 def _indexInputFastaFile(self):
210 self._log.debug("index the input fasta file: get an enhanced suffix array.")
211 cmd = "gt suffixerator -dna -pl -tis -suf -lcp -v -db %s -indexname %s.reads" % (self.indexationFasta, self.indexationFasta)
212 process = subprocess.Popen(cmd.split(' '),stdout=subprocess.PIPE, stderr=subprocess.PIPE)
213 self._log.debug("Running suffixerator with following params %s" % cmd)
214 output = process.communicate()
215 self._log.debug("Suffixerator Output:\n%s" % output[0])
216 if process.returncode != 0:
217 self._logAndRaise("Error in generating enhanced suffix array in %s" % self.indexationFasta)
218
219 def _countAndIndexKmersForGivenK(self):
220 self._log.debug("Counting and indexing k-mers for k = %i " % self.merSize)
221 cmd = "gt tallymer mkindex -mersize %i -minocc %i -indexname %s.tyr-reads -counts -pl -esa %s.reads" % (self.merSize, self.minOccs, self.indexationFasta, self.indexationFasta)
222 process = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
223 self._log.debug("Running tallymer mkindex with following params %s" % cmd)
224 output = process.communicate()
225 self._log.debug("Tallymer mkindex Output:\n%s" % output[0])
226 if process.returncode != 0:
227 self._logAndRaise("Error in indexing kmers in %s.reads" % self.indexationFasta)
228
229 def _searchKmersListInTallymerIndex(self):
230 self._log.debug("Searching list of kmers in tallymer-index ")
231 cmd = "gt tallymer search -output qseqnum qpos counts sequence -tyr %s.tyr-reads -q %s" % (self.indexationFasta, self.inputFasta)
232 process = subprocess.Popen(cmd.split(' '),stdout=subprocess.PIPE, stderr=subprocess.PIPE)
233 self._log.debug("Running tallymer search with following params %s" % cmd)
234 output = process.communicate()
235 self._log.debug("Tallymer search Output:\n%s" % output[0])
236 if process.returncode != 0:
237 self._logAndRaise("Error in searching for kmers in %s.tyr-reads" % self.indexationFasta)
238 tmpOutputFile = open(self._tmpSearchFileName,'w')
239 tmpOutputFile.write(output[0])
240 tmpOutputFile.close()
241
242 def _writeOccurencesNbAndFrequencyOfKmers(self, totalNbOcc, dKmer2Occ):
243 with open(self._tmpStatFileName, "w") as statFile:
244 statFile.write("kmer\tocc\tfreq\n")
245 for kmer in dKmer2Occ.keys():
246 statFile.write("%s\t%i\t%.10f\n" % (kmer, dKmer2Occ[kmer], dKmer2Occ[kmer] / float(totalNbOcc)))
247
248 def _writeOutputFiles(self):
249 for format in self.outputFormats:
250 self._log.info("Generating %s file" % format)
251 outputFileName = "%s.tallymer.%s" % (os.path.splitext(self.inputFasta)[0], format)
252 try:
253 iConvertTranscriptFile = ConvertTranscriptFile(inputFileName=self._tmpMapFileName, name="Tallymer",\
254 inputFormat="map", outputFileName=outputFileName, outputFormat=format,feature= "Match", featurePart="Match-part", verbosity=0) #self.verbosity
255 iConvertTranscriptFile.run()
256 except Exception as inst:
257 self._log.error("Error: %s - Failed to generate %s format ouput, skipping" % (inst, format))
258 shutil.copy2(outputFileName, "../.")
259
260
261 class ConvertUtils(object):
262
263 def convertTallymerFormatIntoMapFormatAndGenerateData(fastaFileName, searchFileName, mapFileName):
264 dIndex2NameLengthList = ConvertUtils._createDictOfNameLengthAccordingToSeqOrder(fastaFileName)
265 plotData = {}
266 plotData2 = {}
267 with open(searchFileName, "r") as talFile:
268 with open(mapFileName, "w") as mapFile:
269 totalNbOcc = 0
270 dKmer2Occ = {}
271 line = talFile.readline()
272 while line:
273 data = line[:-1].split("\t")
274 name = "%s_%s" % (data[3], data[2])
275 nbOccs = int(data[2])
276 chrname = dIndex2NameLengthList[int(data[0])][0]
277 if data[1][0] == "+":
278 start = int(data[1][1:]) + 1
279 end = start + len(data[3])
280 elif data[1][0] == "-":
281 start_revcomp = int(data[1][1:])
282 start = dIndex2NameLengthList[int(data[0])][1] - start_revcomp - 1
283 end = end - len(data[3]) + 1
284 mapLine = "%s\t%s\t%s\t%s\t%i\n" % (name, chrname, start, end, nbOccs)
285 mapFile.write(mapLine)
286
287 if plotData2.get(chrname,None) is None:
288 plotData2[chrname] = {}
289 if plotData2[chrname].get(start, None) is None:
290 plotData2[chrname][start]=0
291 plotData2[chrname][start] += nbOccs
292
293 totalNbOcc += 1
294 if dKmer2Occ.has_key(data[3]):
295 dKmer2Occ[data[3]] += 1
296 else:
297 dKmer2Occ[data[3]] = 1
298 plotData[start] = nbOccs
299 line = talFile.readline()
300 return totalNbOcc, dKmer2Occ, plotData, plotData2
301
302 convertTallymerFormatIntoMapFormatAndGenerateData = staticmethod(convertTallymerFormatIntoMapFormatAndGenerateData)
303
304 def _createDictOfNameLengthAccordingToSeqOrder(fastaFileName):
305 with open(fastaFileName) as fastaFile:
306 line = fastaFile.readline()
307 i = 0
308 length = 0
309 dIndex2Name = {}
310 while line:
311 if line[0] == ">":
312 dIndex2Name[i] = [line[1:-1]]
313 if i > 0:
314 dIndex2Name[i - 1].append(length)
315 length = 0
316 i += 1
317 else:
318 length += len(line[:-1])
319 line = fastaFile.readline()
320 dIndex2Name[i - 1].append(length)
321 return dIndex2Name
322
323 _createDictOfNameLengthAccordingToSeqOrder = staticmethod(_createDictOfNameLengthAccordingToSeqOrder)
324
325 if __name__ == "__main__":
326 iLaunchTallymer = LaunchTallymer()
327 iLaunchTallymer.setAttributesFromCmdLine()
328 iLaunchTallymer.run()