annotate commons/launcher/LaunchTallymer.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
18
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
1 #!/usr/bin/env python
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
2
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
3 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
4 Launch Tallymer's sub programs, generate map file, and convert output to gff and wig, as well as visual (RPlot) data
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
5 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
6
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
7 # Copyright INRA (Institut National de la Recherche Agronomique)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
8 # http://www.inra.fr
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
9 # http://urgi.versailles.inra.fr
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
10 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
11 # This software is governed by the CeCILL license under French law and
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
12 # abiding by the rules of distribution of free software. You can use,
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
13 # modify and/ or redistribute the software under the terms of the CeCILL
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
14 # license as circulated by CEA, CNRS and INRIA at the following URL
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
15 # "http://www.cecill.info".
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
16 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
17 # As a counterpart to the access to the source code and rights to copy,
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
18 # modify and redistribute granted by the license, users are provided only
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
19 # with a limited warranty and the software's author, the holder of the
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
20 # economic rights, and the successive licensors have only limited
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
21 # liability.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
22 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
23 # In this respect, the user's attention is drawn to the risks associated
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
24 # with loading, using, modifying and/or developing or reproducing the
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
25 # software by the user in light of its specific status of free software,
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
26 # that may mean that it is complicated to manipulate, and that also
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
27 # therefore means that it is reserved for developers and experienced
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
28 # professionals having in-depth computer knowledge. Users are therefore
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
29 # encouraged to load and test the software's suitability as regards their
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
30 # requirements in conditions enabling the security of their systems and/or
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
31 # data to be ensured and, more generally, to use and operate it in the
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
32 # same conditions as regards security.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
33 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
34 # The fact that you are presently reading this means that you have had
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
35 # knowledge of the CeCILL license and that you accept its terms.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
36
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
37 import os
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
38 import shutil
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
39 import subprocess
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
40 import time
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
41 from commons.core.utils.RepetOptionParser import RepetOptionParser
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
42 from commons.core.LoggerFactory import LoggerFactory
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
43 from SMART.Java.Python.convertTranscriptFile import ConvertTranscriptFile
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
44 from commons.core.seq.BioseqUtils import BioseqUtils
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
45 from commons.core.seq.BioseqDB import BioseqDB
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
46 from Tallymer_pipe.PlotBenchMarkGFFFiles import PlotBenchMarkGFFFiles
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
47
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
48 LOG_DEPTH = "repet.tools"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
49
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
50
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
51 class LaunchTallymer(object):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
52 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
53 Launch Tallymer's sub programs, generate map file, and convert output to
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
54 gff and wig, as well as visual (RPlot) data
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
55 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
56
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
57 _lValidFormats = ["gff", "gff3", "wig", "bed", "map"]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
58
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
59 def __init__(self, inputFasta="", indexationFasta=None, merSize=20, minOccs=4, outputFormats="gff", nLargestScaffoldsToPlot=0, clean=False, verbosity=0):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
60 self.inputFasta = inputFasta
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
61 self.indexationFasta = indexationFasta if indexationFasta != None else inputFasta
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
62 self.merSize = merSize
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
63 self.minOccs = minOccs
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
64 self.outputFormats = outputFormats
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
65 self.nLargestScaffoldsToPlot = nLargestScaffoldsToPlot
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
66 self.doClean = clean
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
67 self.verbosity = verbosity
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
68
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
69 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self.verbosity)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
70 self._workdir = os.path.join(os.getcwd(), "launchTallymer_%s" % time.strftime("%Y%m%d%H%M%S"))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
71 self._tmpSearchFileName = None
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
72 self._tmpMapFileName = None
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
73 self._tmpStatFileName = None
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
74 self._tmpPngFileName = None
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
75 self._plot_data = {}
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
76 self._plot_data2 = {}
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
77
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
78 def setAttributesFromCmdLine(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
79 description = "Generates stats from the results of the tallymer search ."
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
80 parser = RepetOptionParser(description=description)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
81 parser.add_option("-i", "--inputFasta", dest="inputFasta", default = "", action="store", type="string", help="input fasta file [compulsory] [format: fasta]")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
82 parser.add_option("-u", "--indexationFasta", dest="indexationFasta", default = "", action="store", type="string", help="input indexation fasta file used to generate kmer index (defaults to input fasta)[optional] [format: fasta]")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
83 parser.add_option("-s", "--merSize", dest="merSize", default = 20, action="store", type="int", help="input merSize [optional][default:20]")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
84 parser.add_option("-m", "--minOccs", dest="minOccs", default = 4, action="store", type="int", help="input minimal kmer occurence count [default:4]")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
85 parser.add_option("-f", "--outputFormats", dest="outputFormats", default = "gff", action="store", type="string", help="comma separated list of output file formats (can be %s) [optional] [default:gff]" % ", ".join(self._lValidFormats))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
86 parser.add_option("-n", "--nLargestScaffoldsToPlot",default = 0, type="int", action="store", dest = "nLargestScaffoldsToPlot", help = "generate graph of Kmer repartition along the input sequence for the n biggest scaffolds")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
87 parser.add_option("-c", "--clean", dest = "clean", help = "clean temporary files", default = False, action="store_true")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
88 parser.add_option("-v", "--verbosity", dest="verbosity", default = 1, action="store", type="int", help="verbosity [optional] ")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
89 options, args = parser.parse_args()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
90 self._setAttributesFromOptions(options)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
91
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
92 def _setAttributesFromOptions(self, options):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
93 self.inputFasta = options.inputFasta
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
94 self.indexationFasta = options.indexationFasta if options.indexationFasta != "" else options.inputFasta
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
95 self.merSize = options.merSize
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
96 self.minOccs = options.minOccs
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
97 self.outputFormats = options.outputFormats
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
98 self.nLargestScaffoldsToPlot = options.nLargestScaffoldsToPlot
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
99 self.doClean = options.clean
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
100 self.verbosity = options.verbosity
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
101
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
102 def _checkOptions(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
103 if self.inputFasta == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
104 self._logAndRaise("Error: missing input fasta file")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
105 if self.merSize < 1:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
106 self._logAndRaise("Error: invalid kmer size '%i'; must be a positive integer" % self.merSize)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
107
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
108 self.outputFormats = self.outputFormats.lower().split(',')
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
109 sOutFormats = set(self.outputFormats)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
110 sValidFormats = set(self._lValidFormats)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
111 lInvalidFormats = list(sOutFormats - sValidFormats)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
112 self.outputFormats = list(sValidFormats.intersection(sOutFormats))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
113 if lInvalidFormats:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
114 self._log.warning("Warning: ignoring invalid formats: <%s>" % " ".join(lInvalidFormats))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
115 if not self.outputFormats:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
116 self._logAndRaise("Error: no valid output formats specified")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
117
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
118 def _logAndRaise(self, errorMsg):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
119 self._log.error(errorMsg)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
120 raise Exception(errorMsg)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
121
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
122 def clean(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
123 try:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
124 shutil.rmtree(self._workdir)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
125 except Exception as inst:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
126 self._log.error(inst)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
127
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
128 def run(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
129 LoggerFactory.setLevel(self._log, self.verbosity)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
130 self._checkOptions()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
131 self._log.debug("Input fasta file: %s; K-mer size: %s; Output formats: %s; Cleaning: %s" % (self.inputFasta, self.merSize, str(self.outputFormats), self.doClean))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
132 try:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
133 os.makedirs(self._workdir)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
134 except:pass
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
135
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
136 srcPath = os.path.abspath(self.inputFasta)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
137 dstPath = os.path.join(self._workdir,os.path.basename(self.inputFasta))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
138 os.symlink(srcPath, dstPath)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
139
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
140 if (self.indexationFasta != self.inputFasta):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
141 srcPath = os.path.abspath(self.indexationFasta)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
142 dstPath = os.path.join(self._workdir,os.path.basename(self.indexationFasta))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
143 try:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
144 os.symlink(srcPath, dstPath)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
145 except OSError as inst:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
146 pass
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
147
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
148 os.chdir(self._workdir)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
149
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
150 if (self.indexationFasta != self.inputFasta):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
151 self.indexationFasta = os.path.basename(self.indexationFasta)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
152 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
153 self.indexationFasta = os.path.basename(self.inputFasta)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
154 self.inputFasta = os.path.basename(self.inputFasta)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
155
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
156 self._tmpSearchFileName = "%s.tallymer" % os.path.splitext(os.path.basename(self.inputFasta))[0]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
157 self._tmpMapFileName = "%s_tmp.map" % self._tmpSearchFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
158 self._tmpStatFileName = "%s.stat" % self._tmpSearchFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
159 self._tmpPngFileName = "%s.png" % self._tmpSearchFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
160
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
161
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
162
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
163
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
164
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
165 self._runTallymerSearch()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
166 self._convertTallymerToMap()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
167 self._writeOutputFiles()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
168
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
169 if self.nLargestScaffoldsToPlot > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
170 self._doPlot()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
171 shutil.copy2(self._tmpPngFileName, "../.")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
172
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
173 shutil.copy2(self._tmpStatFileName, "../.")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
174 os.chdir("..")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
175
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
176 if self.doClean:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
177 self.clean()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
178
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
179 def _runTallymerSearch(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
180 self._log.info("Starting to run tallymer search of sequence %s " % (self.inputFasta))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
181 self._indexInputFastaFile()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
182 self._countAndIndexKmersForGivenK()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
183 self._searchKmersListInTallymerIndex()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
184 self._log.info("Finished running tallymer scan of sequence %s " % (self.inputFasta))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
185
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
186 def _convertTallymerToMap(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
187 self._log.info("Starting to run tallymer search to map conversion")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
188 totalNbOcc, dKmer2Occ, self._plot_data, self._plot_data2 = ConvertUtils.convertTallymerFormatIntoMapFormatAndGenerateData(self.inputFasta, self._tmpSearchFileName, self._tmpMapFileName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
189 self._log.debug("totalNbOcc=%i" % totalNbOcc)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
190 self._writeOccurencesNbAndFrequencyOfKmers(totalNbOcc, dKmer2Occ)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
191 self._log.info("Finished tallymer search to map conversion")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
192
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
193 def _doPlot(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
194 iBioseqDB = BioseqDB()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
195 iBioseqDB.load(self.inputFasta)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
196 largest_seqsDb = iBioseqDB.bestLength(self.nLargestScaffoldsToPlot)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
197
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
198 for seq in largest_seqsDb.db:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
199 headerCleaned = seq.header.replace(" ", "_")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
200 shortFastaName = "%s_%s" % (os.path.basename(self.inputFasta),headerCleaned)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
201 data = self._plot_data2[seq.header]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
202 gffPlotter = PlotBenchMarkGFFFiles(yLabel="Number of Kmer Occurences", maxLength=1, title="Kmer repartition along the input sequence: %s; MerSize: %i" % (shortFastaName, self.merSize) )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
203 gffPlotter.setOutFileName("%s_%s.png" % (self._tmpSearchFileName, headerCleaned))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
204 gffPlotter._title = "Mers along the input sequence: %s MerSize: %i" % (shortFastaName, self.merSize)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
205 gffPlotter._xLabel = "Coordinates along the input sequence (%s)" % shortFastaName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
206 gffPlotter._rawData = data
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
207 gffPlotter.run()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
208
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
209 def _indexInputFastaFile(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
210 self._log.debug("index the input fasta file: get an enhanced suffix array.")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
211 cmd = "gt suffixerator -dna -pl -tis -suf -lcp -v -db %s -indexname %s.reads" % (self.indexationFasta, self.indexationFasta)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
212 process = subprocess.Popen(cmd.split(' '),stdout=subprocess.PIPE, stderr=subprocess.PIPE)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
213 self._log.debug("Running suffixerator with following params %s" % cmd)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
214 output = process.communicate()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
215 self._log.debug("Suffixerator Output:\n%s" % output[0])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
216 if process.returncode != 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
217 self._logAndRaise("Error in generating enhanced suffix array in %s" % self.indexationFasta)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
218
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
219 def _countAndIndexKmersForGivenK(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
220 self._log.debug("Counting and indexing k-mers for k = %i " % self.merSize)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
221 cmd = "gt tallymer mkindex -mersize %i -minocc %i -indexname %s.tyr-reads -counts -pl -esa %s.reads" % (self.merSize, self.minOccs, self.indexationFasta, self.indexationFasta)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
222 process = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
223 self._log.debug("Running tallymer mkindex with following params %s" % cmd)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
224 output = process.communicate()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
225 self._log.debug("Tallymer mkindex Output:\n%s" % output[0])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
226 if process.returncode != 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
227 self._logAndRaise("Error in indexing kmers in %s.reads" % self.indexationFasta)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
228
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
229 def _searchKmersListInTallymerIndex(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
230 self._log.debug("Searching list of kmers in tallymer-index ")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
231 cmd = "gt tallymer search -output qseqnum qpos counts sequence -tyr %s.tyr-reads -q %s" % (self.indexationFasta, self.inputFasta)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
232 process = subprocess.Popen(cmd.split(' '),stdout=subprocess.PIPE, stderr=subprocess.PIPE)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
233 self._log.debug("Running tallymer search with following params %s" % cmd)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
234 output = process.communicate()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
235 self._log.debug("Tallymer search Output:\n%s" % output[0])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
236 if process.returncode != 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
237 self._logAndRaise("Error in searching for kmers in %s.tyr-reads" % self.indexationFasta)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
238 tmpOutputFile = open(self._tmpSearchFileName,'w')
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
239 tmpOutputFile.write(output[0])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
240 tmpOutputFile.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
241
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
242 def _writeOccurencesNbAndFrequencyOfKmers(self, totalNbOcc, dKmer2Occ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
243 with open(self._tmpStatFileName, "w") as statFile:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
244 statFile.write("kmer\tocc\tfreq\n")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
245 for kmer in dKmer2Occ.keys():
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
246 statFile.write("%s\t%i\t%.10f\n" % (kmer, dKmer2Occ[kmer], dKmer2Occ[kmer] / float(totalNbOcc)))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
247
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
248 def _writeOutputFiles(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
249 for format in self.outputFormats:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
250 self._log.info("Generating %s file" % format)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
251 outputFileName = "%s.tallymer.%s" % (os.path.splitext(self.inputFasta)[0], format)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
252 try:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
253 iConvertTranscriptFile = ConvertTranscriptFile(inputFileName=self._tmpMapFileName, name="Tallymer",\
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
254 inputFormat="map", outputFileName=outputFileName, outputFormat=format,feature= "Match", featurePart="Match-part", verbosity=0) #self.verbosity
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
255 iConvertTranscriptFile.run()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
256 except Exception as inst:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
257 self._log.error("Error: %s - Failed to generate %s format ouput, skipping" % (inst, format))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
258 shutil.copy2(outputFileName, "../.")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
259
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
260
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
261 class ConvertUtils(object):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
262
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
263 def convertTallymerFormatIntoMapFormatAndGenerateData(fastaFileName, searchFileName, mapFileName):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
264 dIndex2NameLengthList = ConvertUtils._createDictOfNameLengthAccordingToSeqOrder(fastaFileName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
265 plotData = {}
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
266 plotData2 = {}
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
267 with open(searchFileName, "r") as talFile:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
268 with open(mapFileName, "w") as mapFile:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
269 totalNbOcc = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
270 dKmer2Occ = {}
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
271 line = talFile.readline()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
272 while line:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
273 data = line[:-1].split("\t")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
274 name = "%s_%s" % (data[3], data[2])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
275 nbOccs = int(data[2])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
276 chrname = dIndex2NameLengthList[int(data[0])][0]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
277 if data[1][0] == "+":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
278 start = int(data[1][1:]) + 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
279 end = start + len(data[3])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
280 elif data[1][0] == "-":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
281 start_revcomp = int(data[1][1:])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
282 start = dIndex2NameLengthList[int(data[0])][1] - start_revcomp - 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
283 end = end - len(data[3]) + 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
284 mapLine = "%s\t%s\t%s\t%s\t%i\n" % (name, chrname, start, end, nbOccs)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
285 mapFile.write(mapLine)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
286
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
287 if plotData2.get(chrname,None) is None:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
288 plotData2[chrname] = {}
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
289 if plotData2[chrname].get(start, None) is None:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
290 plotData2[chrname][start]=0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
291 plotData2[chrname][start] += nbOccs
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
292
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
293 totalNbOcc += 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
294 if dKmer2Occ.has_key(data[3]):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
295 dKmer2Occ[data[3]] += 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
296 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
297 dKmer2Occ[data[3]] = 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
298 plotData[start] = nbOccs
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
299 line = talFile.readline()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
300 return totalNbOcc, dKmer2Occ, plotData, plotData2
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
301
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
302 convertTallymerFormatIntoMapFormatAndGenerateData = staticmethod(convertTallymerFormatIntoMapFormatAndGenerateData)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
303
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
304 def _createDictOfNameLengthAccordingToSeqOrder(fastaFileName):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
305 with open(fastaFileName) as fastaFile:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
306 line = fastaFile.readline()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
307 i = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
308 length = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
309 dIndex2Name = {}
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
310 while line:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
311 if line[0] == ">":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
312 dIndex2Name[i] = [line[1:-1]]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
313 if i > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
314 dIndex2Name[i - 1].append(length)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
315 length = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
316 i += 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
317 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
318 length += len(line[:-1])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
319 line = fastaFile.readline()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
320 dIndex2Name[i - 1].append(length)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
321 return dIndex2Name
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
322
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
323 _createDictOfNameLengthAccordingToSeqOrder = staticmethod(_createDictOfNameLengthAccordingToSeqOrder)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
324
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
325 if __name__ == "__main__":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
326 iLaunchTallymer = LaunchTallymer()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
327 iLaunchTallymer.setAttributesFromCmdLine()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
328 iLaunchTallymer.run()