comparison smart_toolShed/SMART/Java/Python/GetReadDistribution.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2010
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 import random, os, glob, subprocess
32 from commons.core.parsing.ParserChooser import ParserChooser
33 from commons.core.parsing.GffParser import GffParser
34 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
35 from SMART.Java.Python.misc.Progress import Progress
36 from SMART.Java.Python.misc import Utils
37 from commons.core.LoggerFactory import LoggerFactory
38 from commons.core.utils.RepetOptionParser import RepetOptionParser
39
40 LOG_DEPTH = "smart"
41 DEFAULT_REGION = "_all_"
42 MULTIPLE_STR = {1: "", 1000: " (in kpb)", 1000000: " (in Gbp)"}
43
44 class GetReadDistribution(object):
45
46 def __init__(self, verbosity = 0):
47 self.xLab = ""
48 self.yLab = "# reads"
49 self.verbosity = verbosity
50 self.number = random.randint(0, 100000)
51 self.log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self.verbosity)
52 self.parsers = {}
53 self.distribution = {}
54 self.factors = {}
55 self.regions = None
56 self.tmpDatName = None
57 self.tmpRName = None
58 self.quorum = 1
59 self.width = 800
60 self.height = 300
61
62 def setNames(self, names):
63 self.names = names
64
65 def setInputFiles(self, fileNames, format):
66 chooser = ParserChooser(self.verbosity)
67 chooser.findFormat(format)
68 for cpt, fileName in enumerate(fileNames):
69 self.parsers[self.names[cpt]] = chooser.getParser(fileName)
70
71 def setOutputFileName(self, fileName):
72 self.outputFileName = fileName
73
74 def setLabs(self, xLab, yLab):
75 self.xLab = xLab
76 self.yLab = yLab
77
78 def setBinSize(self, binSize):
79 self.binSize = binSize
80
81 def setColors(self, colors):
82 self.colors = colors
83
84 def setFactors(self, factors):
85 self.factors = dict(zip(self.names, factors))
86
87 def setMultiple(self, boolean):
88 self.multiple = boolean
89
90 def setImageSize(self, width, height):
91 if width != None:
92 self.width = width
93 if height != None:
94 self.height = height
95
96 def setQuorum(self, quorum):
97 self.quorum = quorum
98
99 def setRegionsFile(self, fileName):
100 if fileName != None:
101 self._loadRegions(fileName)
102
103 def _checkOptions(self):
104 if not self.parsers:
105 self.logAndRaise("ERROR: Missing input file names")
106
107 def _logAndRaise(self, errorMsg):
108 self.log.error(errorMsg)
109 raise Exception(errorMsg)
110
111 def _loadRegions(self, fileName):
112 self.regions = {}
113 parser = GffParser(fileName, self.verbosity)
114 for transcript in parser.getIterator():
115 chromosome = transcript.getChromosome()
116 start = transcript.getStart()
117 end = transcript.getEnd()
118 name = transcript.getName()
119 if chromosome not in self.regions:
120 self.regions[chromosome] = {}
121 if start not in self.regions[chromosome]:
122 self.regions[chromosome][start] = {}
123 if end not in self.regions[chromosome][start]:
124 self.regions[chromosome][start][end] = []
125 self.regions[chromosome][start][end].append(name)
126
127 def _getRegions(self, transcript):
128 if self.regions == None:
129 return [DEFAULT_REGION]
130 chromosome = transcript.getChromosome()
131 start = transcript.getStart()
132 end = transcript.getEnd()
133 if chromosome not in self.regions:
134 return []
135 names = []
136 for loadedStart in sorted(self.regions[chromosome].keys()):
137 if loadedStart > end:
138 return names
139 for loadedEnd in reversed(sorted(self.regions[chromosome][loadedStart].keys())):
140 if loadedEnd < start:
141 break
142 names.extend(self.regions[chromosome][loadedStart][loadedEnd])
143 return names
144
145 def _parse(self, name):
146 progress = UnlimitedProgress(10000, "Reading file '%s'" % (name), self.verbosity)
147 for transcript in self.parsers[name].getIterator():
148 if transcript.__class__.__name__ == "Mapping":
149 transcript = transcript.getTranscript()
150 regions = self._getRegions(transcript)
151 for region in regions:
152 if region not in self.distribution:
153 self.distribution[region] = {}
154 if name not in self.distribution[region]:
155 self.distribution[region][name] = {}
156 chromosome = transcript.getChromosome()
157 nbElements = float(transcript.getTagValue("nbElements")) if "nbElements" in transcript.getTagNames() else 1
158 nbElements *= self.factors.get(name, 1)
159 if chromosome not in self.distribution[region][name]:
160 self.distribution[region][name][chromosome] = {}
161 previousBin = None
162 for exon in transcript.getExons():
163 for pos in range(exon.getStart(), exon.getEnd()+1):
164 bin = pos / self.binSize
165 if bin != previousBin:
166 self.distribution[region][name][chromosome][bin] = self.distribution[region][name][chromosome].get(bin, 0) + nbElements
167 previousBin = bin
168 progress.inc()
169 progress.done()
170
171 def _checkQuorum(self, region):
172 if self.quorum == None:
173 return True
174 return max([max([max(self.distribution[region][name][chromosome].values()) for chromosome in self.distribution[region][name]]) for name in self.distribution[region]]) >= self.quorum
175
176 def _writeData(self, region):
177 self.tmpDatName = "tmpFile%d.dat" % (self.number)
178 handle = open(self.tmpDatName, "w")
179 handle.write("Chr\tPos\tCount\tSample\n")
180 for name in self.distribution[region]:
181 for chromosome in sorted(self.distribution[region][name].keys()):
182 for pos in sorted(self.distribution[region][name][chromosome].keys()):
183 handle.write("%s\t%d\t%d\t\"%s\"\n" % (chromosome, pos * self.binSize, self.distribution[region][name][chromosome].get(pos, 0), name))
184 handle.close()
185
186 def _findMultiple(self, region):
187 if not self.multiple:
188 return 1
189 maxPosition = max([self.distribution[region][name][chromosome].keys() for name in self.distribution[region] for chromosome in self.distribution[region][name]])
190 if maxPosition > 2000000:
191 return 1000000
192 elif maxPosition > 2000:
193 return 1000
194 return 1
195
196 def _writeScript(self, region):
197 self.tmpRName = "tmpFile%d.R" % (self.number)
198 fileName = self.outputFileName if region == DEFAULT_REGION else "%s_%s.png" % (os.path.splitext(self.outputFileName)[0], region)
199 colors = "scale_fill_brewer(palette=\"Set1\") + scale_color_brewer(palette=\"Set1\")" if self.colors == None else "scale_fill_manual(values = c(%s)) + scale_color_manual(values = c(%s))" % (", ".join(["\"%s\"" % (color) for color in self.colors]), ", ".join(["\"%s\"" % (color) for color in self.colors]))
200 title = "" if region == DEFAULT_REGION else " of %s" % (region)
201 facet = "Sample ~ Chr" if region == DEFAULT_REGION else "Sample ~ ."
202 handle = open(self.tmpRName, "w")
203 multiple = self._findMultiple(region)
204 handle.write("library(ggplot2)\n")
205 handle.write("data <- read.table(\"%s\", header = T)\n" % (self.tmpDatName))
206 handle.write("data$Sample <- factor(data$Sample, levels=c(%s))\n" % (", ".join(["\"%s\"" % (name) for name in self.names])))
207 handle.write("png(\"%s\", width = %d, height = %d)\n" % (fileName, self.width, self.height))
208 handle.write("ggplot(data, aes(x = Pos/%d, y = Count, fill = Sample, color = Sample)) + opts(title = \"Distribution%s\") + geom_bar(stat = \"identity\") + facet_grid(%s, space=\"free\") + xlab(\"%s%s\") + ylab(\"%s\") + %s + opts(legend.position = \"none\", panel.grid.major = theme_blank(), panel.grid.minor = theme_blank(), panel.background = theme_blank())\n" % (multiple, title, facet, self.xLab, MULTIPLE_STR[multiple], self.yLab, colors))
209 handle.write("dev.off()\n")
210
211 def _runR(self):
212 rCommand = "R"
213 if "SMARTRPATH" in os.environ:
214 rCommand = os.environ["SMARTRPATH"]
215 command = "\"%s\" CMD BATCH %s" % (rCommand, self.tmpRName)
216 status = subprocess.call(command, shell=True)
217 if status != 0:
218 raise Exception("Problem with the execution of script file %s, status is: %s" % (self.tmpRName, status))
219
220 def _plot(self):
221 progress = Progress(len(self.distribution), "Plotting data", self.verbosity)
222 for region in self.distribution:
223 if not self._checkQuorum(region):
224 self.log.info("Not displaying '%s' for it contains insufficient data." % (region))
225 else:
226 self._writeData(region)
227 self._writeScript(region)
228 self._runR()
229 progress.inc()
230 progress.done()
231
232 def _cleanFiles(self):
233 for fileName in (self.tmpDatName, self.tmpRName):
234 if fileName != None and os.path.exists(fileName):
235 os.remove(fileName)
236 for otherFileName in glob.glob("%s*" % (fileName)):
237 os.remove(otherFileName)
238
239 def run(self):
240 LoggerFactory.setLevel(self.log, self.verbosity)
241 self._checkOptions()
242 self.log.info("START Get Read Distribution")
243 for name in self.names:
244 self._parse(name)
245 self._plot()
246 self._cleanFiles()
247 self.log.info("END Get Read Distribution")
248
249
250 if __name__ == "__main__":
251 description = "Usage: GetReadDistribution.py [options]\n\nGet Read Distribution v1.0.1: Get the distribution of a set of reads. [Category: Personal]\n"
252 epilog = ""
253 parser = RepetOptionParser(description = description, epilog = epilog)
254 parser.add_option("-i", "--input", dest="inputFileNames", action="store", default=None, type="string", help="input files, separated by commas [compulsory] [format: string]")
255 parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the input [compulsory] [format: transcript or sequence file format]")
256 parser.add_option("-n", "--names", dest="names", action="store", default=None, type="string", help="name of the input data, separated by commas [compulsory] [format: string]")
257 parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [format: output file in PNG format]")
258 parser.add_option("-s", "--binSize", dest="binSize", action="store", default=10000, type="int", help="bin size [format: int] [default: 10000]")
259 parser.add_option("-l", "--xLabel", dest="xLab", action="store", default="", type="string", help="x-axis label name [format: string]")
260 parser.add_option("-L", "--yLabel", dest="yLab", action="store", default="# reads", type="string", help="y-axis label name [format: string] [default: Reads]")
261 parser.add_option("-c", "--colors", dest="colors", action="store", default=None, type="string", help="colors of the bars, separated by commas [format: string]")
262 parser.add_option("-a", "--factors", dest="factors", action="store", default=None, type="string", help="normalization factors, separated by commas [format: string]")
263 parser.add_option("-r", "--regions", dest="regionsFileName", action="store", default=None, type="string", help="regions to plot [format: transcript file in GFF format]")
264 parser.add_option("-m", "--multiple", dest="multiple", action="store_true", default=False, help="print position using multiples (k, G) [format: boolean] [default: False]")
265 parser.add_option("-q", "--quorum", dest="quorum", action="store", default=1, type="int", help="minimum number of intervals to plot a region [format: int] [default: 1]")
266 parser.add_option("-z", "--width", dest="width", action="store", default=800, type="int", help="width of the image [format: int] [default: 800]")
267 parser.add_option("-Z", "--height", dest="height", action="store", default=300, type="int", help="height of the image [format: int] [default: 300]")
268 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
269 options = parser.parse_args()[0]
270 iGetReadDistribution = GetReadDistribution(options.verbosity)
271 iGetReadDistribution.setNames(options.names.split(","))
272 iGetReadDistribution.setInputFiles(options.inputFileNames.split(","), options.format)
273 iGetReadDistribution.setOutputFileName(options.outputFileName)
274 iGetReadDistribution.setLabs(options.xLab, options.yLab)
275 iGetReadDistribution.setBinSize(options.binSize)
276 iGetReadDistribution.setColors(None if options.colors == None else options.colors.split(","))
277 iGetReadDistribution.setFactors(None if options.factors == None else map(float, options.factors.split(",")))
278 iGetReadDistribution.setRegionsFile(options.regionsFileName)
279 iGetReadDistribution.setMultiple(options.multiple)
280 iGetReadDistribution.setQuorum(options.quorum)
281 iGetReadDistribution.setImageSize(options.width, options.height)
282 iGetReadDistribution.run()
283