comparison smart_toolShed/SMART/Java/Python/GetReadSizes.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2010
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 import random, os, glob, subprocess
32 from commons.core.parsing.ParserChooser import ParserChooser
33 from commons.core.parsing.GffParser import GffParser
34 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
35 from SMART.Java.Python.misc.Progress import Progress
36 from SMART.Java.Python.misc import Utils
37 from commons.core.LoggerFactory import LoggerFactory
38 from commons.core.utils.RepetOptionParser import RepetOptionParser
39
40 LOG_DEPTH = "smart"
41 DEFAULT_REGION = "_all_"
42
43 class GetReadSizes(object):
44
45 def __init__(self, verbosity = 0):
46 self.xLab = "Size"
47 self.yLab = "# reads"
48 self.verbosity = verbosity
49 self.number = random.randint(0, 100000)
50 self.log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self.verbosity)
51 self.parsers = {}
52 self.sizes = {}
53 self.factors = {}
54 self.regions = None
55 self.tmpDatName = None
56 self.tmpRName = None
57 self.width = 800
58 self.height = 300
59
60 def setNames(self, names):
61 self.names = names
62
63 def setInputFiles(self, fileNames, format):
64 chooser = ParserChooser(self.verbosity)
65 chooser.findFormat(format)
66 for cpt, fileName in enumerate(fileNames):
67 self.parsers[self.names[cpt]] = chooser.getParser(fileName)
68
69 def setOutputFileName(self, fileName):
70 self.outputFileName = fileName
71
72 def setLabs(self, xLab, yLab):
73 self.xLab = xLab
74 self.yLab = yLab
75
76 def setSizes(self, minSize, maxSize):
77 self.minSize = minSize
78 self.maxSize = maxSize
79
80 def setColors(self, colors):
81 self.colors = colors
82
83 def setFactors(self, factors):
84 self.factors = dict(zip(self.names, factors))
85
86 def setRegionsFile(self, fileName):
87 if fileName != None:
88 self._loadRegions(fileName)
89
90 def setImageSize(self, width, height):
91 if width != None:
92 self.width = width
93 if height != None:
94 self.height = height
95
96 def _checkOptions(self):
97 if not self.parsers:
98 self.logAndRaise("ERROR: Missing input file names")
99
100 def _logAndRaise(self, errorMsg):
101 self.log.error(errorMsg)
102 raise Exception(errorMsg)
103
104 def _loadRegions(self, fileName):
105 self.regions = {}
106 parser = GffParser(fileName, self.verbosity)
107 for transcript in parser.getIterator():
108 chromosome = transcript.getChromosome()
109 start = transcript.getStart()
110 end = transcript.getEnd()
111 name = transcript.getName()
112 if chromosome not in self.regions:
113 self.regions[chromosome] = {}
114 if start not in self.regions[chromosome]:
115 self.regions[chromosome][start] = {}
116 if end not in self.regions[chromosome][start]:
117 self.regions[chromosome][start][end] = []
118 self.regions[chromosome][start][end].append(name)
119
120 def _getRegions(self, transcript):
121 if self.regions == None:
122 return [DEFAULT_REGION]
123 chromosome = transcript.getChromosome()
124 start = transcript.getStart()
125 end = transcript.getEnd()
126 if chromosome not in self.regions:
127 return []
128 names = []
129 for loadedStart in sorted(self.regions[chromosome].keys()):
130 if loadedStart > end:
131 return names
132 for loadedEnd in reversed(sorted(self.regions[chromosome][loadedStart].keys())):
133 if loadedEnd < start:
134 break
135 names.extend(self.regions[chromosome][loadedStart][loadedEnd])
136 return names
137
138 def _parse(self, name):
139 progress = UnlimitedProgress(10000, "Reading file '%s'" % (name), self.verbosity)
140 for transcript in self.parsers[name].getIterator():
141 if transcript.__class__.__name__ == "Mapping":
142 transcript = transcript.getTranscript()
143 regions = self._getRegions(transcript)
144 for region in regions:
145 if region not in self.sizes:
146 self.sizes[region] = {}
147 if name not in self.sizes[region]:
148 self.sizes[region][name] = {}
149 size = transcript.getSize()
150 if (self.minSize == None or size >= self.minSize) and (self.maxSize == None or size <= self.maxSize):
151 nbElements = float(transcript.getTagValue("nbElements")) if "nbElements" in transcript.getTagNames() else 1
152 nbElements *= self.factors.get(name, 1)
153 self.sizes[region][name][size] = self.sizes[region][name].get(size, 0) + nbElements
154 progress.inc()
155 progress.done()
156 if self.minSize == None:
157 self.minSize = min([min(self.sizes[region][name].keys()) for name in self.names for region in region])
158 if self.maxSize == None:
159 self.maxSize = max([max(self.sizes[region][name].keys()) for name in self.names for region in region])
160
161 def _checkQuorum(self, region):
162 return (max([sum(self.sizes[region][name].values()) for name in self.sizes[region]]) > 0)
163
164 def _writeData(self, region):
165 self.tmpDatName = "tmpFile%d.dat" % (self.number)
166 handle = open(self.tmpDatName, "w")
167 handle.write("Size\tCount\tSample\n")
168 for name in self.sizes[region]:
169 for size in sorted(self.sizes[region][name].keys()):
170 handle.write("%d\t%d\t\"%s\"\n" % (size, self.sizes[region][name].get(size, 0), name))
171 handle.close()
172
173 def _writeScript(self, region):
174 self.tmpRName = "tmpFile%d.R" % (self.number)
175 fileName = self.outputFileName if region == DEFAULT_REGION else "%s_%s.png" % (os.path.splitext(self.outputFileName)[0], region)
176 colors = "scale_fill_brewer(palette=\"Set1\")" if self.colors == None else "scale_fill_manual(values = c(%s))" % (", ".join(["\"%s\"" % (color) for color in self.colors]))
177 title = "" if region == DEFAULT_REGION else " of %s" % (region)
178 handle = open(self.tmpRName, "w")
179 handle.write("library(ggplot2)\n")
180 handle.write("data <- read.table(\"%s\", header = T)\n" % (self.tmpDatName))
181 handle.write("data$Sample <- factor(data$Sample, levels=c(%s))\n" % (", ".join(["\"%s\"" % (name) for name in self.names])))
182 handle.write("data$Size <- factor(data$Size, levels=c(%s))\n" % (", ".join(["%d" % (size) for size in range(self.minSize, self.maxSize+1)])))
183 handle.write("png(\"%s\", width = %d, height = %d)\n" % (fileName, self.width, self.height))
184 handle.write("ggplot(data, aes(x = Size, y = Count, fill = Size)) + opts(title = \"Size distribution%s\") + geom_bar(stat = \"identity\") + facet_grid(. ~ Sample, space=\"free_x\") + xlab(\"%s\") + ylab(\"%s\") + %s + opts(legend.position = \"none\", panel.grid.major = theme_blank(), panel.grid.minor = theme_blank(), panel.background = theme_blank())\n" % (title, self.xLab, self.yLab, colors))
185 handle.write("dev.off()\n")
186
187 def _runR(self):
188 rCommand = "R"
189 if "SMARTRPATH" in os.environ:
190 rCommand = os.environ["SMARTRPATH"]
191 command = "\"%s\" CMD BATCH %s" % (rCommand, self.tmpRName)
192 status = subprocess.call(command, shell=True)
193 if status != 0:
194 raise Exception("Problem with the execution of script file %s, status is: %s" % (self.tmpRName, status))
195
196 def _plot(self):
197 progress = Progress(len(self.sizes), "Plotting data", self.verbosity)
198 for region in self.sizes:
199 if not self._checkQuorum(region):
200 self.log.info("Not displaying '%s' for it contains no data." % (region))
201 else:
202 self._writeData(region)
203 self._writeScript(region)
204 self._runR()
205 progress.inc()
206 progress.done()
207
208 def _cleanFiles(self):
209 for fileName in (self.tmpDatName, self.tmpRName):
210 if fileName != None and os.path.exists(fileName):
211 os.remove(fileName)
212 for otherFileName in glob.glob("%s*" % (fileName)):
213 os.remove(otherFileName)
214
215 def run(self):
216 LoggerFactory.setLevel(self.log, self.verbosity)
217 self._checkOptions()
218 self.log.info("START Get Read Sizes")
219 for name in self.names:
220 self._parse(name)
221 self._plot()
222 self._cleanFiles()
223 self.log.info("END Get Read Sizes")
224
225
226 if __name__ == "__main__":
227 description = "Usage: GetReadSizes.py [options]\n\nGet Read Sizes v1.0.1: Get the sizes of a set of reads. [Category: Personal]\n"
228 epilog = ""
229 parser = RepetOptionParser(description = description, epilog = epilog)
230 parser.add_option("-i", "--input", dest="inputFileNames", action="store", default=None, type="string", help="input files, separated by commas [compulsory] [format: string]")
231 parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the input [compulsory] [format: transcript or sequence file format]")
232 parser.add_option("-n", "--names", dest="names", action="store", default=None, type="string", help="name of the input data, separated by commas [compulsory] [format: string]")
233 parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [format: output file in PNG format]")
234 parser.add_option("-s", "--minSize", dest="minSize", action="store", default=None, type="int", help="minimum size [format: int]")
235 parser.add_option("-S", "--maxSize", dest="maxSize", action="store", default=None, type="int", help="maximum size [format: int]")
236 parser.add_option("-l", "--xLabel", dest="xLab", action="store", default="Size", type="string", help="x-axis label name [format: string] [default: Size]")
237 parser.add_option("-L", "--yLabel", dest="yLab", action="store", default="# reads", type="string", help="y-axis label name [format: string] [default: Reads]")
238 parser.add_option("-c", "--colors", dest="colors", action="store", default=None, type="string", help="colors of the bars, separated by commas [format: string]")
239 parser.add_option("-a", "--factors", dest="factors", action="store", default=None, type="string", help="normalization factors, separated by commas [format: string]")
240 parser.add_option("-r", "--regions", dest="regionsFileName", action="store", default=None, type="string", help="regions to plot [format: transcript file in GFF format]")
241 parser.add_option("-z", "--width", dest="width", action="store", default=800, type="int", help="width of the image [format: int] [default: 800]")
242 parser.add_option("-Z", "--height", dest="height", action="store", default=300, type="int", help="height of the image [format: int] [default: 300]")
243 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
244 options = parser.parse_args()[0]
245 iGetReadSizes = GetReadSizes(options.verbosity)
246 iGetReadSizes.setNames(options.names.split(","))
247 iGetReadSizes.setInputFiles(options.inputFileNames.split(","), options.format)
248 iGetReadSizes.setOutputFileName(options.outputFileName)
249 iGetReadSizes.setLabs(options.xLab, options.yLab)
250 iGetReadSizes.setSizes(options.minSize, options.maxSize)
251 iGetReadSizes.setColors(None if options.colors == None else options.colors.split(","))
252 iGetReadSizes.setFactors(None if options.factors == None else map(float, options.factors.split(",")))
253 iGetReadSizes.setRegionsFile(options.regionsFileName)
254 iGetReadSizes.setImageSize(options.width, options.height)
255 iGetReadSizes.run()