comparison SMART/Java/Python/GetReadSizes.py @ 38:2c0c0a89fad7

Uploaded
author m-zytnicki
date Thu, 02 May 2013 09:56:47 -0400
parents
children 169d364ddd91
comparison
equal deleted inserted replaced
37:d22fadc825e3 38:2c0c0a89fad7
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2010
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 import random, os, glob, subprocess
32 from commons.core.parsing.ParserChooser import ParserChooser
33 from commons.core.parsing.GffParser import GffParser
34 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
35 from SMART.Java.Python.misc.Progress import Progress
36 from SMART.Java.Python.misc import Utils
37 from commons.core.LoggerFactory import LoggerFactory
38 from commons.core.utils.RepetOptionParser import RepetOptionParser
39
40 LOG_DEPTH = "smart"
41 DEFAULT_REGION = "_all_"
42
43 class GetReadSizes(object):
44
45 def __init__(self, verbosity = 0):
46 self.xLab = "Size"
47 self.yLab = "# reads"
48 self.verbosity = verbosity
49 self.number = random.randint(0, 100000)
50 self.log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self.verbosity)
51 self.parsers = {}
52 self.sizes = {}
53 self.factors = {}
54 self.regions = None
55 self.tmpDatName = None
56 self.tmpRName = None
57 self.width = 800
58 self.height = 300
59 self.arial = False
60
61 def setNames(self, names):
62 self.names = names
63
64 def setInputFiles(self, fileNames, format):
65 chooser = ParserChooser(self.verbosity)
66 chooser.findFormat(format)
67 for cpt, fileName in enumerate(fileNames):
68 self.parsers[self.names[cpt]] = chooser.getParser(fileName)
69
70 def setOutputFileName(self, fileName):
71 self.outputFileName = fileName
72
73 def setLabs(self, xLab, yLab):
74 self.xLab = xLab
75 self.yLab = yLab
76
77 def setSizes(self, minSize, maxSize):
78 self.minSize = minSize
79 self.maxSize = maxSize
80
81 def setColors(self, colors):
82 self.colors = colors
83
84 def setFactors(self, factors):
85 self.factors = dict(zip(self.names, factors))
86
87 def setRegionsFile(self, fileName):
88 if fileName != None:
89 self._loadRegions(fileName)
90
91 def setImageSize(self, width, height):
92 if width != None:
93 self.width = width
94 if height != None:
95 self.height = height
96
97 def setArial(self, arial):
98 self.arial = arial
99
100 def _checkOptions(self):
101 if not self.parsers:
102 self.logAndRaise("ERROR: Missing input file names")
103
104 def _logAndRaise(self, errorMsg):
105 self.log.error(errorMsg)
106 raise Exception(errorMsg)
107
108 def _loadRegions(self, fileName):
109 self.regions = {}
110 parser = GffParser(fileName, self.verbosity)
111 for transcript in parser.getIterator():
112 chromosome = transcript.getChromosome()
113 start = transcript.getStart()
114 end = transcript.getEnd()
115 name = transcript.getName()
116 if chromosome not in self.regions:
117 self.regions[chromosome] = {}
118 if start not in self.regions[chromosome]:
119 self.regions[chromosome][start] = {}
120 if end not in self.regions[chromosome][start]:
121 self.regions[chromosome][start][end] = []
122 self.regions[chromosome][start][end].append(name)
123
124 def _getRegions(self, transcript):
125 if self.regions == None:
126 return [DEFAULT_REGION]
127 chromosome = transcript.getChromosome()
128 start = transcript.getStart()
129 end = transcript.getEnd()
130 if chromosome not in self.regions:
131 return []
132 names = []
133 for loadedStart in sorted(self.regions[chromosome].keys()):
134 if loadedStart > end:
135 return names
136 for loadedEnd in reversed(sorted(self.regions[chromosome][loadedStart].keys())):
137 if loadedEnd < start:
138 break
139 names.extend(self.regions[chromosome][loadedStart][loadedEnd])
140 return names
141
142 def _parse(self, name):
143 progress = UnlimitedProgress(10000, "Reading file '%s'" % (name), self.verbosity)
144 for transcript in self.parsers[name].getIterator():
145 if transcript.__class__.__name__ == "Mapping":
146 transcript = transcript.getTranscript()
147 regions = self._getRegions(transcript)
148 for region in regions:
149 if region not in self.sizes:
150 self.sizes[region] = {}
151 if name not in self.sizes[region]:
152 self.sizes[region][name] = {}
153 size = transcript.getSize()
154 if (self.minSize == None or size >= self.minSize) and (self.maxSize == None or size <= self.maxSize):
155 nbElements = float(transcript.getTagValue("nbElements")) if "nbElements" in transcript.getTagNames() else 1
156 nbElements *= self.factors.get(name, 1)
157 self.sizes[region][name][size] = self.sizes[region][name].get(size, 0) + nbElements
158 progress.inc()
159 progress.done()
160 if self.minSize == None:
161 self.minSize = min([min(self.sizes[region][name].keys()) for name in self.names for region in region])
162 if self.maxSize == None:
163 self.maxSize = max([max(self.sizes[region][name].keys()) for name in self.names for region in region])
164
165 def _checkQuorum(self, region):
166 return (max([sum(self.sizes[region][name].values()) for name in self.sizes[region]]) > 0)
167
168 def _writeData(self, region):
169 self.tmpDatName = "tmpFile%d.dat" % (self.number)
170 handle = open(self.tmpDatName, "w")
171 handle.write("Size\tCount\tSample\n")
172 for name in self.sizes[region]:
173 for size in sorted(self.sizes[region][name].keys()):
174 handle.write("%d\t%d\t\"%s\"\n" % (size, self.sizes[region][name].get(size, 0), name))
175 handle.close()
176
177 def _writeScript(self, region):
178 self.tmpRName = "tmpFile%d.R" % (self.number)
179 fileName = self.outputFileName if region == DEFAULT_REGION else "%s_%s.png" % (os.path.splitext(self.outputFileName)[0], region)
180 colors = "scale_fill_brewer(palette=\"Set1\")" if self.colors == None else "scale_fill_manual(values = c(%s))" % (", ".join(["\"%s\"" % (color) for color in self.colors]))
181 title = "" if region == DEFAULT_REGION else " + labs(title = \"Sizes of %s\")" % (region)
182 handle = open(self.tmpRName, "w")
183 arial = ", text = element_text(family=\"Arial\", size=20)" if self.arial else ""
184 if self.arial:
185 handle.write("library(extrafont)\nloadfonts()\n")
186 handle.write("library(ggplot2)\n")
187 handle.write("data <- read.table(\"%s\", header = T)\n" % (self.tmpDatName))
188 handle.write("data$Sample <- factor(data$Sample, levels=c(%s))\n" % (", ".join(["\"%s\"" % (name) for name in self.names])))
189 handle.write("data$Size <- factor(data$Size, levels=c(%s))\n" % (", ".join(["%d" % (size) for size in range(self.minSize, self.maxSize+1)])))
190 handle.write("png(\"%s\", width = %d, height = %d)\n" % (fileName, self.width, self.height))
191 handle.write("ggplot(data, aes(x = Size, y = Count, fill = Size)) %s + geom_bar(stat = \"identity\") + facet_grid(. ~ Sample, space=\"free_x\") + xlab(\"%s\") + ylab(\"%s\") + %s + theme(legend.position = \"none\", panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank()%s)\n" % (title, self.xLab, self.yLab, colors, arial))
192 handle.write("dev.off()\n")
193
194 def _runR(self):
195 rCommand = os.environ["SMARTRPATH"] if "SMARTRPATH" in os.environ else "R"
196 command = "\"%s\" CMD BATCH %s" % (rCommand, self.tmpRName)
197 status = subprocess.call(command, shell=True)
198 if status != 0:
199 raise Exception("Problem with the execution of script file %s, status is: %s" % (self.tmpRName, status))
200
201 def _plot(self):
202 progress = Progress(len(self.sizes), "Plotting data", self.verbosity)
203 for region in self.sizes:
204 if not self._checkQuorum(region):
205 self.log.info("Not displaying '%s' for it contains no data." % (region))
206 else:
207 self._writeData(region)
208 self._writeScript(region)
209 self._runR()
210 progress.inc()
211 progress.done()
212
213 def _cleanFiles(self):
214 for fileName in (self.tmpDatName, self.tmpRName):
215 if fileName != None and os.path.exists(fileName):
216 os.remove(fileName)
217 for otherFileName in glob.glob("%s*" % (fileName)):
218 os.remove(otherFileName)
219
220 def run(self):
221 LoggerFactory.setLevel(self.log, self.verbosity)
222 self._checkOptions()
223 self.log.info("START Get Read Sizes")
224 for name in self.names:
225 self._parse(name)
226 self._plot()
227 self._cleanFiles()
228 self.log.info("END Get Read Sizes")
229
230
231 if __name__ == "__main__":
232 description = "Usage: GetReadSizes.py [options]\n\nGet Read Sizes v1.0.1: Get the sizes of a set of reads. [Category: Personal]\n"
233 epilog = ""
234 parser = RepetOptionParser(description = description, epilog = epilog)
235 parser.add_option("-i", "--input", dest="inputFileNames", action="store", default=None, type="string", help="input files, separated by commas [compulsory] [format: string]")
236 parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the input [compulsory] [format: transcript or sequence file format]")
237 parser.add_option("-n", "--names", dest="names", action="store", default=None, type="string", help="name of the input data, separated by commas [compulsory] [format: string]")
238 parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [format: output file in PNG format]")
239 parser.add_option("-s", "--minSize", dest="minSize", action="store", default=None, type="int", help="minimum size [format: int]")
240 parser.add_option("-S", "--maxSize", dest="maxSize", action="store", default=None, type="int", help="maximum size [format: int]")
241 parser.add_option("-l", "--xLabel", dest="xLab", action="store", default="Size", type="string", help="x-axis label name [format: string] [default: Size]")
242 parser.add_option("-L", "--yLabel", dest="yLab", action="store", default="# reads", type="string", help="y-axis label name [format: string] [default: Reads]")
243 parser.add_option("-c", "--colors", dest="colors", action="store", default=None, type="string", help="colors of the bars, separated by commas [format: string]")
244 parser.add_option("-a", "--factors", dest="factors", action="store", default=None, type="string", help="normalization factors, separated by commas [format: string]")
245 parser.add_option("-r", "--regions", dest="regionsFileName", action="store", default=None, type="string", help="regions to plot [format: transcript file in GFF format]")
246 parser.add_option("-z", "--width", dest="width", action="store", default=800, type="int", help="width of the image [format: int] [default: 800]")
247 parser.add_option("-Z", "--height", dest="height", action="store", default=300, type="int", help="height of the image [format: int] [default: 300]")
248 parser.add_option("-A", "--arial", dest="arial", action="store_true", default=False, help="use Arial font [format: boolean] [default: false]")
249 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
250 options = parser.parse_args()[0]
251 iGetReadSizes = GetReadSizes(options.verbosity)
252 iGetReadSizes.setNames(options.names.split(","))
253 iGetReadSizes.setInputFiles(options.inputFileNames.split(","), options.format)
254 iGetReadSizes.setOutputFileName(options.outputFileName)
255 iGetReadSizes.setLabs(options.xLab, options.yLab)
256 iGetReadSizes.setSizes(options.minSize, options.maxSize)
257 iGetReadSizes.setColors(None if options.colors == None else options.colors.split(","))
258 iGetReadSizes.setFactors(None if options.factors == None else map(float, options.factors.split(",")))
259 iGetReadSizes.setRegionsFile(options.regionsFileName)
260 iGetReadSizes.setImageSize(options.width, options.height)
261 iGetReadSizes.setArial(options.arial)
262 iGetReadSizes.run()