comparison SMART/Java/Python/GetDistribution.py @ 36:44d5973c188c

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 15:02:29 -0400
parents
children 169d364ddd91
comparison
equal deleted inserted replaced
35:d94018ca4ada 36:44d5973c188c
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2012
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 import os
32 from optparse import OptionParser
33 from commons.core.parsing.ParserChooser import ParserChooser
34 from commons.core.parsing.FastaParser import FastaParser
35 from SMART.Java.Python.structure.Transcript import Transcript
36 from commons.core.writer.Gff3Writer import Gff3Writer
37 from SMART.Java.Python.misc.RPlotter import RPlotter
38 from SMART.Java.Python.misc.MultipleRPlotter import MultipleRPlotter
39 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
40 from SMART.Java.Python.misc.Progress import Progress
41
42 TWOSTRANDS = {True: [1, -1], False: [0]}
43 STRANDTOSTR = {1: "(+)", -1: "(-)", 0: ""}
44
45 class GetDistribution(object):
46
47 def __init__(self, verbosity):
48 self.verbosity = verbosity
49 self.sizes = None
50 self.twoStrands = False
51 self.start = 1
52 self.names = ["nbElements"]
53 self.average = False
54 self.nbValues = {}
55 self.height = 300
56 self.width = 600
57 self.colors = None
58 self.gffFileName = None
59 self.csvFileName = None
60 self.yMin = None
61 self.yMax = None
62 self.chromosome = None
63 self.merge = False
64 self.nbTranscripts = None
65
66 def setInputFile(self, fileName, format):
67 chooser = ParserChooser(self.verbosity)
68 chooser.findFormat(format)
69 self.parser = chooser.getParser(fileName)
70
71 def setReferenceFile(self, fileName):
72 if fileName == None:
73 return
74 fastaParser = FastaParser(fileName, self.verbosity)
75 self.chromosomes = fastaParser.getRegions()
76 self.sizes = dict([region, fastaParser.getSizeOfRegion(region)] for region in self.chromosomes)
77 self.maxSize = max(self.sizes.values())
78
79 def setRegion(self, chromosome, start, end):
80 if chromosome == None:
81 return
82 self.maxSize = options.end
83 self.sizes = {chromosome: end}
84 self.chromosomes = [chromosome]
85 self.chromosome = chromosome
86 self.start = start
87 self.end = end
88
89 def setOutputFile(self, fileName):
90 self.outputFileName = fileName
91
92 def setNbBins(self, nbBins):
93 self.nbBins = nbBins
94
95 def set2Strands(self, twoStrands):
96 self.twoStrands = twoStrands
97
98 def setNames(self, names):
99 self.names = names
100
101 def setAverage(self, average):
102 self.average = average
103
104 def setNormalization(self, normalization):
105 self.normalization = normalization
106
107 def setImageSize(self, height, width):
108 self.height = height
109 self.width = width
110
111 def setYLimits(self, yMin, yMax):
112 self.yMin = yMin
113 self.yMax = yMax
114
115 def setColors(self, colors):
116 self.colors = colors
117
118 def writeGff(self, fileName):
119 self.gffFileName = fileName
120
121 def writeCsv(self, fileName):
122 self.csvFileName = fileName
123
124 def mergePlots(self, merge):
125 self.merge = merge
126
127 def _estimateSizes(self):
128 progress = UnlimitedProgress(10000, "Reading input for chromosome size estimate", self.verbosity)
129 self.sizes = {}
130 for self.nbTranscripts, transcript in enumerate(self.parser.getIterator()):
131 chromosome = transcript.getChromosome()
132 start = transcript.getStart()
133 self.sizes[chromosome] = max(start, self.sizes.get(chromosome, 0))
134 progress.inc()
135 progress.done()
136
137 def _computeSliceSize(self):
138 if self.nbBins == 0:
139 return
140 tmp1 = int(max(self.sizes.values()) / float(self.nbBins))
141 tmp2 = 10 ** (len("%d" % (tmp1))-2)
142 self.sliceSize = max(1, int((tmp1 / tmp2) * tmp2))
143 if self.verbosity > 0:
144 print "choosing bin size of %d" % (self.sliceSize)
145
146 def _initBins(self):
147 self.bins = {}
148 for chromosome in self.sizes:
149 self.bins[chromosome] = {}
150 for name in self.names:
151 self.bins[chromosome][name] = {}
152 for strand in TWOSTRANDS[self.twoStrands]:
153 if self.nbBins == 0:
154 self.bins[chromosome][name][strand] = {}
155 else:
156 self.bins[chromosome][name][strand] = dict([(i * self.sliceSize + 1, 0.0) for i in range(self.start / self.sliceSize, self.sizes[chromosome] / self.sliceSize + 1)])
157
158 def _populateBins(self):
159 if self.nbTranscripts == None:
160 progress = UnlimitedProgress(10000, "Counting data", self.verbosity)
161 else:
162 progress = Progress(self.nbTranscripts, "Counting data", self.verbosity)
163 for transcript in self.parser.getIterator():
164 if transcript.__class__.__name__ == "Mapping":
165 transcript = transcript.getTranscript()
166 progress.inc()
167 chromosome = transcript.getChromosome()
168 start = transcript.getStart()
169 if self.chromosome and (chromosome != self.chromosome or start < self.start or start > self.end):
170 continue
171 strand = transcript.getDirection() if self.twoStrands else 0
172 if self.nbBins != 0:
173 bin = (start / self.sliceSize) * self.sliceSize + 1
174 else:
175 bin = start
176 for name in self.names:
177 value = float(transcript.tags.get(name, 1))
178 self.bins[chromosome][name][strand][bin] = self.bins[chromosome][name][strand].get(bin, 0) + value
179 self.nbValues[name] = self.nbValues.get(name, 0) + value
180 progress.done()
181
182 def _normalize(self):
183 average = float(sum(self.nbValues)) / len(self.nbValues.keys())
184 factors = dict([name, float(average) / self.nbValues[name]] for name in self.nbValues)
185 for chromosome in self.bins:
186 for name in self.bins[chromosome]:
187 for strand in self.bins[chromosome][name]:
188 for bin in self.bins[chromosome][name][strand]:
189 self.bins[chromosome][name][strand][bin] *= factors[name]
190
191 def _computeAverage(self):
192 for chromosome in self.bins:
193 for name in self.bins[chromosome]:
194 for strand in self.bins[chromosome][name]:
195 for bin in self.bins[chromosome][name][strand]:
196 self.bins[chromosome][name][strand][bin] = float(self.bins[chromosome][name][strand][bin]) / self.sliceSize
197
198 def _getPlotter(self, chromosome):
199 plot = RPlotter("%s_%s.png" % (os.path.splitext(self.outputFileName)[0], chromosome), self.verbosity)
200 plot.setImageSize(self.width, self.height)
201 if self.sizes[chromosome] <= 1000:
202 unit = "nt."
203 ratio = 1.0
204 elif self.sizes[chromosome] <= 1000000:
205 unit = "kb"
206 ratio = 1000.0
207 else:
208 unit = "Mb"
209 ratio = 1000000.0
210 if self.yMin != None:
211 plot.setMinimumY(self.yMin)
212 if self.yMax != None:
213 plot.setMaximumY(self.yMax)
214 plot.setXLabel("Position on %s (in %s)" % (chromosome.replace("_", " "), unit))
215 plot.setLegend(True)
216 for i, name in enumerate(self.bins[chromosome]):
217 for strand in self.bins[chromosome][name]:
218 fullName = "%s %s" % (name.replace("_", " ")[:6], STRANDTOSTR[strand])
219 factor = 1 if strand == 0 else strand
220 correctedLine = dict([(key / ratio, value * factor) for key, value in self.bins[chromosome][name][strand].iteritems()])
221 plot.addLine(correctedLine, fullName, self.colors[i] if self.colors else None)
222 return plot
223
224 def _plot(self):
225 if self.merge:
226 multiplePlot = MultipleRPlotter(self.outputFileName, self.verbosity)
227 multiplePlot.setImageSize(self.width, self.height * len(self.bins.keys()))
228 progress = Progress(len(self.bins.keys()), "Plotting", options.verbosity)
229 for chromosome in sorted(self.bins.keys()):
230 plot = self._getPlotter(chromosome)
231 if self.merge:
232 multiplePlot.addPlot(plot)
233 else:
234 plot.plot()
235 progress.inc()
236 if self.merge:
237 multiplePlot.plot()
238 progress.done()
239
240 def _writeCsv(self):
241 if self.verbosity > 1:
242 print "Writing CSV file..."
243 csvHandle = open(self.csvFileName, "w")
244 csvHandle.write("chromosome;tag;strand")
245 if self.nbBins != 0:
246 xValues = range(self.start / self.sliceSize, max(self.sizes.values()) / self.sliceSize + 1)
247 for value in xValues:
248 csvHandle.write(";%d-%d" % (value * self.sliceSize + 1, (value+1) * self.sliceSize))
249 csvHandle.write("\n")
250 else:
251 xValues = []
252 for chromosome in self.bins:
253 for name in self.bins[chromosome]:
254 for strand in self.bins[chromosome][name]:
255 for bin in self.bins[chromosome][name][strand]:
256 xValues.extend(self.bins[chromosome][name][strand].keys())
257 xValues = sorted(list(set(xValues)))
258 for value in xValues:
259 csvHandle.write(";%d" % (value))
260 csvHandle.write("\n")
261 for chromosome in self.bins:
262 csvHandle.write("%s" % (chromosome))
263 for name in self.bins[chromosome]:
264 csvHandle.write(";%s" % (name))
265 for strand in self.bins[chromosome][name]:
266 csvHandle.write(";%s" % (STRANDTOSTR[strand]))
267 for bin in xValues:
268 csvHandle.write(";%.2f" % (self.bins[chromosome][name][strand].get(bin, 0)))
269 csvHandle.write("\n")
270 csvHandle.write(";")
271 csvHandle.write(";")
272 csvHandle.close()
273 if self.verbosity > 1:
274 print "...done"
275
276 def _writeGff(self):
277 if self.verbosity > 1:
278 print "Writing GFF file..."
279 writer = Gff3Writer(self.gffFileName, self.verbosity)
280 cpt = 1
281 for chromosome in self.bins:
282 for name in self.bins[chromosome]:
283 for strand in self.bins[chromosome][name]:
284 for bin in self.bins[chromosome][name][strand]:
285 transcript = Transcript()
286 transcript.setChromosome(chromosome)
287 transcript.setStart(bin)
288 if self.nbBins > 0:
289 transcript.setEnd(bin + self.sliceSize)
290 else:
291 transcript.setEnd(self.start)
292 transcript.setDirection(1 if strand == 0 else strand)
293 transcript.setTagValue("ID", "region%d" % (cpt))
294 cpt += 1
295 writer.write()
296 if self.verbosity > 1:
297 print "...done"
298
299 def run(self):
300 if self.sizes == None:
301 self._estimateSizes()
302 self._computeSliceSize()
303 self._initBins()
304 self._populateBins()
305 if self.normalization:
306 self._normalize()
307 if self.average:
308 self._computeAverage()
309 self._plot()
310 if self.csvFileName != None:
311 self._writeCsv()
312 if self.gffFileName != None:
313 self._writeGff()
314
315
316 if __name__ == "__main__":
317
318 description = "Get Distribution v1.0.2: Get the distribution of the genomic coordinates on a genome. [Category: Visualization]"
319
320 parser = OptionParser(description = description)
321 parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]")
322 parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]")
323 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]")
324 parser.add_option("-r", "--reference", dest="referenceFileName", action="store", default=None, type="string", help="file containing the genome [format: file in FASTA format]")
325 parser.add_option("-b", "--nbBins", dest="nbBins", action="store", default=1000, type="int", help="number of bins [default: 1000] [format: int]")
326 parser.add_option("-2", "--bothStrands", dest="bothStrands", action="store_true", default=False, help="plot one curve per strand [format: bool] [default: false]")
327 parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="plot only a chromosome [format: string]")
328 parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="start from a given region [format: int]")
329 parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="end from a given region [format: int]")
330 parser.add_option("-y", "--yMin", dest="yMin", action="store", default=None, type="int", help="minimum value on the y-axis to plot [format: int]")
331 parser.add_option("-Y", "--yMax", dest="yMax", action="store", default=None, type="int", help="maximum value on the y-axis to plot [format: int]")
332 parser.add_option("-x", "--csv", dest="csv", action="store", default=None, help="write a .csv file [format: output file in CSV format] [default: None]")
333 parser.add_option("-g", "--gff", dest="gff", action="store", default=None, help="also write GFF3 file [format: output file in GFF format] [default: None]")
334 parser.add_option("-H", "--height", dest="height", action="store", default=300, type="int", help="height of the graphics [format: int] [default: 300]")
335 parser.add_option("-W", "--width", dest="width", action="store", default=600, type="int", help="width of the graphics [format: int] [default: 1000]")
336 parser.add_option("-a", "--average", dest="average", action="store_true", default=False, help="plot average (instead of sum) [default: false] [format: boolean]")
337 parser.add_option("-n", "--names", dest="names", action="store", default="nbElements", type="string", help="name for the tags (separated by commas and no space) [default: None] [format: string]")
338 parser.add_option("-l", "--color", dest="colors", action="store", default=None, type="string", help="color of the lines (separated by commas and no space) [format: string]")
339 parser.add_option("-z", "--normalize", dest="normalize", action="store_true", default=False, help="normalize data (when panels are different) [format: bool] [default: false]")
340 parser.add_option("-m", "--merge", dest="mergePlots", action="store_true", default=False, help="merge all plots in one figure [format: bool] [default: false]")
341 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]")
342 (options, args) = parser.parse_args()
343
344 gt = GetDistribution(options.verbosity)
345 gt.setInputFile(options.inputFileName, options.format)
346 gt.setOutputFile(options.outputFileName)
347 gt.setReferenceFile(options.referenceFileName)
348 gt.setNbBins(int(options.nbBins))
349 gt.set2Strands(options.bothStrands)
350 gt.setRegion(options.chromosome, options.start, options.end)
351 gt.setNormalization(options.normalize)
352 gt.setAverage(options.average)
353 gt.setYLimits(options.yMin, options.yMax)
354 gt.writeCsv(options.csv)
355 gt.writeGff(options.gff)
356 gt.setImageSize(options.height, options.width)
357 gt.setNames(options.names.split(","))
358 gt.setColors(None if options.colors == None else options.colors.split(","))
359 gt.setNormalization(options.normalize)
360 gt.mergePlots(options.mergePlots)
361 gt.run()
362