comparison SMART/Java/Python/GetDistribution.py @ 46:169d364ddd91

Uploaded
author m-zytnicki
date Mon, 30 Sep 2013 03:19:26 -0400
parents 44d5973c188c
children
comparison
equal deleted inserted replaced
45:e454402ba9d9 46:169d364ddd91
43 STRANDTOSTR = {1: "(+)", -1: "(-)", 0: ""} 43 STRANDTOSTR = {1: "(+)", -1: "(-)", 0: ""}
44 44
45 class GetDistribution(object): 45 class GetDistribution(object):
46 46
47 def __init__(self, verbosity): 47 def __init__(self, verbosity):
48 self.verbosity = verbosity 48 self.verbosity = verbosity
49 self.sizes = None 49 self.sizes = None
50 self.twoStrands = False 50 self.nbBins = None
51 self.start = 1 51 self.sliceSize = None
52 self.names = ["nbElements"] 52 self.twoStrands = False
53 self.average = False 53 self.start = 1
54 self.nbValues = {} 54 self.names = ["nbElements"]
55 self.height = 300 55 self.average = False
56 self.width = 600 56 self.nbValues = {}
57 self.colors = None 57 self.height = 300
58 self.gffFileName = None 58 self.width = 600
59 self.csvFileName = None 59 self.dots = False
60 self.yMin = None 60 self.colors = None
61 self.yMax = None 61 self.gffFileName = None
62 self.chromosome = None 62 self.csvFileName = None
63 self.merge = False 63 self.yMin = None
64 self.nbTranscripts = None 64 self.yMax = None
65 65 self.chromosome = None
66 def setInputFile(self, fileName, format): 66 self.merge = False
67 chooser = ParserChooser(self.verbosity) 67 self.nbTranscripts = None
68 chooser.findFormat(format) 68 self.factors = None
69 self.parser = chooser.getParser(fileName) 69 self.thicknessCurve = 1
70 self.sizePoliceLegend = 1.5
71
72 def setInputFiles(self, fileNames, format):
73 self.fileNames = fileNames
74 self.format = format
70 75
71 def setReferenceFile(self, fileName): 76 def setReferenceFile(self, fileName):
72 if fileName == None: 77 if fileName == None:
73 return 78 return
74 fastaParser = FastaParser(fileName, self.verbosity) 79 fastaParser = FastaParser(fileName, self.verbosity)
75 self.chromosomes = fastaParser.getRegions() 80 self.chromosomes = fastaParser.getRegions()
76 self.sizes = dict([region, fastaParser.getSizeOfRegion(region)] for region in self.chromosomes) 81 self.sizes = dict([region, fastaParser.getSizeOfRegion(region)] for region in self.chromosomes)
77 self.maxSize = max(self.sizes.values()) 82 self.maxSize = max(self.sizes.values())
78 83
79 def setRegion(self, chromosome, start, end): 84 def setRegion(self, chromosome, start, end):
80 if chromosome == None: 85 if chromosome == None or start == None or end == None:
81 return 86 return
82 self.maxSize = options.end 87 self.maxSize = options.end
83 self.sizes = {chromosome: end} 88 self.sizes = {chromosome: end}
84 self.chromosomes = [chromosome] 89 self.chromosomes = [chromosome]
85 self.chromosome = chromosome 90 self.chromosome = chromosome
88 93
89 def setOutputFile(self, fileName): 94 def setOutputFile(self, fileName):
90 self.outputFileName = fileName 95 self.outputFileName = fileName
91 96
92 def setNbBins(self, nbBins): 97 def setNbBins(self, nbBins):
93 self.nbBins = nbBins 98 if nbBins != None:
99 self.nbBins = int(nbBins)
100
101 def setBinSize(self, binSize):
102 if binSize != None:
103 self.sliceSize = int(binSize)
94 104
95 def set2Strands(self, twoStrands): 105 def set2Strands(self, twoStrands):
96 self.twoStrands = twoStrands 106 self.twoStrands = twoStrands
97 107
98 def setNames(self, names): 108 def setNames(self, names):
99 self.names = names 109 self.names = names
110 if len(self.names) == 1 and len(self.fileNames) > 1:
111 self.names = ["file %d" % (i+1) for i in range(len(self.fileNames))]
100 112
101 def setAverage(self, average): 113 def setAverage(self, average):
102 self.average = average 114 self.average = average
103 115
104 def setNormalization(self, normalization): 116 def setNormalization(self, normalization):
105 self.normalization = normalization 117 self.normalization = normalization
118
119 def setNormalizationFactors(self, factors):
120 self.factors = dict([name, 1.0] for name in self.names) if factors == None else dict(zip(self.names, factors))
106 121
107 def setImageSize(self, height, width): 122 def setImageSize(self, height, width):
108 self.height = height 123 self.height = height
109 self.width = width 124 self.width = width
110 125
126 def setDots(self, dots):
127 self.dots = dots
128
111 def setYLimits(self, yMin, yMax): 129 def setYLimits(self, yMin, yMax):
112 self.yMin = yMin 130 self.yMin = yMin
113 self.yMax = yMax 131 self.yMax = yMax
114 132
115 def setColors(self, colors): 133 def setColors(self, colors):
122 self.csvFileName = fileName 140 self.csvFileName = fileName
123 141
124 def mergePlots(self, merge): 142 def mergePlots(self, merge):
125 self.merge = merge 143 self.merge = merge
126 144
145 def setThicknessCurve(self, thickness) :
146 self.thickness = thickness
147
148 def setSizePoliceLegend(self, sizePoliceLegend):
149 self.sizePoliceLegend = sizePoliceLegend
150
127 def _estimateSizes(self): 151 def _estimateSizes(self):
128 progress = UnlimitedProgress(10000, "Reading input for chromosome size estimate", self.verbosity) 152 self.sizes = {}
129 self.sizes = {} 153 self.nbTranscripts = {}
130 for self.nbTranscripts, transcript in enumerate(self.parser.getIterator()): 154 for fileName in self.fileNames:
131 chromosome = transcript.getChromosome() 155 progress = UnlimitedProgress(10000, "Reading %s for chromosome size estimate" % (fileName), self.verbosity)
132 start = transcript.getStart() 156 parserChooser = ParserChooser(self.verbosity)
133 self.sizes[chromosome] = max(start, self.sizes.get(chromosome, 0)) 157 parserChooser.findFormat(self.format)
134 progress.inc() 158 parser = parserChooser.getParser(fileName)
135 progress.done() 159 for nbTranscripts, transcript in enumerate(parser.getIterator()):
160 if transcript.__class__.__name__ == "Mapping":
161 transcript = transcript.getTranscript()
162 chromosome = transcript.getChromosome()
163 start = transcript.getStart()
164 self.sizes[chromosome] = max(start, self.sizes.get(chromosome, 0))
165 progress.inc()
166 progress.done()
167 self.nbTranscripts[fileName] = nbTranscripts
136 168
137 def _computeSliceSize(self): 169 def _computeSliceSize(self):
138 if self.nbBins == 0: 170 if self.nbBins == 0:
139 return 171 return
140 tmp1 = int(max(self.sizes.values()) / float(self.nbBins)) 172 tmp1 = int(max(self.sizes.values()) / float(self.nbBins))
154 self.bins[chromosome][name][strand] = {} 186 self.bins[chromosome][name][strand] = {}
155 else: 187 else:
156 self.bins[chromosome][name][strand] = dict([(i * self.sliceSize + 1, 0.0) for i in range(self.start / self.sliceSize, self.sizes[chromosome] / self.sliceSize + 1)]) 188 self.bins[chromosome][name][strand] = dict([(i * self.sliceSize + 1, 0.0) for i in range(self.start / self.sliceSize, self.sizes[chromosome] / self.sliceSize + 1)])
157 189
158 def _populateBins(self): 190 def _populateBins(self):
159 if self.nbTranscripts == None: 191 for id, fileName in enumerate(self.fileNames):
160 progress = UnlimitedProgress(10000, "Counting data", self.verbosity) 192 if self.nbTranscripts == None:
161 else: 193 progress = UnlimitedProgress(10000, "Counting data", self.verbosity)
162 progress = Progress(self.nbTranscripts, "Counting data", self.verbosity)
163 for transcript in self.parser.getIterator():
164 if transcript.__class__.__name__ == "Mapping":
165 transcript = transcript.getTranscript()
166 progress.inc()
167 chromosome = transcript.getChromosome()
168 start = transcript.getStart()
169 if self.chromosome and (chromosome != self.chromosome or start < self.start or start > self.end):
170 continue
171 strand = transcript.getDirection() if self.twoStrands else 0
172 if self.nbBins != 0:
173 bin = (start / self.sliceSize) * self.sliceSize + 1
174 else: 194 else:
175 bin = start 195 progress = Progress(self.nbTranscripts[fileName], "Counting data", self.verbosity)
176 for name in self.names: 196 parserChooser = ParserChooser(self.verbosity)
177 value = float(transcript.tags.get(name, 1)) 197 parserChooser.findFormat(self.format)
178 self.bins[chromosome][name][strand][bin] = self.bins[chromosome][name][strand].get(bin, 0) + value 198 parser = parserChooser.getParser(fileName)
179 self.nbValues[name] = self.nbValues.get(name, 0) + value 199 for transcript in parser.getIterator():
180 progress.done() 200 if transcript.__class__.__name__ == "Mapping":
181 201 transcript = transcript.getTranscript()
182 def _normalize(self): 202 progress.inc()
183 average = float(sum(self.nbValues)) / len(self.nbValues.keys()) 203 chromosome = transcript.getChromosome()
184 factors = dict([name, float(average) / self.nbValues[name]] for name in self.nbValues) 204 start = transcript.getStart()
205 if self.chromosome and (chromosome != self.chromosome or start < self.start or start > self.end):
206 continue
207 strand = transcript.getDirection() if self.twoStrands else 0
208 if self.nbBins != 0:
209 bin = (start / self.sliceSize) * self.sliceSize + 1
210 else:
211 bin = start
212 if len(self.fileNames) > 1:
213 nbElements = transcript.getTagValue("nbElements") if "nbElements" in transcript.getTagNames() else 1
214 name = self.names[id]
215 self.bins[chromosome][name][strand][bin] = self.bins[chromosome][name][strand].get(bin, 0) + nbElements
216 self.nbValues[name] = self.nbValues.get(name, 0) + nbElements
217 else:
218 for name in self.names:
219 value = float(transcript.tags.get(name, 1))
220 self.bins[chromosome][name][strand][bin] = self.bins[chromosome][name][strand].get(bin, 0) + value
221 self.nbValues[name] = self.nbValues.get(name, 0) + value
222 progress.done()
223
224 def _normalizeFactors(self):
185 for chromosome in self.bins: 225 for chromosome in self.bins:
186 for name in self.bins[chromosome]: 226 for name in self.bins[chromosome]:
187 for strand in self.bins[chromosome][name]: 227 for strand in self.bins[chromosome][name]:
188 for bin in self.bins[chromosome][name][strand]: 228 for bin in self.bins[chromosome][name][strand]:
189 self.bins[chromosome][name][strand][bin] *= factors[name] 229 self.bins[chromosome][name][strand][bin] *= self.factors[name]
230
231 def _normalize(self):
232 average = float(sum(self.nbValues.values())) / len(self.nbValues.keys())
233 self.factors = dict([name, float(average) / self.nbValues[name]] for name in self.nbValues)
234 self._normalizeFactors()
190 235
191 def _computeAverage(self): 236 def _computeAverage(self):
192 for chromosome in self.bins: 237 for chromosome in self.bins:
193 for name in self.bins[chromosome]: 238 for name in self.bins[chromosome]:
194 for strand in self.bins[chromosome][name]: 239 for strand in self.bins[chromosome][name]:
196 self.bins[chromosome][name][strand][bin] = float(self.bins[chromosome][name][strand][bin]) / self.sliceSize 241 self.bins[chromosome][name][strand][bin] = float(self.bins[chromosome][name][strand][bin]) / self.sliceSize
197 242
198 def _getPlotter(self, chromosome): 243 def _getPlotter(self, chromosome):
199 plot = RPlotter("%s_%s.png" % (os.path.splitext(self.outputFileName)[0], chromosome), self.verbosity) 244 plot = RPlotter("%s_%s.png" % (os.path.splitext(self.outputFileName)[0], chromosome), self.verbosity)
200 plot.setImageSize(self.width, self.height) 245 plot.setImageSize(self.width, self.height)
246 plot.setLineWidth(self.thickness)
247 plot.setSizePoliceLegend(self.sizePoliceLegend)
248 if self.dots:
249 plot.setPoints(True)
201 if self.sizes[chromosome] <= 1000: 250 if self.sizes[chromosome] <= 1000:
202 unit = "nt." 251 unit = "nt."
203 ratio = 1.0 252 ratio = 1.0
204 elif self.sizes[chromosome] <= 1000000: 253 elif self.sizes[chromosome] <= 1000000:
205 unit = "kb" 254 unit = "kb"
210 if self.yMin != None: 259 if self.yMin != None:
211 plot.setMinimumY(self.yMin) 260 plot.setMinimumY(self.yMin)
212 if self.yMax != None: 261 if self.yMax != None:
213 plot.setMaximumY(self.yMax) 262 plot.setMaximumY(self.yMax)
214 plot.setXLabel("Position on %s (in %s)" % (chromosome.replace("_", " "), unit)) 263 plot.setXLabel("Position on %s (in %s)" % (chromosome.replace("_", " "), unit))
215 plot.setLegend(True) 264 if len(self.names) > 1:
265 plot.setLegend(True, True)
216 for i, name in enumerate(self.bins[chromosome]): 266 for i, name in enumerate(self.bins[chromosome]):
217 for strand in self.bins[chromosome][name]: 267 for strand in self.bins[chromosome][name]:
218 fullName = "%s %s" % (name.replace("_", " ")[:6], STRANDTOSTR[strand]) 268 #fullName = "%s %s" % (name.replace("_", " ")[:6], STRANDTOSTR[strand])
269 fullName = name.replace("_", " ")[:6]
219 factor = 1 if strand == 0 else strand 270 factor = 1 if strand == 0 else strand
220 correctedLine = dict([(key / ratio, value * factor) for key, value in self.bins[chromosome][name][strand].iteritems()]) 271 correctedLine = dict([(key / ratio, value * factor) for key, value in self.bins[chromosome][name][strand].iteritems()])
221 plot.addLine(correctedLine, fullName, self.colors[i] if self.colors else None) 272 plot.addLine(correctedLine, fullName, self.colors[i] if self.colors else None)
222 return plot 273 return plot
223 274
297 print "...done" 348 print "...done"
298 349
299 def run(self): 350 def run(self):
300 if self.sizes == None: 351 if self.sizes == None:
301 self._estimateSizes() 352 self._estimateSizes()
302 self._computeSliceSize() 353 if self.sliceSize == None:
354 self._computeSliceSize()
303 self._initBins() 355 self._initBins()
304 self._populateBins() 356 self._populateBins()
305 if self.normalization: 357 if self.normalization:
306 self._normalize() 358 self._normalize()
359 if self.factors != None:
360 self._normalizeFactors()
307 if self.average: 361 if self.average:
308 self._computeAverage() 362 self._computeAverage()
309 self._plot() 363 self._plot()
310 if self.csvFileName != None: 364 if self.csvFileName != None:
311 self._writeCsv() 365 self._writeCsv()
316 if __name__ == "__main__": 370 if __name__ == "__main__":
317 371
318 description = "Get Distribution v1.0.2: Get the distribution of the genomic coordinates on a genome. [Category: Visualization]" 372 description = "Get Distribution v1.0.2: Get the distribution of the genomic coordinates on a genome. [Category: Visualization]"
319 373
320 parser = OptionParser(description = description) 374 parser = OptionParser(description = description)
321 parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") 375 parser.add_option("-i", "--input", dest="inputFileNames", action="store", type="string", help="input files separated by commas [compulsory] [format: string]")
322 parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") 376 parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]")
323 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") 377 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]")
324 parser.add_option("-r", "--reference", dest="referenceFileName", action="store", default=None, type="string", help="file containing the genome [format: file in FASTA format]") 378 parser.add_option("-r", "--reference", dest="referenceFileName", action="store", default=None, type="string", help="file containing the genome [format: file in FASTA format]")
325 parser.add_option("-b", "--nbBins", dest="nbBins", action="store", default=1000, type="int", help="number of bins [default: 1000] [format: int]") 379 parser.add_option("-b", "--nbBins", dest="nbBins", action="store", default=1000, type="int", help="number of bins [default: 1000] [format: int]")
326 parser.add_option("-2", "--bothStrands", dest="bothStrands", action="store_true", default=False, help="plot one curve per strand [format: bool] [default: false]") 380 parser.add_option("-B", "--binSize", dest="binSize", action="store", default=None, type="int", help="bin size [default: None] [format: int]")
327 parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="plot only a chromosome [format: string]") 381 parser.add_option("-2", "--bothStrands", dest="bothStrands", action="store_true", default=False, help="plot one curve per strand [format: bool] [default: false]")
328 parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="start from a given region [format: int]") 382 parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="plot only a chromosome [format: string]")
329 parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="end from a given region [format: int]") 383 parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="start from a given region [format: int]")
330 parser.add_option("-y", "--yMin", dest="yMin", action="store", default=None, type="int", help="minimum value on the y-axis to plot [format: int]") 384 parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="end from a given region [format: int]")
331 parser.add_option("-Y", "--yMax", dest="yMax", action="store", default=None, type="int", help="maximum value on the y-axis to plot [format: int]") 385 parser.add_option("-y", "--yMin", dest="yMin", action="store", default=None, type="int", help="minimum value on the y-axis to plot [format: int]")
332 parser.add_option("-x", "--csv", dest="csv", action="store", default=None, help="write a .csv file [format: output file in CSV format] [default: None]") 386 parser.add_option("-Y", "--yMax", dest="yMax", action="store", default=None, type="int", help="maximum value on the y-axis to plot [format: int]")
333 parser.add_option("-g", "--gff", dest="gff", action="store", default=None, help="also write GFF3 file [format: output file in GFF format] [default: None]") 387 parser.add_option("-x", "--csv", dest="csv", action="store", default=None, help="write a .csv file [format: output file in CSV format] [default: None]")
334 parser.add_option("-H", "--height", dest="height", action="store", default=300, type="int", help="height of the graphics [format: int] [default: 300]") 388 parser.add_option("-g", "--gff", dest="gff", action="store", default=None, help="also write GFF3 file [format: output file in GFF format] [default: None]")
335 parser.add_option("-W", "--width", dest="width", action="store", default=600, type="int", help="width of the graphics [format: int] [default: 1000]") 389 parser.add_option("-H", "--height", dest="height", action="store", default=500, type="int", help="height of the graphics [format: int] [default: 300]")
336 parser.add_option("-a", "--average", dest="average", action="store_true", default=False, help="plot average (instead of sum) [default: false] [format: boolean]") 390 parser.add_option("-W", "--width", dest="width", action="store", default=800, type="int", help="width of the graphics [format: int] [default: 1000]")
337 parser.add_option("-n", "--names", dest="names", action="store", default="nbElements", type="string", help="name for the tags (separated by commas and no space) [default: None] [format: string]") 391 parser.add_option("-t", "--thickness", dest="lineThickness", action="store", default=1, type="int", help="thickness of the lines [format : int] [default : 1]")
338 parser.add_option("-l", "--color", dest="colors", action="store", default=None, type="string", help="color of the lines (separated by commas and no space) [format: string]") 392 parser.add_option("-d", "--policeLegend", dest="sizePoliceLegend", action="store", default=1.5, type="float", help="size of the police of the legend [format : float] [default : 1.5]")
339 parser.add_option("-z", "--normalize", dest="normalize", action="store_true", default=False, help="normalize data (when panels are different) [format: bool] [default: false]") 393 parser.add_option("-D", "--dots", dest="dots", action="store_true", default=False, help="plot dots instead of lines [format : bool] [default : false]")
340 parser.add_option("-m", "--merge", dest="mergePlots", action="store_true", default=False, help="merge all plots in one figure [format: bool] [default: false]") 394 parser.add_option("-a", "--average", dest="average", action="store_true", default=False, help="plot average (instead of sum) [default: false] [format: boolean]")
341 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") 395 parser.add_option("-n", "--names", dest="names", action="store", default="nbElements", type="string", help="name for the tags (separated by commas and no space) [default: None] [format: string]")
396 parser.add_option("-l", "--color", dest="colors", action="store", default=None, type="string", help="color of the lines (separated by commas and no space) [format: string]")
397 parser.add_option("-z", "--normalize", dest="normalize", action="store_true", default=False, help="normalize data (when panels are different) [format: bool] [default: false]")
398 parser.add_option("-Z", "--normalizeFac", dest="normalizeFactors", action="store", default=None, help="normalize data with given factors (when panels are different) [format: string]")
399 parser.add_option("-m", "--merge", dest="mergePlots", action="store_true", default=False, help="merge all plots in one figure [format: bool] [default: false]")
400 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]")
342 (options, args) = parser.parse_args() 401 (options, args) = parser.parse_args()
343 402
344 gt = GetDistribution(options.verbosity) 403 gt = GetDistribution(options.verbosity)
345 gt.setInputFile(options.inputFileName, options.format) 404 gt.setInputFiles(options.inputFileNames.split(","), options.format)
346 gt.setOutputFile(options.outputFileName) 405 gt.setOutputFile(options.outputFileName)
347 gt.setReferenceFile(options.referenceFileName) 406 gt.setReferenceFile(options.referenceFileName)
348 gt.setNbBins(int(options.nbBins)) 407 gt.setNbBins(options.nbBins)
408 gt.setBinSize(options.binSize)
349 gt.set2Strands(options.bothStrands) 409 gt.set2Strands(options.bothStrands)
350 gt.setRegion(options.chromosome, options.start, options.end) 410 gt.setRegion(options.chromosome, options.start, options.end)
351 gt.setNormalization(options.normalize) 411 gt.setNormalization(options.normalize)
352 gt.setAverage(options.average) 412 gt.setAverage(options.average)
353 gt.setYLimits(options.yMin, options.yMax) 413 gt.setYLimits(options.yMin, options.yMax)
354 gt.writeCsv(options.csv) 414 gt.writeCsv(options.csv)
355 gt.writeGff(options.gff) 415 gt.writeGff(options.gff)
356 gt.setImageSize(options.height, options.width) 416 gt.setImageSize(options.height, options.width)
357 gt.setNames(options.names.split(",")) 417 gt.setNames(options.names.split(","))
418 gt.setThicknessCurve(options.lineThickness)
419 gt.setSizePoliceLegend(options.sizePoliceLegend)
358 gt.setColors(None if options.colors == None else options.colors.split(",")) 420 gt.setColors(None if options.colors == None else options.colors.split(","))
421 gt.setDots(options.dots)
359 gt.setNormalization(options.normalize) 422 gt.setNormalization(options.normalize)
423 gt.setNormalizationFactors(None if options.normalizeFactors == None else [float(factor) for factor in options.normalizeFactors.split(",")])
360 gt.mergePlots(options.mergePlots) 424 gt.mergePlots(options.mergePlots)
361 gt.run() 425 gt.run()
362 426