comparison SMART/Java/Python/plotCoverage.py @ 38:2c0c0a89fad7

Uploaded
author m-zytnicki
date Thu, 02 May 2013 09:56:47 -0400
parents 44d5973c188c
children 169d364ddd91
comparison
equal deleted inserted replaced
37:d22fadc825e3 38:2c0c0a89fad7
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2010
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 import os, os.path, subprocess, glob, random
32 from optparse import OptionParser
33 from SMART.Java.Python.structure.Interval import Interval
34 from SMART.Java.Python.structure.Transcript import Transcript
35 from commons.core.parsing.ParserChooser import ParserChooser
36 from SMART.Java.Python.misc.RPlotter import RPlotter
37 from SMART.Java.Python.misc.Progress import Progress
38 from commons.core.parsing.FastaParser import FastaParser
39
40 strands = [-1, 1]
41 colors = {-1: "blue", 1: "red", 0: "black"}
42 colorLine = "black"
43
44 def parseTargetField(field):
45 strand = "+"
46 splittedFieldSpace = field.split()
47 splittedFieldPlus = field.split("+", 4)
48 if len(splittedFieldSpace) == 3:
49 id, start, end = splittedFieldSpace
50 elif len(splittedFieldSpace) == 4:
51 id, start, end, strand = splittedFieldSpace
52 elif len(splittedFieldPlus) == 3:
53 id, start, end = splittedFieldPlus
54 elif len(splittedFieldPlus) == 4:
55 id, start, end, strand = splittedFieldPlus
56 else:
57 raise Exception("Cannot parse Target field '%s'." % (field))
58 return (id, int(start), int(end), strand)
59
60
61 class SimpleTranscript(object):
62 def __init__(self, transcript1, transcript2, color = None):
63 self.start = max(0, transcript1.getStart() - transcript2.getStart())
64 self.end = min(transcript2.getEnd() - transcript2.getStart(), transcript1.getEnd() - transcript2.getStart())
65 self.strand = transcript1.getDirection() * transcript2.getDirection()
66 self.exons = []
67 for exon in transcript1.getExons():
68 if exon.getEnd() >= transcript2.getStart() and exon.getStart() <= transcript2.getEnd():
69 start = max(0, exon.getStart() - transcript2.getStart())
70 end = min(transcript2.getEnd() - transcript2.getStart(), exon.getEnd() - transcript2.getStart())
71 self.addExon(start, end, self.strand, color)
72
73 def addExon(self, start, end, strand, color):
74 exon = SimpleExon(start, end, strand, color)
75 self.exons.append(exon)
76
77 def getRScript(self, yOffset, height):
78 rString = ""
79 previousEnd = None
80 for exon in sorted(self.exons, key=lambda exon: exon.start):
81 if previousEnd != None:
82 rString += "segments(%.1f, %.1f, %.1f, %.1f, col = \"%s\")\n" % (previousEnd, yOffset + height / 4.0, exon.start, yOffset + height / 4.0, colorLine)
83 rString += exon.getRScript(yOffset, height)
84 previousEnd = exon.end
85 return rString
86
87
88 class SimpleExon(object):
89 def __init__(self, start, end, strand, color = None):
90 self.start = start
91 self.end = end
92 self.strand = strand
93 self.color = color
94
95 def getRScript(self, yOffset, height):
96 color = self.color if self.color != None else colors[self.strand]
97 return "rect(%.1f, %.1f, %.1f, %.1f, col=\"%s\", border = \"%s\")\n" % (self.start, yOffset, self.end, yOffset + height / 2.0, color, colorLine)
98
99
100 class Plotter(object):
101
102 def __init__(self, seed, index, verbosity):
103 self.seed = seed
104 self.index = index
105 self.verbosity = verbosity
106 self.maxCoverage = 0
107 self.maxOverlap = 0
108 self.log = ""
109 self.merge = False
110 self.width = 1500
111 self.heigth = 1000
112 self.xLabel = ""
113 self.yLabel = ""
114 self.title = None
115 self.absPath = os.getcwd()
116 self.coverageDataFileName = "tmpFile_%d_%s.dat" % (seed, index)
117 self.coverageScript = ""
118 self.overlapScript = ""
119 self.outputFileName = None
120
121 def setOutputFileName(self, fileName):
122 self.outputFileName = fileName
123
124 def setTranscript(self, transcript):
125 self.transcript = transcript
126 self.name = transcript.getName()
127 self.size = transcript.getEnd() - transcript.getStart() + 1
128 if self.title == None:
129 self.title = self.name
130 else:
131 self.title += " " + self.name
132
133 def setTitle(self, title):
134 self.title = title + " " + self.name
135
136 def setPlotSize(self, width, height):
137 self.width = width
138 self.height = height
139
140 def setLabels(self, xLabel, yLabel):
141 self.xLabel = xLabel
142 self.yLabel = yLabel
143
144 def setMerge(self, merge):
145 self.merge = merge
146
147 def setCoverageData(self, coverage):
148 outputCoveragePerStrand = dict([strand, 0] for strand in strands)
149 outputCoverage = 0
150 dataFile = open(os.path.abspath(self.coverageDataFileName), "w")
151 for position in range(self.size+1):
152 sumValue = 0
153 found = False
154 dataFile.write("%d\t" % (position))
155 for strand in strands:
156 value = coverage[strand].get(position, 0)
157 sumValue += value
158 dataFile.write("%d\t" % (value))
159 if value > 0:
160 found = True
161 outputCoveragePerStrand[strand] += 1
162 self.maxCoverage = max(self.maxCoverage, sumValue)
163 dataFile.write("%d\n" % (sumValue))
164 if found:
165 outputCoverage += 1
166 dataFile.close()
167 self.log += "%s (%d nt):\n - both strands: %d (%.0f%%)\n - (+) strand: %d (%.0f%%)\n - (-) strand: %d (%.0f%%)\n" % (self.name, self.size, outputCoverage, float(outputCoverage) / self.size * 100, outputCoveragePerStrand[1], float(outputCoveragePerStrand[1]) / self.size * 100, outputCoveragePerStrand[-1], float(outputCoveragePerStrand[-1]) / self.size * 100)
168 self.coverageScript += "data = scan(\"%s\", list(pos = -666, minus = -666, plus = -666, sumValue = -666), sep=\"\t\")\n" % (os.path.abspath(self.coverageDataFileName))
169 self.coverageScript += "lines(x = data$pos, y = data$minus, col = \"%s\")\n" % (colors[-1])
170 self.coverageScript += "lines(x = data$pos, y = data$plus, col = \"%s\")\n" % (colors[1])
171 self.coverageScript += "lines(x = data$pos, y = data$sumValue, col = \"%s\")\n" % (colors[0])
172
173 def setOverlapData(self, overlap):
174 height = 1
175 self.maxOverlap = (len(overlap) + 1) * height
176 thisElement = SimpleTranscript(self.transcript, self.transcript, "black")
177 self.overlapScript += thisElement.getRScript(0, height)
178 for cpt, transcript in enumerate(sorted(overlap, cmp=lambda c1, c2: c1.start - c2.start if c1.start != c2.start else c1.end - c2.end)):
179 self.overlapScript += transcript.getRScript((cpt + 1) * height, height)
180
181 def getFirstLine(self, suffix = None):
182 return "png(file = \"%s_%s%s.png\", width = %d, height = %d, bg = \"white\")\n" % (self.outputFileName, self.name, "" if suffix == None or self.merge else "_%s" % (suffix), self.width, self.height)
183
184 def getLastLine(self):
185 return "dev.off()\n"
186
187 def startR(self, fileName, script):
188 scriptFile = open(fileName, "w")
189 scriptFile.write(script)
190 scriptFile.close()
191 command = "R CMD BATCH %s" % (fileName)
192 status = subprocess.call(command, shell=True)
193 if status != 0:
194 raise Exception("Problem with the execution of script file %s, status is: %s" % (fileName, status))
195
196 def plot(self):
197 if self.merge:
198 fileName = "%s_%d_%s.R" % (self.outputFileName, self.seed, self.index)
199 plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, max(self.maxCoverage, self.maxOverlap), self.title)
200 script = self.getFirstLine() + plotLine + self.overlapScript + self.coverageScript + self.getLastLine()
201 self.startR(fileName, script)
202 else:
203 fileName = "%s_%d_%s_overlap.R" % (self.outputFileName, self.seed, self.index)
204 plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, self.maxOverlap, self.title)
205 script = self.getFirstLine("overlap") + plotLine + self.overlapScript + self.getLastLine()
206 self.startR(fileName, script)
207 fileName = "%s_%d_%s_coverage.R" % (self.outputFileName, self.seed, self.index)
208 plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, self.maxCoverage, self.title)
209 script = self.getFirstLine("coverage") + plotLine + self.coverageScript + self.getLastLine()
210 self.startR(fileName, script)
211
212
213 class PlotParser(object):
214
215 def __init__(self, verbosity):
216 self.verbosity = verbosity
217 self.parsers = [None, None]
218 self.sequenceParser = None
219 self.seed = random.randint(0, 10000)
220 self.title = ""
221 self.merge = False
222
223 def __del__(self):
224 for fileName in glob.glob("tmpFile_%d*.dat" % (self.seed)):
225 os.remove(fileName)
226 for fileName in glob.glob("%s*.R" % (os.path.abspath(self.outputFileName))):
227 os.remove(fileName)
228 for fileName in glob.glob("%s*.Rout" % (os.path.abspath(self.outputFileName))):
229 os.remove(fileName)
230
231 def addInput(self, inputNb, fileName, fileFormat):
232 if fileName == None:
233 return
234 chooser = ParserChooser(self.verbosity)
235 chooser.findFormat(fileFormat)
236 self.parsers[inputNb] = chooser.getParser(fileName)
237 if inputNb == 0:
238 self.parsers[1] = self.parsers[0]
239
240 def addSequence(self, fileName):
241 if fileName == None:
242 return
243 self.sequenceParser = FastaParser(fileName, self.verbosity)
244
245 def setOutput(self, fileName):
246 self.outputFileName = fileName
247
248 def setPlotSize(self, width, height):
249 self.width = width
250 self.height = height
251
252 def setLabels(self, xLabel, yLabel):
253 self.xLabel = xLabel
254 self.yLabel = yLabel
255
256 def setTitle(self, title):
257 self.title = title
258
259 def setMerge(self, merge):
260 self.merge = merge
261
262 def initializeDataFromSequences(self):
263 self.sizes = {}
264 self.coverage = {}
265 self.overlap = {}
266 for region in self.sequenceParser.getRegions():
267 self.sizes[region] = self.sequenceParser.getSizeOfRegion(region)
268 self.coverage[region] = {}
269 self.overlap[region] = []
270 for strand in strands:
271 self.coverage[region][strand] = {}
272 self.coverage[region][strand][1] = 0
273 self.coverage[region][strand][self.sizes[region]] = 0
274
275 def initializeDataFromTranscripts(self):
276 self.coverage = dict([i, None] for i in range(self.parsers[1].getNbTranscripts()))
277 self.overlap = dict([i, None] for i in range(self.parsers[1].getNbTranscripts()))
278 self.sizes = dict([i, 0] for i in range(self.parsers[1].getNbTranscripts()))
279 progress = Progress(self.parsers[1].getNbTranscripts(), "Reading regions", self.verbosity)
280 for cpt, transcript in enumerate(self.parsers[1].getIterator()):
281 self.coverage[cpt] = {}
282 self.overlap[cpt] = []
283 for strand in strands:
284 self.coverage[cpt][strand] = {}
285 self.coverage[cpt][strand][0] = 0
286 self.coverage[cpt][strand][transcript.getEnd() - transcript.getStart()] = 0
287 for exon in transcript.getExons():
288 self.sizes[cpt] += exon.getSize()
289 progress.inc()
290 progress.done()
291
292 def initialize(self):
293 if self.sequenceParser == None:
294 self.initializeDataFromTranscripts()
295 else:
296 self.initializeDataFromSequences()
297
298 def computeCoverage(self, transcript1, transcript2, id):
299 strand = transcript1.getDirection() * transcript2.getDirection()
300 for exon1 in transcript1.getExons():
301 for exon2 in transcript2.getExons():
302 if exon1.overlapWith(exon2):
303 for position in range(max(exon1.getStart(), exon2.getStart()), min(exon1.getEnd(), exon2.getEnd()) + 1):
304 relativePosition = position - transcript2.getStart() + 1
305 self.coverage[id][strand][relativePosition] = self.coverage[id][strand].get(relativePosition, 0) + 1
306
307 def computeOverlap(self, transcript1, transcript2, id):
308 simpleTranscript = SimpleTranscript(transcript1, transcript2)
309 self.overlap[id].append(simpleTranscript)
310
311 def compute2TranscriptFiles(self):
312 progress = Progress(self.parsers[1].getNbTranscripts(), "Comparing regions", self.verbosity)
313 for cpt2, transcript2 in enumerate(self.parsers[1].getIterator()):
314 for transcript1 in self.parsers[0].getIterator():
315 if transcript1.overlapWithExon(transcript2):
316 self.computeCoverage(transcript1, transcript2, cpt2)
317 self.computeOverlap(transcript1, transcript2, cpt2)
318 progress.inc()
319 progress.done()
320
321 def extractReferenceQueryMapping(self, mapping):
322 queryTranscript = mapping.getTranscript()
323 referenceTranscript = Transcript()
324 referenceTranscript.setChromosome(queryTranscript.getChromosome())
325 referenceTranscript.setName(queryTranscript.getChromosome())
326 referenceTranscript.setDirection("+")
327 referenceTranscript.setEnd(self.sizes[queryTranscript.getChromosome()])
328 referenceTranscript.setStart(1)
329 return (referenceTranscript, queryTranscript)
330
331 def extractReferenceQuery(self, inputTranscript):
332 if "Target" not in inputTranscript.getTagNames():
333 raise Exception("Cannot extract Target field in line '%s'." % (inputTranscript))
334 id, start, end, strand = parseTargetField(inputTranscript.getTagValue("Target"))
335 if id not in self.sizes:
336 raise Exception("Target id '%s' of transcript '%s' does not correspond to anything in FASTA file." % (id, inputTranscript))
337 referenceTranscript = Transcript()
338 referenceTranscript.setChromosome(id)
339 referenceTranscript.setName(id)
340 referenceTranscript.setDirection("+")
341 referenceTranscript.setEnd(self.sizes[id])
342 referenceTranscript.setStart(1)
343 queryTranscript = Transcript()
344 queryTranscript.setChromosome(id)
345 queryTranscript.setName(id)
346 queryTranscript.setStart(start)
347 queryTranscript.setEnd(end)
348 queryTranscript.setDirection(strand)
349 if inputTranscript.getNbExons() > 1:
350 factor = float(end - start) / (inputTranscript.getEnd() - inputTranscript.getStart())
351 for exon in inputTranscript.getExons():
352 newExon = Interval()
353 newExon.setChromosome(id)
354 newExon.setDirection(strand)
355 if "Target" in inputTranscript.getTagNames():
356 id, start, end, strand = parseTargetField(exon.getTagValue("Target"))
357 newExon.setStart(start)
358 newExon.setEnd(end)
359 else:
360 newExon.setStart(int(round((exon.getStart() - inputTranscript.getStart()) * factor)) + start)
361 newExon.setEnd( int(round((exon.getEnd() - inputTranscript.getStart()) * factor)) + start)
362 queryTranscript.addExon(newExon)
363 return (referenceTranscript, queryTranscript)
364
365 def compute1TranscriptFiles(self):
366 progress = Progress(self.parsers[1].getNbItems(), "Comparing regions", self.verbosity)
367 for transcript in self.parsers[1].getIterator():
368 if transcript.__class__.__name__ == "Mapping":
369 referenceTranscript, queryTranscript = self.extractReferenceQueryMapping(transcript)
370 else:
371 referenceTranscript, queryTranscript = self.extractReferenceQuery(transcript)
372 self.computeCoverage(queryTranscript, referenceTranscript, referenceTranscript.getName())
373 self.computeOverlap(queryTranscript, referenceTranscript, referenceTranscript.getName())
374 progress.inc()
375 progress.done()
376
377 def compute(self):
378 if self.sequenceParser == None:
379 self.compute2TranscriptFiles()
380 else:
381 self.compute1TranscriptFiles()
382
383 def plotTranscript(self, index, transcript):
384 plotter = Plotter(self.seed, index, self.verbosity)
385 plotter.setOutputFileName(self.outputFileName)
386 plotter.setTranscript(transcript)
387 plotter.setTitle(self.title)
388 plotter.setLabels(self.xLabel, self.yLabel)
389 plotter.setPlotSize(self.width, self.height)
390 plotter.setCoverageData(self.coverage[index])
391 plotter.setOverlapData(self.overlap[index])
392 plotter.setMerge(self.merge)
393 plotter.plot()
394 output = plotter.log
395 return output
396
397 def plot1TranscriptFile(self):
398 self.outputCoverage = {}
399 self.outputCoveragePerStrand = {}
400 output = ""
401 progress = Progress(len(self.sequenceParser.getRegions()), "Plotting regions", self.verbosity)
402 for cpt2, region in enumerate(self.sequenceParser.getRegions()):
403 transcript = Transcript()
404 transcript.setName(region)
405 transcript.setDirection("+")
406 transcript.setEnd(self.sizes[region])
407 transcript.setStart(1)
408 output += self.plotTranscript(region, transcript)
409 progress.inc()
410 progress.done()
411 if self.verbosity > 0:
412 print output
413
414 def plot2TranscriptFiles(self):
415 self.outputCoverage = [0] * self.parsers[1].getNbTranscripts()
416 self.outputCoveragePerStrand = [None] * self.parsers[1].getNbTranscripts()
417 for cpt in range(self.parsers[1].getNbTranscripts()):
418 self.outputCoveragePerStrand[cpt] = dict([strand, 0] for strand in strands)
419 progress = Progress(self.parsers[1].getNbTranscripts(), "Plotting regions", self.verbosity)
420 output = ""
421 for cpt2, transcript2 in enumerate(self.parsers[1].getIterator()):
422 output += self.plotTranscript(cpt2, transcript2)
423 progress.inc()
424 progress.done()
425 if self.verbosity > 0:
426 print output
427
428 def plot(self):
429 if self.sequenceParser == None:
430 self.plot2TranscriptFiles()
431 else:
432 self.plot1TranscriptFile()
433
434 def start(self):
435 self.initialize()
436 self.compute()
437 self.plot()
438
439
440 if __name__ == "__main__":
441
442 # parse command line
443 description = "Plot Coverage v1.0.1: Plot the coverage of the first data with respect to the second one. [Category: Visualization]"
444
445 parser = OptionParser(description = description)
446 parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript or mapping format given by -f]")
447 parser.add_option("-f", "--inputFormat1", dest="inputFormat1", action="store", type="string", help="format of input file 1 [compulsory] [format: transcript or mapping file format]")
448 parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]")
449 parser.add_option("-g", "--inputFormat2", dest="inputFormat2", action="store", type="string", help="format of input file 2 [compulsory] [format: transcript file format]")
450 parser.add_option("-q", "--sequence", dest="inputSequence", action="store", default=None, type="string", help="input sequence file [format: file in FASTA format] [default: None]")
451 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]")
452 parser.add_option("-w", "--width", dest="width", action="store", default=1500, type="int", help="width of the plots (in px) [format: int] [default: 1500]")
453 parser.add_option("-e", "--height", dest="height", action="store", default=1000, type="int", help="height of the plots (in px) [format: int] [default: 1000]")
454 parser.add_option("-t", "--title", dest="title", action="store", default="", type="string", help="title of the plots [format: string]")
455 parser.add_option("-x", "--xlab", dest="xLabel", action="store", default="", type="string", help="label on the x-axis [format: string]")
456 parser.add_option("-y", "--ylab", dest="yLabel", action="store", default="", type="string", help="label on the y-axis [format: string]")
457 parser.add_option("-p", "--plusColor", dest="plusColor", action="store", default="red", type="string", help="color for the elements on the plus strand [format: string] [default: red]")
458 parser.add_option("-m", "--minusColor", dest="minusColor", action="store", default="blue", type="string", help="color for the elements on the minus strand [format: string] [default: blue]")
459 parser.add_option("-s", "--sumColor", dest="sumColor", action="store", default="black", type="string", help="color for 2 strands coverage line [format: string] [default: black]")
460 parser.add_option("-l", "--lineColor", dest="lineColor", action="store", default="black", type="string", help="color for the lines [format: string] [default: black]")
461 parser.add_option("-1", "--merge", dest="merge", action="store_true", default=False, help="merge the 2 plots in 1 [format: boolean] [default: false]")
462 parser.add_option("-D", "--directory", dest="working_Dir", action="store", default=os.getcwd(), type="string", help="the directory to store the results [format: directory]")
463 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
464 (options, args) = parser.parse_args()
465
466 colors[1] = options.plusColor
467 colors[-1] = options.minusColor
468 colors[0] = options.sumColor
469 colorLine = options.lineColor
470
471 pp = PlotParser(options.verbosity)
472 pp.addInput(0, options.inputFileName1, options.inputFormat1)
473 pp.addInput(1, options.inputFileName2, options.inputFormat2)
474 pp.addSequence(options.inputSequence)
475 pp.setOutput(options.outputFileName if os.path.isabs(options.outputFileName) else os.path.join(options.working_Dir, options.outputFileName))
476 pp.setPlotSize(options.width, options.height)
477 pp.setLabels(options.xLabel, options.yLabel)
478 pp.setTitle(options.title)
479 pp.setMerge(options.merge)
480 pp.start()
481