comparison SMART/Java/Python/plotCoverage.py @ 46:169d364ddd91

Uploaded
author m-zytnicki
date Mon, 30 Sep 2013 03:19:26 -0400
parents 44d5973c188c
children
comparison
equal deleted inserted replaced
45:e454402ba9d9 46:169d364ddd91
30 # 30 #
31 import os, os.path, subprocess, glob, random 31 import os, os.path, subprocess, glob, random
32 from optparse import OptionParser 32 from optparse import OptionParser
33 from SMART.Java.Python.structure.Interval import Interval 33 from SMART.Java.Python.structure.Interval import Interval
34 from SMART.Java.Python.structure.Transcript import Transcript 34 from SMART.Java.Python.structure.Transcript import Transcript
35 from commons.core.parsing.ParserChooser import ParserChooser 35 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
36 from SMART.Java.Python.misc.RPlotter import RPlotter 36 from SMART.Java.Python.misc.RPlotter import RPlotter
37 from SMART.Java.Python.misc.Progress import Progress 37 from SMART.Java.Python.misc.Progress import Progress
38 from commons.core.parsing.FastaParser import FastaParser 38 from commons.core.parsing.FastaParser import FastaParser
39 39
40 strands = [-1, 1] 40 strands = [-1, 1]
41 colors = {-1: "blue", 1: "red", 0: "black"} 41 colors = {-1: "blue", 1: "red", 0: "black"}
42 colorLine = "black" 42 colorLine = "black"
43 43
44 def parseTargetField(field): 44 def parseTargetField(field):
45 strand = "+" 45 strand = "+"
46 splittedFieldSpace = field.split() 46 splittedFieldSpace = field.split()
47 splittedFieldPlus = field.split("+", 4) 47 splittedFieldPlus = field.split("+", 4)
48 if len(splittedFieldSpace) == 3: 48 if len(splittedFieldSpace) == 3:
49 id, start, end = splittedFieldSpace 49 id, start, end = splittedFieldSpace
50 elif len(splittedFieldSpace) == 4: 50 elif len(splittedFieldSpace) == 4:
51 id, start, end, strand = splittedFieldSpace 51 id, start, end, strand = splittedFieldSpace
52 elif len(splittedFieldPlus) == 3: 52 elif len(splittedFieldPlus) == 3:
53 id, start, end = splittedFieldPlus 53 id, start, end = splittedFieldPlus
54 elif len(splittedFieldPlus) == 4: 54 elif len(splittedFieldPlus) == 4:
55 id, start, end, strand = splittedFieldPlus 55 id, start, end, strand = splittedFieldPlus
56 else: 56 else:
57 raise Exception("Cannot parse Target field '%s'." % (field)) 57 raise Exception("Cannot parse Target field '%s'." % (field))
58 return (id, int(start), int(end), strand) 58 return (id, int(start), int(end), strand)
59 59
60 60
61 class SimpleTranscript(object): 61 class SimpleTranscript(object):
62 def __init__(self, transcript1, transcript2, color = None): 62 def __init__(self, transcript1, transcript2, color = None):
63 self.start = max(0, transcript1.getStart() - transcript2.getStart()) 63 self.start = max(0, transcript1.getStart() - transcript2.getStart())
64 self.end = min(transcript2.getEnd() - transcript2.getStart(), transcript1.getEnd() - transcript2.getStart()) 64 self.end = min(transcript2.getEnd() - transcript2.getStart(), transcript1.getEnd() - transcript2.getStart())
65 self.strand = transcript1.getDirection() * transcript2.getDirection() 65 self.strand = transcript1.getDirection() * transcript2.getDirection()
66 self.exons = [] 66 self.exons = []
67 for exon in transcript1.getExons(): 67 for exon in transcript1.getExons():
68 if exon.getEnd() >= transcript2.getStart() and exon.getStart() <= transcript2.getEnd(): 68 if exon.getEnd() >= transcript2.getStart() and exon.getStart() <= transcript2.getEnd():
69 start = max(0, exon.getStart() - transcript2.getStart()) 69 start = max(0, exon.getStart() - transcript2.getStart())
70 end = min(transcript2.getEnd() - transcript2.getStart(), exon.getEnd() - transcript2.getStart()) 70 end = min(transcript2.getEnd() - transcript2.getStart(), exon.getEnd() - transcript2.getStart())
71 self.addExon(start, end, self.strand, color) 71 self.addExon(start, end, self.strand, color)
72 72
73 def addExon(self, start, end, strand, color): 73 def addExon(self, start, end, strand, color):
74 exon = SimpleExon(start, end, strand, color) 74 exon = SimpleExon(start, end, strand, color)
75 self.exons.append(exon) 75 self.exons.append(exon)
76 76
77 def getRScript(self, yOffset, height): 77 def getRScript(self, yOffset, height):
78 rString = "" 78 rString = ""
79 previousEnd = None 79 previousEnd = None
80 for exon in sorted(self.exons, key=lambda exon: exon.start): 80 for exon in sorted(self.exons, key=lambda exon: exon.start):
81 if previousEnd != None: 81 if previousEnd != None:
82 rString += "segments(%.1f, %.1f, %.1f, %.1f, col = \"%s\")\n" % (previousEnd, yOffset + height / 4.0, exon.start, yOffset + height / 4.0, colorLine) 82 rString += "segments(%.1f, %.1f, %.1f, %.1f, col = \"%s\")\n" % (previousEnd, yOffset + height / 4.0, exon.start, yOffset + height / 4.0, colorLine)
83 rString += exon.getRScript(yOffset, height) 83 rString += exon.getRScript(yOffset, height)
84 previousEnd = exon.end 84 previousEnd = exon.end
85 return rString 85 return rString
86 86
87 87
88 class SimpleExon(object): 88 class SimpleExon(object):
89 def __init__(self, start, end, strand, color = None): 89 def __init__(self, start, end, strand, color = None):
90 self.start = start 90 self.start = start
91 self.end = end 91 self.end = end
92 self.strand = strand 92 self.strand = strand
93 self.color = color 93 self.color = color
94 94
95 def getRScript(self, yOffset, height): 95 def getRScript(self, yOffset, height):
96 color = self.color if self.color != None else colors[self.strand] 96 color = self.color if self.color != None else colors[self.strand]
97 return "rect(%.1f, %.1f, %.1f, %.1f, col=\"%s\", border = \"%s\")\n" % (self.start, yOffset, self.end, yOffset + height / 2.0, color, colorLine) 97 return "rect(%.1f, %.1f, %.1f, %.1f, col=\"%s\", border = \"%s\")\n" % (self.start, yOffset, self.end, yOffset + height / 2.0, color, colorLine)
98 98
99 99
100 class Plotter(object): 100 class Plotter(object):
101 101
102 def __init__(self, seed, index, verbosity): 102 def __init__(self, seed, index, verbosity):
103 self.seed = seed 103 self.seed = seed
104 self.index = index 104 self.index = index
105 self.verbosity = verbosity 105 self.verbosity = verbosity
106 self.maxCoverage = 0 106 self.maxCoverage = 0
107 self.maxOverlap = 0 107 self.maxOverlap = 0
108 self.log = "" 108 self.log = ""
109 self.merge = False 109 self.merge = False
110 self.width = 1500 110 self.width = 1500
111 self.heigth = 1000 111 self.heigth = 1000
112 self.xLabel = "" 112 self.xLabel = ""
113 self.yLabel = "" 113 self.yLabel = ""
114 self.title = None 114 self.title = None
115 self.absPath = os.getcwd() 115 self.absPath = os.getcwd()
116 self.coverageDataFileName = "tmpFile_%d_%s.dat" % (seed, index) 116 self.coverageDataFileName = "tmpFile_%d_%s.dat" % (seed, index)
117 self.coverageScript = "" 117 self.coverageScript = ""
118 self.overlapScript = "" 118 self.overlapScript = ""
119 self.outputFileName = None 119 self.outputFileName = None
120 120
121 def setOutputFileName(self, fileName): 121 def setOutputFileName(self, fileName):
122 self.outputFileName = fileName 122 self.outputFileName = fileName
123 123
124 def setTranscript(self, transcript): 124 def setTranscript(self, transcript):
125 self.transcript = transcript 125 self.transcript = transcript
126 self.name = transcript.getName() 126 self.name = transcript.getName()
127 self.size = transcript.getEnd() - transcript.getStart() + 1 127 self.size = transcript.getEnd() - transcript.getStart() + 1
128 if self.title == None: 128 if self.title == None:
129 self.title = self.name 129 self.title = self.name
130 else: 130 else:
131 self.title += " " + self.name 131 self.title += " " + self.name
132 132
133 def setTitle(self, title): 133 def setTitle(self, title):
134 self.title = title + " " + self.name 134 self.title = title + " " + self.name
135 135
136 def setPlotSize(self, width, height): 136 def setPlotSize(self, width, height):
137 self.width = width 137 self.width = width
138 self.height = height 138 self.height = height
139 139
140 def setLabels(self, xLabel, yLabel): 140 def setLabels(self, xLabel, yLabel):
141 self.xLabel = xLabel 141 self.xLabel = xLabel
142 self.yLabel = yLabel 142 self.yLabel = yLabel
143 143
144 def setMerge(self, merge): 144 def setMerge(self, merge):
145 self.merge = merge 145 self.merge = merge
146 146
147 def setCoverageData(self, coverage): 147 def setCoverageData(self, coverage):
148 outputCoveragePerStrand = dict([strand, 0] for strand in strands) 148 outputCoveragePerStrand = dict([strand, 0] for strand in strands)
149 outputCoverage = 0 149 outputCoverage = 0
150 dataFile = open(os.path.abspath(self.coverageDataFileName), "w") 150 dataFile = open(os.path.abspath(self.coverageDataFileName), "w")
151 for position in range(self.size+1): 151 for position in range(self.size+1):
152 sumValue = 0 152 sumValue = 0
153 found = False 153 found = False
154 dataFile.write("%d\t" % (position)) 154 dataFile.write("%d\t" % (position))
155 for strand in strands: 155 for strand in strands:
156 value = coverage[strand].get(position, 0) 156 value = coverage[strand].get(position, 0)
157 sumValue += value 157 sumValue += value
158 dataFile.write("%d\t" % (value)) 158 dataFile.write("%d\t" % (value))
159 if value > 0: 159 if value > 0:
160 found = True 160 found = True
161 outputCoveragePerStrand[strand] += 1 161 outputCoveragePerStrand[strand] += 1
162 self.maxCoverage = max(self.maxCoverage, sumValue) 162 self.maxCoverage = max(self.maxCoverage, sumValue)
163 dataFile.write("%d\n" % (sumValue)) 163 dataFile.write("%d\n" % (sumValue))
164 if found: 164 if found:
165 outputCoverage += 1 165 outputCoverage += 1
166 dataFile.close() 166 dataFile.close()
167 self.log += "%s (%d nt):\n - both strands: %d (%.0f%%)\n - (+) strand: %d (%.0f%%)\n - (-) strand: %d (%.0f%%)\n" % (self.name, self.size, outputCoverage, float(outputCoverage) / self.size * 100, outputCoveragePerStrand[1], float(outputCoveragePerStrand[1]) / self.size * 100, outputCoveragePerStrand[-1], float(outputCoveragePerStrand[-1]) / self.size * 100) 167 self.log += "%s (%d nt):\n - both strands: %d (%.0f%%)\n - (+) strand: %d (%.0f%%)\n - (-) strand: %d (%.0f%%)\n" % (self.name, self.size, outputCoverage, float(outputCoverage) / self.size * 100, outputCoveragePerStrand[1], float(outputCoveragePerStrand[1]) / self.size * 100, outputCoveragePerStrand[-1], float(outputCoveragePerStrand[-1]) / self.size * 100)
168 self.coverageScript += "data = scan(\"%s\", list(pos = -666, minus = -666, plus = -666, sumValue = -666), sep=\"\t\")\n" % (os.path.abspath(self.coverageDataFileName)) 168 self.coverageScript += "data = scan(\"%s\", list(pos = -666, minus = -666, plus = -666, sumValue = -666), sep=\"\t\")\n" % (os.path.abspath(self.coverageDataFileName))
169 self.coverageScript += "lines(x = data$pos, y = data$minus, col = \"%s\")\n" % (colors[-1]) 169 self.coverageScript += "lines(x = data$pos, y = data$minus, col = \"%s\")\n" % (colors[-1])
170 self.coverageScript += "lines(x = data$pos, y = data$plus, col = \"%s\")\n" % (colors[1]) 170 self.coverageScript += "lines(x = data$pos, y = data$plus, col = \"%s\")\n" % (colors[1])
171 self.coverageScript += "lines(x = data$pos, y = data$sumValue, col = \"%s\")\n" % (colors[0]) 171 self.coverageScript += "lines(x = data$pos, y = data$sumValue, col = \"%s\")\n" % (colors[0])
172 172
173 def setOverlapData(self, overlap): 173 def setOverlapData(self, overlap):
174 height = 1 174 height = 1
175 self.maxOverlap = (len(overlap) + 1) * height 175 self.maxOverlap = (len(overlap) + 1) * height
176 thisElement = SimpleTranscript(self.transcript, self.transcript, "black") 176 thisElement = SimpleTranscript(self.transcript, self.transcript, "black")
177 self.overlapScript += thisElement.getRScript(0, height) 177 self.overlapScript += thisElement.getRScript(0, height)
178 for cpt, transcript in enumerate(sorted(overlap, cmp=lambda c1, c2: c1.start - c2.start if c1.start != c2.start else c1.end - c2.end)): 178 for cpt, transcript in enumerate(sorted(overlap, cmp=lambda c1, c2: c1.start - c2.start if c1.start != c2.start else c1.end - c2.end)):
179 self.overlapScript += transcript.getRScript((cpt + 1) * height, height) 179 self.overlapScript += transcript.getRScript((cpt + 1) * height, height)
180 180
181 def getFirstLine(self, suffix = None): 181 def getFirstLine(self, suffix = None):
182 return "png(file = \"%s_%s%s.png\", width = %d, height = %d, bg = \"white\")\n" % (self.outputFileName, self.name, "" if suffix == None or self.merge else "_%s" % (suffix), self.width, self.height) 182 return "png(file = \"%s_%s%s.png\", width = %d, height = %d, bg = \"white\")\n" % (self.outputFileName, self.name, "" if suffix == None or self.merge else "_%s" % (suffix), self.width, self.height)
183 183
184 def getLastLine(self): 184 def getLastLine(self):
185 return "dev.off()\n" 185 return "dev.off()\n"
186 186
187 def startR(self, fileName, script): 187 def startR(self, fileName, script):
188 scriptFile = open(fileName, "w") 188 scriptFile = open(fileName, "w")
189 scriptFile.write(script) 189 scriptFile.write(script)
190 scriptFile.close() 190 scriptFile.close()
191 command = "R CMD BATCH %s" % (fileName) 191 command = "R CMD BATCH %s" % (fileName)
192 status = subprocess.call(command, shell=True) 192 status = subprocess.call(command, shell=True)
193 if status != 0: 193 if status != 0:
194 raise Exception("Problem with the execution of script file %s, status is: %s" % (fileName, status)) 194 raise Exception("Problem with the execution of script file %s, status is: %s" % (fileName, status))
195 195
196 def plot(self): 196 def plot(self):
197 if self.merge: 197 print "outputfileName is written in :", self.outputFileName
198 fileName = "%s_%d_%s.R" % (self.outputFileName, self.seed, self.index) 198 if self.merge:
199 plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, max(self.maxCoverage, self.maxOverlap), self.title) 199 fileName = "%s_%d_%s.R" % (self.outputFileName, self.seed, self.index)
200 script = self.getFirstLine() + plotLine + self.overlapScript + self.coverageScript + self.getLastLine() 200 plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, max(self.maxCoverage, self.maxOverlap), self.title)
201 self.startR(fileName, script) 201 script = self.getFirstLine() + plotLine + self.overlapScript + self.coverageScript + self.getLastLine()
202 else: 202 self.startR(fileName, script)
203 fileName = "%s_%d_%s_overlap.R" % (self.outputFileName, self.seed, self.index) 203 else:
204 plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, self.maxOverlap, self.title) 204 fileName = "%s_%d_%s_overlap.R" % (self.outputFileName, self.seed, self.index)
205 script = self.getFirstLine("overlap") + plotLine + self.overlapScript + self.getLastLine() 205 print "overlap file is written in :", fileName
206 self.startR(fileName, script) 206 plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, self.maxOverlap, self.title)
207 fileName = "%s_%d_%s_coverage.R" % (self.outputFileName, self.seed, self.index) 207 script = self.getFirstLine("overlap") + plotLine + self.overlapScript + self.getLastLine()
208 plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, self.maxCoverage, self.title) 208 self.startR(fileName, script)
209 script = self.getFirstLine("coverage") + plotLine + self.coverageScript + self.getLastLine() 209 fileName = "%s_%d_%s_coverage.R" % (self.outputFileName, self.seed, self.index)
210 self.startR(fileName, script) 210 plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, self.maxCoverage, self.title)
211 script = self.getFirstLine("coverage") + plotLine + self.coverageScript + self.getLastLine()
212 self.startR(fileName, script)
211 213
212 214
213 class PlotParser(object): 215 class PlotParser(object):
214 216
215 def __init__(self, verbosity): 217 def __init__(self, verbosity):
216 self.verbosity = verbosity 218 self.verbosity = verbosity
217 self.parsers = [None, None] 219 self.parsers = [None, None]
218 self.sequenceParser = None 220 self.sequenceParser = None
219 self.seed = random.randint(0, 10000) 221 self.seed = random.randint(0, 10000)
220 self.title = "" 222 self.title = ""
221 self.merge = False 223 self.merge = False
222 224
223 def __del__(self): 225 def __del__(self):
224 for fileName in glob.glob("tmpFile_%d*.dat" % (self.seed)): 226 for fileName in glob.glob("tmpFile_%d*.dat" % (self.seed)):
225 os.remove(fileName) 227 os.remove(fileName)
226 for fileName in glob.glob("%s*.R" % (os.path.abspath(self.outputFileName))): 228 for fileName in glob.glob("%s*.R" % (os.path.abspath(self.outputFileName))):
227 os.remove(fileName) 229 os.remove(fileName)
228 for fileName in glob.glob("%s*.Rout" % (os.path.abspath(self.outputFileName))): 230 for fileName in glob.glob("%s*.Rout" % (os.path.abspath(self.outputFileName))):
229 os.remove(fileName) 231 os.remove(fileName)
230 232
231 def addInput(self, inputNb, fileName, fileFormat): 233 def addInput(self, inputNb, fileName, fileFormat):
232 if fileName == None: 234 if fileName == None:
233 return 235 return
234 chooser = ParserChooser(self.verbosity) 236 self.parsers[inputNb] = TranscriptContainer(fileName, fileFormat, self.verbosity)
235 chooser.findFormat(fileFormat) 237 if inputNb == 0:
236 self.parsers[inputNb] = chooser.getParser(fileName) 238 self.parsers[1] = self.parsers[0]
237 if inputNb == 0: 239
238 self.parsers[1] = self.parsers[0] 240 def addSequence(self, fileName):
239 241 if fileName == None:
240 def addSequence(self, fileName): 242 return
241 if fileName == None: 243 self.sequenceParser = FastaParser(fileName, self.verbosity)
242 return 244
243 self.sequenceParser = FastaParser(fileName, self.verbosity) 245 def setOutput(self, fileName):
244 246 self.outputFileName = fileName
245 def setOutput(self, fileName): 247
246 self.outputFileName = fileName 248 def setPlotSize(self, width, height):
247 249 self.width = width
248 def setPlotSize(self, width, height): 250 self.height = height
249 self.width = width 251
250 self.height = height 252 def setLabels(self, xLabel, yLabel):
251 253 self.xLabel = xLabel
252 def setLabels(self, xLabel, yLabel): 254 self.yLabel = yLabel
253 self.xLabel = xLabel 255
254 self.yLabel = yLabel 256 def setTitle(self, title):
255 257 self.title = title
256 def setTitle(self, title): 258
257 self.title = title 259 def setMerge(self, merge):
258 260 self.merge = merge
259 def setMerge(self, merge): 261
260 self.merge = merge 262 def initializeDataFromSequences(self):
261 263 self.sizes = {}
262 def initializeDataFromSequences(self): 264 self.coverage = {}
263 self.sizes = {} 265 self.overlap = {}
264 self.coverage = {} 266 for region in self.sequenceParser.getRegions():
265 self.overlap = {} 267 self.sizes[region] = self.sequenceParser.getSizeOfRegion(region)
266 for region in self.sequenceParser.getRegions(): 268 self.coverage[region] = {}
267 self.sizes[region] = self.sequenceParser.getSizeOfRegion(region) 269 self.overlap[region] = []
268 self.coverage[region] = {} 270 for strand in strands:
269 self.overlap[region] = [] 271 self.coverage[region][strand] = {}
270 for strand in strands: 272 self.coverage[region][strand][1] = 0
271 self.coverage[region][strand] = {} 273 self.coverage[region][strand][self.sizes[region]] = 0
272 self.coverage[region][strand][1] = 0 274
273 self.coverage[region][strand][self.sizes[region]] = 0 275
274 276 def initializeDataFromTranscripts(self):
275 def initializeDataFromTranscripts(self): 277 self.coverage = dict([i, None] for i in range(self.parsers[1].getNbTranscripts()))
276 self.coverage = dict([i, None] for i in range(self.parsers[1].getNbTranscripts())) 278 self.overlap = dict([i, None] for i in range(self.parsers[1].getNbTranscripts()))
277 self.overlap = dict([i, None] for i in range(self.parsers[1].getNbTranscripts())) 279 self.sizes = dict([i, 0] for i in range(self.parsers[1].getNbTranscripts()))
278 self.sizes = dict([i, 0] for i in range(self.parsers[1].getNbTranscripts())) 280 self.parsers[0].findData()
279 progress = Progress(self.parsers[1].getNbTranscripts(), "Reading regions", self.verbosity) 281 progress = Progress(self.parsers[1].getNbTranscripts(), "Reading regions", self.verbosity)
280 for cpt, transcript in enumerate(self.parsers[1].getIterator()): 282 for cpt, transcript in enumerate(self.parsers[1].getIterator()):
281 self.coverage[cpt] = {} 283 self.coverage[cpt] = {}
282 self.overlap[cpt] = [] 284 self.overlap[cpt] = []
283 for strand in strands: 285 for strand in strands:
284 self.coverage[cpt][strand] = {} 286 self.coverage[cpt][strand] = {}
285 self.coverage[cpt][strand][0] = 0 287 self.coverage[cpt][strand][0] = 0
286 self.coverage[cpt][strand][transcript.getEnd() - transcript.getStart()] = 0 288 self.coverage[cpt][strand][transcript.getEnd() - transcript.getStart()] = 0
287 for exon in transcript.getExons(): 289 for exon in transcript.getExons():
288 self.sizes[cpt] += exon.getSize() 290 self.sizes[cpt] += exon.getSize()
289 progress.inc() 291 progress.inc()
290 progress.done() 292 progress.done()
291 293
292 def initialize(self): 294 def initialize(self):
293 if self.sequenceParser == None: 295 if self.sequenceParser == None:
294 self.initializeDataFromTranscripts() 296 self.initializeDataFromTranscripts()
295 else: 297 else:
296 self.initializeDataFromSequences() 298 self.initializeDataFromSequences()
297 299
298 def computeCoverage(self, transcript1, transcript2, id): 300 def computeCoverage(self, transcript1, transcript2, id):
299 strand = transcript1.getDirection() * transcript2.getDirection() 301 strand = transcript1.getDirection() * transcript2.getDirection()
300 for exon1 in transcript1.getExons(): 302 for exon1 in transcript1.getExons():
301 for exon2 in transcript2.getExons(): 303 for exon2 in transcript2.getExons():
302 if exon1.overlapWith(exon2): 304 if exon1.overlapWith(exon2):
303 for position in range(max(exon1.getStart(), exon2.getStart()), min(exon1.getEnd(), exon2.getEnd()) + 1): 305 for position in range(max(exon1.getStart(), exon2.getStart()), min(exon1.getEnd(), exon2.getEnd()) + 1):
304 relativePosition = position - transcript2.getStart() + 1 306 relativePosition = position - transcript2.getStart() + 1
305 self.coverage[id][strand][relativePosition] = self.coverage[id][strand].get(relativePosition, 0) + 1 307 self.coverage[id][strand][relativePosition] = self.coverage[id][strand].get(relativePosition, 0) + 1
306 308
307 def computeOverlap(self, transcript1, transcript2, id): 309 def computeOverlap(self, transcript1, transcript2, id):
308 simpleTranscript = SimpleTranscript(transcript1, transcript2) 310 simpleTranscript = SimpleTranscript(transcript1, transcript2)
309 self.overlap[id].append(simpleTranscript) 311 self.overlap[id].append(simpleTranscript)
310 312
311 def compute2TranscriptFiles(self): 313 def compute2TranscriptFiles(self):
312 progress = Progress(self.parsers[1].getNbTranscripts(), "Comparing regions", self.verbosity) 314 progress = Progress(self.parsers[1].getNbTranscripts(), "Comparing regions", self.verbosity)
313 for cpt2, transcript2 in enumerate(self.parsers[1].getIterator()): 315 for cpt2, transcript2 in enumerate(self.parsers[1].getIterator()):
314 for transcript1 in self.parsers[0].getIterator(): 316 for transcript1 in self.parsers[0].getIterator():
315 if transcript1.overlapWithExon(transcript2): 317 if transcript1.overlapWithExon(transcript2):
316 self.computeCoverage(transcript1, transcript2, cpt2) 318 self.computeCoverage(transcript1, transcript2, cpt2)
317 self.computeOverlap(transcript1, transcript2, cpt2) 319 self.computeOverlap(transcript1, transcript2, cpt2)
318 progress.inc() 320 progress.inc()
319 progress.done() 321 progress.done()
320 322
321 def extractReferenceQueryMapping(self, mapping): 323 def extractReferenceQuery(self, inputTranscript):
322 queryTranscript = mapping.getTranscript() 324 if "Target" not in inputTranscript.getTagNames():
323 referenceTranscript = Transcript() 325 raise Exception("Cannot extract Target field in line '%s'." % (inputTranscript))
324 referenceTranscript.setChromosome(queryTranscript.getChromosome()) 326 id, start, end, strand = parseTargetField(inputTranscript.getTagValue("Target"))
325 referenceTranscript.setName(queryTranscript.getChromosome()) 327 if id not in self.sizes:
326 referenceTranscript.setDirection("+") 328 raise Exception("Target id '%s' of transcript '%s' does not correspond to anything in FASTA file." % (id, inputTranscript))
327 referenceTranscript.setEnd(self.sizes[queryTranscript.getChromosome()]) 329 referenceTranscript = Transcript()
328 referenceTranscript.setStart(1) 330 referenceTranscript.setChromosome(id)
329 return (referenceTranscript, queryTranscript) 331 referenceTranscript.setName(id)
330 332 referenceTranscript.setDirection("+")
331 def extractReferenceQuery(self, inputTranscript): 333 referenceTranscript.setEnd(self.sizes[id])
332 if "Target" not in inputTranscript.getTagNames(): 334 referenceTranscript.setStart(1)
333 raise Exception("Cannot extract Target field in line '%s'." % (inputTranscript)) 335 queryTranscript = Transcript()
334 id, start, end, strand = parseTargetField(inputTranscript.getTagValue("Target")) 336 queryTranscript.setChromosome(id)
335 if id not in self.sizes: 337 queryTranscript.setName(id)
336 raise Exception("Target id '%s' of transcript '%s' does not correspond to anything in FASTA file." % (id, inputTranscript)) 338 queryTranscript.setStart(start)
337 referenceTranscript = Transcript() 339 queryTranscript.setEnd(end)
338 referenceTranscript.setChromosome(id) 340 queryTranscript.setDirection(strand)
339 referenceTranscript.setName(id) 341 if inputTranscript.getNbExons() > 1:
340 referenceTranscript.setDirection("+") 342 factor = float(end - start) / (inputTranscript.getEnd() - inputTranscript.getStart())
341 referenceTranscript.setEnd(self.sizes[id]) 343 for exon in inputTranscript.getExons():
342 referenceTranscript.setStart(1) 344 newExon = Interval()
343 queryTranscript = Transcript() 345 newExon.setChromosome(id)
344 queryTranscript.setChromosome(id) 346 newExon.setDirection(strand)
345 queryTranscript.setName(id) 347 if "Target" in inputTranscript.getTagNames():
346 queryTranscript.setStart(start) 348 id, start, end, strand = parseTargetField(exon.getTagValue("Target"))
347 queryTranscript.setEnd(end) 349 newExon.setStart(start)
348 queryTranscript.setDirection(strand) 350 newExon.setEnd(end)
349 if inputTranscript.getNbExons() > 1: 351 else:
350 factor = float(end - start) / (inputTranscript.getEnd() - inputTranscript.getStart()) 352 newExon.setStart(int(round((exon.getStart() - inputTranscript.getStart()) * factor)) + start)
351 for exon in inputTranscript.getExons(): 353 newExon.setEnd( int(round((exon.getEnd() - inputTranscript.getStart()) * factor)) + start)
352 newExon = Interval() 354 queryTranscript.addExon(newExon)
353 newExon.setChromosome(id) 355 return (referenceTranscript, queryTranscript)
354 newExon.setDirection(strand) 356
355 if "Target" in inputTranscript.getTagNames(): 357 def compute1TranscriptFiles(self):
356 id, start, end, strand = parseTargetField(exon.getTagValue("Target")) 358 progress = Progress(self.parsers[1].getNbTranscripts(), "Comparing regions", self.verbosity)
357 newExon.setStart(start) 359 for transcript in self.parsers[1].getIterator():
358 newExon.setEnd(end) 360 referenceTranscript, queryTranscript = self.extractReferenceQuery(transcript)
359 else: 361 self.computeCoverage(queryTranscript, referenceTranscript, referenceTranscript.getName())
360 newExon.setStart(int(round((exon.getStart() - inputTranscript.getStart()) * factor)) + start) 362 self.computeOverlap(queryTranscript, referenceTranscript, referenceTranscript.getName())
361 newExon.setEnd( int(round((exon.getEnd() - inputTranscript.getStart()) * factor)) + start) 363 progress.inc()
362 queryTranscript.addExon(newExon) 364 progress.done()
363 return (referenceTranscript, queryTranscript) 365
364 366 def compute(self):
365 def compute1TranscriptFiles(self): 367 if self.sequenceParser == None:
366 progress = Progress(self.parsers[1].getNbItems(), "Comparing regions", self.verbosity) 368 self.compute2TranscriptFiles()
367 for transcript in self.parsers[1].getIterator(): 369 else:
368 if transcript.__class__.__name__ == "Mapping": 370 self.compute1TranscriptFiles()
369 referenceTranscript, queryTranscript = self.extractReferenceQueryMapping(transcript) 371
370 else: 372 def plotTranscript(self, index, transcript):
371 referenceTranscript, queryTranscript = self.extractReferenceQuery(transcript) 373 plotter = Plotter(self.seed, index, self.verbosity)
372 self.computeCoverage(queryTranscript, referenceTranscript, referenceTranscript.getName()) 374 plotter.setOutputFileName(self.outputFileName)
373 self.computeOverlap(queryTranscript, referenceTranscript, referenceTranscript.getName()) 375 plotter.setTranscript(transcript)
374 progress.inc() 376 plotter.setTitle(self.title)
375 progress.done() 377 plotter.setLabels(self.xLabel, self.yLabel)
376 378 plotter.setPlotSize(self.width, self.height)
377 def compute(self): 379 plotter.setCoverageData(self.coverage[index])
378 if self.sequenceParser == None: 380 plotter.setOverlapData(self.overlap[index])
379 self.compute2TranscriptFiles() 381 plotter.setMerge(self.merge)
380 else: 382 plotter.plot()
381 self.compute1TranscriptFiles() 383 output = plotter.log
382 384 return output
383 def plotTranscript(self, index, transcript): 385
384 plotter = Plotter(self.seed, index, self.verbosity) 386 def plot1TranscriptFile(self):
385 plotter.setOutputFileName(self.outputFileName) 387 self.outputCoverage = {}
386 plotter.setTranscript(transcript) 388 self.outputCoveragePerStrand = {}
387 plotter.setTitle(self.title) 389 output = ""
388 plotter.setLabels(self.xLabel, self.yLabel) 390 progress = Progress(len(self.sequenceParser.getRegions()), "Plotting regions", self.verbosity)
389 plotter.setPlotSize(self.width, self.height) 391 for cpt2, region in enumerate(self.sequenceParser.getRegions()):
390 plotter.setCoverageData(self.coverage[index]) 392 transcript = Transcript()
391 plotter.setOverlapData(self.overlap[index]) 393 transcript.setName(region)
392 plotter.setMerge(self.merge) 394 transcript.setDirection("+")
393 plotter.plot() 395 transcript.setEnd(self.sizes[region])
394 output = plotter.log 396 transcript.setStart(1)
395 return output 397 output += self.plotTranscript(region, transcript)
396 398 progress.inc()
397 def plot1TranscriptFile(self): 399 progress.done()
398 self.outputCoverage = {} 400 if self.verbosity > 0:
399 self.outputCoveragePerStrand = {} 401 print output
400 output = "" 402
401 progress = Progress(len(self.sequenceParser.getRegions()), "Plotting regions", self.verbosity) 403 def plot2TranscriptFiles(self):
402 for cpt2, region in enumerate(self.sequenceParser.getRegions()): 404 self.outputCoverage = [0] * self.parsers[1].getNbTranscripts()
403 transcript = Transcript() 405 self.outputCoveragePerStrand = [None] * self.parsers[1].getNbTranscripts()
404 transcript.setName(region) 406 for cpt in range(self.parsers[1].getNbTranscripts()):
405 transcript.setDirection("+") 407 self.outputCoveragePerStrand[cpt] = dict([strand, 0] for strand in strands)
406 transcript.setEnd(self.sizes[region]) 408 progress = Progress(self.parsers[1].getNbTranscripts(), "Plotting regions", self.verbosity)
407 transcript.setStart(1) 409 output = ""
408 output += self.plotTranscript(region, transcript) 410 for cpt2, transcript2 in enumerate(self.parsers[1].getIterator()):
409 progress.inc() 411 output += self.plotTranscript(cpt2, transcript2)
410 progress.done() 412 progress.inc()
411 if self.verbosity > 0: 413 progress.done()
412 print output 414 if self.verbosity > 0:
413 415 print output
414 def plot2TranscriptFiles(self): 416
415 self.outputCoverage = [0] * self.parsers[1].getNbTranscripts() 417 def plot(self):
416 self.outputCoveragePerStrand = [None] * self.parsers[1].getNbTranscripts() 418 if self.sequenceParser == None:
417 for cpt in range(self.parsers[1].getNbTranscripts()): 419 self.plot2TranscriptFiles()
418 self.outputCoveragePerStrand[cpt] = dict([strand, 0] for strand in strands) 420 else:
419 progress = Progress(self.parsers[1].getNbTranscripts(), "Plotting regions", self.verbosity) 421 self.plot1TranscriptFile()
420 output = "" 422
421 for cpt2, transcript2 in enumerate(self.parsers[1].getIterator()): 423 def start(self):
422 output += self.plotTranscript(cpt2, transcript2) 424 self.initialize()
423 progress.inc() 425 self.compute()
424 progress.done() 426 self.plot()
425 if self.verbosity > 0:
426 print output
427
428 def plot(self):
429 if self.sequenceParser == None:
430 self.plot2TranscriptFiles()
431 else:
432 self.plot1TranscriptFile()
433
434 def start(self):
435 self.initialize()
436 self.compute()
437 self.plot()
438 427
439 428
440 if __name__ == "__main__": 429 if __name__ == "__main__":
441 430
442 # parse command line 431 # parse command line
443 description = "Plot Coverage v1.0.1: Plot the coverage of the first data with respect to the second one. [Category: Visualization]" 432 description = "Plot Coverage v1.0.1: Plot the coverage of the first data with respect to the second one. [Category: Visualization]"
444 433
445 parser = OptionParser(description = description) 434 parser = OptionParser(description = description)
446 parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript or mapping format given by -f]") 435 parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]")
447 parser.add_option("-f", "--inputFormat1", dest="inputFormat1", action="store", type="string", help="format of input file 1 [compulsory] [format: transcript or mapping file format]") 436 parser.add_option("-f", "--inputFormat1", dest="inputFormat1", action="store", type="string", help="format of input file 1 [compulsory] [format: transcript file format]")
448 parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") 437 parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]")
449 parser.add_option("-g", "--inputFormat2", dest="inputFormat2", action="store", type="string", help="format of input file 2 [compulsory] [format: transcript file format]") 438 parser.add_option("-g", "--inputFormat2", dest="inputFormat2", action="store", type="string", help="format of input file 2 [compulsory] [format: transcript file format]")
450 parser.add_option("-q", "--sequence", dest="inputSequence", action="store", default=None, type="string", help="input sequence file [format: file in FASTA format] [default: None]") 439 parser.add_option("-q", "--sequence", dest="inputSequence", action="store", default=None, type="string", help="input sequence file [format: file in FASTA format] [default: None]")
451 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]") 440 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]")
452 parser.add_option("-w", "--width", dest="width", action="store", default=1500, type="int", help="width of the plots (in px) [format: int] [default: 1500]") 441 parser.add_option("-w", "--width", dest="width", action="store", default=1500, type="int", help="width of the plots (in px) [format: int] [default: 1500]")
453 parser.add_option("-e", "--height", dest="height", action="store", default=1000, type="int", help="height of the plots (in px) [format: int] [default: 1000]") 442 parser.add_option("-e", "--height", dest="height", action="store", default=1000, type="int", help="height of the plots (in px) [format: int] [default: 1000]")
454 parser.add_option("-t", "--title", dest="title", action="store", default="", type="string", help="title of the plots [format: string]") 443 parser.add_option("-t", "--title", dest="title", action="store", default="", type="string", help="title of the plots [format: string]")
455 parser.add_option("-x", "--xlab", dest="xLabel", action="store", default="", type="string", help="label on the x-axis [format: string]") 444 parser.add_option("-x", "--xlab", dest="xLabel", action="store", default="", type="string", help="label on the x-axis [format: string]")
456 parser.add_option("-y", "--ylab", dest="yLabel", action="store", default="", type="string", help="label on the y-axis [format: string]") 445 parser.add_option("-y", "--ylab", dest="yLabel", action="store", default="", type="string", help="label on the y-axis [format: string]")
457 parser.add_option("-p", "--plusColor", dest="plusColor", action="store", default="red", type="string", help="color for the elements on the plus strand [format: string] [default: red]") 446 parser.add_option("-p", "--plusColor", dest="plusColor", action="store", default="red", type="string", help="color for the elements on the plus strand [format: string] [default: red]")
458 parser.add_option("-m", "--minusColor", dest="minusColor", action="store", default="blue", type="string", help="color for the elements on the minus strand [format: string] [default: blue]") 447 parser.add_option("-m", "--minusColor", dest="minusColor", action="store", default="blue", type="string", help="color for the elements on the minus strand [format: string] [default: blue]")
459 parser.add_option("-s", "--sumColor", dest="sumColor", action="store", default="black", type="string", help="color for 2 strands coverage line [format: string] [default: black]") 448 parser.add_option("-s", "--sumColor", dest="sumColor", action="store", default="black", type="string", help="color for 2 strands coverage line [format: string] [default: black]")
460 parser.add_option("-l", "--lineColor", dest="lineColor", action="store", default="black", type="string", help="color for the lines [format: string] [default: black]") 449 parser.add_option("-l", "--lineColor", dest="lineColor", action="store", default="black", type="string", help="color for the lines [format: string] [default: black]")
461 parser.add_option("-1", "--merge", dest="merge", action="store_true", default=False, help="merge the 2 plots in 1 [format: boolean] [default: false]") 450 parser.add_option("-1", "--merge", dest="merge", action="store_true", default=False, help="merge the 2 plots in 1 [format: boolean] [default: false]")
462 parser.add_option("-D", "--directory", dest="working_Dir", action="store", default=os.getcwd(), type="string", help="the directory to store the results [format: directory]") 451 parser.add_option("-D", "--directory", dest="working_Dir", action="store", default=os.getcwd(), type="string", help="the directory to store the results [format: directory]")
463 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") 452 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
464 (options, args) = parser.parse_args() 453 (options, args) = parser.parse_args()
465 454
466 colors[1] = options.plusColor 455 colors[1] = options.plusColor
467 colors[-1] = options.minusColor 456 colors[-1] = options.minusColor
468 colors[0] = options.sumColor 457 colors[0] = options.sumColor
469 colorLine = options.lineColor 458 colorLine = options.lineColor
470 459
471 pp = PlotParser(options.verbosity) 460 pp = PlotParser(options.verbosity)
472 pp.addInput(0, options.inputFileName1, options.inputFormat1) 461 pp.addInput(0, options.inputFileName1, options.inputFormat1)
473 pp.addInput(1, options.inputFileName2, options.inputFormat2) 462 pp.addInput(1, options.inputFileName2, options.inputFormat2)
474 pp.addSequence(options.inputSequence) 463 pp.addSequence(options.inputSequence)
475 pp.setOutput(options.outputFileName if os.path.isabs(options.outputFileName) else os.path.join(options.working_Dir, options.outputFileName)) 464 pp.setOutput(options.outputFileName if os.path.isabs(options.outputFileName) else os.path.join(options.working_Dirpath, options.outputFileName))
476 pp.setPlotSize(options.width, options.height) 465 pp.setPlotSize(options.width, options.height)
477 pp.setLabels(options.xLabel, options.yLabel) 466 pp.setLabels(options.xLabel, options.yLabel)
478 pp.setTitle(options.title) 467 pp.setTitle(options.title)
479 pp.setMerge(options.merge) 468 pp.setMerge(options.merge)
480 pp.start() 469 pp.start()
481 470
471