comparison SMART/Java/Python/plotCoverage.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents 769e306b7933
children
comparison
equal deleted inserted replaced
17:b0e8584489e6 18:94ab73e8a190
26 # same conditions as regards security. 26 # same conditions as regards security.
27 # 27 #
28 # The fact that you are presently reading this means that you have had 28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms. 29 # knowledge of the CeCILL license and that you accept its terms.
30 # 30 #
31 import os, subprocess, glob, random 31 import os, os.path, subprocess, glob, random
32 from optparse import OptionParser 32 from optparse import OptionParser
33 from SMART.Java.Python.structure.Interval import Interval 33 from SMART.Java.Python.structure.Interval import Interval
34 from SMART.Java.Python.structure.Transcript import Transcript 34 from SMART.Java.Python.structure.Transcript import Transcript
35 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer 35 from commons.core.parsing.ParserChooser import ParserChooser
36 from SMART.Java.Python.misc.RPlotter import RPlotter 36 from SMART.Java.Python.misc.RPlotter import RPlotter
37 from SMART.Java.Python.misc.Progress import Progress 37 from SMART.Java.Python.misc.Progress import Progress
38 from commons.core.parsing.FastaParser import FastaParser 38 from commons.core.parsing.FastaParser import FastaParser
39 39
40 strands = [-1, 1] 40 strands = [-1, 1]
41 colors = {-1: "blue", 1: "red", 0: "black"} 41 colors = {-1: "blue", 1: "red", 0: "black"}
42 colorLine = "black" 42 colorLine = "black"
43 43
44 def parseTargetField(field): 44 def parseTargetField(field):
45 strand = "+" 45 strand = "+"
46 splittedFieldSpace = field.split() 46 splittedFieldSpace = field.split()
47 splittedFieldPlus = field.split("+", 4) 47 splittedFieldPlus = field.split("+", 4)
48 if len(splittedFieldSpace) == 3: 48 if len(splittedFieldSpace) == 3:
49 id, start, end = splittedFieldSpace 49 id, start, end = splittedFieldSpace
50 elif len(splittedFieldSpace) == 4: 50 elif len(splittedFieldSpace) == 4:
51 id, start, end, strand = splittedFieldSpace 51 id, start, end, strand = splittedFieldSpace
52 elif len(splittedFieldPlus) == 3: 52 elif len(splittedFieldPlus) == 3:
53 id, start, end = splittedFieldPlus 53 id, start, end = splittedFieldPlus
54 elif len(splittedFieldPlus) == 4: 54 elif len(splittedFieldPlus) == 4:
55 id, start, end, strand = splittedFieldPlus 55 id, start, end, strand = splittedFieldPlus
56 else: 56 else:
57 raise Exception("Cannot parse Target field '%s'." % (field)) 57 raise Exception("Cannot parse Target field '%s'." % (field))
58 return (id, int(start), int(end), strand) 58 return (id, int(start), int(end), strand)
59 59
60 60
61 class SimpleTranscript(object): 61 class SimpleTranscript(object):
62 def __init__(self, transcript1, transcript2, color = None): 62 def __init__(self, transcript1, transcript2, color = None):
63 self.start = max(0, transcript1.getStart() - transcript2.getStart()) 63 self.start = max(0, transcript1.getStart() - transcript2.getStart())
64 self.end = min(transcript2.getEnd() - transcript2.getStart(), transcript1.getEnd() - transcript2.getStart()) 64 self.end = min(transcript2.getEnd() - transcript2.getStart(), transcript1.getEnd() - transcript2.getStart())
65 self.strand = transcript1.getDirection() * transcript2.getDirection() 65 self.strand = transcript1.getDirection() * transcript2.getDirection()
66 self.exons = [] 66 self.exons = []
67 for exon in transcript1.getExons(): 67 for exon in transcript1.getExons():
68 if exon.getEnd() >= transcript2.getStart() and exon.getStart() <= transcript2.getEnd(): 68 if exon.getEnd() >= transcript2.getStart() and exon.getStart() <= transcript2.getEnd():
69 start = max(0, exon.getStart() - transcript2.getStart()) 69 start = max(0, exon.getStart() - transcript2.getStart())
70 end = min(transcript2.getEnd() - transcript2.getStart(), exon.getEnd() - transcript2.getStart()) 70 end = min(transcript2.getEnd() - transcript2.getStart(), exon.getEnd() - transcript2.getStart())
71 self.addExon(start, end, self.strand, color) 71 self.addExon(start, end, self.strand, color)
72 72
73 def addExon(self, start, end, strand, color): 73 def addExon(self, start, end, strand, color):
74 exon = SimpleExon(start, end, strand, color) 74 exon = SimpleExon(start, end, strand, color)
75 self.exons.append(exon) 75 self.exons.append(exon)
76 76
77 def getRScript(self, yOffset, height): 77 def getRScript(self, yOffset, height):
78 rString = "" 78 rString = ""
79 previousEnd = None 79 previousEnd = None
80 for exon in sorted(self.exons, key=lambda exon: exon.start): 80 for exon in sorted(self.exons, key=lambda exon: exon.start):
81 if previousEnd != None: 81 if previousEnd != None:
82 rString += "segments(%.1f, %.1f, %.1f, %.1f, col = \"%s\")\n" % (previousEnd, yOffset + height / 4.0, exon.start, yOffset + height / 4.0, colorLine) 82 rString += "segments(%.1f, %.1f, %.1f, %.1f, col = \"%s\")\n" % (previousEnd, yOffset + height / 4.0, exon.start, yOffset + height / 4.0, colorLine)
83 rString += exon.getRScript(yOffset, height) 83 rString += exon.getRScript(yOffset, height)
84 previousEnd = exon.end 84 previousEnd = exon.end
85 return rString 85 return rString
86 86
87 87
88 class SimpleExon(object): 88 class SimpleExon(object):
89 def __init__(self, start, end, strand, color = None): 89 def __init__(self, start, end, strand, color = None):
90 self.start = start 90 self.start = start
91 self.end = end 91 self.end = end
92 self.strand = strand 92 self.strand = strand
93 self.color = color 93 self.color = color
94 94
95 def getRScript(self, yOffset, height): 95 def getRScript(self, yOffset, height):
96 color = self.color if self.color != None else colors[self.strand] 96 color = self.color if self.color != None else colors[self.strand]
97 return "rect(%.1f, %.1f, %.1f, %.1f, col=\"%s\", border = \"%s\")\n" % (self.start, yOffset, self.end, yOffset + height / 2.0, color, colorLine) 97 return "rect(%.1f, %.1f, %.1f, %.1f, col=\"%s\", border = \"%s\")\n" % (self.start, yOffset, self.end, yOffset + height / 2.0, color, colorLine)
98 98
99 99
100 class Plotter(object): 100 class Plotter(object):
101 101
102 def __init__(self, seed, index, verbosity): 102 def __init__(self, seed, index, verbosity):
103 self.seed = seed 103 self.seed = seed
104 self.index = index 104 self.index = index
105 self.verbosity = verbosity 105 self.verbosity = verbosity
106 self.maxCoverage = 0 106 self.maxCoverage = 0
107 self.maxOverlap = 0 107 self.maxOverlap = 0
108 self.log = "" 108 self.log = ""
109 self.merge = False 109 self.merge = False
110 self.width = 1500 110 self.width = 1500
111 self.heigth = 1000 111 self.heigth = 1000
112 self.xLabel = "" 112 self.xLabel = ""
113 self.yLabel = "" 113 self.yLabel = ""
114 self.title = None 114 self.title = None
115 self.absPath = os.getcwd() 115 self.absPath = os.getcwd()
116 self.coverageDataFileName = "tmpFile_%d_%s.dat" % (seed, index) 116 self.coverageDataFileName = "tmpFile_%d_%s.dat" % (seed, index)
117 self.coverageScript = "" 117 self.coverageScript = ""
118 self.overlapScript = "" 118 self.overlapScript = ""
119 self.outputFileName = None 119 self.outputFileName = None
120 120
121 def setOutputFileName(self, fileName): 121 def setOutputFileName(self, fileName):
122 self.outputFileName = fileName 122 self.outputFileName = fileName
123 123
124 def setTranscript(self, transcript): 124 def setTranscript(self, transcript):
125 self.transcript = transcript 125 self.transcript = transcript
126 self.name = transcript.getName() 126 self.name = transcript.getName()
127 self.size = transcript.getEnd() - transcript.getStart() + 1 127 self.size = transcript.getEnd() - transcript.getStart() + 1
128 if self.title == None: 128 if self.title == None:
129 self.title = self.name 129 self.title = self.name
130 else: 130 else:
131 self.title += " " + self.name 131 self.title += " " + self.name
132 132
133 def setTitle(self, title): 133 def setTitle(self, title):
134 self.title = title + " " + self.name 134 self.title = title + " " + self.name
135 135
136 def setPlotSize(self, width, height): 136 def setPlotSize(self, width, height):
137 self.width = width 137 self.width = width
138 self.height = height 138 self.height = height
139 139
140 def setLabels(self, xLabel, yLabel): 140 def setLabels(self, xLabel, yLabel):
141 self.xLabel = xLabel 141 self.xLabel = xLabel
142 self.yLabel = yLabel 142 self.yLabel = yLabel
143 143
144 def setMerge(self, merge): 144 def setMerge(self, merge):
145 self.merge = merge 145 self.merge = merge
146 146
147 def setCoverageData(self, coverage): 147 def setCoverageData(self, coverage):
148 outputCoveragePerStrand = dict([strand, 0] for strand in strands) 148 outputCoveragePerStrand = dict([strand, 0] for strand in strands)
149 outputCoverage = 0 149 outputCoverage = 0
150 dataFile = open(os.path.abspath(self.coverageDataFileName), "w") 150 dataFile = open(os.path.abspath(self.coverageDataFileName), "w")
151 for position in range(self.size+1): 151 for position in range(self.size+1):
152 sumValue = 0 152 sumValue = 0
153 found = False 153 found = False
154 dataFile.write("%d\t" % (position)) 154 dataFile.write("%d\t" % (position))
155 for strand in strands: 155 for strand in strands:
156 value = coverage[strand].get(position, 0) 156 value = coverage[strand].get(position, 0)
157 sumValue += value 157 sumValue += value
158 dataFile.write("%d\t" % (value)) 158 dataFile.write("%d\t" % (value))
159 if value > 0: 159 if value > 0:
160 found = True 160 found = True
161 outputCoveragePerStrand[strand] += 1 161 outputCoveragePerStrand[strand] += 1
162 self.maxCoverage = max(self.maxCoverage, sumValue) 162 self.maxCoverage = max(self.maxCoverage, sumValue)
163 dataFile.write("%d\n" % (sumValue)) 163 dataFile.write("%d\n" % (sumValue))
164 if found: 164 if found:
165 outputCoverage += 1 165 outputCoverage += 1
166 dataFile.close() 166 dataFile.close()
167 self.log += "%s (%d nt):\n - both strands: %d (%.0f%%)\n - (+) strand: %d (%.0f%%)\n - (-) strand: %d (%.0f%%)\n" % (self.name, self.size, outputCoverage, float(outputCoverage) / self.size * 100, outputCoveragePerStrand[1], float(outputCoveragePerStrand[1]) / self.size * 100, outputCoveragePerStrand[-1], float(outputCoveragePerStrand[-1]) / self.size * 100) 167 self.log += "%s (%d nt):\n - both strands: %d (%.0f%%)\n - (+) strand: %d (%.0f%%)\n - (-) strand: %d (%.0f%%)\n" % (self.name, self.size, outputCoverage, float(outputCoverage) / self.size * 100, outputCoveragePerStrand[1], float(outputCoveragePerStrand[1]) / self.size * 100, outputCoveragePerStrand[-1], float(outputCoveragePerStrand[-1]) / self.size * 100)
168 self.coverageScript += "data = scan(\"%s\", list(pos = -666, minus = -666, plus = -666, sumValue = -666), sep=\"\t\")\n" % (os.path.abspath(self.coverageDataFileName)) 168 self.coverageScript += "data = scan(\"%s\", list(pos = -666, minus = -666, plus = -666, sumValue = -666), sep=\"\t\")\n" % (os.path.abspath(self.coverageDataFileName))
169 self.coverageScript += "lines(x = data$pos, y = data$minus, col = \"%s\")\n" % (colors[-1]) 169 self.coverageScript += "lines(x = data$pos, y = data$minus, col = \"%s\")\n" % (colors[-1])
170 self.coverageScript += "lines(x = data$pos, y = data$plus, col = \"%s\")\n" % (colors[1]) 170 self.coverageScript += "lines(x = data$pos, y = data$plus, col = \"%s\")\n" % (colors[1])
171 self.coverageScript += "lines(x = data$pos, y = data$sumValue, col = \"%s\")\n" % (colors[0]) 171 self.coverageScript += "lines(x = data$pos, y = data$sumValue, col = \"%s\")\n" % (colors[0])
172 172
173 def setOverlapData(self, overlap): 173 def setOverlapData(self, overlap):
174 height = 1 174 height = 1
175 self.maxOverlap = (len(overlap) + 1) * height 175 self.maxOverlap = (len(overlap) + 1) * height
176 thisElement = SimpleTranscript(self.transcript, self.transcript, "black") 176 thisElement = SimpleTranscript(self.transcript, self.transcript, "black")
177 self.overlapScript += thisElement.getRScript(0, height) 177 self.overlapScript += thisElement.getRScript(0, height)
178 for cpt, transcript in enumerate(sorted(overlap, cmp=lambda c1, c2: c1.start - c2.start if c1.start != c2.start else c1.end - c2.end)): 178 for cpt, transcript in enumerate(sorted(overlap, cmp=lambda c1, c2: c1.start - c2.start if c1.start != c2.start else c1.end - c2.end)):
179 self.overlapScript += transcript.getRScript((cpt + 1) * height, height) 179 self.overlapScript += transcript.getRScript((cpt + 1) * height, height)
180 180
181 def getFirstLine(self, suffix = None): 181 def getFirstLine(self, suffix = None):
182 return "png(file = \"%s_%s%s.png\", width = %d, height = %d, bg = \"white\")\n" % (self.outputFileName, self.name, "" if suffix == None or self.merge else "_%s" % (suffix), self.width, self.height) 182 return "png(file = \"%s_%s%s.png\", width = %d, height = %d, bg = \"white\")\n" % (self.outputFileName, self.name, "" if suffix == None or self.merge else "_%s" % (suffix), self.width, self.height)
183 183
184 def getLastLine(self): 184 def getLastLine(self):
185 return "dev.off()\n" 185 return "dev.off()\n"
186 186
187 def startR(self, fileName, script): 187 def startR(self, fileName, script):
188 scriptFile = open(fileName, "w") 188 scriptFile = open(fileName, "w")
189 scriptFile.write(script) 189 scriptFile.write(script)
190 scriptFile.close() 190 scriptFile.close()
191 command = "R CMD BATCH %s" % (fileName) 191 command = "R CMD BATCH %s" % (fileName)
192 status = subprocess.call(command, shell=True) 192 status = subprocess.call(command, shell=True)
193 if status != 0: 193 if status != 0:
194 raise Exception("Problem with the execution of script file %s, status is: %s" % (fileName, status)) 194 raise Exception("Problem with the execution of script file %s, status is: %s" % (fileName, status))
195 195
196 def plot(self): 196 def plot(self):
197 print "outputfileName is written in :", self.outputFileName 197 if self.merge:
198 if self.merge: 198 fileName = "%s_%d_%s.R" % (self.outputFileName, self.seed, self.index)
199 fileName = "%s_%d_%s.R" % (self.outputFileName, self.seed, self.index) 199 plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, max(self.maxCoverage, self.maxOverlap), self.title)
200 plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, max(self.maxCoverage, self.maxOverlap), self.title) 200 script = self.getFirstLine() + plotLine + self.overlapScript + self.coverageScript + self.getLastLine()
201 script = self.getFirstLine() + plotLine + self.overlapScript + self.coverageScript + self.getLastLine() 201 self.startR(fileName, script)
202 self.startR(fileName, script) 202 else:
203 else: 203 fileName = "%s_%d_%s_overlap.R" % (self.outputFileName, self.seed, self.index)
204 fileName = "%s_%d_%s_overlap.R" % (self.outputFileName, self.seed, self.index) 204 plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, self.maxOverlap, self.title)
205 print "overlap file is written in :", fileName 205 script = self.getFirstLine("overlap") + plotLine + self.overlapScript + self.getLastLine()
206 plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, self.maxOverlap, self.title) 206 self.startR(fileName, script)
207 script = self.getFirstLine("overlap") + plotLine + self.overlapScript + self.getLastLine() 207 fileName = "%s_%d_%s_coverage.R" % (self.outputFileName, self.seed, self.index)
208 self.startR(fileName, script) 208 plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, self.maxCoverage, self.title)
209 fileName = "%s_%d_%s_coverage.R" % (self.outputFileName, self.seed, self.index) 209 script = self.getFirstLine("coverage") + plotLine + self.coverageScript + self.getLastLine()
210 plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, self.maxCoverage, self.title) 210 self.startR(fileName, script)
211 script = self.getFirstLine("coverage") + plotLine + self.coverageScript + self.getLastLine()
212 self.startR(fileName, script)
213 211
214 212
215 class PlotParser(object): 213 class PlotParser(object):
216 214
217 def __init__(self, verbosity): 215 def __init__(self, verbosity):
218 self.verbosity = verbosity 216 self.verbosity = verbosity
219 self.parsers = [None, None] 217 self.parsers = [None, None]
220 self.sequenceParser = None 218 self.sequenceParser = None
221 self.seed = random.randint(0, 10000) 219 self.seed = random.randint(0, 10000)
222 self.title = "" 220 self.title = ""
223 self.merge = False 221 self.merge = False
224 222
225 def __del__(self): 223 def __del__(self):
226 for fileName in glob.glob("tmpFile_%d*.dat" % (self.seed)): 224 for fileName in glob.glob("tmpFile_%d*.dat" % (self.seed)):
227 os.remove(fileName) 225 os.remove(fileName)
228 for fileName in glob.glob("%s*.R" % (os.path.abspath(self.outputFileName))): 226 for fileName in glob.glob("%s*.R" % (os.path.abspath(self.outputFileName))):
229 os.remove(fileName) 227 os.remove(fileName)
230 for fileName in glob.glob("%s*.Rout" % (os.path.abspath(self.outputFileName))): 228 for fileName in glob.glob("%s*.Rout" % (os.path.abspath(self.outputFileName))):
231 os.remove(fileName) 229 os.remove(fileName)
232 230
233 def addInput(self, inputNb, fileName, fileFormat): 231 def addInput(self, inputNb, fileName, fileFormat):
234 if fileName == None: 232 if fileName == None:
235 return 233 return
236 self.parsers[inputNb] = TranscriptContainer(fileName, fileFormat, self.verbosity) 234 chooser = ParserChooser(self.verbosity)
237 if inputNb == 0: 235 chooser.findFormat(fileFormat)
238 self.parsers[1] = self.parsers[0] 236 self.parsers[inputNb] = chooser.getParser(fileName)
239 237 if inputNb == 0:
240 def addSequence(self, fileName): 238 self.parsers[1] = self.parsers[0]
241 if fileName == None: 239
242 return 240 def addSequence(self, fileName):
243 self.sequenceParser = FastaParser(fileName, self.verbosity) 241 if fileName == None:
244 242 return
245 def setOutput(self, fileName): 243 self.sequenceParser = FastaParser(fileName, self.verbosity)
246 self.outputFileName = fileName 244
247 245 def setOutput(self, fileName):
248 def setPlotSize(self, width, height): 246 self.outputFileName = fileName
249 self.width = width 247
250 self.height = height 248 def setPlotSize(self, width, height):
251 249 self.width = width
252 def setLabels(self, xLabel, yLabel): 250 self.height = height
253 self.xLabel = xLabel 251
254 self.yLabel = yLabel 252 def setLabels(self, xLabel, yLabel):
255 253 self.xLabel = xLabel
256 def setTitle(self, title): 254 self.yLabel = yLabel
257 self.title = title 255
258 256 def setTitle(self, title):
259 def setMerge(self, merge): 257 self.title = title
260 self.merge = merge 258
261 259 def setMerge(self, merge):
262 def initializeDataFromSequences(self): 260 self.merge = merge
263 self.sizes = {} 261
264 self.coverage = {} 262 def initializeDataFromSequences(self):
265 self.overlap = {} 263 self.sizes = {}
266 for region in self.sequenceParser.getRegions(): 264 self.coverage = {}
267 self.sizes[region] = self.sequenceParser.getSizeOfRegion(region) 265 self.overlap = {}
268 self.coverage[region] = {} 266 for region in self.sequenceParser.getRegions():
269 self.overlap[region] = [] 267 self.sizes[region] = self.sequenceParser.getSizeOfRegion(region)
270 for strand in strands: 268 self.coverage[region] = {}
271 self.coverage[region][strand] = {} 269 self.overlap[region] = []
272 self.coverage[region][strand][1] = 0 270 for strand in strands:
273 self.coverage[region][strand][self.sizes[region]] = 0 271 self.coverage[region][strand] = {}
274 272 self.coverage[region][strand][1] = 0
275 273 self.coverage[region][strand][self.sizes[region]] = 0
276 def initializeDataFromTranscripts(self): 274
277 self.coverage = dict([i, None] for i in range(self.parsers[1].getNbTranscripts())) 275 def initializeDataFromTranscripts(self):
278 self.overlap = dict([i, None] for i in range(self.parsers[1].getNbTranscripts())) 276 self.coverage = dict([i, None] for i in range(self.parsers[1].getNbTranscripts()))
279 self.sizes = dict([i, 0] for i in range(self.parsers[1].getNbTranscripts())) 277 self.overlap = dict([i, None] for i in range(self.parsers[1].getNbTranscripts()))
280 self.parsers[0].findData() 278 self.sizes = dict([i, 0] for i in range(self.parsers[1].getNbTranscripts()))
281 progress = Progress(self.parsers[1].getNbTranscripts(), "Reading regions", self.verbosity) 279 progress = Progress(self.parsers[1].getNbTranscripts(), "Reading regions", self.verbosity)
282 for cpt, transcript in enumerate(self.parsers[1].getIterator()): 280 for cpt, transcript in enumerate(self.parsers[1].getIterator()):
283 self.coverage[cpt] = {} 281 self.coverage[cpt] = {}
284 self.overlap[cpt] = [] 282 self.overlap[cpt] = []
285 for strand in strands: 283 for strand in strands:
286 self.coverage[cpt][strand] = {} 284 self.coverage[cpt][strand] = {}
287 self.coverage[cpt][strand][0] = 0 285 self.coverage[cpt][strand][0] = 0
288 self.coverage[cpt][strand][transcript.getEnd() - transcript.getStart()] = 0 286 self.coverage[cpt][strand][transcript.getEnd() - transcript.getStart()] = 0
289 for exon in transcript.getExons(): 287 for exon in transcript.getExons():
290 self.sizes[cpt] += exon.getSize() 288 self.sizes[cpt] += exon.getSize()
291 progress.inc() 289 progress.inc()
292 progress.done() 290 progress.done()
293 291
294 def initialize(self): 292 def initialize(self):
295 if self.sequenceParser == None: 293 if self.sequenceParser == None:
296 self.initializeDataFromTranscripts() 294 self.initializeDataFromTranscripts()
297 else: 295 else:
298 self.initializeDataFromSequences() 296 self.initializeDataFromSequences()
299 297
300 def computeCoverage(self, transcript1, transcript2, id): 298 def computeCoverage(self, transcript1, transcript2, id):
301 strand = transcript1.getDirection() * transcript2.getDirection() 299 strand = transcript1.getDirection() * transcript2.getDirection()
302 for exon1 in transcript1.getExons(): 300 for exon1 in transcript1.getExons():
303 for exon2 in transcript2.getExons(): 301 for exon2 in transcript2.getExons():
304 if exon1.overlapWith(exon2): 302 if exon1.overlapWith(exon2):
305 for position in range(max(exon1.getStart(), exon2.getStart()), min(exon1.getEnd(), exon2.getEnd()) + 1): 303 for position in range(max(exon1.getStart(), exon2.getStart()), min(exon1.getEnd(), exon2.getEnd()) + 1):
306 relativePosition = position - transcript2.getStart() + 1 304 relativePosition = position - transcript2.getStart() + 1
307 self.coverage[id][strand][relativePosition] = self.coverage[id][strand].get(relativePosition, 0) + 1 305 self.coverage[id][strand][relativePosition] = self.coverage[id][strand].get(relativePosition, 0) + 1
308 306
309 def computeOverlap(self, transcript1, transcript2, id): 307 def computeOverlap(self, transcript1, transcript2, id):
310 simpleTranscript = SimpleTranscript(transcript1, transcript2) 308 simpleTranscript = SimpleTranscript(transcript1, transcript2)
311 self.overlap[id].append(simpleTranscript) 309 self.overlap[id].append(simpleTranscript)
312 310
313 def compute2TranscriptFiles(self): 311 def compute2TranscriptFiles(self):
314 progress = Progress(self.parsers[1].getNbTranscripts(), "Comparing regions", self.verbosity) 312 progress = Progress(self.parsers[1].getNbTranscripts(), "Comparing regions", self.verbosity)
315 for cpt2, transcript2 in enumerate(self.parsers[1].getIterator()): 313 for cpt2, transcript2 in enumerate(self.parsers[1].getIterator()):
316 for transcript1 in self.parsers[0].getIterator(): 314 for transcript1 in self.parsers[0].getIterator():
317 if transcript1.overlapWithExon(transcript2): 315 if transcript1.overlapWithExon(transcript2):
318 self.computeCoverage(transcript1, transcript2, cpt2) 316 self.computeCoverage(transcript1, transcript2, cpt2)
319 self.computeOverlap(transcript1, transcript2, cpt2) 317 self.computeOverlap(transcript1, transcript2, cpt2)
320 progress.inc() 318 progress.inc()
321 progress.done() 319 progress.done()
322 320
323 def extractReferenceQuery(self, inputTranscript): 321 def extractReferenceQueryMapping(self, mapping):
324 if "Target" not in inputTranscript.getTagNames(): 322 queryTranscript = mapping.getTranscript()
325 raise Exception("Cannot extract Target field in line '%s'." % (inputTranscript)) 323 referenceTranscript = Transcript()
326 id, start, end, strand = parseTargetField(inputTranscript.getTagValue("Target")) 324 referenceTranscript.setChromosome(queryTranscript.getChromosome())
327 if id not in self.sizes: 325 referenceTranscript.setName(queryTranscript.getChromosome())
328 raise Exception("Target id '%s' of transcript '%s' does not correspond to anything in FASTA file." % (id, inputTranscript)) 326 referenceTranscript.setDirection("+")
329 referenceTranscript = Transcript() 327 referenceTranscript.setEnd(self.sizes[queryTranscript.getChromosome()])
330 referenceTranscript.setChromosome(id) 328 referenceTranscript.setStart(1)
331 referenceTranscript.setName(id) 329 return (referenceTranscript, queryTranscript)
332 referenceTranscript.setDirection("+") 330
333 referenceTranscript.setEnd(self.sizes[id]) 331 def extractReferenceQuery(self, inputTranscript):
334 referenceTranscript.setStart(1) 332 if "Target" not in inputTranscript.getTagNames():
335 queryTranscript = Transcript() 333 raise Exception("Cannot extract Target field in line '%s'." % (inputTranscript))
336 queryTranscript.setChromosome(id) 334 id, start, end, strand = parseTargetField(inputTranscript.getTagValue("Target"))
337 queryTranscript.setName(id) 335 if id not in self.sizes:
338 queryTranscript.setStart(start) 336 raise Exception("Target id '%s' of transcript '%s' does not correspond to anything in FASTA file." % (id, inputTranscript))
339 queryTranscript.setEnd(end) 337 referenceTranscript = Transcript()
340 queryTranscript.setDirection(strand) 338 referenceTranscript.setChromosome(id)
341 if inputTranscript.getNbExons() > 1: 339 referenceTranscript.setName(id)
342 factor = float(end - start) / (inputTranscript.getEnd() - inputTranscript.getStart()) 340 referenceTranscript.setDirection("+")
343 for exon in inputTranscript.getExons(): 341 referenceTranscript.setEnd(self.sizes[id])
344 newExon = Interval() 342 referenceTranscript.setStart(1)
345 newExon.setChromosome(id) 343 queryTranscript = Transcript()
346 newExon.setDirection(strand) 344 queryTranscript.setChromosome(id)
347 if "Target" in inputTranscript.getTagNames(): 345 queryTranscript.setName(id)
348 id, start, end, strand = parseTargetField(exon.getTagValue("Target")) 346 queryTranscript.setStart(start)
349 newExon.setStart(start) 347 queryTranscript.setEnd(end)
350 newExon.setEnd(end) 348 queryTranscript.setDirection(strand)
351 else: 349 if inputTranscript.getNbExons() > 1:
352 newExon.setStart(int(round((exon.getStart() - inputTranscript.getStart()) * factor)) + start) 350 factor = float(end - start) / (inputTranscript.getEnd() - inputTranscript.getStart())
353 newExon.setEnd( int(round((exon.getEnd() - inputTranscript.getStart()) * factor)) + start) 351 for exon in inputTranscript.getExons():
354 queryTranscript.addExon(newExon) 352 newExon = Interval()
355 return (referenceTranscript, queryTranscript) 353 newExon.setChromosome(id)
356 354 newExon.setDirection(strand)
357 def compute1TranscriptFiles(self): 355 if "Target" in inputTranscript.getTagNames():
358 progress = Progress(self.parsers[1].getNbTranscripts(), "Comparing regions", self.verbosity) 356 id, start, end, strand = parseTargetField(exon.getTagValue("Target"))
359 for transcript in self.parsers[1].getIterator(): 357 newExon.setStart(start)
360 referenceTranscript, queryTranscript = self.extractReferenceQuery(transcript) 358 newExon.setEnd(end)
361 self.computeCoverage(queryTranscript, referenceTranscript, referenceTranscript.getName()) 359 else:
362 self.computeOverlap(queryTranscript, referenceTranscript, referenceTranscript.getName()) 360 newExon.setStart(int(round((exon.getStart() - inputTranscript.getStart()) * factor)) + start)
363 progress.inc() 361 newExon.setEnd( int(round((exon.getEnd() - inputTranscript.getStart()) * factor)) + start)
364 progress.done() 362 queryTranscript.addExon(newExon)
365 363 return (referenceTranscript, queryTranscript)
366 def compute(self): 364
367 if self.sequenceParser == None: 365 def compute1TranscriptFiles(self):
368 self.compute2TranscriptFiles() 366 progress = Progress(self.parsers[1].getNbItems(), "Comparing regions", self.verbosity)
369 else: 367 for transcript in self.parsers[1].getIterator():
370 self.compute1TranscriptFiles() 368 if transcript.__class__.__name__ == "Mapping":
371 369 referenceTranscript, queryTranscript = self.extractReferenceQueryMapping(transcript)
372 def plotTranscript(self, index, transcript): 370 else:
373 plotter = Plotter(self.seed, index, self.verbosity) 371 referenceTranscript, queryTranscript = self.extractReferenceQuery(transcript)
374 plotter.setOutputFileName(self.outputFileName) 372 self.computeCoverage(queryTranscript, referenceTranscript, referenceTranscript.getName())
375 plotter.setTranscript(transcript) 373 self.computeOverlap(queryTranscript, referenceTranscript, referenceTranscript.getName())
376 plotter.setTitle(self.title) 374 progress.inc()
377 plotter.setLabels(self.xLabel, self.yLabel) 375 progress.done()
378 plotter.setPlotSize(self.width, self.height) 376
379 plotter.setCoverageData(self.coverage[index]) 377 def compute(self):
380 plotter.setOverlapData(self.overlap[index]) 378 if self.sequenceParser == None:
381 plotter.setMerge(self.merge) 379 self.compute2TranscriptFiles()
382 plotter.plot() 380 else:
383 output = plotter.log 381 self.compute1TranscriptFiles()
384 return output 382
385 383 def plotTranscript(self, index, transcript):
386 def plot1TranscriptFile(self): 384 plotter = Plotter(self.seed, index, self.verbosity)
387 self.outputCoverage = {} 385 plotter.setOutputFileName(self.outputFileName)
388 self.outputCoveragePerStrand = {} 386 plotter.setTranscript(transcript)
389 output = "" 387 plotter.setTitle(self.title)
390 progress = Progress(len(self.sequenceParser.getRegions()), "Plotting regions", self.verbosity) 388 plotter.setLabels(self.xLabel, self.yLabel)
391 for cpt2, region in enumerate(self.sequenceParser.getRegions()): 389 plotter.setPlotSize(self.width, self.height)
392 transcript = Transcript() 390 plotter.setCoverageData(self.coverage[index])
393 transcript.setName(region) 391 plotter.setOverlapData(self.overlap[index])
394 transcript.setDirection("+") 392 plotter.setMerge(self.merge)
395 transcript.setEnd(self.sizes[region]) 393 plotter.plot()
396 transcript.setStart(1) 394 output = plotter.log
397 output += self.plotTranscript(region, transcript) 395 return output
398 progress.inc() 396
399 progress.done() 397 def plot1TranscriptFile(self):
400 if self.verbosity > 0: 398 self.outputCoverage = {}
401 print output 399 self.outputCoveragePerStrand = {}
402 400 output = ""
403 def plot2TranscriptFiles(self): 401 progress = Progress(len(self.sequenceParser.getRegions()), "Plotting regions", self.verbosity)
404 self.outputCoverage = [0] * self.parsers[1].getNbTranscripts() 402 for cpt2, region in enumerate(self.sequenceParser.getRegions()):
405 self.outputCoveragePerStrand = [None] * self.parsers[1].getNbTranscripts() 403 transcript = Transcript()
406 for cpt in range(self.parsers[1].getNbTranscripts()): 404 transcript.setName(region)
407 self.outputCoveragePerStrand[cpt] = dict([strand, 0] for strand in strands) 405 transcript.setDirection("+")
408 progress = Progress(self.parsers[1].getNbTranscripts(), "Plotting regions", self.verbosity) 406 transcript.setEnd(self.sizes[region])
409 output = "" 407 transcript.setStart(1)
410 for cpt2, transcript2 in enumerate(self.parsers[1].getIterator()): 408 output += self.plotTranscript(region, transcript)
411 output += self.plotTranscript(cpt2, transcript2) 409 progress.inc()
412 progress.inc() 410 progress.done()
413 progress.done() 411 if self.verbosity > 0:
414 if self.verbosity > 0: 412 print output
415 print output 413
416 414 def plot2TranscriptFiles(self):
417 def plot(self): 415 self.outputCoverage = [0] * self.parsers[1].getNbTranscripts()
418 if self.sequenceParser == None: 416 self.outputCoveragePerStrand = [None] * self.parsers[1].getNbTranscripts()
419 self.plot2TranscriptFiles() 417 for cpt in range(self.parsers[1].getNbTranscripts()):
420 else: 418 self.outputCoveragePerStrand[cpt] = dict([strand, 0] for strand in strands)
421 self.plot1TranscriptFile() 419 progress = Progress(self.parsers[1].getNbTranscripts(), "Plotting regions", self.verbosity)
422 420 output = ""
423 def start(self): 421 for cpt2, transcript2 in enumerate(self.parsers[1].getIterator()):
424 self.initialize() 422 output += self.plotTranscript(cpt2, transcript2)
425 self.compute() 423 progress.inc()
426 self.plot() 424 progress.done()
425 if self.verbosity > 0:
426 print output
427
428 def plot(self):
429 if self.sequenceParser == None:
430 self.plot2TranscriptFiles()
431 else:
432 self.plot1TranscriptFile()
433
434 def start(self):
435 self.initialize()
436 self.compute()
437 self.plot()
427 438
428 439
429 if __name__ == "__main__": 440 if __name__ == "__main__":
430 441
431 # parse command line 442 # parse command line
432 description = "Plot Coverage v1.0.1: Plot the coverage of the first data with respect to the second one. [Category: Visualization]" 443 description = "Plot Coverage v1.0.1: Plot the coverage of the first data with respect to the second one. [Category: Visualization]"
433 444
434 parser = OptionParser(description = description) 445 parser = OptionParser(description = description)
435 parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") 446 parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript or mapping format given by -f]")
436 parser.add_option("-f", "--inputFormat1", dest="inputFormat1", action="store", type="string", help="format of input file 1 [compulsory] [format: transcript file format]") 447 parser.add_option("-f", "--inputFormat1", dest="inputFormat1", action="store", type="string", help="format of input file 1 [compulsory] [format: transcript or mapping file format]")
437 parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") 448 parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]")
438 parser.add_option("-g", "--inputFormat2", dest="inputFormat2", action="store", type="string", help="format of input file 2 [compulsory] [format: transcript file format]") 449 parser.add_option("-g", "--inputFormat2", dest="inputFormat2", action="store", type="string", help="format of input file 2 [compulsory] [format: transcript file format]")
439 parser.add_option("-q", "--sequence", dest="inputSequence", action="store", default=None, type="string", help="input sequence file [format: file in FASTA format] [default: None]") 450 parser.add_option("-q", "--sequence", dest="inputSequence", action="store", default=None, type="string", help="input sequence file [format: file in FASTA format] [default: None]")
440 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]") 451 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]")
441 parser.add_option("-w", "--width", dest="width", action="store", default=1500, type="int", help="width of the plots (in px) [format: int] [default: 1500]") 452 parser.add_option("-w", "--width", dest="width", action="store", default=1500, type="int", help="width of the plots (in px) [format: int] [default: 1500]")
442 parser.add_option("-e", "--height", dest="height", action="store", default=1000, type="int", help="height of the plots (in px) [format: int] [default: 1000]") 453 parser.add_option("-e", "--height", dest="height", action="store", default=1000, type="int", help="height of the plots (in px) [format: int] [default: 1000]")
443 parser.add_option("-t", "--title", dest="title", action="store", default="", type="string", help="title of the plots [format: string]") 454 parser.add_option("-t", "--title", dest="title", action="store", default="", type="string", help="title of the plots [format: string]")
444 parser.add_option("-x", "--xlab", dest="xLabel", action="store", default="", type="string", help="label on the x-axis [format: string]") 455 parser.add_option("-x", "--xlab", dest="xLabel", action="store", default="", type="string", help="label on the x-axis [format: string]")
445 parser.add_option("-y", "--ylab", dest="yLabel", action="store", default="", type="string", help="label on the y-axis [format: string]") 456 parser.add_option("-y", "--ylab", dest="yLabel", action="store", default="", type="string", help="label on the y-axis [format: string]")
446 parser.add_option("-p", "--plusColor", dest="plusColor", action="store", default="red", type="string", help="color for the elements on the plus strand [format: string] [default: red]") 457 parser.add_option("-p", "--plusColor", dest="plusColor", action="store", default="red", type="string", help="color for the elements on the plus strand [format: string] [default: red]")
447 parser.add_option("-m", "--minusColor", dest="minusColor", action="store", default="blue", type="string", help="color for the elements on the minus strand [format: string] [default: blue]") 458 parser.add_option("-m", "--minusColor", dest="minusColor", action="store", default="blue", type="string", help="color for the elements on the minus strand [format: string] [default: blue]")
448 parser.add_option("-s", "--sumColor", dest="sumColor", action="store", default="black", type="string", help="color for 2 strands coverage line [format: string] [default: black]") 459 parser.add_option("-s", "--sumColor", dest="sumColor", action="store", default="black", type="string", help="color for 2 strands coverage line [format: string] [default: black]")
449 parser.add_option("-l", "--lineColor", dest="lineColor", action="store", default="black", type="string", help="color for the lines [format: string] [default: black]") 460 parser.add_option("-l", "--lineColor", dest="lineColor", action="store", default="black", type="string", help="color for the lines [format: string] [default: black]")
450 parser.add_option("-1", "--merge", dest="merge", action="store_true", default=False, help="merge the 2 plots in 1 [format: boolean] [default: false]") 461 parser.add_option("-1", "--merge", dest="merge", action="store_true", default=False, help="merge the 2 plots in 1 [format: boolean] [default: false]")
451 parser.add_option("-D", "--directory", dest="working_Dir", action="store", default=os.getcwd(), type="string", help="the directory to store the results [format: directory]") 462 parser.add_option("-D", "--directory", dest="working_Dir", action="store", default=os.getcwd(), type="string", help="the directory to store the results [format: directory]")
452 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") 463 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
453 (options, args) = parser.parse_args() 464 (options, args) = parser.parse_args()
454 465
455 colors[1] = options.plusColor 466 colors[1] = options.plusColor
456 colors[-1] = options.minusColor 467 colors[-1] = options.minusColor
457 colors[0] = options.sumColor 468 colors[0] = options.sumColor
458 colorLine = options.lineColor 469 colorLine = options.lineColor
459 470
460 pp = PlotParser(options.verbosity) 471 pp = PlotParser(options.verbosity)
461 pp.addInput(0, options.inputFileName1, options.inputFormat1) 472 pp.addInput(0, options.inputFileName1, options.inputFormat1)
462 pp.addInput(1, options.inputFileName2, options.inputFormat2) 473 pp.addInput(1, options.inputFileName2, options.inputFormat2)
463 pp.addSequence(options.inputSequence) 474 pp.addSequence(options.inputSequence)
464 if options.working_Dir[-1] != '/': 475 pp.setOutput(options.outputFileName if os.path.isabs(options.outputFileName) else os.path.join(options.working_Dir, options.outputFileName))
465 path = options.working_Dir + '/' 476 pp.setPlotSize(options.width, options.height)
466 pp.setOutput(path + options.outputFileName) 477 pp.setLabels(options.xLabel, options.yLabel)
467 pp.setPlotSize(options.width, options.height) 478 pp.setTitle(options.title)
468 pp.setLabels(options.xLabel, options.yLabel) 479 pp.setMerge(options.merge)
469 pp.setTitle(options.title) 480 pp.start()
470 pp.setMerge(options.merge) 481
471 pp.start()
472
473