| 6 | 1 #! /usr/bin/env python | 
|  | 2 # | 
|  | 3 # Copyright INRA-URGI 2009-2010 | 
|  | 4 # | 
|  | 5 # This software is governed by the CeCILL license under French law and | 
|  | 6 # abiding by the rules of distribution of free software. You can use, | 
|  | 7 # modify and/ or redistribute the software under the terms of the CeCILL | 
|  | 8 # license as circulated by CEA, CNRS and INRIA at the following URL | 
|  | 9 # "http://www.cecill.info". | 
|  | 10 # | 
|  | 11 # As a counterpart to the access to the source code and rights to copy, | 
|  | 12 # modify and redistribute granted by the license, users are provided only | 
|  | 13 # with a limited warranty and the software's author, the holder of the | 
|  | 14 # economic rights, and the successive licensors have only limited | 
|  | 15 # liability. | 
|  | 16 # | 
|  | 17 # In this respect, the user's attention is drawn to the risks associated | 
|  | 18 # with loading, using, modifying and/or developing or reproducing the | 
|  | 19 # software by the user in light of its specific status of free software, | 
|  | 20 # that may mean that it is complicated to manipulate, and that also | 
|  | 21 # therefore means that it is reserved for developers and experienced | 
|  | 22 # professionals having in-depth computer knowledge. Users are therefore | 
|  | 23 # encouraged to load and test the software's suitability as regards their | 
|  | 24 # requirements in conditions enabling the security of their systems and/or | 
|  | 25 # data to be ensured and, more generally, to use and operate it in the | 
|  | 26 # same conditions as regards security. | 
|  | 27 # | 
|  | 28 # The fact that you are presently reading this means that you have had | 
|  | 29 # knowledge of the CeCILL license and that you accept its terms. | 
|  | 30 # | 
|  | 31 import random, os, glob, subprocess | 
|  | 32 from commons.core.parsing.ParserChooser import ParserChooser | 
|  | 33 from commons.core.parsing.GffParser import GffParser | 
|  | 34 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress | 
|  | 35 from SMART.Java.Python.misc.Progress import Progress | 
|  | 36 from SMART.Java.Python.misc import Utils | 
|  | 37 from commons.core.LoggerFactory import LoggerFactory | 
|  | 38 from commons.core.utils.RepetOptionParser import RepetOptionParser | 
|  | 39 | 
|  | 40 LOG_DEPTH      = "smart" | 
|  | 41 DEFAULT_REGION = "_all_" | 
|  | 42 | 
|  | 43 class GetReadSizes(object): | 
|  | 44 | 
|  | 45 	def __init__(self, verbosity = 0): | 
|  | 46 		self.xLab       = "Size" | 
|  | 47 		self.yLab       = "# reads" | 
|  | 48 		self.verbosity  = verbosity | 
|  | 49 		self.number     = random.randint(0, 100000) | 
|  | 50 		self.log        = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self.verbosity) | 
| 18 | 51 		self.parsers    = {} | 
| 6 | 52 		self.sizes      = {} | 
|  | 53 		self.factors    = {} | 
|  | 54 		self.regions    = None | 
|  | 55 		self.tmpDatName = None | 
|  | 56 		self.tmpRName   = None | 
|  | 57 		self.width      = 800 | 
|  | 58 		self.height     = 300 | 
| 18 | 59 		self.arial      = False | 
| 6 | 60 | 
|  | 61 	def setNames(self, names): | 
|  | 62 		self.names = names | 
|  | 63 | 
|  | 64 	def setInputFiles(self, fileNames, format): | 
|  | 65 		chooser = ParserChooser(self.verbosity) | 
|  | 66 		chooser.findFormat(format) | 
|  | 67 		for cpt, fileName in enumerate(fileNames): | 
|  | 68 			self.parsers[self.names[cpt]] = chooser.getParser(fileName) | 
|  | 69 | 
|  | 70 	def setOutputFileName(self, fileName): | 
|  | 71 		self.outputFileName = fileName | 
|  | 72 | 
|  | 73 	def setLabs(self, xLab, yLab): | 
|  | 74 		self.xLab = xLab | 
|  | 75 		self.yLab = yLab | 
|  | 76 | 
|  | 77 	def setSizes(self, minSize, maxSize): | 
|  | 78 		self.minSize = minSize | 
|  | 79 		self.maxSize = maxSize | 
|  | 80 | 
|  | 81 	def setColors(self, colors): | 
|  | 82 		self.colors = colors | 
|  | 83 | 
|  | 84 	def setFactors(self, factors): | 
|  | 85 		self.factors = dict(zip(self.names, factors)) | 
|  | 86 | 
|  | 87 	def setRegionsFile(self, fileName): | 
|  | 88 		if fileName != None: | 
|  | 89 			self._loadRegions(fileName) | 
|  | 90 | 
|  | 91 	def setImageSize(self, width, height): | 
|  | 92 		if width != None: | 
|  | 93 			self.width = width | 
|  | 94 		if height != None: | 
|  | 95 			self.height = height | 
|  | 96 | 
| 18 | 97 	def setArial(self, arial): | 
|  | 98 		self.arial = arial | 
|  | 99 | 
| 6 | 100 	def _checkOptions(self): | 
|  | 101 		if not self.parsers: | 
|  | 102 			self.logAndRaise("ERROR: Missing input file names") | 
|  | 103 | 
|  | 104 	def _logAndRaise(self, errorMsg): | 
|  | 105 		self.log.error(errorMsg) | 
|  | 106 		raise Exception(errorMsg) | 
|  | 107 | 
|  | 108 	def _loadRegions(self, fileName): | 
|  | 109 		self.regions = {} | 
|  | 110 		parser       = GffParser(fileName, self.verbosity) | 
|  | 111 		for transcript in parser.getIterator(): | 
|  | 112 			chromosome = transcript.getChromosome() | 
|  | 113 			start      = transcript.getStart() | 
|  | 114 			end        = transcript.getEnd() | 
|  | 115 			name       = transcript.getName() | 
|  | 116 			if chromosome not in self.regions: | 
|  | 117 				self.regions[chromosome] = {} | 
|  | 118 			if start not in self.regions[chromosome]: | 
|  | 119 				self.regions[chromosome][start] = {} | 
|  | 120 			if end not in self.regions[chromosome][start]: | 
|  | 121 				self.regions[chromosome][start][end] = [] | 
|  | 122 			self.regions[chromosome][start][end].append(name) | 
|  | 123 | 
|  | 124 	def _getRegions(self, transcript): | 
|  | 125 		if self.regions == None: | 
|  | 126 			return [DEFAULT_REGION] | 
|  | 127 		chromosome = transcript.getChromosome() | 
|  | 128 		start      = transcript.getStart() | 
|  | 129 		end        = transcript.getEnd() | 
|  | 130 		if chromosome not in self.regions: | 
|  | 131 			return [] | 
|  | 132 		names = [] | 
|  | 133 		for loadedStart in sorted(self.regions[chromosome].keys()): | 
|  | 134 			if loadedStart > end: | 
|  | 135 				return names | 
|  | 136 			for loadedEnd in reversed(sorted(self.regions[chromosome][loadedStart].keys())): | 
|  | 137 				if loadedEnd < start: | 
|  | 138 					break | 
|  | 139 				names.extend(self.regions[chromosome][loadedStart][loadedEnd]) | 
|  | 140 		return names | 
|  | 141 | 
|  | 142 	def _parse(self, name): | 
|  | 143 		progress = UnlimitedProgress(10000, "Reading file '%s'" % (name), self.verbosity) | 
|  | 144 		for transcript in self.parsers[name].getIterator(): | 
|  | 145 			if transcript.__class__.__name__ == "Mapping": | 
|  | 146 				transcript = transcript.getTranscript() | 
|  | 147 			regions = self._getRegions(transcript) | 
|  | 148 			for region in regions: | 
|  | 149 				if region not in self.sizes: | 
|  | 150 					self.sizes[region] = {} | 
|  | 151 				if name not in self.sizes[region]: | 
|  | 152 					self.sizes[region][name] = {} | 
|  | 153 				size = transcript.getSize() | 
|  | 154 				if (self.minSize == None or size >= self.minSize) and (self.maxSize == None or size <= self.maxSize): | 
|  | 155 					nbElements                     = float(transcript.getTagValue("nbElements")) if "nbElements" in transcript.getTagNames() else 1 | 
|  | 156 					nbElements                    *= self.factors.get(name, 1) | 
|  | 157 					self.sizes[region][name][size] = self.sizes[region][name].get(size, 0) + nbElements | 
|  | 158 			progress.inc() | 
|  | 159 		progress.done() | 
|  | 160 		if self.minSize == None: | 
|  | 161 			self.minSize = min([min(self.sizes[region][name].keys()) for name in self.names for region in region]) | 
|  | 162 		if self.maxSize == None: | 
|  | 163 			self.maxSize = max([max(self.sizes[region][name].keys()) for name in self.names for region in region]) | 
|  | 164 | 
|  | 165 	def _checkQuorum(self, region): | 
|  | 166 		return (max([sum(self.sizes[region][name].values()) for name in self.sizes[region]]) > 0) | 
|  | 167 | 
|  | 168 	def _writeData(self, region): | 
|  | 169 		self.tmpDatName = "tmpFile%d.dat" % (self.number) | 
|  | 170 		handle          = open(self.tmpDatName, "w") | 
|  | 171 		handle.write("Size\tCount\tSample\n") | 
|  | 172 		for name in self.sizes[region]: | 
|  | 173 			for size in sorted(self.sizes[region][name].keys()): | 
|  | 174 				handle.write("%d\t%d\t\"%s\"\n" % (size, self.sizes[region][name].get(size, 0), name)) | 
|  | 175 		handle.close() | 
|  | 176 | 
|  | 177 	def _writeScript(self, region): | 
|  | 178 		self.tmpRName = "tmpFile%d.R" % (self.number) | 
|  | 179 		fileName      = self.outputFileName if region == DEFAULT_REGION else "%s_%s.png" % (os.path.splitext(self.outputFileName)[0], region) | 
|  | 180 		colors        = "scale_fill_brewer(palette=\"Set1\")" if self.colors == None else "scale_fill_manual(values = c(%s))" % (", ".join(["\"%s\"" % (color) for color in self.colors])) | 
| 18 | 181 		title         = "" if region == DEFAULT_REGION else " + labs(title = \"Sizes of %s\")" % (region) | 
| 6 | 182 		handle        = open(self.tmpRName, "w") | 
| 18 | 183 		arial         = ", text = element_text(family=\"Arial\", size=20)" if self.arial else "" | 
|  | 184 		if self.arial: | 
|  | 185 			handle.write("library(extrafont)\nloadfonts()\n") | 
| 6 | 186 		handle.write("library(ggplot2)\n") | 
|  | 187 		handle.write("data <- read.table(\"%s\", header = T)\n" % (self.tmpDatName)) | 
|  | 188 		handle.write("data$Sample <- factor(data$Sample, levels=c(%s))\n" % (", ".join(["\"%s\"" % (name) for name in self.names]))) | 
|  | 189 		handle.write("data$Size <- factor(data$Size, levels=c(%s))\n" % (", ".join(["%d" % (size) for size in range(self.minSize, self.maxSize+1)]))) | 
|  | 190 		handle.write("png(\"%s\", width = %d, height = %d)\n" % (fileName, self.width, self.height)) | 
| 18 | 191 		handle.write("ggplot(data, aes(x = Size, y = Count, fill = Size)) %s + geom_bar(stat = \"identity\") + facet_grid(. ~ Sample, space=\"free_x\") + xlab(\"%s\") + ylab(\"%s\") + %s + theme(legend.position = \"none\", panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank()%s)\n" % (title, self.xLab, self.yLab, colors, arial)) | 
| 6 | 192 		handle.write("dev.off()\n") | 
|  | 193 | 
|  | 194 	def _runR(self): | 
| 18 | 195 		rCommand = os.environ["SMARTRPATH"] if "SMARTRPATH" in os.environ else "R" | 
|  | 196 		command  = "\"%s\" CMD BATCH %s" % (rCommand, self.tmpRName) | 
|  | 197 		status   = subprocess.call(command, shell=True) | 
| 6 | 198 		if status != 0: | 
|  | 199 			raise Exception("Problem with the execution of script file %s, status is: %s" % (self.tmpRName, status)) | 
|  | 200 | 
|  | 201 	def _plot(self): | 
|  | 202 		progress = Progress(len(self.sizes), "Plotting data", self.verbosity) | 
|  | 203 		for region in self.sizes: | 
|  | 204 			if not self._checkQuorum(region): | 
|  | 205 				self.log.info("Not displaying '%s' for it contains no data." % (region)) | 
|  | 206 			else: | 
|  | 207 				self._writeData(region) | 
|  | 208 				self._writeScript(region) | 
|  | 209 				self._runR() | 
|  | 210 			progress.inc() | 
|  | 211 		progress.done() | 
|  | 212 | 
|  | 213 	def _cleanFiles(self): | 
|  | 214 		for fileName in (self.tmpDatName, self.tmpRName): | 
|  | 215 			if fileName != None and os.path.exists(fileName): | 
|  | 216 				os.remove(fileName) | 
|  | 217 				for otherFileName in glob.glob("%s*" % (fileName)): | 
|  | 218 					os.remove(otherFileName) | 
|  | 219 | 
|  | 220 	def run(self): | 
|  | 221 		LoggerFactory.setLevel(self.log, self.verbosity) | 
|  | 222 		self._checkOptions() | 
|  | 223 		self.log.info("START Get Read Sizes") | 
|  | 224 		for name in self.names: | 
|  | 225 			self._parse(name) | 
|  | 226 		self._plot() | 
|  | 227 		self._cleanFiles() | 
|  | 228 		self.log.info("END Get Read Sizes") | 
|  | 229 | 
|  | 230 | 
|  | 231 if __name__ == "__main__": | 
|  | 232 	description = "Usage: GetReadSizes.py [options]\n\nGet Read Sizes v1.0.1: Get the sizes of a set of reads. [Category: Personal]\n" | 
|  | 233 	epilog = "" | 
|  | 234 	parser = RepetOptionParser(description = description, epilog = epilog) | 
|  | 235 	parser.add_option("-i", "--input",     dest="inputFileNames",  action="store",      default=None,     type="string", help="input files, separated by commas [compulsory] [format: string]") | 
|  | 236 	parser.add_option("-f", "--format",    dest="format",          action="store",      default=None,     type="string", help="format of the input [compulsory] [format: transcript or sequence file format]") | 
|  | 237 	parser.add_option("-n", "--names",     dest="names",           action="store",      default=None,     type="string", help="name of the input data, separated by commas [compulsory] [format: string]") | 
|  | 238 	parser.add_option("-o", "--output",    dest="outputFileName",  action="store",      default=None,      type="string", help="output file [format: output file in PNG format]") | 
|  | 239 	parser.add_option("-s", "--minSize",   dest="minSize",         action="store",      default=None,      type="int",    help="minimum size [format: int]") | 
|  | 240 	parser.add_option("-S", "--maxSize",   dest="maxSize",         action="store",      default=None,      type="int",    help="maximum size [format: int]") | 
|  | 241 	parser.add_option("-l", "--xLabel",    dest="xLab",            action="store",      default="Size",    type="string", help="x-axis label name [format: string] [default: Size]") | 
|  | 242 	parser.add_option("-L", "--yLabel",    dest="yLab",            action="store",      default="# reads", type="string", help="y-axis label name [format: string] [default: Reads]") | 
|  | 243 	parser.add_option("-c", "--colors",    dest="colors",          action="store",      default=None,      type="string", help="colors of the bars, separated by commas  [format: string]") | 
|  | 244 	parser.add_option("-a", "--factors",   dest="factors",         action="store",      default=None,      type="string", help="normalization factors, separated by commas  [format: string]") | 
|  | 245 	parser.add_option("-r", "--regions",   dest="regionsFileName", action="store",      default=None,      type="string", help="regions to plot [format: transcript file in GFF format]") | 
|  | 246 	parser.add_option("-z", "--width",     dest="width",           action="store",      default=800,       type="int",    help="width of the image [format: int] [default: 800]") | 
|  | 247 	parser.add_option("-Z", "--height",    dest="height",          action="store",      default=300,       type="int",    help="height of the image [format: int] [default: 300]") | 
| 18 | 248 	parser.add_option("-A", "--arial",     dest="arial",           action="store_true", default=False,                    help="use Arial font [format: boolean] [default: false]") | 
| 6 | 249 	parser.add_option("-v", "--verbosity", dest="verbosity",       action="store",      default=1,         type="int",    help="trace level [format: int]") | 
|  | 250 	options = parser.parse_args()[0] | 
|  | 251 	iGetReadSizes = GetReadSizes(options.verbosity) | 
|  | 252 	iGetReadSizes.setNames(options.names.split(",")) | 
|  | 253 	iGetReadSizes.setInputFiles(options.inputFileNames.split(","), options.format) | 
|  | 254 	iGetReadSizes.setOutputFileName(options.outputFileName) | 
|  | 255 	iGetReadSizes.setLabs(options.xLab, options.yLab) | 
|  | 256 	iGetReadSizes.setSizes(options.minSize, options.maxSize) | 
|  | 257 	iGetReadSizes.setColors(None if options.colors == None else options.colors.split(",")) | 
|  | 258 	iGetReadSizes.setFactors(None if options.factors == None else map(float, options.factors.split(","))) | 
|  | 259 	iGetReadSizes.setRegionsFile(options.regionsFileName) | 
|  | 260 	iGetReadSizes.setImageSize(options.width, options.height) | 
| 18 | 261 	iGetReadSizes.setArial(options.arial) | 
| 6 | 262 	iGetReadSizes.run() |