view SMART/Java/Python/GetReadSizes.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents 769e306b7933
children
line wrap: on
line source

#! /usr/bin/env python
#
# Copyright INRA-URGI 2009-2010
# 
# This software is governed by the CeCILL license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
# 
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
# 
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
# 
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL license and that you accept its terms.
#
import random, os, glob, subprocess
from commons.core.parsing.ParserChooser import ParserChooser
from commons.core.parsing.GffParser import GffParser
from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
from SMART.Java.Python.misc.Progress import Progress
from SMART.Java.Python.misc import Utils
from commons.core.LoggerFactory import LoggerFactory
from commons.core.utils.RepetOptionParser import RepetOptionParser

LOG_DEPTH      = "smart"
DEFAULT_REGION = "_all_"

class GetReadSizes(object):

	def __init__(self, verbosity = 0):
		self.xLab       = "Size"
		self.yLab       = "# reads"
		self.verbosity  = verbosity
		self.number     = random.randint(0, 100000)
		self.log        = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self.verbosity)
		self.parsers    = {}
		self.sizes      = {}
		self.factors    = {}
		self.regions    = None
		self.tmpDatName = None
		self.tmpRName   = None
		self.width      = 800
		self.height     = 300
		self.arial      = False

	def setNames(self, names):
		self.names = names

	def setInputFiles(self, fileNames, format):
		chooser = ParserChooser(self.verbosity)
		chooser.findFormat(format)
		for cpt, fileName in enumerate(fileNames):
			self.parsers[self.names[cpt]] = chooser.getParser(fileName)

	def setOutputFileName(self, fileName):
		self.outputFileName = fileName

	def setLabs(self, xLab, yLab):
		self.xLab = xLab
		self.yLab = yLab

	def setSizes(self, minSize, maxSize):
		self.minSize = minSize
		self.maxSize = maxSize

	def setColors(self, colors):
		self.colors = colors

	def setFactors(self, factors):
		self.factors = dict(zip(self.names, factors))

	def setRegionsFile(self, fileName):
		if fileName != None:
			self._loadRegions(fileName)

	def setImageSize(self, width, height):
		if width != None:
			self.width = width
		if height != None:
			self.height = height

	def setArial(self, arial):
		self.arial = arial

	def _checkOptions(self):
		if not self.parsers:
			self.logAndRaise("ERROR: Missing input file names")

	def _logAndRaise(self, errorMsg):
		self.log.error(errorMsg)
		raise Exception(errorMsg)

	def _loadRegions(self, fileName):
		self.regions = {}
		parser       = GffParser(fileName, self.verbosity)
		for transcript in parser.getIterator():
			chromosome = transcript.getChromosome()
			start      = transcript.getStart()
			end        = transcript.getEnd()
			name       = transcript.getName()
			if chromosome not in self.regions:
				self.regions[chromosome] = {}
			if start not in self.regions[chromosome]:
				self.regions[chromosome][start] = {}
			if end not in self.regions[chromosome][start]:
				self.regions[chromosome][start][end] = []
			self.regions[chromosome][start][end].append(name)

	def _getRegions(self, transcript):
		if self.regions == None:
			return [DEFAULT_REGION]
		chromosome = transcript.getChromosome()
		start      = transcript.getStart()
		end        = transcript.getEnd()
		if chromosome not in self.regions:
			return []
		names = []
		for loadedStart in sorted(self.regions[chromosome].keys()):
			if loadedStart > end:
				return names
			for loadedEnd in reversed(sorted(self.regions[chromosome][loadedStart].keys())):
				if loadedEnd < start:
					break
				names.extend(self.regions[chromosome][loadedStart][loadedEnd])
		return names

	def _parse(self, name):
		progress = UnlimitedProgress(10000, "Reading file '%s'" % (name), self.verbosity)
		for transcript in self.parsers[name].getIterator():
			if transcript.__class__.__name__ == "Mapping":
				transcript = transcript.getTranscript()
			regions = self._getRegions(transcript)
			for region in regions:
				if region not in self.sizes:
					self.sizes[region] = {}
				if name not in self.sizes[region]:
					self.sizes[region][name] = {}
				size = transcript.getSize()
				if (self.minSize == None or size >= self.minSize) and (self.maxSize == None or size <= self.maxSize):
					nbElements                     = float(transcript.getTagValue("nbElements")) if "nbElements" in transcript.getTagNames() else 1
					nbElements                    *= self.factors.get(name, 1)
					self.sizes[region][name][size] = self.sizes[region][name].get(size, 0) + nbElements
			progress.inc()
		progress.done()
		if self.minSize == None:
			self.minSize = min([min(self.sizes[region][name].keys()) for name in self.names for region in region])
		if self.maxSize == None:
			self.maxSize = max([max(self.sizes[region][name].keys()) for name in self.names for region in region])

	def _checkQuorum(self, region):
		return (max([sum(self.sizes[region][name].values()) for name in self.sizes[region]]) > 0)

	def _writeData(self, region):
		self.tmpDatName = "tmpFile%d.dat" % (self.number)
		handle          = open(self.tmpDatName, "w")
		handle.write("Size\tCount\tSample\n")
		for name in self.sizes[region]:
			for size in sorted(self.sizes[region][name].keys()):
				handle.write("%d\t%d\t\"%s\"\n" % (size, self.sizes[region][name].get(size, 0), name))
		handle.close()

	def _writeScript(self, region):
		self.tmpRName = "tmpFile%d.R" % (self.number)
		fileName      = self.outputFileName if region == DEFAULT_REGION else "%s_%s.png" % (os.path.splitext(self.outputFileName)[0], region)
		colors        = "scale_fill_brewer(palette=\"Set1\")" if self.colors == None else "scale_fill_manual(values = c(%s))" % (", ".join(["\"%s\"" % (color) for color in self.colors]))
		title         = "" if region == DEFAULT_REGION else " + labs(title = \"Sizes of %s\")" % (region)
		handle        = open(self.tmpRName, "w")
		arial         = ", text = element_text(family=\"Arial\", size=20)" if self.arial else ""
		if self.arial:
			handle.write("library(extrafont)\nloadfonts()\n")
		handle.write("library(ggplot2)\n")
		handle.write("data <- read.table(\"%s\", header = T)\n" % (self.tmpDatName))
		handle.write("data$Sample <- factor(data$Sample, levels=c(%s))\n" % (", ".join(["\"%s\"" % (name) for name in self.names])))
		handle.write("data$Size <- factor(data$Size, levels=c(%s))\n" % (", ".join(["%d" % (size) for size in range(self.minSize, self.maxSize+1)])))
		handle.write("png(\"%s\", width = %d, height = %d)\n" % (fileName, self.width, self.height))
		handle.write("ggplot(data, aes(x = Size, y = Count, fill = Size)) %s + geom_bar(stat = \"identity\") + facet_grid(. ~ Sample, space=\"free_x\") + xlab(\"%s\") + ylab(\"%s\") + %s + theme(legend.position = \"none\", panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank()%s)\n" % (title, self.xLab, self.yLab, colors, arial))
		handle.write("dev.off()\n")

	def _runR(self):
		rCommand = os.environ["SMARTRPATH"] if "SMARTRPATH" in os.environ else "R"
		command  = "\"%s\" CMD BATCH %s" % (rCommand, self.tmpRName)
		status   = subprocess.call(command, shell=True)
		if status != 0:
			raise Exception("Problem with the execution of script file %s, status is: %s" % (self.tmpRName, status))

	def _plot(self):
		progress = Progress(len(self.sizes), "Plotting data", self.verbosity)
		for region in self.sizes:
			if not self._checkQuorum(region):
				self.log.info("Not displaying '%s' for it contains no data." % (region))
			else:
				self._writeData(region)
				self._writeScript(region)
				self._runR()
			progress.inc()
		progress.done()

	def _cleanFiles(self):
		for fileName in (self.tmpDatName, self.tmpRName):
			if fileName != None and os.path.exists(fileName):
				os.remove(fileName)
				for otherFileName in glob.glob("%s*" % (fileName)):
					os.remove(otherFileName)

	def run(self):
		LoggerFactory.setLevel(self.log, self.verbosity)
		self._checkOptions()
		self.log.info("START Get Read Sizes")
		for name in self.names:
			self._parse(name)
		self._plot()
		self._cleanFiles()
		self.log.info("END Get Read Sizes")


if __name__ == "__main__":
	description = "Usage: GetReadSizes.py [options]\n\nGet Read Sizes v1.0.1: Get the sizes of a set of reads. [Category: Personal]\n"
	epilog = ""
	parser = RepetOptionParser(description = description, epilog = epilog)
	parser.add_option("-i", "--input",     dest="inputFileNames",  action="store",      default=None,     type="string", help="input files, separated by commas [compulsory] [format: string]")
	parser.add_option("-f", "--format",    dest="format",          action="store",      default=None,     type="string", help="format of the input [compulsory] [format: transcript or sequence file format]")
	parser.add_option("-n", "--names",     dest="names",           action="store",      default=None,     type="string", help="name of the input data, separated by commas [compulsory] [format: string]")
	parser.add_option("-o", "--output",    dest="outputFileName",  action="store",      default=None,      type="string", help="output file [format: output file in PNG format]")
	parser.add_option("-s", "--minSize",   dest="minSize",         action="store",      default=None,      type="int",    help="minimum size [format: int]")
	parser.add_option("-S", "--maxSize",   dest="maxSize",         action="store",      default=None,      type="int",    help="maximum size [format: int]")
	parser.add_option("-l", "--xLabel",    dest="xLab",            action="store",      default="Size",    type="string", help="x-axis label name [format: string] [default: Size]")
	parser.add_option("-L", "--yLabel",    dest="yLab",            action="store",      default="# reads", type="string", help="y-axis label name [format: string] [default: Reads]")
	parser.add_option("-c", "--colors",    dest="colors",          action="store",      default=None,      type="string", help="colors of the bars, separated by commas  [format: string]")
	parser.add_option("-a", "--factors",   dest="factors",         action="store",      default=None,      type="string", help="normalization factors, separated by commas  [format: string]")
	parser.add_option("-r", "--regions",   dest="regionsFileName", action="store",      default=None,      type="string", help="regions to plot [format: transcript file in GFF format]")
	parser.add_option("-z", "--width",     dest="width",           action="store",      default=800,       type="int",    help="width of the image [format: int] [default: 800]")
	parser.add_option("-Z", "--height",    dest="height",          action="store",      default=300,       type="int",    help="height of the image [format: int] [default: 300]")
	parser.add_option("-A", "--arial",     dest="arial",           action="store_true", default=False,                    help="use Arial font [format: boolean] [default: false]")
	parser.add_option("-v", "--verbosity", dest="verbosity",       action="store",      default=1,         type="int",    help="trace level [format: int]")
	options = parser.parse_args()[0]
	iGetReadSizes = GetReadSizes(options.verbosity)
	iGetReadSizes.setNames(options.names.split(","))
	iGetReadSizes.setInputFiles(options.inputFileNames.split(","), options.format)
	iGetReadSizes.setOutputFileName(options.outputFileName)
	iGetReadSizes.setLabs(options.xLab, options.yLab)
	iGetReadSizes.setSizes(options.minSize, options.maxSize)
	iGetReadSizes.setColors(None if options.colors == None else options.colors.split(","))
	iGetReadSizes.setFactors(None if options.factors == None else map(float, options.factors.split(",")))
	iGetReadSizes.setRegionsFile(options.regionsFileName)
	iGetReadSizes.setImageSize(options.width, options.height)
	iGetReadSizes.setArial(options.arial)
	iGetReadSizes.run()