view SMART/Java/Python/GetReadDistribution.py @ 38:2c0c0a89fad7

Uploaded
author m-zytnicki
date Thu, 02 May 2013 09:56:47 -0400
parents
children
line wrap: on
line source

#! /usr/bin/env python
#
# Copyright INRA-URGI 2009-2010
# 
# This software is governed by the CeCILL license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
# 
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
# 
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
# 
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL license and that you accept its terms.
#
import random, os, glob, subprocess
from commons.core.parsing.ParserChooser import ParserChooser
from commons.core.parsing.GffParser import GffParser
from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
from SMART.Java.Python.misc.Progress import Progress
from SMART.Java.Python.misc import Utils
from commons.core.LoggerFactory import LoggerFactory
from commons.core.utils.RepetOptionParser import RepetOptionParser

LOG_DEPTH      = "smart"
DEFAULT_REGION = "_all_"
MULTIPLE_STR   = {1: "", 1000: " (in kbp)", 1000000: " (in Gbp)"}

class GetReadDistribution(object):

	def __init__(self, verbosity = 0):
		self.xLab         = ""
		self.yLab         = "# reads"
		self.verbosity    = verbosity
		self.number       = random.randint(0, 100000)
		self.log          = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self.verbosity)
		self.parsers      = {}
		self.distribution = {}
		self.factors      = {}
		self.regions      = None
		self.tmpDatName   = None
		self.tmpRName     = None
		self.quorum       = 1
		self.strands      = False
		self.width        = 800
		self.height       = 300
		self.arial        = False

	def setNames(self, names):
		self.names = names

	def setInputFiles(self, fileNames, format):
		chooser = ParserChooser(self.verbosity)
		chooser.findFormat(format)
		for cpt, fileName in enumerate(fileNames):
			self.parsers[self.names[cpt]] = chooser.getParser(fileName)

	def setOutputFileName(self, fileName):
		self.outputFileName = fileName

	def setLabs(self, xLab, yLab):
		self.xLab = xLab
		self.yLab = yLab

	def setBinSize(self, binSize):
		self.binSize = binSize

	def setColors(self, colors):
		self.colors = colors

	def setFactors(self, factors):
		if factors == None:
			self.factors = dict([name, 1.0] for name in self.names)
		else:
			self.factors = dict(zip(self.names, factors))

	def setMultiple(self, boolean):
		self.multiple = boolean
	
	def setImageSize(self, width, height):
		if width != None:
			self.width = width
		if height != None:
			self.height = height

	def setQuorum(self, quorum):
		self.quorum = quorum

	def setRegionsFile(self, fileName):
		if fileName != None:
			self._loadRegions(fileName)

	def setBothStrands(self, strands):
		self.strands = strands

	def setArial(self, arial):
		self.arial = arial

	def _checkOptions(self):
		if not self.parsers:
			self.logAndRaise("ERROR: Missing input file names")

	def _logAndRaise(self, errorMsg):
		self.log.error(errorMsg)
		raise Exception(errorMsg)

	def _loadRegions(self, fileName):
		self.regions = {}
		parser       = GffParser(fileName, self.verbosity)
		for transcript in parser.getIterator():
			chromosome = transcript.getChromosome()
			start      = transcript.getStart()
			end        = transcript.getEnd()
			name       = transcript.getName()
			if chromosome not in self.regions:
				self.regions[chromosome] = {}
			if start not in self.regions[chromosome]:
				self.regions[chromosome][start] = {}
			if end not in self.regions[chromosome][start]:
				self.regions[chromosome][start][end] = []
			self.regions[chromosome][start][end].append(name)

	def _getRegions(self, transcript):
		if self.regions == None:
			return [DEFAULT_REGION]
		chromosome = transcript.getChromosome()
		start      = transcript.getStart()
		end        = transcript.getEnd()
		if chromosome not in self.regions:
			return []
		names = []
		for loadedStart in sorted(self.regions[chromosome].keys()):
			if loadedStart > end:
				return names
			for loadedEnd in reversed(sorted(self.regions[chromosome][loadedStart].keys())):
				if loadedEnd < start:
					break
				names.extend(self.regions[chromosome][loadedStart][loadedEnd])
		return names

	def _parse(self, name):
		progress = UnlimitedProgress(10000, "Reading file '%s'" % (name), self.verbosity)
		for transcript in self.parsers[name].getIterator():
			if transcript.__class__.__name__ == "Mapping":
				transcript = transcript.getTranscript()
			regions = self._getRegions(transcript)
			for region in regions:
				if region not in self.distribution:
					self.distribution[region] = {}
				if name not in self.distribution[region]:
					self.distribution[region][name] = {}
				chromosome  = transcript.getChromosome()
				nbElements  = float(transcript.getTagValue("nbElements")) if "nbElements" in transcript.getTagNames() else 1
				nbElements *= self.factors.get(name, 1)
				strand      = transcript.getDirection() if self.strands else 1
				if chromosome not in self.distribution[region][name]:
					self.distribution[region][name][chromosome] = {}
				if strand not in self.distribution[region][name][chromosome]:
					self.distribution[region][name][chromosome][strand] = {}
				previousBin = None
				for exon in transcript.getExons():
					for pos in range(exon.getStart(), exon.getEnd()+1):
						bin = pos / self.binSize
						if bin != previousBin:
							self.distribution[region][name][chromosome][strand][bin] = self.distribution[region][name][chromosome][strand].get(bin, 0) + nbElements
							previousBin = bin
			progress.inc()
		progress.done()

	def _checkQuorum(self, region):
		if self.quorum == None:
			return True
		return max([max([max([max(self.distribution[region][name][chromosome][strand].values()) for strand in self.distribution[region][name][chromosome]]) for chromosome in self.distribution[region][name]]) for name in self.distribution[region]])

	def _writeData(self, region):
		self.tmpDatName = "tmpFile%d.dat" % (self.number)
		handle          = open(self.tmpDatName, "w")
		handle.write("Chr\tPos\tStrand\tCount\tSample\n")
		for name in self.distribution[region]:
			for chromosome in sorted(self.distribution[region][name].keys()):
				for strand in sorted(self.distribution[region][name][chromosome].keys()):
					for pos in sorted(self.distribution[region][name][chromosome][strand].keys()):
						handle.write("%s\t%d\t%d\t%d\t\"%s\"\n" % (chromosome, pos * self.binSize, strand, self.distribution[region][name][chromosome][strand].get(pos, 0) * strand, name))
		handle.close()

	def _findMultiple(self, region):
		if not self.multiple:
			return 1
		maxPosition = max([max([max([max(self.distribution[region][name][chromosome][strand].keys()) for strand in self.distribution[region][name][chromosome]]) for chromosome in self.distribution[region][name]]) for name in self.distribution[region]]) * self.binSize
		if maxPosition > 2000000:
			return 1000000
		elif maxPosition > 2000:
			return 1000
		return 1

	def _writeScript(self, region):
		self.tmpRName = "tmpFile%d.R" % (self.number)
		fileName      = self.outputFileName if region == DEFAULT_REGION else "%s_%s.png" % (os.path.splitext(self.outputFileName)[0], region)
		colors        = "scale_fill_brewer(palette=\"Set1\") + scale_color_brewer(palette=\"Set1\")" if self.colors == None else "scale_fill_manual(values = c(%s)) + scale_color_manual(values = c(%s))" % (", ".join(["\"%s\"" % (color) for color in self.colors]), ", ".join(["\"%s\"" % (color) for color in self.colors]))
		title         = "" if region == DEFAULT_REGION else " + labs(title = \"Distribution of %s\") " % (region)
		facet         = "Sample ~ Chr" if region == DEFAULT_REGION else "Sample ~ ."
		handle        = open(self.tmpRName, "w")
		multiple      = self._findMultiple(region)
		arial         = ", text = element_text(family=\"Arial\", size=20)" if self.arial else ""
		if self.arial:
			handle.write("library(extrafont)\nloadfonts()\n")
		handle.write("library(ggplot2)\n")
		handle.write("data <- read.table(\"%s\", header = T)\n" % (self.tmpDatName))
		handle.write("data$Sample <- factor(data$Sample, levels=c(%s))\n" % (", ".join(["\"%s\"" % (name) for name in self.names])))
		handle.write("png(\"%s\", width = %d, height = %d)\n" % (fileName, self.width, self.height))
		handle.write("ggplot(data, aes(x = Pos/%d, y = Count, fill = Sample, color = Sample)) %s + geom_bar(stat = \"identity\") + facet_grid(%s, space=\"free\") + xlab(\"%s%s\") + ylab(\"%s\") + %s + theme(legend.position = \"none\", panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank()%s)\n" % (multiple, title, facet, self.xLab, MULTIPLE_STR[multiple], self.yLab, colors, arial))
		handle.write("dev.off()\n")

	def _runR(self):
		rCommand = os.environ["SMARTRPATH"] if "SMARTRPATH" in os.environ else "R"
		command  = "\"%s\" CMD BATCH %s" % (rCommand, self.tmpRName)
		status   = subprocess.call(command, shell=True)
		if status != 0:
			raise Exception("Problem with the execution of script file %s, status is: %s" % (self.tmpRName, status))

	def _plot(self):
		progress = Progress(len(self.distribution), "Plotting data", self.verbosity)
		for region in self.distribution:
			if not self._checkQuorum(region):
				self.log.info("Not displaying '%s' for it contains insufficient data." % (region))
			else:
				self._writeData(region)
				self._writeScript(region)
				self._runR()
			progress.inc()
		progress.done()

	def _cleanFiles(self):
		for fileName in (self.tmpDatName, self.tmpRName):
			if fileName != None and os.path.exists(fileName):
				os.remove(fileName)
				for otherFileName in glob.glob("%s*" % (fileName)):
					os.remove(otherFileName)

	def run(self):
		LoggerFactory.setLevel(self.log, self.verbosity)
		self._checkOptions()
		self.log.info("START Get Read Distribution")
		for name in self.names:
			self._parse(name)
		self._plot()
		self._cleanFiles()
		self.log.info("END Get Read Distribution")


if __name__ == "__main__":
	description = "Usage: GetReadDistribution.py [options]\n\nGet Read Distribution v1.0.1: Get the distribution of a set of reads. [Category: Personal]\n"
	epilog = ""
	parser = RepetOptionParser(description = description, epilog = epilog)
	parser.add_option("-i", "--input",     dest="inputFileNames",  action="store",      default=None,      type="string", help="input files, separated by commas [compulsory] [format: string]")
	parser.add_option("-f", "--format",    dest="format",          action="store",      default=None,      type="string", help="format of the input [compulsory] [format: transcript or sequence file format]")
	parser.add_option("-n", "--names",     dest="names",           action="store",      default=None,      type="string", help="name of the input data, separated by commas [compulsory] [format: string]")
	parser.add_option("-o", "--output",    dest="outputFileName",  action="store",      default=None,      type="string", help="output file [format: output file in PNG format]")
	parser.add_option("-s", "--binSize",   dest="binSize",         action="store",      default=10000,     type="int",    help="bin size [format: int] [default: 10000]")
	parser.add_option("-l", "--xLabel",    dest="xLab",            action="store",      default="",        type="string", help="x-axis label name [format: string]")
	parser.add_option("-L", "--yLabel",    dest="yLab",            action="store",      default="# reads", type="string", help="y-axis label name [format: string] [default: Reads]")
	parser.add_option("-c", "--colors",    dest="colors",          action="store",      default=None,      type="string", help="colors of the bars, separated by commas  [format: string]")
	parser.add_option("-a", "--factors",   dest="factors",         action="store",      default=None,      type="string", help="normalization factors, separated by commas  [format: string]")
	parser.add_option("-r", "--regions",   dest="regionsFileName", action="store",      default=None,      type="string", help="regions to plot [format: transcript file in GFF format]")
	parser.add_option("-2", "--strands",   dest="strands",         action="store_true", default=False,                    help="plot negative strands on the negative x-axis [format: boolean] [default: False]")
	parser.add_option("-m", "--multiple",  dest="multiple",        action="store_true", default=False,                    help="use human readable genomic positions (k, G) [format: boolean] [default: False]")
	parser.add_option("-q", "--quorum",    dest="quorum",          action="store",      default=1,         type="int",    help="minimum number of intervals to plot a region [format: int] [default: 1]")
	parser.add_option("-z", "--width",     dest="width",           action="store",      default=800,       type="int",    help="width of the image [format: int] [default: 800]")
	parser.add_option("-Z", "--height",    dest="height",          action="store",      default=300,       type="int",    help="height of the image [format: int] [default: 300]")
	parser.add_option("-A", "--arial",     dest="arial",           action="store_true", default=False,                    help="use Arial font [format: boolean] [default: false]")
	parser.add_option("-v", "--verbosity", dest="verbosity",       action="store",      default=1,         type="int",    help="trace level [format: int]")
	options = parser.parse_args()[0]
	iGetReadDistribution = GetReadDistribution(options.verbosity)
	iGetReadDistribution.setNames(options.names.split(","))
	iGetReadDistribution.setInputFiles(options.inputFileNames.split(","), options.format)
	iGetReadDistribution.setOutputFileName(options.outputFileName)
	iGetReadDistribution.setLabs(options.xLab, options.yLab)
	iGetReadDistribution.setBinSize(options.binSize)
	iGetReadDistribution.setColors(None if options.colors == None else options.colors.split(","))
	iGetReadDistribution.setFactors(None if options.factors == None else map(float, options.factors.split(",")))
	iGetReadDistribution.setRegionsFile(options.regionsFileName)
	iGetReadDistribution.setMultiple(options.multiple)
	iGetReadDistribution.setQuorum(options.quorum)
	iGetReadDistribution.setImageSize(options.width, options.height)
	iGetReadDistribution.setBothStrands(options.strands)
	iGetReadDistribution.setArial(options.arial)
	iGetReadDistribution.run()