diff SMART/Java/Python/GetReadDistribution.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents 769e306b7933
children
line wrap: on
line diff
--- a/SMART/Java/Python/GetReadDistribution.py	Mon Apr 22 11:11:10 2013 -0400
+++ b/SMART/Java/Python/GetReadDistribution.py	Mon Apr 29 03:20:15 2013 -0400
@@ -39,7 +39,7 @@
 
 LOG_DEPTH      = "smart"
 DEFAULT_REGION = "_all_"
-MULTIPLE_STR   = {1: "", 1000: " (in kpb)", 1000000: " (in Gbp)"}
+MULTIPLE_STR   = {1: "", 1000: " (in kbp)", 1000000: " (in Gbp)"}
 
 class GetReadDistribution(object):
 
@@ -56,8 +56,10 @@
 		self.tmpDatName   = None
 		self.tmpRName     = None
 		self.quorum       = 1
+		self.strands      = False
 		self.width        = 800
 		self.height       = 300
+		self.arial        = False
 
 	def setNames(self, names):
 		self.names = names
@@ -82,7 +84,10 @@
 		self.colors = colors
 
 	def setFactors(self, factors):
-		self.factors = dict(zip(self.names, factors))
+		if factors == None:
+			self.factors = dict([name, 1.0] for name in self.names)
+		else:
+			self.factors = dict(zip(self.names, factors))
 
 	def setMultiple(self, boolean):
 		self.multiple = boolean
@@ -100,6 +105,12 @@
 		if fileName != None:
 			self._loadRegions(fileName)
 
+	def setBothStrands(self, strands):
+		self.strands = strands
+
+	def setArial(self, arial):
+		self.arial = arial
+
 	def _checkOptions(self):
 		if not self.parsers:
 			self.logAndRaise("ERROR: Missing input file names")
@@ -156,14 +167,17 @@
 				chromosome  = transcript.getChromosome()
 				nbElements  = float(transcript.getTagValue("nbElements")) if "nbElements" in transcript.getTagNames() else 1
 				nbElements *= self.factors.get(name, 1)
+				strand      = transcript.getDirection() if self.strands else 1
 				if chromosome not in self.distribution[region][name]:
 					self.distribution[region][name][chromosome] = {}
+				if strand not in self.distribution[region][name][chromosome]:
+					self.distribution[region][name][chromosome][strand] = {}
 				previousBin = None
 				for exon in transcript.getExons():
 					for pos in range(exon.getStart(), exon.getEnd()+1):
 						bin = pos / self.binSize
 						if bin != previousBin:
-							self.distribution[region][name][chromosome][bin] = self.distribution[region][name][chromosome].get(bin, 0) + nbElements
+							self.distribution[region][name][chromosome][strand][bin] = self.distribution[region][name][chromosome][strand].get(bin, 0) + nbElements
 							previousBin = bin
 			progress.inc()
 		progress.done()
@@ -171,22 +185,23 @@
 	def _checkQuorum(self, region):
 		if self.quorum == None:
 			return True
-		return max([max([max(self.distribution[region][name][chromosome].values()) for chromosome in self.distribution[region][name]]) for name in self.distribution[region]]) >= self.quorum
+		return max([max([max([max(self.distribution[region][name][chromosome][strand].values()) for strand in self.distribution[region][name][chromosome]]) for chromosome in self.distribution[region][name]]) for name in self.distribution[region]])
 
 	def _writeData(self, region):
 		self.tmpDatName = "tmpFile%d.dat" % (self.number)
 		handle          = open(self.tmpDatName, "w")
-		handle.write("Chr\tPos\tCount\tSample\n")
+		handle.write("Chr\tPos\tStrand\tCount\tSample\n")
 		for name in self.distribution[region]:
 			for chromosome in sorted(self.distribution[region][name].keys()):
-				for pos in sorted(self.distribution[region][name][chromosome].keys()):
-					handle.write("%s\t%d\t%d\t\"%s\"\n" % (chromosome, pos * self.binSize, self.distribution[region][name][chromosome].get(pos, 0), name))
+				for strand in sorted(self.distribution[region][name][chromosome].keys()):
+					for pos in sorted(self.distribution[region][name][chromosome][strand].keys()):
+						handle.write("%s\t%d\t%d\t%d\t\"%s\"\n" % (chromosome, pos * self.binSize, strand, self.distribution[region][name][chromosome][strand].get(pos, 0) * strand, name))
 		handle.close()
 
 	def _findMultiple(self, region):
 		if not self.multiple:
 			return 1
-		maxPosition = max([self.distribution[region][name][chromosome].keys() for name in self.distribution[region] for chromosome in self.distribution[region][name]])
+		maxPosition = max([max([max([max(self.distribution[region][name][chromosome][strand].keys()) for strand in self.distribution[region][name][chromosome]]) for chromosome in self.distribution[region][name]]) for name in self.distribution[region]]) * self.binSize
 		if maxPosition > 2000000:
 			return 1000000
 		elif maxPosition > 2000:
@@ -197,23 +212,24 @@
 		self.tmpRName = "tmpFile%d.R" % (self.number)
 		fileName      = self.outputFileName if region == DEFAULT_REGION else "%s_%s.png" % (os.path.splitext(self.outputFileName)[0], region)
 		colors        = "scale_fill_brewer(palette=\"Set1\") + scale_color_brewer(palette=\"Set1\")" if self.colors == None else "scale_fill_manual(values = c(%s)) + scale_color_manual(values = c(%s))" % (", ".join(["\"%s\"" % (color) for color in self.colors]), ", ".join(["\"%s\"" % (color) for color in self.colors]))
-		title         = "" if region == DEFAULT_REGION else " of %s" % (region)
+		title         = "" if region == DEFAULT_REGION else " + labs(title = \"Distribution of %s\") " % (region)
 		facet         = "Sample ~ Chr" if region == DEFAULT_REGION else "Sample ~ ."
 		handle        = open(self.tmpRName, "w")
 		multiple      = self._findMultiple(region)
+		arial         = ", text = element_text(family=\"Arial\", size=20)" if self.arial else ""
+		if self.arial:
+			handle.write("library(extrafont)\nloadfonts()\n")
 		handle.write("library(ggplot2)\n")
 		handle.write("data <- read.table(\"%s\", header = T)\n" % (self.tmpDatName))
 		handle.write("data$Sample <- factor(data$Sample, levels=c(%s))\n" % (", ".join(["\"%s\"" % (name) for name in self.names])))
 		handle.write("png(\"%s\", width = %d, height = %d)\n" % (fileName, self.width, self.height))
-		handle.write("ggplot(data, aes(x = Pos/%d, y = Count, fill = Sample, color = Sample)) + opts(title = \"Distribution%s\") + geom_bar(stat = \"identity\") + facet_grid(%s, space=\"free\") + xlab(\"%s%s\") + ylab(\"%s\") + %s + opts(legend.position = \"none\", panel.grid.major = theme_blank(), panel.grid.minor = theme_blank(), panel.background = theme_blank())\n" % (multiple, title, facet, self.xLab, MULTIPLE_STR[multiple], self.yLab, colors))
+		handle.write("ggplot(data, aes(x = Pos/%d, y = Count, fill = Sample, color = Sample)) %s + geom_bar(stat = \"identity\") + facet_grid(%s, space=\"free\") + xlab(\"%s%s\") + ylab(\"%s\") + %s + theme(legend.position = \"none\", panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank()%s)\n" % (multiple, title, facet, self.xLab, MULTIPLE_STR[multiple], self.yLab, colors, arial))
 		handle.write("dev.off()\n")
 
 	def _runR(self):
-		rCommand = "R"
-		if "SMARTRPATH" in os.environ:
-			rCommand = os.environ["SMARTRPATH"]
-		command = "\"%s\" CMD BATCH %s" % (rCommand, self.tmpRName)
-		status = subprocess.call(command, shell=True)
+		rCommand = os.environ["SMARTRPATH"] if "SMARTRPATH" in os.environ else "R"
+		command  = "\"%s\" CMD BATCH %s" % (rCommand, self.tmpRName)
+		status   = subprocess.call(command, shell=True)
 		if status != 0:
 			raise Exception("Problem with the execution of script file %s, status is: %s" % (self.tmpRName, status))
 
@@ -261,10 +277,12 @@
 	parser.add_option("-c", "--colors",    dest="colors",          action="store",      default=None,      type="string", help="colors of the bars, separated by commas  [format: string]")
 	parser.add_option("-a", "--factors",   dest="factors",         action="store",      default=None,      type="string", help="normalization factors, separated by commas  [format: string]")
 	parser.add_option("-r", "--regions",   dest="regionsFileName", action="store",      default=None,      type="string", help="regions to plot [format: transcript file in GFF format]")
-	parser.add_option("-m", "--multiple",  dest="multiple",        action="store_true", default=False,                    help="print position using multiples (k, G) [format: boolean] [default: False]")
+	parser.add_option("-2", "--strands",   dest="strands",         action="store_true", default=False,                    help="plot negative strands on the negative x-axis [format: boolean] [default: False]")
+	parser.add_option("-m", "--multiple",  dest="multiple",        action="store_true", default=False,                    help="use human readable genomic positions (k, G) [format: boolean] [default: False]")
 	parser.add_option("-q", "--quorum",    dest="quorum",          action="store",      default=1,         type="int",    help="minimum number of intervals to plot a region [format: int] [default: 1]")
 	parser.add_option("-z", "--width",     dest="width",           action="store",      default=800,       type="int",    help="width of the image [format: int] [default: 800]")
 	parser.add_option("-Z", "--height",    dest="height",          action="store",      default=300,       type="int",    help="height of the image [format: int] [default: 300]")
+	parser.add_option("-A", "--arial",     dest="arial",           action="store_true", default=False,                    help="use Arial font [format: boolean] [default: false]")
 	parser.add_option("-v", "--verbosity", dest="verbosity",       action="store",      default=1,         type="int",    help="trace level [format: int]")
 	options = parser.parse_args()[0]
 	iGetReadDistribution = GetReadDistribution(options.verbosity)
@@ -279,5 +297,7 @@
 	iGetReadDistribution.setMultiple(options.multiple)
 	iGetReadDistribution.setQuorum(options.quorum)
 	iGetReadDistribution.setImageSize(options.width, options.height)
+	iGetReadDistribution.setBothStrands(options.strands)
+	iGetReadDistribution.setArial(options.arial)
 	iGetReadDistribution.run()