diff SMART/Java/Python/GetDistribution.py @ 46:169d364ddd91

Uploaded
author m-zytnicki
date Mon, 30 Sep 2013 03:19:26 -0400
parents 44d5973c188c
children
line wrap: on
line diff
--- a/SMART/Java/Python/GetDistribution.py	Wed Sep 18 08:51:22 2013 -0400
+++ b/SMART/Java/Python/GetDistribution.py	Mon Sep 30 03:19:26 2013 -0400
@@ -45,28 +45,33 @@
 class GetDistribution(object):
 
 	def __init__(self, verbosity):
-		self.verbosity     = verbosity
-		self.sizes         = None
-		self.twoStrands    = False
-		self.start         = 1
-		self.names         = ["nbElements"]
-		self.average       = False
-		self.nbValues      = {}
-		self.height        = 300
-		self.width         = 600
-		self.colors        = None
-		self.gffFileName   = None
-		self.csvFileName   = None
-		self.yMin          = None
-		self.yMax          = None
-		self.chromosome    = None
-		self.merge         = False
-		self.nbTranscripts = None
+		self.verbosity        = verbosity
+		self.sizes            = None
+		self.nbBins           = None
+		self.sliceSize        = None
+		self.twoStrands       = False
+		self.start            = 1
+		self.names            = ["nbElements"]
+		self.average          = False
+		self.nbValues         = {}
+		self.height           = 300
+		self.width            = 600
+		self.dots             = False
+		self.colors           = None
+		self.gffFileName      = None
+		self.csvFileName      = None
+		self.yMin             = None
+		self.yMax             = None
+		self.chromosome       = None
+		self.merge            = False
+		self.nbTranscripts    = None
+		self.factors          = None
+		self.thicknessCurve   = 1
+		self.sizePoliceLegend = 1.5
 
-	def setInputFile(self, fileName, format):
-		chooser = ParserChooser(self.verbosity)
-		chooser.findFormat(format)
-		self.parser = chooser.getParser(fileName)
+	def setInputFiles(self, fileNames, format):
+		self.fileNames = fileNames
+		self.format    = format
 
 	def setReferenceFile(self, fileName):
 		if fileName == None:
@@ -77,7 +82,7 @@
 		self.maxSize     = max(self.sizes.values())
 
 	def setRegion(self, chromosome, start, end):
-		if chromosome == None:
+		if chromosome == None or start == None or end == None:
 			return
 		self.maxSize     = options.end
 		self.sizes       = {chromosome: end}
@@ -90,13 +95,20 @@
 		self.outputFileName = fileName
 
 	def setNbBins(self, nbBins):
-		self.nbBins = nbBins
+		if nbBins != None:
+			self.nbBins = int(nbBins)
+
+	def setBinSize(self, binSize):
+		if binSize != None:
+			self.sliceSize = int(binSize)
 
 	def set2Strands(self, twoStrands):
 		self.twoStrands = twoStrands
 
 	def setNames(self, names):
 		self.names = names
+		if len(self.names) == 1 and len(self.fileNames) > 1:
+			self.names = ["file %d" % (i+1) for i in range(len(self.fileNames))]
 
 	def setAverage(self, average):
 		self.average = average
@@ -104,10 +116,16 @@
 	def setNormalization(self, normalization):
 		self.normalization = normalization
 	
+	def setNormalizationFactors(self, factors):
+		self.factors = dict([name, 1.0] for name in self.names) if factors == None else dict(zip(self.names, factors))
+	
 	def setImageSize(self, height, width):
 		self.height = height
 		self.width  = width
 
+	def setDots(self, dots):
+		self.dots = dots
+
 	def setYLimits(self, yMin, yMax):
 		self.yMin = yMin
 		self.yMax = yMax
@@ -124,15 +142,29 @@
 	def mergePlots(self, merge):
 		self.merge = merge
 
+	def setThicknessCurve(self, thickness) :
+		self.thickness = thickness
+
+	def setSizePoliceLegend(self, sizePoliceLegend):
+		self.sizePoliceLegend = sizePoliceLegend
+
 	def _estimateSizes(self):
-		progress = UnlimitedProgress(10000, "Reading input for chromosome size estimate", self.verbosity)
-		self.sizes = {}
-		for self.nbTranscripts, transcript in enumerate(self.parser.getIterator()):
-			chromosome = transcript.getChromosome()
-			start      = transcript.getStart()
-			self.sizes[chromosome] = max(start, self.sizes.get(chromosome, 0))
-			progress.inc()
-		progress.done()
+		self.sizes         = {}
+		self.nbTranscripts = {}
+		for fileName in self.fileNames:
+			progress = UnlimitedProgress(10000, "Reading %s for chromosome size estimate" % (fileName), self.verbosity)
+			parserChooser = ParserChooser(self.verbosity)
+			parserChooser.findFormat(self.format)
+			parser = parserChooser.getParser(fileName)
+			for nbTranscripts, transcript in enumerate(parser.getIterator()):
+				if transcript.__class__.__name__ == "Mapping":
+					transcript = transcript.getTranscript()
+				chromosome = transcript.getChromosome()
+				start      = transcript.getStart()
+				self.sizes[chromosome] = max(start, self.sizes.get(chromosome, 0))
+				progress.inc()
+			progress.done()
+			self.nbTranscripts[fileName] = nbTranscripts
 
 	def _computeSliceSize(self):
 		if self.nbBins == 0:
@@ -156,37 +188,50 @@
 						self.bins[chromosome][name][strand] = dict([(i * self.sliceSize + 1, 0.0) for i in range(self.start / self.sliceSize, self.sizes[chromosome] / self.sliceSize + 1)])
 
 	def _populateBins(self):
-		if self.nbTranscripts == None:
-			progress = UnlimitedProgress(10000, "Counting data", self.verbosity)
-		else:
-			progress = Progress(self.nbTranscripts, "Counting data", self.verbosity)
-		for transcript in self.parser.getIterator():
-			if transcript.__class__.__name__ == "Mapping":
-				transcript = transcript.getTranscript()
-			progress.inc()
-			chromosome = transcript.getChromosome()
-			start      = transcript.getStart()
-			if self.chromosome and (chromosome != self.chromosome or start < self.start or start > self.end):
-				continue
-			strand = transcript.getDirection() if self.twoStrands else 0
-			if self.nbBins != 0:
-				bin = (start / self.sliceSize) * self.sliceSize + 1
+		for id, fileName in enumerate(self.fileNames):
+			if self.nbTranscripts == None:
+				progress = UnlimitedProgress(10000, "Counting data", self.verbosity)
 			else:
-				bin = start
-			for name in self.names:
-				value = float(transcript.tags.get(name, 1))
-				self.bins[chromosome][name][strand][bin] = self.bins[chromosome][name][strand].get(bin, 0) + value
-				self.nbValues[name] = self.nbValues.get(name, 0) + value
-		progress.done()
+				progress = Progress(self.nbTranscripts[fileName], "Counting data", self.verbosity)
+			parserChooser = ParserChooser(self.verbosity)
+			parserChooser.findFormat(self.format)
+			parser = parserChooser.getParser(fileName)
+			for transcript in parser.getIterator():
+				if transcript.__class__.__name__ == "Mapping":
+					transcript = transcript.getTranscript()
+				progress.inc()
+				chromosome = transcript.getChromosome()
+				start      = transcript.getStart()
+				if self.chromosome and (chromosome != self.chromosome or start < self.start or start > self.end):
+					continue
+				strand = transcript.getDirection() if self.twoStrands else 0
+				if self.nbBins != 0:
+					bin = (start / self.sliceSize) * self.sliceSize + 1
+				else:
+					bin = start
+				if len(self.fileNames) > 1:
+					nbElements = transcript.getTagValue("nbElements") if "nbElements" in transcript.getTagNames() else 1
+					name       = self.names[id]
+					self.bins[chromosome][name][strand][bin] = self.bins[chromosome][name][strand].get(bin, 0) + nbElements
+					self.nbValues[name] = self.nbValues.get(name, 0) + nbElements
+				else:
+					for name in self.names:
+						value = float(transcript.tags.get(name, 1))
+						self.bins[chromosome][name][strand][bin] = self.bins[chromosome][name][strand].get(bin, 0) + value
+						self.nbValues[name] = self.nbValues.get(name, 0) + value
+			progress.done()
 
-	def _normalize(self):
-		average = float(sum(self.nbValues)) / len(self.nbValues.keys())
-		factors = dict([name, float(average) / self.nbValues[name]] for name in self.nbValues)
+	def _normalizeFactors(self):
 		for chromosome in self.bins:
 			for name in self.bins[chromosome]:
 				for strand in self.bins[chromosome][name]:
 					for bin in self.bins[chromosome][name][strand]:
-						self.bins[chromosome][name][strand][bin] *= factors[name]
+						self.bins[chromosome][name][strand][bin] *= self.factors[name]
+
+	def _normalize(self):
+		average      = float(sum(self.nbValues.values())) / len(self.nbValues.keys())
+		self.factors = dict([name, float(average) / self.nbValues[name]] for name in self.nbValues)
+		self._normalizeFactors()
 
 	def _computeAverage(self):
 		for chromosome in self.bins:
@@ -198,6 +243,10 @@
 	def _getPlotter(self, chromosome):
 		plot = RPlotter("%s_%s.png" % (os.path.splitext(self.outputFileName)[0], chromosome), self.verbosity)
 		plot.setImageSize(self.width, self.height)
+		plot.setLineWidth(self.thickness)
+		plot.setSizePoliceLegend(self.sizePoliceLegend)
+		if self.dots:
+			plot.setPoints(True)
 		if self.sizes[chromosome] <= 1000:
 			unit  = "nt."
 			ratio = 1.0
@@ -212,10 +261,12 @@
 		if self.yMax != None:
 			plot.setMaximumY(self.yMax)
 		plot.setXLabel("Position on %s (in %s)" % (chromosome.replace("_", " "), unit))
-		plot.setLegend(True)
+		if len(self.names) > 1:
+			plot.setLegend(True, True)
 		for i, name in enumerate(self.bins[chromosome]):
 			for strand in self.bins[chromosome][name]:
-				fullName = "%s %s" % (name.replace("_", " ")[:6], STRANDTOSTR[strand])
+				#fullName = "%s %s" % (name.replace("_", " ")[:6], STRANDTOSTR[strand])
+				fullName = name.replace("_", " ")[:6]
 				factor = 1 if strand == 0 else strand
 				correctedLine = dict([(key / ratio, value * factor) for key, value in self.bins[chromosome][name][strand].iteritems()])
 				plot.addLine(correctedLine, fullName, self.colors[i] if self.colors else None)
@@ -299,11 +350,14 @@
 	def run(self):
 		if self.sizes == None:
 			self._estimateSizes()
-		self._computeSliceSize()
+		if self.sliceSize == None:
+			self._computeSliceSize()
 		self._initBins()
 		self._populateBins()
 		if self.normalization:
 			self._normalize()
+		if self.factors != None:
+			self._normalizeFactors()
 		if self.average:
 			self._computeAverage()
 		self._plot()
@@ -318,34 +372,40 @@
 	description = "Get Distribution v1.0.2: Get the distribution of the genomic coordinates on a genome. [Category: Visualization]"
 
 	parser = OptionParser(description = description)
-	parser.add_option("-i", "--input",       dest="inputFileName",     action="store",                            type="string", help="input file [compulsory] [format: file in transcript format given by -f]")
-	parser.add_option("-f", "--format",      dest="format",            action="store",                            type="string", help="format of the input file [compulsory] [format: transcript file format]")
-	parser.add_option("-o", "--output",      dest="outputFileName",    action="store",                            type="string", help="output file [compulsory] [format: output file in GFF3 format]")
-	parser.add_option("-r", "--reference",   dest="referenceFileName", action="store",      default=None,         type="string", help="file containing the genome [format: file in FASTA format]")
-	parser.add_option("-b", "--nbBins",      dest="nbBins",            action="store",      default=1000,         type="int",    help="number of bins [default: 1000] [format: int]")
-	parser.add_option("-2", "--bothStrands", dest="bothStrands",       action="store_true", default=False,                       help="plot one curve per strand [format: bool] [default: false]")
-	parser.add_option("-c", "--chromosome",  dest="chromosome",        action="store",      default=None,         type="string", help="plot only a chromosome [format: string]")
-	parser.add_option("-s", "--start",       dest="start",             action="store",      default=None,         type="int",    help="start from a given region [format: int]")
-	parser.add_option("-e", "--end",         dest="end",               action="store",      default=None,         type="int",    help="end from a given region [format: int]")
-	parser.add_option("-y", "--yMin",        dest="yMin",              action="store",      default=None,         type="int",    help="minimum value on the y-axis to plot [format: int]")
-	parser.add_option("-Y", "--yMax",        dest="yMax",              action="store",      default=None,         type="int",    help="maximum value on the y-axis to plot [format: int]")
-	parser.add_option("-x", "--csv",         dest="csv",               action="store",      default=None,                        help="write a .csv file [format: output file in CSV format] [default: None]")
-	parser.add_option("-g", "--gff",         dest="gff",               action="store",      default=None,                        help="also write GFF3 file [format: output file in GFF format] [default: None]")
-	parser.add_option("-H", "--height",      dest="height",            action="store",      default=300,          type="int",    help="height of the graphics [format: int] [default: 300]")
-	parser.add_option("-W", "--width",       dest="width",             action="store",      default=600,          type="int",    help="width of the graphics [format: int] [default: 1000]")
-	parser.add_option("-a", "--average",     dest="average",           action="store_true", default=False,                       help="plot average (instead of sum) [default: false] [format: boolean]")
-	parser.add_option("-n", "--names",       dest="names",             action="store",      default="nbElements", type="string", help="name for the tags (separated by commas and no space) [default: None] [format: string]")
-	parser.add_option("-l", "--color",       dest="colors",            action="store",      default=None,         type="string", help="color of the lines (separated by commas and no space) [format: string]")
-	parser.add_option("-z", "--normalize",   dest="normalize",         action="store_true", default=False,                       help="normalize data (when panels are different) [format: bool] [default: false]")
-	parser.add_option("-m", "--merge",       dest="mergePlots",        action="store_true", default=False,                       help="merge all plots in one figure [format: bool] [default: false]")
-	parser.add_option("-v", "--verbosity",   dest="verbosity",         action="store",      default=1,            type="int",    help="trace level [default: 1] [format: int]")
+	parser.add_option("-i", "--input",        dest="inputFileNames",      action="store",                            type="string", help="input files separated by commas [compulsory] [format: string]")
+	parser.add_option("-f", "--format",       dest="format",              action="store",                            type="string", help="format of the input file [compulsory] [format: transcript file format]")
+	parser.add_option("-o", "--output",       dest="outputFileName",      action="store",                            type="string", help="output file [compulsory] [format: output file in GFF3 format]")
+	parser.add_option("-r", "--reference",    dest="referenceFileName",   action="store",      default=None,         type="string", help="file containing the genome [format: file in FASTA format]")
+	parser.add_option("-b", "--nbBins",       dest="nbBins",              action="store",      default=1000,         type="int",    help="number of bins [default: 1000] [format: int]")
+	parser.add_option("-B", "--binSize",      dest="binSize",             action="store",      default=None,         type="int",    help="bin size [default: None] [format: int]")
+	parser.add_option("-2", "--bothStrands",  dest="bothStrands",         action="store_true", default=False,                       help="plot one curve per strand [format: bool] [default: false]")
+	parser.add_option("-c", "--chromosome",   dest="chromosome",          action="store",      default=None,         type="string", help="plot only a chromosome [format: string]")
+	parser.add_option("-s", "--start",        dest="start",               action="store",      default=None,         type="int",    help="start from a given region [format: int]")
+	parser.add_option("-e", "--end",          dest="end",                 action="store",      default=None,         type="int",    help="end from a given region [format: int]")
+	parser.add_option("-y", "--yMin",         dest="yMin",                action="store",      default=None,         type="int",    help="minimum value on the y-axis to plot [format: int]")
+	parser.add_option("-Y", "--yMax",         dest="yMax",                action="store",      default=None,         type="int",    help="maximum value on the y-axis to plot [format: int]")
+	parser.add_option("-x", "--csv",          dest="csv",                 action="store",      default=None,                        help="write a .csv file [format: output file in CSV format] [default: None]")
+	parser.add_option("-g", "--gff",          dest="gff",                 action="store",      default=None,                        help="also write GFF3 file [format: output file in GFF format] [default: None]")
+	parser.add_option("-H", "--height",       dest="height",              action="store",      default=500,          type="int",    help="height of the graphics [format: int] [default: 300]")
+	parser.add_option("-W", "--width",        dest="width",               action="store",      default=800,          type="int",    help="width of the graphics [format: int] [default: 1000]")
+	parser.add_option("-t", "--thickness", 	  dest="lineThickness", 	  action="store",      default=1,            type="int",    help="thickness of the lines [format : int] [default : 1]")
+	parser.add_option("-d", "--policeLegend", dest="sizePoliceLegend",    action="store",      default=1.5,          type="float",  help="size of the police of the legend  [format : float] [default : 1.5]")
+	parser.add_option("-D", "--dots",         dest="dots",                action="store_true", default=False,                       help="plot dots instead of lines  [format : bool] [default : false]")
+	parser.add_option("-a", "--average",      dest="average",             action="store_true", default=False,                       help="plot average (instead of sum) [default: false] [format: boolean]")
+	parser.add_option("-n", "--names",        dest="names",               action="store",      default="nbElements", type="string", help="name for the tags (separated by commas and no space) [default: None] [format: string]")
+	parser.add_option("-l", "--color",        dest="colors",              action="store",      default=None,         type="string", help="color of the lines (separated by commas and no space) [format: string]")
+	parser.add_option("-z", "--normalize",    dest="normalize",           action="store_true", default=False,                       help="normalize data (when panels are different) [format: bool] [default: false]")
+	parser.add_option("-Z", "--normalizeFac", dest="normalizeFactors",    action="store",      default=None,                        help="normalize data with given factors (when panels are different) [format: string]")
+	parser.add_option("-m", "--merge",        dest="mergePlots",          action="store_true", default=False,                       help="merge all plots in one figure [format: bool] [default: false]")
+	parser.add_option("-v", "--verbosity",    dest="verbosity",           action="store",      default=1,            type="int",    help="trace level [default: 1] [format: int]")
 	(options, args) = parser.parse_args()
 
 	gt = GetDistribution(options.verbosity)
-	gt.setInputFile(options.inputFileName, options.format)
+	gt.setInputFiles(options.inputFileNames.split(","), options.format)
 	gt.setOutputFile(options.outputFileName)
 	gt.setReferenceFile(options.referenceFileName)
-	gt.setNbBins(int(options.nbBins))
+	gt.setNbBins(options.nbBins)
+	gt.setBinSize(options.binSize)
 	gt.set2Strands(options.bothStrands)
 	gt.setRegion(options.chromosome, options.start, options.end)
 	gt.setNormalization(options.normalize)
@@ -355,8 +415,12 @@
 	gt.writeGff(options.gff)
 	gt.setImageSize(options.height, options.width)
 	gt.setNames(options.names.split(","))
+	gt.setThicknessCurve(options.lineThickness)
+	gt.setSizePoliceLegend(options.sizePoliceLegend)
 	gt.setColors(None if options.colors == None else options.colors.split(","))
+	gt.setDots(options.dots)
 	gt.setNormalization(options.normalize)
+	gt.setNormalizationFactors(None if options.normalizeFactors == None else [float(factor) for factor in options.normalizeFactors.split(",")])
 	gt.mergePlots(options.mergePlots)
 	gt.run()