diff SMART/Java/Python/getSizes.py @ 38:2c0c0a89fad7

Uploaded
author m-zytnicki
date Thu, 02 May 2013 09:56:47 -0400
parents 44d5973c188c
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SMART/Java/Python/getSizes.py	Thu May 02 09:56:47 2013 -0400
@@ -0,0 +1,218 @@
+#! /usr/bin/env python
+#
+# Copyright INRA-URGI 2009-2010
+# 
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software. You can use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+# 
+# As a counterpart to the access to the source code and rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty and the software's author, the holder of the
+# economic rights, and the successive licensors have only limited
+# liability.
+# 
+# In this respect, the user's attention is drawn to the risks associated
+# with loading, using, modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean that it is complicated to manipulate, and that also
+# therefore means that it is reserved for developers and experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and, more generally, to use and operate it in the
+# same conditions as regards security.
+# 
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+#
+import os, sys
+from optparse import OptionParser
+from commons.core.parsing.FastaParser import FastaParser
+from commons.core.parsing.FastqParser import FastqParser
+from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
+from commons.core.parsing.GffParser import GffParser
+from SMART.Java.Python.misc.Progress import Progress
+from SMART.Java.Python.misc.RPlotter import RPlotter
+from SMART.Java.Python.misc import Utils
+
+from commons.core.LoggerFactory import LoggerFactory
+from commons.core.utils.RepetOptionParser import RepetOptionParser
+
+LOG_DEPTH = "smart"
+
+class GetSizes(object):
+	
+	def __init__(self, inFileName = None, inFormat=None, outFileName = None, query=None,xMax=None, xMin=None, verbosity = 0):
+		self.inFileName = inFileName
+		self.inFormat= inFormat
+		self.outFileName = outFileName
+		self.query = query
+		self.xMax = xMax
+		self.xMin = xMin
+		self.xLab = "Size"
+		self.yLab = "# reads"
+		self.barplot = False
+		self._verbosity = verbosity
+		self.parser = None
+		self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)
+		
+	def setAttributesFromCmdLine(self):
+		description = "Usage: getSizes.py [options]\n\nGet Sizes v1.0.2: Get the sizes of a set of genomic coordinates. [Category: Visualization]\n"
+		epilog = ""
+		parser = RepetOptionParser(description = description, epilog = epilog)
+		parser.add_option("-i", "--input",	 dest="inputFileName", action="store",	    default=None,	   type="string", help="input file [compulsory] [format: file in transcript or sequence format given by -f]")
+		parser.add_option("-f", "--format",	dest="format",		   action="store",	    default=None,	   type="string", help="format of the input [compulsory] [format: transcript or sequence file format]")
+		parser.add_option("-q", "--query",	 dest="query",		   action="store",	    default=None,	   type="string", help="type to mesure [default: size] [format: choice (size, intron size, exon size, 1st exon size)]")	 
+		parser.add_option("-o", "--output",	dest="outputFileName", action="store",	    default=None,	   type="string", help="output file [format: output file in PNG format]")
+		parser.add_option("-x", "--xMax",	  dest="xMax",		   action="store",	    default=None,	   type="int",	  help="maximum value on the x-axis to plot [format: int]")
+		parser.add_option("-X", "--xMin",	  dest="xMin",		   action="store",	    default=None,	   type="int",	  help="minimum value on the x-axis to plot [format: int]")
+		parser.add_option("-v", "--verbosity", dest="verbosity",   action="store",	    default=1,		   type="int",	  help="trace level [format: int]")
+		parser.add_option("-a", "--xLabel",	dest="xLab",           action="store",	    default="Size",	   type="string", help="x absis label name [format: string] [default: Size]")
+		parser.add_option("-b", "--yLabel",	dest="yLab",           action="store",	    default="# reads", type="string", help="y absis label name [format: string] [default: Reads]")
+		parser.add_option("-B", "--barplot",   dest="barplot",     action="store_true", default=False,					  help="use barplot representation [format: bool] [default: false]")  
+		options = parser.parse_args()[0]
+		self._setAttributesFromOptions(options)
+		
+	def _setAttributesFromOptions(self, options):
+		self.setInFileName(options.inputFileName)
+		self.setInFormat(options.format)
+		self.setQuery(options.query)
+		self.setOutFileName(options.outputFileName)
+		self.setXMax(options.xMax)
+		self.setXMin(options.xMin)
+		self.setxLab(options.xLab)
+		self.setyLab(options.yLab)
+		self.setBarplot(options.barplot)
+		self.setVerbosity(options.verbosity)
+		
+	def setInFileName(self, inputFileName):
+		self.inFileName = inputFileName
+		
+	def setInFormat(self, inFormat):
+		self.inFormat = inFormat
+	
+	def setQuery(self, query):
+		self.query = query
+		
+	def setOutFileName(self, outFileName):
+		self.outFileName = outFileName
+	
+	def setXMax(self, xMax):
+		self.xMax = xMax
+		
+	def setXMin(self, xMin):
+		self.xMin = xMin
+	
+	def setxLab(self, xLab):
+		self.xLab = xLab
+		
+	def setyLab(self, yLab):
+		self.yLab = yLab
+		
+	def setBarplot(self, barplot):
+		self.barplot = barplot
+		
+	def setVerbosity(self, verbosity):
+		self._verbosity = verbosity
+		
+	def _checkOptions(self):
+		if self.inFileName == None:
+			self._logAndRaise("ERROR: Missing input file name")
+		if self.inFormat == "fasta":
+			self.parser = FastaParser(self.inFileName, self._verbosity)
+		elif self.inFormat == "fastq":
+			self.parser = FastqParser(self.inFileName, self._verbosity)
+		else:
+			self.parser = TranscriptContainer(self.inFileName, self.inFormat, self._verbosity)
+			
+	def _logAndRaise(self, errorMsg):
+		self._log.error(errorMsg)
+		raise Exception(errorMsg)
+
+	def run(self):
+		LoggerFactory.setLevel(self._log, self._verbosity)
+		self._checkOptions()
+		self._log.info("START getsizes")
+		self._log.debug("Input file name: %s" % self.inFileName)
+
+		nbItems = self.parser.getNbItems()
+		self._log.info( "%i items found" % (nbItems))
+		
+		# treat items
+		progress   = Progress(nbItems, "Analyzing sequences of %s" % (self.inFileName), self._verbosity)
+		sizes      = {}
+		minimum	   = 1000000000000
+		maximum	   = 0
+		sum		   = 0
+		number     = 0
+		nbSubItems = 0
+		for item in self.parser.getIterator():
+			items = []
+			if self.query == "exon":
+				items = item.getExons()
+			elif self.query == "exon1":
+				if len(item.getExons()) > 1:
+					item.sortExons()
+					items = [item.getExons()[0]]
+			elif self.query == "intron":
+				items = item.getIntrons()
+			else:
+				items = [item, ]
+	
+			for thisItem in items:
+				try:
+					nbElements = int(float(thisItem.getTagValue("nbElements")))
+					if nbElements == None:
+						nbElements = 1
+				except:
+					nbElements = 1
+				size	= thisItem.getSize()
+				minimum = min(minimum, size)
+				maximum = max(maximum, size)
+				
+				if size not in sizes:
+					sizes[size] = nbElements
+				else:
+					sizes[size] += nbElements
+				sum		+= size
+				nbSubItems += nbElements
+			number += 1
+			progress.inc()
+		progress.done()
+
+		if self.outFileName != None:
+			plotter = RPlotter(self.outFileName, self._verbosity)
+			plotter.setFill(0)
+			plotter.setMinimumX(self.xMin)
+			plotter.setMaximumX(self.xMax)
+			plotter.setXLabel(self.xLab)
+			plotter.setYLabel(self.yLab)
+			plotter.setBarplot(self.barplot)
+			plotter.addLine(sizes)
+			plotter.plot()
+			
+		if nbSubItems == 0:
+			self._logAndRaise("No item found")
+			
+		self.items = number	  
+		self.subItems = nbSubItems
+		self.nucleotides = sum
+		self.minAvgMedMax = Utils.getMinAvgMedMax(sizes)
+				  
+		print "%d items" % (number)
+		print "%d sub-items" % (nbSubItems)
+		print "%d nucleotides" % (sum)
+		print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(sizes)
+
+		self._log.info("END getsizes")
+
+
+if __name__ == "__main__":
+	iGetSizes = GetSizes()
+	iGetSizes.setAttributesFromCmdLine()
+	iGetSizes.run()
+	
+#TODO: add two more options!!!!!!