diff SMART/Java/Python/getSizes.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents 769e306b7933
children
line wrap: on
line diff
--- a/SMART/Java/Python/getSizes.py	Mon Apr 22 11:11:10 2013 -0400
+++ b/SMART/Java/Python/getSizes.py	Mon Apr 29 03:20:15 2013 -0400
@@ -44,195 +44,175 @@
 LOG_DEPTH = "smart"
 
 class GetSizes(object):
-    
-    def __init__(self, inFileName = None, inFormat=None, outFileName = None, query=None,xMax=None, xMin=None, csv=False, verbosity = 0):
-        self.inFileName = inFileName
-        self.inFormat= inFormat
-        self.outFileName = outFileName
-        self.query = query
-        self.xMax = xMax
-        self.xMin = xMin
-        self.xLab = "Size"
-        self.yLab = "# reads"
-        self.barplot = False
-        self.csv = csv
-        self._verbosity = verbosity
-        self.parser = None
-        self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)
-        
-    def setAttributesFromCmdLine(self):
-        description = "Usage: getSizes.py [options]\n\nGet Sizes v1.0.2: Get the sizes of a set of genomic coordinates. [Category: Visualization]\n"
-        epilog = ""
-        parser = RepetOptionParser(description = description, epilog = epilog)
-        parser.add_option("-i", "--input",     dest="inputFileName",  action="store",      default=None,      type="string", help="input file [compulsory] [format: file in transcript or sequence format given by -f]")
-        parser.add_option("-f", "--format",    dest="format",         action="store",      default=None,      type="string", help="format of the input [compulsory] [format: transcript or sequence file format]")
-        parser.add_option("-q", "--query",     dest="query",          action="store",      default=None,      type="string", help="type to mesure [default: size] [format: choice (size, intron size, exon size, 1st exon size)]")     
-        parser.add_option("-o", "--output",    dest="outputFileName", action="store",      default=None,      type="string", help="output file [format: output file in PNG format]")
-        parser.add_option("-x", "--xMax",      dest="xMax",           action="store",      default=None,      type="int",    help="maximum value on the x-axis to plot [format: int]")
-        parser.add_option("-X", "--xMin",      dest="xMin",           action="store",      default=None,      type="int",    help="minimum value on the x-axis to plot [format: int]")
-        parser.add_option("-v", "--verbosity", dest="verbosity",      action="store",      default=1,         type="int",    help="trace level [format: int]")
-        parser.add_option("-c", "--csv",       dest="csv",            action="store",                         type="string", help="write a .csv file [format: bool] [default: false]")
-        parser.add_option("-a", "--xLabel",    dest="xLab",           action="store",      default="Size",    type="string", help="x absis label name [format: string] [default: Size]")
-        parser.add_option("-b", "--yLabel",    dest="yLab",           action="store",      default="# reads", type="string", help="y absis label name [format: string] [default: Reads]")
-        parser.add_option("-B", "--barplot",   dest="barplot",        action="store_true", default=False,                    help="use barplot representation [format: bool] [default: false]")  
-        options = parser.parse_args()[0]
-        self._setAttributesFromOptions(options)
-        
-    def _setAttributesFromOptions(self, options):
-        self.setInFileName(options.inputFileName)
-        self.setInFormat(options.format)
-        self.setQuery(options.query)
-        self.setOutFileName(options.outputFileName)
-        self.setXMax(options.xMax)
-        self.setXMin(options.xMin)
-        self.setxLab(options.xLab)
-        self.setyLab(options.yLab)
-        self.setBarplot(options.barplot)
-        self.setVerbosity(options.verbosity)
-        
-    def setInFileName(self, inputFileName):
-        self.inFileName = inputFileName
-        
-    def setInFormat(self, inFormat):
-        self.inFormat = inFormat
-    
-    def setQuery(self, query):
-        self.query = query
-        
-    def setOutFileName(self, outFileName):
-        self.outFileName = outFileName
-    
-    def setXMax(self, xMax):
-        self.xMax = xMax
-        
-    def setXMin(self, xMin):
-        self.xMin = xMin
-    
-    def setxLab(self, xLab):
-        self.xLab = xLab
-        
-    def setyLab(self, yLab):
-        self.yLab = yLab
-        
-    def setBarplot(self, barplot):
-        self.barplot = barplot
-        
-    def setCsv(self, csv):
-        self.csv = csv
-        
-    def setVerbosity(self, verbosity):
-        self._verbosity = verbosity
-        
-    def _checkOptions(self):
-        if self.inFileName == None:
-            self._logAndRaise("ERROR: Missing input file name")
-        if self.inFormat == "fasta":
-            self.parser = FastaParser(self.inFileName, self._verbosity)
-        elif self.inFormat == "fastq":
-            self.parser = FastqParser(self.inFileName, self._verbosity)
-        else:
-            self.parser = TranscriptContainer(self.inFileName, self.inFormat, self._verbosity)
-            
-    def _logAndRaise(self, errorMsg):
-        self._log.error(errorMsg)
-        raise Exception(errorMsg)
-                    
-    def run(self):
-        LoggerFactory.setLevel(self._log, self._verbosity)
-        self._checkOptions()
-        self._log.info("START getsizes")
-        self._log.debug("Input file name: %s" % self.inFileName)
+	
+	def __init__(self, inFileName = None, inFormat=None, outFileName = None, query=None,xMax=None, xMin=None, verbosity = 0):
+		self.inFileName = inFileName
+		self.inFormat= inFormat
+		self.outFileName = outFileName
+		self.query = query
+		self.xMax = xMax
+		self.xMin = xMin
+		self.xLab = "Size"
+		self.yLab = "# reads"
+		self.barplot = False
+		self._verbosity = verbosity
+		self.parser = None
+		self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)
+		
+	def setAttributesFromCmdLine(self):
+		description = "Usage: getSizes.py [options]\n\nGet Sizes v1.0.2: Get the sizes of a set of genomic coordinates. [Category: Visualization]\n"
+		epilog = ""
+		parser = RepetOptionParser(description = description, epilog = epilog)
+		parser.add_option("-i", "--input",	 dest="inputFileName", action="store",	    default=None,	   type="string", help="input file [compulsory] [format: file in transcript or sequence format given by -f]")
+		parser.add_option("-f", "--format",	dest="format",		   action="store",	    default=None,	   type="string", help="format of the input [compulsory] [format: transcript or sequence file format]")
+		parser.add_option("-q", "--query",	 dest="query",		   action="store",	    default=None,	   type="string", help="type to mesure [default: size] [format: choice (size, intron size, exon size, 1st exon size)]")	 
+		parser.add_option("-o", "--output",	dest="outputFileName", action="store",	    default=None,	   type="string", help="output file [format: output file in PNG format]")
+		parser.add_option("-x", "--xMax",	  dest="xMax",		   action="store",	    default=None,	   type="int",	  help="maximum value on the x-axis to plot [format: int]")
+		parser.add_option("-X", "--xMin",	  dest="xMin",		   action="store",	    default=None,	   type="int",	  help="minimum value on the x-axis to plot [format: int]")
+		parser.add_option("-v", "--verbosity", dest="verbosity",   action="store",	    default=1,		   type="int",	  help="trace level [format: int]")
+		parser.add_option("-a", "--xLabel",	dest="xLab",           action="store",	    default="Size",	   type="string", help="x absis label name [format: string] [default: Size]")
+		parser.add_option("-b", "--yLabel",	dest="yLab",           action="store",	    default="# reads", type="string", help="y absis label name [format: string] [default: Reads]")
+		parser.add_option("-B", "--barplot",   dest="barplot",     action="store_true", default=False,					  help="use barplot representation [format: bool] [default: false]")  
+		options = parser.parse_args()[0]
+		self._setAttributesFromOptions(options)
+		
+	def _setAttributesFromOptions(self, options):
+		self.setInFileName(options.inputFileName)
+		self.setInFormat(options.format)
+		self.setQuery(options.query)
+		self.setOutFileName(options.outputFileName)
+		self.setXMax(options.xMax)
+		self.setXMin(options.xMin)
+		self.setxLab(options.xLab)
+		self.setyLab(options.yLab)
+		self.setBarplot(options.barplot)
+		self.setVerbosity(options.verbosity)
+		
+	def setInFileName(self, inputFileName):
+		self.inFileName = inputFileName
+		
+	def setInFormat(self, inFormat):
+		self.inFormat = inFormat
+	
+	def setQuery(self, query):
+		self.query = query
+		
+	def setOutFileName(self, outFileName):
+		self.outFileName = outFileName
+	
+	def setXMax(self, xMax):
+		self.xMax = xMax
+		
+	def setXMin(self, xMin):
+		self.xMin = xMin
+	
+	def setxLab(self, xLab):
+		self.xLab = xLab
+		
+	def setyLab(self, yLab):
+		self.yLab = yLab
+		
+	def setBarplot(self, barplot):
+		self.barplot = barplot
+		
+	def setVerbosity(self, verbosity):
+		self._verbosity = verbosity
+		
+	def _checkOptions(self):
+		if self.inFileName == None:
+			self._logAndRaise("ERROR: Missing input file name")
+		if self.inFormat == "fasta":
+			self.parser = FastaParser(self.inFileName, self._verbosity)
+		elif self.inFormat == "fastq":
+			self.parser = FastqParser(self.inFileName, self._verbosity)
+		else:
+			self.parser = TranscriptContainer(self.inFileName, self.inFormat, self._verbosity)
+			
+	def _logAndRaise(self, errorMsg):
+		self._log.error(errorMsg)
+		raise Exception(errorMsg)
 
-        nbItems = self.parser.getNbItems()
-        self._log.info( "%i items found" % (nbItems))
-        
-        # treat items
-        progress   = Progress(nbItems, "Analyzing sequences of %s" % (self.inFileName), self._verbosity)
-        sizes      = {}
-        names      = {}
-        minimum    = 1000000000000
-        maximum    = 0
-        sum        = 0
-        number     = 0
-        nbSubItems = 0
-        for item in self.parser.getIterator():
-            items = []
-            if self.query == "exon":
-                items = item.getExons()
-            elif self.query == "exon1":
-                if len(item.getExons()) > 1:
-                    item.sortExons()
-                    items = [item.getExons()[0]]
-            elif self.query == "intron":
-                items = item.getIntrons()
-            else:
-                items = [item, ]
-    
-            for thisItem in items:
-                try:
-                    nbElements = int(float(thisItem.getTagValue("nbElements")))
-                    if nbElements == None:
-                        nbElements = 1
-                except:
-                    nbElements = 1
-                size    = thisItem.getSize()
-                minimum = min(minimum, size)
-                maximum = max(maximum, size)
-                name    = thisItem.name.split()[0]
-                
-                if size not in sizes:
-                    sizes[size] = nbElements
-                    if self.csv:
-                        names[size] = [name, ]
-                else:
-                    sizes[size] += nbElements
-                    if self.csv:
-                        names[size].append(name)
-                sum        += size
-                nbSubItems += nbElements
-            number += 1
-            progress.inc()
-        progress.done()
+	def run(self):
+		LoggerFactory.setLevel(self._log, self._verbosity)
+		self._checkOptions()
+		self._log.info("START getsizes")
+		self._log.debug("Input file name: %s" % self.inFileName)
 
-        if self.outFileName != None:
-            plotter = RPlotter(self.outFileName, self._verbosity)
-            plotter.setFill(0)
-            plotter.setMinimumX(self.xMin)
-            plotter.setMaximumX(self.xMax)
-            plotter.setXLabel(self.xLab)
-            plotter.setYLabel(self.yLab)
-            plotter.setBarplot(self.barplot)
-            plotter.addLine(sizes)
-            plotter.plot()
-            
-        if nbSubItems == 0:
-            self._logAndRaise("No item found")
-            
-        if self.csv:
-            csvHandle = open(self.csv, "w")
-            for size in range(min(sizes.keys()), max(sizes.keys())+1):
-                if size not in sizes:
-                    csvHandle.write("%d,0,\n" % (size))
-                else:
-                    csvHandle.write("%d,%d,%s\n" % (size, sizes[size], ";".join(names[size])))
-            csvHandle.close()
-        
-        self.items = number      
-        self.subItems = nbSubItems
-        self.nucleotides = sum
-        self.minAvgMedMax = Utils.getMinAvgMedMax(sizes)
-                  
-        print "%d items" % (number)
-        print "%d sub-items" % (nbSubItems)
-        print "%d nucleotides" % (sum)
-        print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(sizes)
+		nbItems = self.parser.getNbItems()
+		self._log.info( "%i items found" % (nbItems))
+		
+		# treat items
+		progress   = Progress(nbItems, "Analyzing sequences of %s" % (self.inFileName), self._verbosity)
+		sizes      = {}
+		minimum	   = 1000000000000
+		maximum	   = 0
+		sum		   = 0
+		number     = 0
+		nbSubItems = 0
+		for item in self.parser.getIterator():
+			items = []
+			if self.query == "exon":
+				items = item.getExons()
+			elif self.query == "exon1":
+				if len(item.getExons()) > 1:
+					item.sortExons()
+					items = [item.getExons()[0]]
+			elif self.query == "intron":
+				items = item.getIntrons()
+			else:
+				items = [item, ]
+	
+			for thisItem in items:
+				try:
+					nbElements = int(float(thisItem.getTagValue("nbElements")))
+					if nbElements == None:
+						nbElements = 1
+				except:
+					nbElements = 1
+				size	= thisItem.getSize()
+				minimum = min(minimum, size)
+				maximum = max(maximum, size)
+				
+				if size not in sizes:
+					sizes[size] = nbElements
+				else:
+					sizes[size] += nbElements
+				sum		+= size
+				nbSubItems += nbElements
+			number += 1
+			progress.inc()
+		progress.done()
 
-        self._log.info("END getsizes")
+		if self.outFileName != None:
+			plotter = RPlotter(self.outFileName, self._verbosity)
+			plotter.setFill(0)
+			plotter.setMinimumX(self.xMin)
+			plotter.setMaximumX(self.xMax)
+			plotter.setXLabel(self.xLab)
+			plotter.setYLabel(self.yLab)
+			plotter.setBarplot(self.barplot)
+			plotter.addLine(sizes)
+			plotter.plot()
+			
+		if nbSubItems == 0:
+			self._logAndRaise("No item found")
+			
+		self.items = number	  
+		self.subItems = nbSubItems
+		self.nucleotides = sum
+		self.minAvgMedMax = Utils.getMinAvgMedMax(sizes)
+				  
+		print "%d items" % (number)
+		print "%d sub-items" % (nbSubItems)
+		print "%d nucleotides" % (sum)
+		print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(sizes)
+
+		self._log.info("END getsizes")
 
 
 if __name__ == "__main__":
-    iGetSizes = GetSizes()
-    iGetSizes.setAttributesFromCmdLine()
-    iGetSizes.run()
-    
+	iGetSizes = GetSizes()
+	iGetSizes.setAttributesFromCmdLine()
+	iGetSizes.run()
+	
 #TODO: add two more options!!!!!!