diff SMART/Java/Python/getSizes.py @ 6:769e306b7933

Change the repository level.
author yufei-luo
date Fri, 18 Jan 2013 04:54:14 -0500
parents
children 94ab73e8a190
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SMART/Java/Python/getSizes.py	Fri Jan 18 04:54:14 2013 -0500
@@ -0,0 +1,238 @@
+#! /usr/bin/env python
+#
+# Copyright INRA-URGI 2009-2010
+# 
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software. You can use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+# 
+# As a counterpart to the access to the source code and rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty and the software's author, the holder of the
+# economic rights, and the successive licensors have only limited
+# liability.
+# 
+# In this respect, the user's attention is drawn to the risks associated
+# with loading, using, modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean that it is complicated to manipulate, and that also
+# therefore means that it is reserved for developers and experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and, more generally, to use and operate it in the
+# same conditions as regards security.
+# 
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+#
+import os, sys
+from optparse import OptionParser
+from commons.core.parsing.FastaParser import FastaParser
+from commons.core.parsing.FastqParser import FastqParser
+from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
+from commons.core.parsing.GffParser import GffParser
+from SMART.Java.Python.misc.Progress import Progress
+from SMART.Java.Python.misc.RPlotter import RPlotter
+from SMART.Java.Python.misc import Utils
+
+from commons.core.LoggerFactory import LoggerFactory
+from commons.core.utils.RepetOptionParser import RepetOptionParser
+
+LOG_DEPTH = "smart"
+
+class GetSizes(object):
+    
+    def __init__(self, inFileName = None, inFormat=None, outFileName = None, query=None,xMax=None, xMin=None, csv=False, verbosity = 0):
+        self.inFileName = inFileName
+        self.inFormat= inFormat
+        self.outFileName = outFileName
+        self.query = query
+        self.xMax = xMax
+        self.xMin = xMin
+        self.xLab = "Size"
+        self.yLab = "# reads"
+        self.barplot = False
+        self.csv = csv
+        self._verbosity = verbosity
+        self.parser = None
+        self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)
+        
+    def setAttributesFromCmdLine(self):
+        description = "Usage: getSizes.py [options]\n\nGet Sizes v1.0.2: Get the sizes of a set of genomic coordinates. [Category: Visualization]\n"
+        epilog = ""
+        parser = RepetOptionParser(description = description, epilog = epilog)
+        parser.add_option("-i", "--input",     dest="inputFileName",  action="store",      default=None,      type="string", help="input file [compulsory] [format: file in transcript or sequence format given by -f]")
+        parser.add_option("-f", "--format",    dest="format",         action="store",      default=None,      type="string", help="format of the input [compulsory] [format: transcript or sequence file format]")
+        parser.add_option("-q", "--query",     dest="query",          action="store",      default=None,      type="string", help="type to mesure [default: size] [format: choice (size, intron size, exon size, 1st exon size)]")     
+        parser.add_option("-o", "--output",    dest="outputFileName", action="store",      default=None,      type="string", help="output file [format: output file in PNG format]")
+        parser.add_option("-x", "--xMax",      dest="xMax",           action="store",      default=None,      type="int",    help="maximum value on the x-axis to plot [format: int]")
+        parser.add_option("-X", "--xMin",      dest="xMin",           action="store",      default=None,      type="int",    help="minimum value on the x-axis to plot [format: int]")
+        parser.add_option("-v", "--verbosity", dest="verbosity",      action="store",      default=1,         type="int",    help="trace level [format: int]")
+        parser.add_option("-c", "--csv",       dest="csv",            action="store",                         type="string", help="write a .csv file [format: bool] [default: false]")
+        parser.add_option("-a", "--xLabel",    dest="xLab",           action="store",      default="Size",    type="string", help="x absis label name [format: string] [default: Size]")
+        parser.add_option("-b", "--yLabel",    dest="yLab",           action="store",      default="# reads", type="string", help="y absis label name [format: string] [default: Reads]")
+        parser.add_option("-B", "--barplot",   dest="barplot",        action="store_true", default=False,                    help="use barplot representation [format: bool] [default: false]")  
+        options = parser.parse_args()[0]
+        self._setAttributesFromOptions(options)
+        
+    def _setAttributesFromOptions(self, options):
+        self.setInFileName(options.inputFileName)
+        self.setInFormat(options.format)
+        self.setQuery(options.query)
+        self.setOutFileName(options.outputFileName)
+        self.setXMax(options.xMax)
+        self.setXMin(options.xMin)
+        self.setxLab(options.xLab)
+        self.setyLab(options.yLab)
+        self.setBarplot(options.barplot)
+        self.setVerbosity(options.verbosity)
+        
+    def setInFileName(self, inputFileName):
+        self.inFileName = inputFileName
+        
+    def setInFormat(self, inFormat):
+        self.inFormat = inFormat
+    
+    def setQuery(self, query):
+        self.query = query
+        
+    def setOutFileName(self, outFileName):
+        self.outFileName = outFileName
+    
+    def setXMax(self, xMax):
+        self.xMax = xMax
+        
+    def setXMin(self, xMin):
+        self.xMin = xMin
+    
+    def setxLab(self, xLab):
+        self.xLab = xLab
+        
+    def setyLab(self, yLab):
+        self.yLab = yLab
+        
+    def setBarplot(self, barplot):
+        self.barplot = barplot
+        
+    def setCsv(self, csv):
+        self.csv = csv
+        
+    def setVerbosity(self, verbosity):
+        self._verbosity = verbosity
+        
+    def _checkOptions(self):
+        if self.inFileName == None:
+            self._logAndRaise("ERROR: Missing input file name")
+        if self.inFormat == "fasta":
+            self.parser = FastaParser(self.inFileName, self._verbosity)
+        elif self.inFormat == "fastq":
+            self.parser = FastqParser(self.inFileName, self._verbosity)
+        else:
+            self.parser = TranscriptContainer(self.inFileName, self.inFormat, self._verbosity)
+            
+    def _logAndRaise(self, errorMsg):
+        self._log.error(errorMsg)
+        raise Exception(errorMsg)
+                    
+    def run(self):
+        LoggerFactory.setLevel(self._log, self._verbosity)
+        self._checkOptions()
+        self._log.info("START getsizes")
+        self._log.debug("Input file name: %s" % self.inFileName)
+
+        nbItems = self.parser.getNbItems()
+        self._log.info( "%i items found" % (nbItems))
+        
+        # treat items
+        progress   = Progress(nbItems, "Analyzing sequences of %s" % (self.inFileName), self._verbosity)
+        sizes      = {}
+        names      = {}
+        minimum    = 1000000000000
+        maximum    = 0
+        sum        = 0
+        number     = 0
+        nbSubItems = 0
+        for item in self.parser.getIterator():
+            items = []
+            if self.query == "exon":
+                items = item.getExons()
+            elif self.query == "exon1":
+                if len(item.getExons()) > 1:
+                    item.sortExons()
+                    items = [item.getExons()[0]]
+            elif self.query == "intron":
+                items = item.getIntrons()
+            else:
+                items = [item, ]
+    
+            for thisItem in items:
+                try:
+                    nbElements = int(float(thisItem.getTagValue("nbElements")))
+                    if nbElements == None:
+                        nbElements = 1
+                except:
+                    nbElements = 1
+                size    = thisItem.getSize()
+                minimum = min(minimum, size)
+                maximum = max(maximum, size)
+                name    = thisItem.name.split()[0]
+                
+                if size not in sizes:
+                    sizes[size] = nbElements
+                    if self.csv:
+                        names[size] = [name, ]
+                else:
+                    sizes[size] += nbElements
+                    if self.csv:
+                        names[size].append(name)
+                sum        += size
+                nbSubItems += nbElements
+            number += 1
+            progress.inc()
+        progress.done()
+
+        if self.outFileName != None:
+            plotter = RPlotter(self.outFileName, self._verbosity)
+            plotter.setFill(0)
+            plotter.setMinimumX(self.xMin)
+            plotter.setMaximumX(self.xMax)
+            plotter.setXLabel(self.xLab)
+            plotter.setYLabel(self.yLab)
+            plotter.setBarplot(self.barplot)
+            plotter.addLine(sizes)
+            plotter.plot()
+            
+        if nbSubItems == 0:
+            self._logAndRaise("No item found")
+            
+        if self.csv:
+            csvHandle = open(self.csv, "w")
+            for size in range(min(sizes.keys()), max(sizes.keys())+1):
+                if size not in sizes:
+                    csvHandle.write("%d,0,\n" % (size))
+                else:
+                    csvHandle.write("%d,%d,%s\n" % (size, sizes[size], ";".join(names[size])))
+            csvHandle.close()
+        
+        self.items = number      
+        self.subItems = nbSubItems
+        self.nucleotides = sum
+        self.minAvgMedMax = Utils.getMinAvgMedMax(sizes)
+                  
+        print "%d items" % (number)
+        print "%d sub-items" % (nbSubItems)
+        print "%d nucleotides" % (sum)
+        print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(sizes)
+
+        self._log.info("END getsizes")
+
+
+if __name__ == "__main__":
+    iGetSizes = GetSizes()
+    iGetSizes.setAttributesFromCmdLine()
+    iGetSizes.run()
+    
+#TODO: add two more options!!!!!!