| 
6
 | 
     1 #! /usr/bin/env python
 | 
| 
 | 
     2 #
 | 
| 
 | 
     3 # Copyright INRA-URGI 2009-2010
 | 
| 
 | 
     4 # 
 | 
| 
 | 
     5 # This software is governed by the CeCILL license under French law and
 | 
| 
 | 
     6 # abiding by the rules of distribution of free software. You can use,
 | 
| 
 | 
     7 # modify and/ or redistribute the software under the terms of the CeCILL
 | 
| 
 | 
     8 # license as circulated by CEA, CNRS and INRIA at the following URL
 | 
| 
 | 
     9 # "http://www.cecill.info".
 | 
| 
 | 
    10 # 
 | 
| 
 | 
    11 # As a counterpart to the access to the source code and rights to copy,
 | 
| 
 | 
    12 # modify and redistribute granted by the license, users are provided only
 | 
| 
 | 
    13 # with a limited warranty and the software's author, the holder of the
 | 
| 
 | 
    14 # economic rights, and the successive licensors have only limited
 | 
| 
 | 
    15 # liability.
 | 
| 
 | 
    16 # 
 | 
| 
 | 
    17 # In this respect, the user's attention is drawn to the risks associated
 | 
| 
 | 
    18 # with loading, using, modifying and/or developing or reproducing the
 | 
| 
 | 
    19 # software by the user in light of its specific status of free software,
 | 
| 
 | 
    20 # that may mean that it is complicated to manipulate, and that also
 | 
| 
 | 
    21 # therefore means that it is reserved for developers and experienced
 | 
| 
 | 
    22 # professionals having in-depth computer knowledge. Users are therefore
 | 
| 
 | 
    23 # encouraged to load and test the software's suitability as regards their
 | 
| 
 | 
    24 # requirements in conditions enabling the security of their systems and/or
 | 
| 
 | 
    25 # data to be ensured and, more generally, to use and operate it in the
 | 
| 
 | 
    26 # same conditions as regards security.
 | 
| 
 | 
    27 # 
 | 
| 
 | 
    28 # The fact that you are presently reading this means that you have had
 | 
| 
 | 
    29 # knowledge of the CeCILL license and that you accept its terms.
 | 
| 
 | 
    30 #
 | 
| 
 | 
    31 import os, sys
 | 
| 
 | 
    32 from optparse import OptionParser
 | 
| 
 | 
    33 from commons.core.parsing.FastaParser import FastaParser
 | 
| 
 | 
    34 from commons.core.parsing.FastqParser import FastqParser
 | 
| 
 | 
    35 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
 | 
| 
 | 
    36 from commons.core.parsing.GffParser import GffParser
 | 
| 
 | 
    37 from SMART.Java.Python.misc.Progress import Progress
 | 
| 
 | 
    38 from SMART.Java.Python.misc.RPlotter import RPlotter
 | 
| 
 | 
    39 from SMART.Java.Python.misc import Utils
 | 
| 
 | 
    40 
 | 
| 
 | 
    41 from commons.core.LoggerFactory import LoggerFactory
 | 
| 
 | 
    42 from commons.core.utils.RepetOptionParser import RepetOptionParser
 | 
| 
 | 
    43 
 | 
| 
 | 
    44 LOG_DEPTH = "smart"
 | 
| 
 | 
    45 
 | 
| 
 | 
    46 class GetSizes(object):
 | 
| 
18
 | 
    47 	
 | 
| 
 | 
    48 	def __init__(self, inFileName = None, inFormat=None, outFileName = None, query=None,xMax=None, xMin=None, verbosity = 0):
 | 
| 
 | 
    49 		self.inFileName = inFileName
 | 
| 
 | 
    50 		self.inFormat= inFormat
 | 
| 
 | 
    51 		self.outFileName = outFileName
 | 
| 
 | 
    52 		self.query = query
 | 
| 
 | 
    53 		self.xMax = xMax
 | 
| 
 | 
    54 		self.xMin = xMin
 | 
| 
 | 
    55 		self.xLab = "Size"
 | 
| 
 | 
    56 		self.yLab = "# reads"
 | 
| 
 | 
    57 		self.barplot = False
 | 
| 
 | 
    58 		self._verbosity = verbosity
 | 
| 
 | 
    59 		self.parser = None
 | 
| 
 | 
    60 		self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)
 | 
| 
 | 
    61 		
 | 
| 
 | 
    62 	def setAttributesFromCmdLine(self):
 | 
| 
 | 
    63 		description = "Usage: getSizes.py [options]\n\nGet Sizes v1.0.2: Get the sizes of a set of genomic coordinates. [Category: Visualization]\n"
 | 
| 
 | 
    64 		epilog = ""
 | 
| 
 | 
    65 		parser = RepetOptionParser(description = description, epilog = epilog)
 | 
| 
 | 
    66 		parser.add_option("-i", "--input",	 dest="inputFileName", action="store",	    default=None,	   type="string", help="input file [compulsory] [format: file in transcript or sequence format given by -f]")
 | 
| 
 | 
    67 		parser.add_option("-f", "--format",	dest="format",		   action="store",	    default=None,	   type="string", help="format of the input [compulsory] [format: transcript or sequence file format]")
 | 
| 
 | 
    68 		parser.add_option("-q", "--query",	 dest="query",		   action="store",	    default=None,	   type="string", help="type to mesure [default: size] [format: choice (size, intron size, exon size, 1st exon size)]")	 
 | 
| 
 | 
    69 		parser.add_option("-o", "--output",	dest="outputFileName", action="store",	    default=None,	   type="string", help="output file [format: output file in PNG format]")
 | 
| 
 | 
    70 		parser.add_option("-x", "--xMax",	  dest="xMax",		   action="store",	    default=None,	   type="int",	  help="maximum value on the x-axis to plot [format: int]")
 | 
| 
 | 
    71 		parser.add_option("-X", "--xMin",	  dest="xMin",		   action="store",	    default=None,	   type="int",	  help="minimum value on the x-axis to plot [format: int]")
 | 
| 
 | 
    72 		parser.add_option("-v", "--verbosity", dest="verbosity",   action="store",	    default=1,		   type="int",	  help="trace level [format: int]")
 | 
| 
 | 
    73 		parser.add_option("-a", "--xLabel",	dest="xLab",           action="store",	    default="Size",	   type="string", help="x absis label name [format: string] [default: Size]")
 | 
| 
 | 
    74 		parser.add_option("-b", "--yLabel",	dest="yLab",           action="store",	    default="# reads", type="string", help="y absis label name [format: string] [default: Reads]")
 | 
| 
 | 
    75 		parser.add_option("-B", "--barplot",   dest="barplot",     action="store_true", default=False,					  help="use barplot representation [format: bool] [default: false]")  
 | 
| 
 | 
    76 		options = parser.parse_args()[0]
 | 
| 
 | 
    77 		self._setAttributesFromOptions(options)
 | 
| 
 | 
    78 		
 | 
| 
 | 
    79 	def _setAttributesFromOptions(self, options):
 | 
| 
 | 
    80 		self.setInFileName(options.inputFileName)
 | 
| 
 | 
    81 		self.setInFormat(options.format)
 | 
| 
 | 
    82 		self.setQuery(options.query)
 | 
| 
 | 
    83 		self.setOutFileName(options.outputFileName)
 | 
| 
 | 
    84 		self.setXMax(options.xMax)
 | 
| 
 | 
    85 		self.setXMin(options.xMin)
 | 
| 
 | 
    86 		self.setxLab(options.xLab)
 | 
| 
 | 
    87 		self.setyLab(options.yLab)
 | 
| 
 | 
    88 		self.setBarplot(options.barplot)
 | 
| 
 | 
    89 		self.setVerbosity(options.verbosity)
 | 
| 
 | 
    90 		
 | 
| 
 | 
    91 	def setInFileName(self, inputFileName):
 | 
| 
 | 
    92 		self.inFileName = inputFileName
 | 
| 
 | 
    93 		
 | 
| 
 | 
    94 	def setInFormat(self, inFormat):
 | 
| 
 | 
    95 		self.inFormat = inFormat
 | 
| 
 | 
    96 	
 | 
| 
 | 
    97 	def setQuery(self, query):
 | 
| 
 | 
    98 		self.query = query
 | 
| 
 | 
    99 		
 | 
| 
 | 
   100 	def setOutFileName(self, outFileName):
 | 
| 
 | 
   101 		self.outFileName = outFileName
 | 
| 
 | 
   102 	
 | 
| 
 | 
   103 	def setXMax(self, xMax):
 | 
| 
 | 
   104 		self.xMax = xMax
 | 
| 
 | 
   105 		
 | 
| 
 | 
   106 	def setXMin(self, xMin):
 | 
| 
 | 
   107 		self.xMin = xMin
 | 
| 
 | 
   108 	
 | 
| 
 | 
   109 	def setxLab(self, xLab):
 | 
| 
 | 
   110 		self.xLab = xLab
 | 
| 
 | 
   111 		
 | 
| 
 | 
   112 	def setyLab(self, yLab):
 | 
| 
 | 
   113 		self.yLab = yLab
 | 
| 
 | 
   114 		
 | 
| 
 | 
   115 	def setBarplot(self, barplot):
 | 
| 
 | 
   116 		self.barplot = barplot
 | 
| 
 | 
   117 		
 | 
| 
 | 
   118 	def setVerbosity(self, verbosity):
 | 
| 
 | 
   119 		self._verbosity = verbosity
 | 
| 
 | 
   120 		
 | 
| 
 | 
   121 	def _checkOptions(self):
 | 
| 
 | 
   122 		if self.inFileName == None:
 | 
| 
 | 
   123 			self._logAndRaise("ERROR: Missing input file name")
 | 
| 
 | 
   124 		if self.inFormat == "fasta":
 | 
| 
 | 
   125 			self.parser = FastaParser(self.inFileName, self._verbosity)
 | 
| 
 | 
   126 		elif self.inFormat == "fastq":
 | 
| 
 | 
   127 			self.parser = FastqParser(self.inFileName, self._verbosity)
 | 
| 
 | 
   128 		else:
 | 
| 
 | 
   129 			self.parser = TranscriptContainer(self.inFileName, self.inFormat, self._verbosity)
 | 
| 
 | 
   130 			
 | 
| 
 | 
   131 	def _logAndRaise(self, errorMsg):
 | 
| 
 | 
   132 		self._log.error(errorMsg)
 | 
| 
 | 
   133 		raise Exception(errorMsg)
 | 
| 
6
 | 
   134 
 | 
| 
18
 | 
   135 	def run(self):
 | 
| 
 | 
   136 		LoggerFactory.setLevel(self._log, self._verbosity)
 | 
| 
 | 
   137 		self._checkOptions()
 | 
| 
 | 
   138 		self._log.info("START getsizes")
 | 
| 
 | 
   139 		self._log.debug("Input file name: %s" % self.inFileName)
 | 
| 
6
 | 
   140 
 | 
| 
18
 | 
   141 		nbItems = self.parser.getNbItems()
 | 
| 
 | 
   142 		self._log.info( "%i items found" % (nbItems))
 | 
| 
 | 
   143 		
 | 
| 
 | 
   144 		# treat items
 | 
| 
 | 
   145 		progress   = Progress(nbItems, "Analyzing sequences of %s" % (self.inFileName), self._verbosity)
 | 
| 
 | 
   146 		sizes      = {}
 | 
| 
 | 
   147 		minimum	   = 1000000000000
 | 
| 
 | 
   148 		maximum	   = 0
 | 
| 
 | 
   149 		sum		   = 0
 | 
| 
 | 
   150 		number     = 0
 | 
| 
 | 
   151 		nbSubItems = 0
 | 
| 
 | 
   152 		for item in self.parser.getIterator():
 | 
| 
 | 
   153 			items = []
 | 
| 
 | 
   154 			if self.query == "exon":
 | 
| 
 | 
   155 				items = item.getExons()
 | 
| 
 | 
   156 			elif self.query == "exon1":
 | 
| 
 | 
   157 				if len(item.getExons()) > 1:
 | 
| 
 | 
   158 					item.sortExons()
 | 
| 
 | 
   159 					items = [item.getExons()[0]]
 | 
| 
 | 
   160 			elif self.query == "intron":
 | 
| 
 | 
   161 				items = item.getIntrons()
 | 
| 
 | 
   162 			else:
 | 
| 
 | 
   163 				items = [item, ]
 | 
| 
 | 
   164 	
 | 
| 
 | 
   165 			for thisItem in items:
 | 
| 
 | 
   166 				try:
 | 
| 
 | 
   167 					nbElements = int(float(thisItem.getTagValue("nbElements")))
 | 
| 
 | 
   168 					if nbElements == None:
 | 
| 
 | 
   169 						nbElements = 1
 | 
| 
 | 
   170 				except:
 | 
| 
 | 
   171 					nbElements = 1
 | 
| 
 | 
   172 				size	= thisItem.getSize()
 | 
| 
 | 
   173 				minimum = min(minimum, size)
 | 
| 
 | 
   174 				maximum = max(maximum, size)
 | 
| 
 | 
   175 				
 | 
| 
 | 
   176 				if size not in sizes:
 | 
| 
 | 
   177 					sizes[size] = nbElements
 | 
| 
 | 
   178 				else:
 | 
| 
 | 
   179 					sizes[size] += nbElements
 | 
| 
 | 
   180 				sum		+= size
 | 
| 
 | 
   181 				nbSubItems += nbElements
 | 
| 
 | 
   182 			number += 1
 | 
| 
 | 
   183 			progress.inc()
 | 
| 
 | 
   184 		progress.done()
 | 
| 
6
 | 
   185 
 | 
| 
18
 | 
   186 		if self.outFileName != None:
 | 
| 
 | 
   187 			plotter = RPlotter(self.outFileName, self._verbosity)
 | 
| 
 | 
   188 			plotter.setFill(0)
 | 
| 
 | 
   189 			plotter.setMinimumX(self.xMin)
 | 
| 
 | 
   190 			plotter.setMaximumX(self.xMax)
 | 
| 
 | 
   191 			plotter.setXLabel(self.xLab)
 | 
| 
 | 
   192 			plotter.setYLabel(self.yLab)
 | 
| 
 | 
   193 			plotter.setBarplot(self.barplot)
 | 
| 
 | 
   194 			plotter.addLine(sizes)
 | 
| 
 | 
   195 			plotter.plot()
 | 
| 
 | 
   196 			
 | 
| 
 | 
   197 		if nbSubItems == 0:
 | 
| 
 | 
   198 			self._logAndRaise("No item found")
 | 
| 
 | 
   199 			
 | 
| 
 | 
   200 		self.items = number	  
 | 
| 
 | 
   201 		self.subItems = nbSubItems
 | 
| 
 | 
   202 		self.nucleotides = sum
 | 
| 
 | 
   203 		self.minAvgMedMax = Utils.getMinAvgMedMax(sizes)
 | 
| 
 | 
   204 				  
 | 
| 
 | 
   205 		print "%d items" % (number)
 | 
| 
 | 
   206 		print "%d sub-items" % (nbSubItems)
 | 
| 
 | 
   207 		print "%d nucleotides" % (sum)
 | 
| 
 | 
   208 		print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(sizes)
 | 
| 
 | 
   209 
 | 
| 
 | 
   210 		self._log.info("END getsizes")
 | 
| 
6
 | 
   211 
 | 
| 
 | 
   212 
 | 
| 
 | 
   213 if __name__ == "__main__":
 | 
| 
18
 | 
   214 	iGetSizes = GetSizes()
 | 
| 
 | 
   215 	iGetSizes.setAttributesFromCmdLine()
 | 
| 
 | 
   216 	iGetSizes.run()
 | 
| 
 | 
   217 	
 | 
| 
6
 | 
   218 #TODO: add two more options!!!!!!
 |