| 
6
 | 
     1 #! /usr/bin/env python
 | 
| 
 | 
     2 #
 | 
| 
 | 
     3 # Copyright INRA-URGI 2009-2010
 | 
| 
 | 
     4 # 
 | 
| 
 | 
     5 # This software is governed by the CeCILL license under French law and
 | 
| 
 | 
     6 # abiding by the rules of distribution of free software. You can use,
 | 
| 
 | 
     7 # modify and/ or redistribute the software under the terms of the CeCILL
 | 
| 
 | 
     8 # license as circulated by CEA, CNRS and INRIA at the following URL
 | 
| 
 | 
     9 # "http://www.cecill.info".
 | 
| 
 | 
    10 # 
 | 
| 
 | 
    11 # As a counterpart to the access to the source code and rights to copy,
 | 
| 
 | 
    12 # modify and redistribute granted by the license, users are provided only
 | 
| 
 | 
    13 # with a limited warranty and the software's author, the holder of the
 | 
| 
 | 
    14 # economic rights, and the successive licensors have only limited
 | 
| 
 | 
    15 # liability.
 | 
| 
 | 
    16 # 
 | 
| 
 | 
    17 # In this respect, the user's attention is drawn to the risks associated
 | 
| 
 | 
    18 # with loading, using, modifying and/or developing or reproducing the
 | 
| 
 | 
    19 # software by the user in light of its specific status of free software,
 | 
| 
 | 
    20 # that may mean that it is complicated to manipulate, and that also
 | 
| 
 | 
    21 # therefore means that it is reserved for developers and experienced
 | 
| 
 | 
    22 # professionals having in-depth computer knowledge. Users are therefore
 | 
| 
 | 
    23 # encouraged to load and test the software's suitability as regards their
 | 
| 
 | 
    24 # requirements in conditions enabling the security of their systems and/or
 | 
| 
 | 
    25 # data to be ensured and, more generally, to use and operate it in the
 | 
| 
 | 
    26 # same conditions as regards security.
 | 
| 
 | 
    27 # 
 | 
| 
 | 
    28 # The fact that you are presently reading this means that you have had
 | 
| 
 | 
    29 # knowledge of the CeCILL license and that you accept its terms.
 | 
| 
 | 
    30 #
 | 
| 
 | 
    31 """
 | 
| 
 | 
    32 Cluster the data into regions (defined by size and overlap with next region) and keep only highest peaks.
 | 
| 
 | 
    33 """
 | 
| 
 | 
    34 
 | 
| 
 | 
    35 import math
 | 
| 
 | 
    36 from optparse import OptionParser
 | 
| 
 | 
    37 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
 | 
| 
 | 
    38 from commons.core.parsing.WigParser import WigParser
 | 
| 
 | 
    39 from SMART.Java.Python.misc.Progress import Progress
 | 
| 
 | 
    40 from SMART.Java.Python.misc.RPlotter import RPlotter
 | 
| 
 | 
    41 
 | 
| 
 | 
    42 class GetWigProfile(object):
 | 
| 
 | 
    43 
 | 
| 
 | 
    44 	def __init__(self, verbosity):
 | 
| 
 | 
    45 		self.verbosity	= verbosity
 | 
| 
 | 
    46 		self.values		 = {}
 | 
| 
 | 
    47 		self.defaultValue = 0.0
 | 
| 
 | 
    48 
 | 
| 
 | 
    49 	def _iToJ(self, i, size):
 | 
| 
 | 
    50 		return min(self.nbPoints+1, int(math.floor(float(i - self.distance) / (size) * (self.nbPoints))))
 | 
| 
 | 
    51 
 | 
| 
 | 
    52 	def readTranscripts(self):
 | 
| 
 | 
    53 		self.strandNames = (1, -1) if self.strands else (1, )
 | 
| 
 | 
    54 		self.values		= dict([(strand, dict([(i, 0.0) for i in range(self.nbPoints + 2 * self.distance)])) for strand in self.strandNames])
 | 
| 
 | 
    55 		transcriptParser = TranscriptContainer(self.inputFileName, self.inputFormat, self.verbosity)
 | 
| 
 | 
    56 		wigParser		= WigParser(self.wig)
 | 
| 
 | 
    57 		nbValues		 = dict([(strand, dict([(i, 0.0) for i in range(self.nbPoints + 2 * self.distance)])) for strand in self.strandNames])
 | 
| 
 | 
    58 		wigParser.setStrands(self.strands)
 | 
| 
 | 
    59 		wigParser.setDefaultValue(self.defaultValue)
 | 
| 
 | 
    60 
 | 
| 
 | 
    61 		progress = Progress(transcriptParser.getNbTranscripts(), "Parsing %s" % (self.inputFileName), self.verbosity)
 | 
| 
 | 
    62 		for transcript in transcriptParser.getIterator():
 | 
| 
 | 
    63 			transcriptSize = transcript.getSize()
 | 
| 
 | 
    64 			expectedSize   = transcriptSize + 2 * self.distance
 | 
| 
 | 
    65 			transcript.extendStart(self.distance)
 | 
| 
 | 
    66 			transcript.extendEnd(self.distance)
 | 
| 
 | 
    67 			theseValues = transcript.extractWigData(wigParser)
 | 
| 
 | 
    68 
 | 
| 
 | 
    69 			if len(self.strandNames) == 1:
 | 
| 
 | 
    70 				theseValues = {1: theseValues}
 | 
| 
 | 
    71 			for strand in self.strandNames:
 | 
| 
 | 
    72 				if len(theseValues[strand]) < expectedSize:
 | 
| 
 | 
    73 					theseValues[strand] = [self.defaultValue] * (expectedSize - len(theseValues[strand])) + theseValues[strand]
 | 
| 
 | 
    74 				if len(theseValues[strand]) != expectedSize:
 | 
| 
 | 
    75 					raise Exception("Got something wrong with the size of the WIG data concerning %s [%s]: %d found instead of %d" % (transcript, ",".join(["%d-%d" % (exon.getStart(), exon.getEnd()) for exon in transcript.getExons()]), len(theseValues[strand]), expectedSize))
 | 
| 
 | 
    76 				fivePValues = theseValues[strand][: self.distance]
 | 
| 
 | 
    77 				nbValues         = [0.0] * (self.nbPoints)
 | 
| 
 | 
    78 				transcriptValues = [0.0] * (self.nbPoints)
 | 
| 
 | 
    79 				for i in range(self.distance, len(theseValues[strand]) - self.distance):
 | 
| 
 | 
    80 					startJ = self._iToJ(i, transcriptSize)
 | 
| 
 | 
    81 					endJ   = max(startJ+1, self._iToJ(i+1, transcriptSize))
 | 
| 
 | 
    82 					for j in range(startJ, endJ):
 | 
| 
 | 
    83 						transcriptValues[j]	+= theseValues[strand][i]
 | 
| 
 | 
    84 						nbValues[j] += 1
 | 
| 
 | 
    85 				threePValues = theseValues[strand][-self.distance: ]
 | 
| 
 | 
    86 				values = fivePValues + [self.defaultValue if nbValue == 0 else transcriptValue / nbValue for transcriptValue, nbValue in zip(transcriptValues, nbValues)] + threePValues
 | 
| 
 | 
    87 				for i, value in enumerate(values):
 | 
| 
 | 
    88 					self.values[strand][i] += value
 | 
| 
 | 
    89 			progress.inc()
 | 
| 
 | 
    90 		progress.done()
 | 
| 
 | 
    91 
 | 
| 
 | 
    92 		for strand in self.strandNames:
 | 
| 
 | 
    93 			if strand == 0:
 | 
| 
 | 
    94 				strand = 1
 | 
| 
 | 
    95 			for i in range(self.nbPoints + 2 * self.distance):
 | 
| 
 | 
    96 				self.values[strand][i] /= transcriptParser.getNbTranscripts() * strand
 | 
| 
 | 
    97 
 | 
| 
 | 
    98 
 | 
| 
 | 
    99 	def smoothen(self):
 | 
| 
 | 
   100 		if self.smoothenForce == None:
 | 
| 
 | 
   101 			return
 | 
| 
 | 
   102 		for strand in self.strandNames:
 | 
| 
 | 
   103 			averageValues = {}
 | 
| 
 | 
   104 			for center in range(self.distance, self.distance + self.nbPoints):
 | 
| 
 | 
   105 				sum		= 0.0
 | 
| 
 | 
   106 				nbValues = 0.0
 | 
| 
 | 
   107 				for i in range(center - self.smoothenForce + 1, center + self.smoothenForce):
 | 
| 
 | 
   108 					if i > self.distance and i < self.distance + self.nbPoints:
 | 
| 
 | 
   109 						nbValues += 1
 | 
| 
 | 
   110 						sum		+= self.values[strand][i]
 | 
| 
 | 
   111 				averageValues[center] = sum / nbValues
 | 
| 
 | 
   112 			for position in range(self.distance, self.distance + self.nbPoints):
 | 
| 
 | 
   113 				self.values[strand][position] = averageValues[position]
 | 
| 
 | 
   114 		
 | 
| 
 | 
   115 
 | 
| 
 | 
   116 	def plot(self):
 | 
| 
 | 
   117 		plotter = RPlotter(self.outputFileName, self.verbosity)
 | 
| 
 | 
   118 		for strand in self.strandNames:
 | 
| 
 | 
   119 			plotter.addLine(self.values[strand])
 | 
| 
 | 
   120 		if self.log:
 | 
| 
 | 
   121 			plotter.setLog("y")
 | 
| 
 | 
   122 		plotter.setAxisLabel("x", {0: -self.distance, self.distance: "start", self.distance+self.nbPoints-1: "end", 2*self.distance+self.nbPoints-1: self.distance})
 | 
| 
 | 
   123 		plotter.plot()
 | 
| 
 | 
   124 
 | 
| 
 | 
   125 
 | 
| 
 | 
   126 
 | 
| 
 | 
   127 if __name__ == "__main__":
 | 
| 
 | 
   128 	
 | 
| 
 | 
   129 	# parse command line
 | 
| 
 | 
   130 	description = "Get WIG Profile v1.0.1: Compute the average profile of some genomic coordinates using WIG files (thus covering a large proportion of the genome). [Category: WIG Tools]"
 | 
| 
 | 
   131 
 | 
| 
 | 
   132 	parser = OptionParser(description = description)
 | 
| 
 | 
   133 	parser.add_option("-i", "--input",			 dest="inputFileName",	action="store",											type="string", help="input file [compulsory] [format: file in transcript format given by -f]")
 | 
| 
 | 
   134 	parser.add_option("-f", "--inputFormat", dest="inputFormat",		action="store",											type="string", help="format of the input file [compulsory] [format: transcript file format]")
 | 
| 
 | 
   135 	parser.add_option("-w", "--wig",				 dest="wig",						action="store",											type="string", help="wig file name [compulsory] [format: file in WIG format]")	
 | 
| 
 | 
   136 	parser.add_option("-p", "--nbPoints",		 dest="nbPoints",				action="store",			 default=1000,	type="int",		 help="number of points on the x-axis [compulsory] [format: int] [default: 1000]")	
 | 
| 
 | 
   137 	parser.add_option("-d", "--distance",		 dest="distance",				action="store",			 default=0,			type="int",		 help="distance around genomic coordinates [compulsory] [format: int] [default: 0]")	
 | 
| 
 | 
   138 	parser.add_option("-s", "--strands",		 dest="strands",				action="store_true", default=False,								 help="consider both strands separately [format: boolean] [default: False]")	
 | 
| 
 | 
   139 	parser.add_option("-m", "--smoothen",		 dest="smoothen",				action="store",			 default=None,	type="int",		 help="smoothen the curve [format: int] [default: None]")	
 | 
| 
 | 
   140 	parser.add_option("-a", "--default",		 dest="defaultValue",	 action="store",			 default=0.0,	 type="float",	help="default value (when value is NA) [default: 0.0] [format: float]")
 | 
| 
 | 
   141 	parser.add_option("-o", "--output",			 dest="outputFileName", action="store",											type="string", help="output file [compulsory] [format: output file in PNG format]")
 | 
| 
 | 
   142 	parser.add_option("-l", "--log",				 dest="log",						action="store_true", default=False,								 help="use log scale for y-axis	[format: boolean] [default: False]")
 | 
| 
 | 
   143 	parser.add_option("-v", "--verbosity",	 dest="verbosity",			action="store",			 default=1,			type="int",		 help="trace level [format: int]")
 | 
| 
 | 
   144 	(options, args) = parser.parse_args()
 | 
| 
 | 
   145 
 | 
| 
 | 
   146 	wigProfile								= GetWigProfile(options.verbosity)
 | 
| 
 | 
   147 	wigProfile.strands			 	= options.strands
 | 
| 
 | 
   148 	wigProfile.inputFileName	= options.inputFileName
 | 
| 
 | 
   149 	wigProfile.inputFormat		= options.inputFormat
 | 
| 
 | 
   150 	wigProfile.wig						= options.wig
 | 
| 
 | 
   151 	wigProfile.nbPoints				= options.nbPoints
 | 
| 
 | 
   152 	wigProfile.distance				= options.distance
 | 
| 
 | 
   153 	wigProfile.smoothenForce	= options.smoothen
 | 
| 
 | 
   154 	wigProfile.defaultValue	  = options.defaultValue
 | 
| 
 | 
   155 	wigProfile.outputFileName = options.outputFileName
 | 
| 
 | 
   156 	wigProfile.log						= options.log
 | 
| 
 | 
   157 
 | 
| 
 | 
   158 	wigProfile.readTranscripts()
 | 
| 
 | 
   159 	wigProfile.smoothen()
 | 
| 
 | 
   160 	wigProfile.plot()
 |