| 6 | 1 #! /usr/bin/env python | 
|  | 2 # | 
|  | 3 # Copyright INRA-URGI 2009-2010 | 
|  | 4 # | 
|  | 5 # This software is governed by the CeCILL license under French law and | 
|  | 6 # abiding by the rules of distribution of free software. You can use, | 
|  | 7 # modify and/ or redistribute the software under the terms of the CeCILL | 
|  | 8 # license as circulated by CEA, CNRS and INRIA at the following URL | 
|  | 9 # "http://www.cecill.info". | 
|  | 10 # | 
|  | 11 # As a counterpart to the access to the source code and rights to copy, | 
|  | 12 # modify and redistribute granted by the license, users are provided only | 
|  | 13 # with a limited warranty and the software's author, the holder of the | 
|  | 14 # economic rights, and the successive licensors have only limited | 
|  | 15 # liability. | 
|  | 16 # | 
|  | 17 # In this respect, the user's attention is drawn to the risks associated | 
|  | 18 # with loading, using, modifying and/or developing or reproducing the | 
|  | 19 # software by the user in light of its specific status of free software, | 
|  | 20 # that may mean that it is complicated to manipulate, and that also | 
|  | 21 # therefore means that it is reserved for developers and experienced | 
|  | 22 # professionals having in-depth computer knowledge. Users are therefore | 
|  | 23 # encouraged to load and test the software's suitability as regards their | 
|  | 24 # requirements in conditions enabling the security of their systems and/or | 
|  | 25 # data to be ensured and, more generally, to use and operate it in the | 
|  | 26 # same conditions as regards security. | 
|  | 27 # | 
|  | 28 # The fact that you are presently reading this means that you have had | 
|  | 29 # knowledge of the CeCILL license and that you accept its terms. | 
|  | 30 # | 
|  | 31 """ | 
|  | 32 Cluster the data into regions (defined by size and overlap with next region) and keep only highest peaks. | 
|  | 33 """ | 
|  | 34 | 
|  | 35 import math | 
|  | 36 from optparse import OptionParser | 
|  | 37 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer | 
|  | 38 from commons.core.parsing.WigParser import WigParser | 
|  | 39 from SMART.Java.Python.misc.Progress import Progress | 
|  | 40 from SMART.Java.Python.misc.RPlotter import RPlotter | 
|  | 41 | 
|  | 42 class GetWigProfile(object): | 
|  | 43 | 
|  | 44 	def __init__(self, verbosity): | 
|  | 45 		self.verbosity	= verbosity | 
|  | 46 		self.values		 = {} | 
|  | 47 		self.defaultValue = 0.0 | 
|  | 48 | 
|  | 49 	def _iToJ(self, i, size): | 
|  | 50 		return min(self.nbPoints+1, int(math.floor(float(i - self.distance) / (size) * (self.nbPoints)))) | 
|  | 51 | 
|  | 52 	def readTranscripts(self): | 
|  | 53 		self.strandNames = (1, -1) if self.strands else (1, ) | 
|  | 54 		self.values		= dict([(strand, dict([(i, 0.0) for i in range(self.nbPoints + 2 * self.distance)])) for strand in self.strandNames]) | 
|  | 55 		transcriptParser = TranscriptContainer(self.inputFileName, self.inputFormat, self.verbosity) | 
|  | 56 		wigParser		= WigParser(self.wig) | 
|  | 57 		nbValues		 = dict([(strand, dict([(i, 0.0) for i in range(self.nbPoints + 2 * self.distance)])) for strand in self.strandNames]) | 
|  | 58 		wigParser.setStrands(self.strands) | 
|  | 59 		wigParser.setDefaultValue(self.defaultValue) | 
|  | 60 | 
|  | 61 		progress = Progress(transcriptParser.getNbTranscripts(), "Parsing %s" % (self.inputFileName), self.verbosity) | 
|  | 62 		for transcript in transcriptParser.getIterator(): | 
|  | 63 			transcriptSize = transcript.getSize() | 
|  | 64 			expectedSize   = transcriptSize + 2 * self.distance | 
|  | 65 			transcript.extendStart(self.distance) | 
|  | 66 			transcript.extendEnd(self.distance) | 
|  | 67 			theseValues = transcript.extractWigData(wigParser) | 
|  | 68 | 
|  | 69 			if len(self.strandNames) == 1: | 
|  | 70 				theseValues = {1: theseValues} | 
|  | 71 			for strand in self.strandNames: | 
|  | 72 				if len(theseValues[strand]) < expectedSize: | 
|  | 73 					theseValues[strand] = [self.defaultValue] * (expectedSize - len(theseValues[strand])) + theseValues[strand] | 
|  | 74 				if len(theseValues[strand]) != expectedSize: | 
|  | 75 					raise Exception("Got something wrong with the size of the WIG data concerning %s [%s]: %d found instead of %d" % (transcript, ",".join(["%d-%d" % (exon.getStart(), exon.getEnd()) for exon in transcript.getExons()]), len(theseValues[strand]), expectedSize)) | 
|  | 76 				fivePValues = theseValues[strand][: self.distance] | 
|  | 77 				nbValues         = [0.0] * (self.nbPoints) | 
|  | 78 				transcriptValues = [0.0] * (self.nbPoints) | 
|  | 79 				for i in range(self.distance, len(theseValues[strand]) - self.distance): | 
|  | 80 					startJ = self._iToJ(i, transcriptSize) | 
|  | 81 					endJ   = max(startJ+1, self._iToJ(i+1, transcriptSize)) | 
|  | 82 					for j in range(startJ, endJ): | 
|  | 83 						transcriptValues[j]	+= theseValues[strand][i] | 
|  | 84 						nbValues[j] += 1 | 
|  | 85 				threePValues = theseValues[strand][-self.distance: ] | 
|  | 86 				values = fivePValues + [self.defaultValue if nbValue == 0 else transcriptValue / nbValue for transcriptValue, nbValue in zip(transcriptValues, nbValues)] + threePValues | 
|  | 87 				for i, value in enumerate(values): | 
|  | 88 					self.values[strand][i] += value | 
|  | 89 			progress.inc() | 
|  | 90 		progress.done() | 
|  | 91 | 
|  | 92 		for strand in self.strandNames: | 
|  | 93 			if strand == 0: | 
|  | 94 				strand = 1 | 
|  | 95 			for i in range(self.nbPoints + 2 * self.distance): | 
|  | 96 				self.values[strand][i] /= transcriptParser.getNbTranscripts() * strand | 
|  | 97 | 
|  | 98 | 
|  | 99 	def smoothen(self): | 
|  | 100 		if self.smoothenForce == None: | 
|  | 101 			return | 
|  | 102 		for strand in self.strandNames: | 
|  | 103 			averageValues = {} | 
|  | 104 			for center in range(self.distance, self.distance + self.nbPoints): | 
|  | 105 				sum		= 0.0 | 
|  | 106 				nbValues = 0.0 | 
|  | 107 				for i in range(center - self.smoothenForce + 1, center + self.smoothenForce): | 
|  | 108 					if i > self.distance and i < self.distance + self.nbPoints: | 
|  | 109 						nbValues += 1 | 
|  | 110 						sum		+= self.values[strand][i] | 
|  | 111 				averageValues[center] = sum / nbValues | 
|  | 112 			for position in range(self.distance, self.distance + self.nbPoints): | 
|  | 113 				self.values[strand][position] = averageValues[position] | 
|  | 114 | 
|  | 115 | 
|  | 116 	def plot(self): | 
|  | 117 		plotter = RPlotter(self.outputFileName, self.verbosity) | 
|  | 118 		for strand in self.strandNames: | 
|  | 119 			plotter.addLine(self.values[strand]) | 
|  | 120 		if self.log: | 
|  | 121 			plotter.setLog("y") | 
|  | 122 		plotter.setAxisLabel("x", {0: -self.distance, self.distance: "start", self.distance+self.nbPoints-1: "end", 2*self.distance+self.nbPoints-1: self.distance}) | 
|  | 123 		plotter.plot() | 
|  | 124 | 
|  | 125 | 
|  | 126 | 
|  | 127 if __name__ == "__main__": | 
|  | 128 | 
|  | 129 	# parse command line | 
|  | 130 	description = "Get WIG Profile v1.0.1: Compute the average profile of some genomic coordinates using WIG files (thus covering a large proportion of the genome). [Category: WIG Tools]" | 
|  | 131 | 
|  | 132 	parser = OptionParser(description = description) | 
|  | 133 	parser.add_option("-i", "--input",			 dest="inputFileName",	action="store",											type="string", help="input file [compulsory] [format: file in transcript format given by -f]") | 
|  | 134 	parser.add_option("-f", "--inputFormat", dest="inputFormat",		action="store",											type="string", help="format of the input file [compulsory] [format: transcript file format]") | 
|  | 135 	parser.add_option("-w", "--wig",				 dest="wig",						action="store",											type="string", help="wig file name [compulsory] [format: file in WIG format]") | 
|  | 136 	parser.add_option("-p", "--nbPoints",		 dest="nbPoints",				action="store",			 default=1000,	type="int",		 help="number of points on the x-axis [compulsory] [format: int] [default: 1000]") | 
|  | 137 	parser.add_option("-d", "--distance",		 dest="distance",				action="store",			 default=0,			type="int",		 help="distance around genomic coordinates [compulsory] [format: int] [default: 0]") | 
|  | 138 	parser.add_option("-s", "--strands",		 dest="strands",				action="store_true", default=False,								 help="consider both strands separately [format: boolean] [default: False]") | 
|  | 139 	parser.add_option("-m", "--smoothen",		 dest="smoothen",				action="store",			 default=None,	type="int",		 help="smoothen the curve [format: int] [default: None]") | 
|  | 140 	parser.add_option("-a", "--default",		 dest="defaultValue",	 action="store",			 default=0.0,	 type="float",	help="default value (when value is NA) [default: 0.0] [format: float]") | 
|  | 141 	parser.add_option("-o", "--output",			 dest="outputFileName", action="store",											type="string", help="output file [compulsory] [format: output file in PNG format]") | 
|  | 142 	parser.add_option("-l", "--log",				 dest="log",						action="store_true", default=False,								 help="use log scale for y-axis	[format: boolean] [default: False]") | 
|  | 143 	parser.add_option("-v", "--verbosity",	 dest="verbosity",			action="store",			 default=1,			type="int",		 help="trace level [format: int]") | 
|  | 144 	(options, args) = parser.parse_args() | 
|  | 145 | 
|  | 146 	wigProfile								= GetWigProfile(options.verbosity) | 
|  | 147 	wigProfile.strands			 	= options.strands | 
|  | 148 	wigProfile.inputFileName	= options.inputFileName | 
|  | 149 	wigProfile.inputFormat		= options.inputFormat | 
|  | 150 	wigProfile.wig						= options.wig | 
|  | 151 	wigProfile.nbPoints				= options.nbPoints | 
|  | 152 	wigProfile.distance				= options.distance | 
|  | 153 	wigProfile.smoothenForce	= options.smoothen | 
|  | 154 	wigProfile.defaultValue	  = options.defaultValue | 
|  | 155 	wigProfile.outputFileName = options.outputFileName | 
|  | 156 	wigProfile.log						= options.log | 
|  | 157 | 
|  | 158 	wigProfile.readTranscripts() | 
|  | 159 	wigProfile.smoothen() | 
|  | 160 	wigProfile.plot() |