| 36 | 1 #! /usr/bin/env python | 
|  | 2 # | 
|  | 3 # Copyright INRA-URGI 2009-2010 | 
|  | 4 # | 
|  | 5 # This software is governed by the CeCILL license under French law and | 
|  | 6 # abiding by the rules of distribution of free software. You can use, | 
|  | 7 # modify and/ or redistribute the software under the terms of the CeCILL | 
|  | 8 # license as circulated by CEA, CNRS and INRIA at the following URL | 
|  | 9 # "http://www.cecill.info". | 
|  | 10 # | 
|  | 11 # As a counterpart to the access to the source code and rights to copy, | 
|  | 12 # modify and redistribute granted by the license, users are provided only | 
|  | 13 # with a limited warranty and the software's author, the holder of the | 
|  | 14 # economic rights, and the successive licensors have only limited | 
|  | 15 # liability. | 
|  | 16 # | 
|  | 17 # In this respect, the user's attention is drawn to the risks associated | 
|  | 18 # with loading, using, modifying and/or developing or reproducing the | 
|  | 19 # software by the user in light of its specific status of free software, | 
|  | 20 # that may mean that it is complicated to manipulate, and that also | 
|  | 21 # therefore means that it is reserved for developers and experienced | 
|  | 22 # professionals having in-depth computer knowledge. Users are therefore | 
|  | 23 # encouraged to load and test the software's suitability as regards their | 
|  | 24 # requirements in conditions enabling the security of their systems and/or | 
|  | 25 # data to be ensured and, more generally, to use and operate it in the | 
|  | 26 # same conditions as regards security. | 
|  | 27 # | 
|  | 28 # The fact that you are presently reading this means that you have had | 
|  | 29 # knowledge of the CeCILL license and that you accept its terms. | 
|  | 30 # | 
|  | 31 from commons.core.writer.WriterChooser import WriterChooser | 
|  | 32 """Clusterize a set of transcripts""" | 
|  | 33 | 
|  | 34 import os, os.path, random | 
|  | 35 from optparse import OptionParser | 
| 60 | 36 from heapq import heappush, heappop | 
| 36 | 37 from commons.core.parsing.ParserChooser import ParserChooser | 
|  | 38 from commons.core.writer.Gff3Writer import Gff3Writer | 
|  | 39 from SMART.Java.Python.structure.Transcript import Transcript | 
|  | 40 from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle | 
|  | 41 from SMART.Java.Python.ncList.FileSorter import FileSorter | 
|  | 42 from SMART.Java.Python.misc.Progress import Progress | 
|  | 43 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress | 
|  | 44 | 
|  | 45 class Clusterize(object): | 
|  | 46 | 
|  | 47 	def __init__(self, verbosity): | 
| 60 | 48 		self.parsers           = {} | 
|  | 49 		self.sortedFileNames   = {} | 
|  | 50 		self.normalize         = False | 
|  | 51 		self.presorted         = False | 
|  | 52 		self.distance          = 1 | 
|  | 53 		self.collinear         = False | 
|  | 54 		self.nbWritten         = 0 | 
|  | 55 		self.nbMerges          = 0 | 
|  | 56 		self.verbosity         = verbosity | 
| 36 | 57 		self.splittedFileNames = {} | 
| 60 | 58 		self.chromosomes       = set() | 
| 36 | 59 | 
|  | 60 	def __del__(self): | 
| 60 | 61 		for fileName1 in self.splittedFileNames: | 
|  | 62 			for fileName2 in self.splittedFileNames[fileName1].values(): | 
|  | 63 				os.remove(fileName2) | 
| 36 | 64 | 
| 60 | 65 	def setInputFiles(self, fileNames, format): | 
| 36 | 66 		parserChooser = ParserChooser(self.verbosity) | 
|  | 67 		parserChooser.findFormat(format) | 
| 60 | 68 		for fileName in fileNames: | 
|  | 69 			self.parsers[fileName] = parserChooser.getParser(fileName) | 
|  | 70 			self.sortedFileNames[fileName] = "%s_sorted_%d.pkl" % (os.path.splitext(fileName)[0], random.randint(1, 100000)) | 
|  | 71 			if "SMARTTMPPATH" in os.environ: | 
|  | 72 				self.sortedFileNames[fileName] = os.path.join(os.environ["SMARTTMPPATH"], os.path.basename(self.sortedFileNames[fileName])) | 
| 36 | 73 | 
|  | 74 	def setOutputFileName(self, fileName, format="gff3", title="S-MART", feature="transcript", featurePart="exon"): | 
|  | 75 		writerChooser = WriterChooser() | 
|  | 76 		writerChooser.findFormat(format) | 
|  | 77 		self.writer = writerChooser.getWriter(fileName) | 
|  | 78 		self.writer.setTitle(title) | 
|  | 79 		self.writer.setFeature(feature) | 
|  | 80 		self.writer.setFeaturePart(featurePart) | 
|  | 81 | 
|  | 82 	def setDistance(self, distance): | 
|  | 83 		self.distance = distance | 
|  | 84 | 
| 60 | 85 	def setColinear(self, collinear): | 
|  | 86 		self.collinear = collinear | 
| 36 | 87 | 
|  | 88 	def setNormalize(self, normalize): | 
|  | 89 		self.normalize = normalize | 
|  | 90 | 
|  | 91 	def setPresorted(self, presorted): | 
|  | 92 		self.presorted = presorted | 
|  | 93 | 
| 60 | 94 	def _sortFiles(self): | 
| 36 | 95 		if self.presorted: | 
|  | 96 			return | 
| 60 | 97 		for fileName, parser in self.parsers.iteritems(): | 
|  | 98 			fs = FileSorter(parser, self.verbosity-4) | 
|  | 99 			fs.perChromosome(True) | 
|  | 100 			fs.setPresorted(self.presorted) | 
|  | 101 			fs.setOutputFileName(self.sortedFileNames[fileName]) | 
|  | 102 			fs.sort() | 
|  | 103 			self.splittedFileNames[fileName] = fs.getOutputFileNames() | 
|  | 104 			self.nbElementsPerChromosome     = fs.getNbElementsPerChromosome() | 
|  | 105 			self.nbElements                  = fs.getNbElements() | 
|  | 106 			self.chromosomes.update(self.splittedFileNames[fileName].keys()) | 
| 36 | 107 | 
| 60 | 108 	def _iterate(self): | 
|  | 109 		progress = UnlimitedProgress(10000, "Reading input file", self.verbosity) | 
| 46 | 110 		transcripts = [] | 
| 60 | 111 		heap        = [] | 
|  | 112 		parsersSets = [] | 
|  | 113 		if self.chromosomes: | 
|  | 114 			parsersSets.append(self.parsers.values()) | 
|  | 115 		else: | 
|  | 116 			for chromosome in self.chromosomes: | 
|  | 117 				parsersSets.append([self.splittedFileNames[fileName][chromosome] for fileName in self.splittedFileNames if chromosome in self.splittedFileNames[fileName]]) | 
|  | 118 		for parsers in parsersSets: | 
|  | 119 			for parser in parsers: | 
|  | 120 				iterator = parser.getIterator() | 
|  | 121 				for transcript in iterator: | 
|  | 122 					if transcript.__class__.__name__ == "Mapping": | 
|  | 123 						transcript = transcript.getTranscript() | 
|  | 124 					heappush(heap, (transcript.getChromosome(), transcript.getStart(), -transcript.getEnd(), transcript, iterator)) | 
|  | 125 					break | 
|  | 126 			while heap: | 
|  | 127 				chromosome, start, end, newTranscript, iterator = heappop(heap) | 
|  | 128 				for transcript in iterator: | 
|  | 129 					if transcript.__class__.__name__ == "Mapping": | 
|  | 130 						transcript = transcript.getTranscript() | 
|  | 131 					heappush(heap, (transcript.getChromosome(), transcript.getStart(), -transcript.getEnd(), transcript, iterator)) | 
|  | 132 					break | 
|  | 133 				newTranscripts = [] | 
|  | 134 				if newTranscript.__class__.__name__ == "Mapping": | 
|  | 135 					newTranscript = newTranscript.getTranscript() | 
|  | 136 				for oldTranscript in transcripts: | 
|  | 137 					if self._checkOverlap(newTranscript, oldTranscript): | 
|  | 138 						self._merge(newTranscript, oldTranscript) | 
|  | 139 					elif self._checkPassed(newTranscript, oldTranscript): | 
|  | 140 						self._write(oldTranscript) | 
|  | 141 					else: | 
|  | 142 						newTranscripts.append(oldTranscript) | 
|  | 143 				newTranscripts.append(newTranscript) | 
|  | 144 				transcripts = newTranscripts | 
|  | 145 				progress.inc() | 
|  | 146 			for transcript in transcripts: | 
|  | 147 				self._write(transcript) | 
| 36 | 148 		progress.done() | 
|  | 149 | 
|  | 150 	def _merge(self, transcript1, transcript2): | 
|  | 151 		self.nbMerges += 1 | 
|  | 152 		transcript2.setDirection(transcript1.getDirection()) | 
|  | 153 		transcript1.merge(transcript2) | 
|  | 154 | 
|  | 155 	def _write(self, transcript): | 
|  | 156 		self.nbWritten += 1 | 
|  | 157 		self.writer.addTranscript(transcript) | 
|  | 158 | 
|  | 159 	def _checkOverlap(self, transcript1, transcript2): | 
|  | 160 		if transcript1.getChromosome() != transcript2.getChromosome(): | 
|  | 161 			return False | 
| 60 | 162 		if self.collinear and transcript1.getDirection() != transcript2.getDirection(): | 
| 36 | 163 			return False | 
|  | 164 		if transcript1.getDistance(transcript2) > self.distance: | 
|  | 165 			return False | 
|  | 166 		return True | 
|  | 167 | 
|  | 168 	def _checkPassed(self, transcript1, transcript2): | 
|  | 169 		return ((transcript1.getChromosome() != transcript2.getChromosome()) or (transcript1.getDistance(transcript2) > self.distance)) | 
|  | 170 | 
|  | 171 	def run(self): | 
| 60 | 172 		self._sortFiles() | 
|  | 173 		self._iterate() | 
| 36 | 174 		self.writer.close() | 
|  | 175 		if self.verbosity > 0: | 
|  | 176 			print "# input:   %d" % (self.nbElements) | 
|  | 177 			print "# written: %d (%d%% overlaps)" % (self.nbWritten, 0 if (self.nbElements == 0) else ((float(self.nbWritten) / self.nbElements) * 100)) | 
|  | 178 			print "# merges:  %d" % (self.nbMerges) | 
|  | 179 | 
|  | 180 | 
|  | 181 if __name__ == "__main__": | 
|  | 182 	description = "Clusterize v1.0.3: clusterize the data which overlap. [Category: Merge]" | 
|  | 183 | 
|  | 184 	parser = OptionParser(description = description) | 
| 60 | 185 	parser.add_option("-i", "--inputs",       dest="inputFileNames", action="store",				     type="string", help="input files (separated by commas) [compulsory] [format: string]") | 
|  | 186 	parser.add_option("-f", "--format",       dest="format",		 action="store",				     type="string", help="format of file [format: transcript file format]") | 
|  | 187 	parser.add_option("-o", "--output",       dest="outputFileName", action="store",				     type="string", help="output file [compulsory] [format: output file in transcript format given by -u]") | 
|  | 188 	parser.add_option("-u", "--outputFormat", dest="outputFormat",   action="store",      default="gff",		        type="string", help="output file format [format: transcript file format]") | 
|  | 189 	parser.add_option("-c", "--collinear",    dest="collinear",      action="store_true", default=False,				help="merge collinear transcripts only [format: bool] [default: false]") | 
|  | 190 	parser.add_option("-d", "--distance",     dest="distance",       action="store",      default=0,     type="int",    help="max. distance between two transcripts to be merged [format: int] [default: 0]") | 
|  | 191 	parser.add_option("-n", "--normalize",    dest="normalize",      action="store_true", default=False,				help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]") | 
|  | 192 	parser.add_option("-s", "--sorted",       dest="sorted",		 action="store_true", default=False,				help="input is already sorted [format: bool] [default: false]") | 
|  | 193 	parser.add_option("-v", "--verbosity",    dest="verbosity",      action="store",      default=1,     type="int",    help="trace level [format: int] [default: 1]") | 
| 36 | 194 	(options, args) = parser.parse_args() | 
|  | 195 | 
|  | 196 	c = Clusterize(options.verbosity) | 
| 60 | 197 	c.setInputFiles(options.inputFileNames.split(","), options.format) | 
| 36 | 198 	c.setOutputFileName(options.outputFileName, options.outputFormat) | 
| 60 | 199 	c.setColinear(options.collinear) | 
| 36 | 200 	c.setDistance(options.distance) | 
|  | 201 	c.setNormalize(options.normalize) | 
|  | 202 	c.setPresorted(options.sorted) | 
|  | 203 	c.run() |