| 6 | 1 #! /usr/bin/env python | 
|  | 2 # | 
|  | 3 # Copyright INRA-URGI 2009-2011 | 
|  | 4 # | 
|  | 5 # This software is governed by the CeCILL license under French law and | 
|  | 6 # abiding by the rules of distribution of free software. You can use, | 
|  | 7 # modify and/ or redistribute the software under the terms of the CeCILL | 
|  | 8 # license as circulated by CEA, CNRS and INRIA at the following URL | 
|  | 9 # "http://www.cecill.info". | 
|  | 10 # | 
|  | 11 # As a counterpart to the access to the source code and rights to copy, | 
|  | 12 # modify and redistribute granted by the license, users are provided only | 
|  | 13 # with a limited warranty and the software's author, the holder of the | 
|  | 14 # economic rights, and the successive licensors have only limited | 
|  | 15 # liability. | 
|  | 16 # | 
|  | 17 # In this respect, the user's attention is drawn to the risks associated | 
|  | 18 # with loading, using, modifying and/or developing or reproducing the | 
|  | 19 # software by the user in light of its specific status of free software, | 
|  | 20 # that may mean that it is complicated to manipulate, and that also | 
|  | 21 # therefore means that it is reserved for developers and experienced | 
|  | 22 # professionals having in-depth computer knowledge. Users are therefore | 
|  | 23 # encouraged to load and test the software's suitability as regards their | 
|  | 24 # requirements in conditions enabling the security of their systems and/or | 
|  | 25 # data to be ensured and, more generally, to use and operate it in the | 
|  | 26 # same conditions as regards security. | 
|  | 27 # | 
|  | 28 # The fact that you are presently reading this means that you have had | 
|  | 29 # knowledge of the CeCILL license and that you accept its terms. | 
|  | 30 # | 
|  | 31 from optparse import OptionParser | 
|  | 32 from commons.core.parsing.ParserChooser import ParserChooser | 
|  | 33 from commons.core.writer.TranscriptWriter import TranscriptWriter | 
|  | 34 from SMART.Java.Python.structure.Transcript import Transcript | 
|  | 35 from SMART.Java.Python.structure.Interval import Interval | 
|  | 36 from SMART.Java.Python.misc.Progress import Progress | 
|  | 37 | 
|  | 38 QUERY        = 0 | 
|  | 39 REFERENCE    = 1 | 
|  | 40 INPUTS       = (QUERY, REFERENCE) | 
|  | 41 STRANDS      = (-1, 1) | 
|  | 42 TAG_DISTANCE = "distance_" | 
|  | 43 TAG_SENSE    = "_sense" | 
|  | 44 TAG_REGION   = "_region" | 
|  | 45 TAGS_REGION  = {-1: "_upstream", 0: "", 1: "_downstream"} | 
|  | 46 TAGS_RREGION = {-1: "upstream", 0: "overlapping", 1: "downstream"} | 
| 18 | 47 TAGS_SENSE   = {-1: "antisense", 0: "", 1: "collinear"} | 
| 6 | 48 STRANDSTOSTR = {-1: "(-)", 0: "", 1: "(+)"} | 
|  | 49 | 
|  | 50 | 
| 18 | 51 def getOrderKey(transcript, direction, input): | 
|  | 52 	if direction == 1: | 
|  | 53 		if input == QUERY: | 
|  | 54 			return (transcript.getEnd(), -transcript.getStart()) | 
|  | 55 		return (transcript.getStart(), -transcript.getEnd()) | 
|  | 56 	if input == QUERY: | 
|  | 57 		return (-transcript.getStart(), transcript.getEnd()) | 
|  | 58 	return (-transcript.getEnd(), transcript.getStart()) | 
| 6 | 59 | 
|  | 60 | 
|  | 61 class GetFlanking(object): | 
|  | 62 | 
| 18 | 63 	def __init__(self, verbosity): | 
|  | 64 		self.verbosity   = verbosity | 
|  | 65 		self.transcripts = dict([id, {}] for id in INPUTS) | 
|  | 66 		self.directions  = [] | 
|  | 67 		self.noOverlap   = False | 
|  | 68 		self.colinear    = False | 
|  | 69 		self.antisense   = False | 
|  | 70 		self.distance    = None | 
|  | 71 		self.minDistance = None | 
|  | 72 		self.maxDistance = None | 
|  | 73 		self.tagName     = "flanking" | 
| 6 | 74 | 
| 18 | 75 	def setInputFile(self, fileName, format, id): | 
|  | 76 		chooser = ParserChooser(self.verbosity) | 
|  | 77 		chooser.findFormat(format) | 
|  | 78 		parser = chooser.getParser(fileName) | 
|  | 79 		for transcript in parser.getIterator(): | 
|  | 80 			chromosome = transcript.getChromosome() | 
|  | 81 			if chromosome not in self.transcripts[id]: | 
|  | 82 				self.transcripts[id][chromosome] = [] | 
|  | 83 			self.transcripts[id][chromosome].append(transcript) | 
| 6 | 84 | 
| 18 | 85 	def setOutputFile(self, fileName): | 
|  | 86 		self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) | 
| 6 | 87 | 
| 18 | 88 	def addUpstreamDirection(self, upstream): | 
|  | 89 		if upstream: | 
|  | 90 			self.directions.append(-1) | 
| 6 | 91 | 
| 18 | 92 	def addDownstreamDirection(self, downstream): | 
|  | 93 		if downstream: | 
|  | 94 			self.directions.append(1) | 
| 6 | 95 | 
| 18 | 96 	def setColinear(self, colinear): | 
|  | 97 		self.colinear = colinear | 
| 6 | 98 | 
| 18 | 99 	def setAntisense(self, antisense): | 
|  | 100 		self.antisense = antisense | 
|  | 101 | 
|  | 102 	def setNoOverlap(self, noOverlap): | 
|  | 103 		self.noOverlap = noOverlap | 
| 6 | 104 | 
| 18 | 105 	def setMinDistance(self, distance): | 
|  | 106 		self.minDistance = distance | 
|  | 107 | 
|  | 108 	def setMaxDistance(self, distance): | 
|  | 109 		self.maxDistance = distance | 
|  | 110 | 
|  | 111 	def setNewTagName(self, tagName): | 
|  | 112 		self.tagName = tagName | 
| 6 | 113 | 
| 18 | 114 	def match(self, transcriptQuery, transcriptRef, direction): | 
|  | 115 		#print "comparing", transcriptQuery, "with", transcriptRef, "on direction", direction | 
|  | 116 		if direction == 1 and transcriptRef.getEnd() < transcriptQuery.getStart(): | 
|  | 117 			return False | 
|  | 118 		if direction == -1 and transcriptQuery.getEnd() < transcriptRef.getStart(): | 
|  | 119 			return False | 
|  | 120 		if self.noOverlap and transcriptRef.overlapWith(transcriptQuery): | 
|  | 121 			return False | 
|  | 122 		if self.colinear and transcriptRef.getDirection() != transcriptQuery.getDirection(): | 
|  | 123 			return False | 
|  | 124 		if self.antisense and transcriptRef.getDirection() == transcriptQuery.getDirection(): | 
|  | 125 			return False | 
|  | 126 		if self.minDistance != None or self.maxDistance != None: | 
|  | 127 			distance = transcriptRef.getDistance(transcriptQuery) | 
|  | 128 			if self.minDistance != None and distance < self.minDistance: | 
|  | 129 				return False | 
|  | 130 			if self.maxDistance != None and distance > self.maxDistance: | 
|  | 131 				return False | 
|  | 132 		return True | 
|  | 133 | 
|  | 134 	def getFlanking(self, chromosome, direction): | 
|  | 135 		if chromosome not in self.transcripts[REFERENCE]: | 
|  | 136 			return | 
|  | 137 		sortedTranscripts = dict([id, {}] for id in INPUTS) | 
|  | 138 		for id in INPUTS: | 
|  | 139 			sortedTranscripts[id] = sorted(self.transcripts[id][chromosome], key = lambda t: getOrderKey(t, direction, id)) | 
|  | 140 		refIndex = 0 | 
|  | 141 		progress = Progress(len(sortedTranscripts[QUERY]), "Reading chr %s %s" % (chromosome, STRANDSTOSTR[direction]), self.verbosity) | 
|  | 142 		for query in sortedTranscripts[QUERY]: | 
|  | 143 			#print "Q: ", query | 
|  | 144 			#print "R1: ", sortedTranscripts[REFERENCE][refIndex] | 
|  | 145 			while not self.match(query, sortedTranscripts[REFERENCE][refIndex], direction): | 
|  | 146 				refIndex += 1 | 
|  | 147 				if refIndex == len(sortedTranscripts[REFERENCE]): | 
|  | 148 					progress.done() | 
|  | 149 					#print "done" | 
|  | 150 					return | 
|  | 151 				#print "R2: ", sortedTranscripts[REFERENCE][refIndex] | 
|  | 152 			self.flankings[query][direction] = sortedTranscripts[REFERENCE][refIndex] | 
|  | 153 			progress.inc() | 
|  | 154 		progress.done() | 
| 6 | 155 | 
| 18 | 156 	def setTags(self, query, reference, direction): | 
|  | 157 		refName = reference.getTagValue("ID") | 
|  | 158 		if refName == None: | 
|  | 159 			refName = reference.getName() | 
|  | 160 		if refName == None: | 
|  | 161 			refName = reference.__str__() | 
|  | 162 		query.setTagValue("%s%s" % (self.tagName, TAGS_REGION[direction*query.getDirection()]), refName) | 
|  | 163 		query.setTagValue("%s_%s%s" % (TAG_DISTANCE, self.tagName, TAGS_REGION[direction*query.getDirection()]), query.getDistance(reference)) | 
|  | 164 		query.setTagValue("%s_%s" % (TAG_SENSE, self.tagName), TAGS_SENSE[query.getDirection() * reference.getDirection()]) | 
|  | 165 		if direction == 0: | 
|  | 166 			query.setTagValue("%s_%s" % (TAG_REGION, self.tagName), TAGS_RREGION[cmp(query.getRelativeDistance(reference), 0)]) | 
|  | 167 		for tag in reference.getTagNames(): | 
|  | 168 			if tag not in ("quality", "feature"): | 
|  | 169 				query.setTagValue("%s%s_%s" % (self.tagName, TAGS_REGION[direction*query.getDirection()], tag), reference.getTagValue(tag)) | 
|  | 170 		return query | 
| 6 | 171 | 
| 18 | 172 	def write(self): | 
|  | 173 		progress = Progress(len(self.flankings.keys()), "Printing data", self.verbosity) | 
|  | 174 		for transcriptQuery in self.flankings.keys(): | 
|  | 175 			if not self.flankings[transcriptQuery]: | 
|  | 176 				self.writer.addTranscript(transcriptQuery) | 
|  | 177 			elif self.directions: | 
|  | 178 				for direction in self.directions: | 
|  | 179 					#relativeDirection = direction if transcriptQuery.getDirection() == 1 else - direction | 
|  | 180 					relativeDirection = direction * transcriptQuery.getDirection() | 
|  | 181 					if relativeDirection in self.flankings[transcriptQuery]: | 
|  | 182 						transcriptRef = self.flankings[transcriptQuery][relativeDirection] | 
|  | 183 						transcriptQuery = self.setTags(transcriptQuery, transcriptRef, relativeDirection) | 
|  | 184 				self.writer.addTranscript(transcriptQuery) | 
|  | 185 			else: | 
|  | 186 				transcriptRef = sorted(self.flankings[transcriptQuery].values(), key = lambda transcriptRef: transcriptQuery.getDistance(transcriptRef))[0] | 
|  | 187 				self.writer.addTranscript(self.setTags(transcriptQuery, transcriptRef, 0)) | 
|  | 188 			progress.inc() | 
|  | 189 		progress.done() | 
| 6 | 190 | 
| 18 | 191 	def run(self): | 
|  | 192 		for chromosome in sorted(self.transcripts[QUERY].keys()): | 
|  | 193 			self.flankings = dict([query, {}] for query in self.transcripts[QUERY][chromosome]) | 
|  | 194 			for direction in STRANDS: | 
|  | 195 				#print "comparison", chromosome, direction | 
|  | 196 				self.getFlanking(chromosome, direction) | 
|  | 197 			self.write() | 
|  | 198 		self.writer.close() | 
| 6 | 199 | 
|  | 200 if __name__ == "__main__": | 
| 18 | 201 | 
|  | 202 	description = "Get Flanking v1.0.1: Get the flanking regions of a set of reference. [Category: Data Selection]" | 
| 6 | 203 | 
| 18 | 204 	parser = OptionParser(description = description) | 
|  | 205 	parser.add_option("-i", "--input1",      dest="inputFileName1", action="store",                          type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") | 
|  | 206 	parser.add_option("-f", "--format1",     dest="format1",        action="store",                          type="string", help="format of previous file [compulsory] [format: transcript file format]") | 
|  | 207 	parser.add_option("-j", "--input2",      dest="inputFileName2", action="store",                          type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]") | 
|  | 208 	parser.add_option("-g", "--format2",     dest="format2",        action="store",                          type="string", help="format of previous file [compulsory] [format: transcript file format]") | 
|  | 209 	parser.add_option("-5", "--upstream",    dest="upstream",       action="store_true", default=False,                     help="output upstream elements [format: boolean] [default: False]") | 
|  | 210 	parser.add_option("-3", "--downstream",  dest="downstream",     action="store_true", default=False,                     help="output downstream elements [format: boolean] [default: False]") | 
|  | 211 	parser.add_option("-c", "--colinear",    dest="colinear",       action="store_true", default=False,                     help="find first colinear element [format: boolean] [default: False]") | 
|  | 212 	parser.add_option("-a", "--antisense",   dest="antisense",      action="store_true", default=False,                     help="find first anti-sense element [format: boolean] [default: False]") | 
|  | 213 	parser.add_option("-e", "--noOverlap",   dest="noOverlap",      action="store_true", default=False,                     help="do not consider elements which are overlapping reference elements [format: boolean] [default: False]") | 
|  | 214 	parser.add_option("-d", "--minDistance", dest="minDistance",    action="store",      default=None,       type="int",    help="minimum distance between 2 elements [format: int]") | 
|  | 215 	parser.add_option("-D", "--maxDistance", dest="maxDistance",    action="store",      default=None,       type="int",    help="maximum distance between 2 elements [format: int]") | 
|  | 216 	parser.add_option("-t", "--tag",         dest="tagName",        action="store",      default="flanking", type="string", help="name of the new tag [format: string] [default: flanking]") | 
|  | 217 	parser.add_option("-o", "--output",      dest="outputFileName", action="store",                          type="string", help="output file [format: output file in GFF3 format]") | 
|  | 218 	parser.add_option("-v", "--verbosity",   dest="verbosity",      action="store",      default=1,          type="int",    help="trace level [format: int]") | 
|  | 219 	(options, args) = parser.parse_args() | 
| 6 | 220 | 
| 18 | 221 	gf = GetFlanking(options.verbosity) | 
|  | 222 	gf.setInputFile(options.inputFileName1, options.format1, QUERY) | 
|  | 223 	gf.setInputFile(options.inputFileName2, options.format2, REFERENCE) | 
|  | 224 	gf.setOutputFile(options.outputFileName) | 
|  | 225 	gf.addUpstreamDirection(options.upstream) | 
|  | 226 	gf.addDownstreamDirection(options.downstream) | 
|  | 227 	gf.setColinear(options.colinear) | 
|  | 228 	gf.setAntisense(options.antisense) | 
|  | 229 	gf.setNoOverlap(options.noOverlap) | 
|  | 230 	gf.setMinDistance(options.minDistance) | 
|  | 231 	gf.setMaxDistance(options.maxDistance) | 
|  | 232 	gf.setNewTagName(options.tagName) | 
|  | 233 	gf.run() |