| 
6
 | 
     1 #! /usr/bin/env python
 | 
| 
 | 
     2 #
 | 
| 
 | 
     3 # Copyright INRA-URGI 2009-2011
 | 
| 
 | 
     4 # 
 | 
| 
 | 
     5 # This software is governed by the CeCILL license under French law and
 | 
| 
 | 
     6 # abiding by the rules of distribution of free software. You can use,
 | 
| 
 | 
     7 # modify and/ or redistribute the software under the terms of the CeCILL
 | 
| 
 | 
     8 # license as circulated by CEA, CNRS and INRIA at the following URL
 | 
| 
 | 
     9 # "http://www.cecill.info".
 | 
| 
 | 
    10 # 
 | 
| 
 | 
    11 # As a counterpart to the access to the source code and rights to copy,
 | 
| 
 | 
    12 # modify and redistribute granted by the license, users are provided only
 | 
| 
 | 
    13 # with a limited warranty and the software's author, the holder of the
 | 
| 
 | 
    14 # economic rights, and the successive licensors have only limited
 | 
| 
 | 
    15 # liability.
 | 
| 
 | 
    16 # 
 | 
| 
 | 
    17 # In this respect, the user's attention is drawn to the risks associated
 | 
| 
 | 
    18 # with loading, using, modifying and/or developing or reproducing the
 | 
| 
 | 
    19 # software by the user in light of its specific status of free software,
 | 
| 
 | 
    20 # that may mean that it is complicated to manipulate, and that also
 | 
| 
 | 
    21 # therefore means that it is reserved for developers and experienced
 | 
| 
 | 
    22 # professionals having in-depth computer knowledge. Users are therefore
 | 
| 
 | 
    23 # encouraged to load and test the software's suitability as regards their
 | 
| 
 | 
    24 # requirements in conditions enabling the security of their systems and/or
 | 
| 
 | 
    25 # data to be ensured and, more generally, to use and operate it in the
 | 
| 
 | 
    26 # same conditions as regards security.
 | 
| 
 | 
    27 # 
 | 
| 
 | 
    28 # The fact that you are presently reading this means that you have had
 | 
| 
 | 
    29 # knowledge of the CeCILL license and that you accept its terms.
 | 
| 
 | 
    30 #
 | 
| 
 | 
    31 from optparse import OptionParser
 | 
| 
 | 
    32 from commons.core.parsing.ParserChooser import ParserChooser
 | 
| 
 | 
    33 from commons.core.writer.TranscriptWriter import TranscriptWriter
 | 
| 
 | 
    34 from SMART.Java.Python.structure.Transcript import Transcript
 | 
| 
 | 
    35 from SMART.Java.Python.structure.Interval import Interval
 | 
| 
 | 
    36 from SMART.Java.Python.misc.Progress import Progress
 | 
| 
 | 
    37 
 | 
| 
 | 
    38 QUERY        = 0
 | 
| 
 | 
    39 REFERENCE    = 1
 | 
| 
 | 
    40 INPUTS       = (QUERY, REFERENCE)
 | 
| 
 | 
    41 STRANDS      = (-1, 1)
 | 
| 
 | 
    42 TAG_DISTANCE = "distance_"
 | 
| 
 | 
    43 TAG_SENSE    = "_sense"
 | 
| 
 | 
    44 TAG_REGION   = "_region"
 | 
| 
 | 
    45 TAGS_REGION  = {-1: "_upstream", 0: "", 1: "_downstream"}
 | 
| 
 | 
    46 TAGS_RREGION = {-1: "upstream", 0: "overlapping", 1: "downstream"}
 | 
| 
18
 | 
    47 TAGS_SENSE   = {-1: "antisense", 0: "", 1: "collinear"}
 | 
| 
6
 | 
    48 STRANDSTOSTR = {-1: "(-)", 0: "", 1: "(+)"}
 | 
| 
 | 
    49 
 | 
| 
 | 
    50 
 | 
| 
18
 | 
    51 def getOrderKey(transcript, direction, input):
 | 
| 
 | 
    52 	if direction == 1:
 | 
| 
 | 
    53 		if input == QUERY:
 | 
| 
 | 
    54 			return (transcript.getEnd(), -transcript.getStart())
 | 
| 
 | 
    55 		return (transcript.getStart(), -transcript.getEnd())
 | 
| 
 | 
    56 	if input == QUERY:
 | 
| 
 | 
    57 		return (-transcript.getStart(), transcript.getEnd())
 | 
| 
 | 
    58 	return (-transcript.getEnd(), transcript.getStart())
 | 
| 
6
 | 
    59 
 | 
| 
 | 
    60 
 | 
| 
 | 
    61 class GetFlanking(object):
 | 
| 
 | 
    62 
 | 
| 
18
 | 
    63 	def __init__(self, verbosity):
 | 
| 
 | 
    64 		self.verbosity   = verbosity
 | 
| 
 | 
    65 		self.transcripts = dict([id, {}] for id in INPUTS)
 | 
| 
 | 
    66 		self.directions  = []
 | 
| 
 | 
    67 		self.noOverlap   = False
 | 
| 
 | 
    68 		self.colinear    = False
 | 
| 
 | 
    69 		self.antisense   = False
 | 
| 
 | 
    70 		self.distance    = None
 | 
| 
 | 
    71 		self.minDistance = None
 | 
| 
 | 
    72 		self.maxDistance = None
 | 
| 
 | 
    73 		self.tagName     = "flanking"
 | 
| 
6
 | 
    74 
 | 
| 
18
 | 
    75 	def setInputFile(self, fileName, format, id):
 | 
| 
 | 
    76 		chooser = ParserChooser(self.verbosity)
 | 
| 
 | 
    77 		chooser.findFormat(format)
 | 
| 
 | 
    78 		parser = chooser.getParser(fileName)
 | 
| 
 | 
    79 		for transcript in parser.getIterator():
 | 
| 
 | 
    80 			chromosome = transcript.getChromosome()
 | 
| 
 | 
    81 			if chromosome not in self.transcripts[id]:
 | 
| 
 | 
    82 				self.transcripts[id][chromosome] = []
 | 
| 
 | 
    83 			self.transcripts[id][chromosome].append(transcript)
 | 
| 
6
 | 
    84 
 | 
| 
18
 | 
    85 	def setOutputFile(self, fileName):
 | 
| 
 | 
    86 		self.writer = TranscriptWriter(fileName, "gff3", self.verbosity)
 | 
| 
6
 | 
    87 
 | 
| 
18
 | 
    88 	def addUpstreamDirection(self, upstream):
 | 
| 
 | 
    89 		if upstream:
 | 
| 
 | 
    90 			self.directions.append(-1)
 | 
| 
6
 | 
    91 
 | 
| 
18
 | 
    92 	def addDownstreamDirection(self, downstream):
 | 
| 
 | 
    93 		if downstream:
 | 
| 
 | 
    94 			self.directions.append(1)
 | 
| 
6
 | 
    95 
 | 
| 
18
 | 
    96 	def setColinear(self, colinear):
 | 
| 
 | 
    97 		self.colinear = colinear
 | 
| 
6
 | 
    98 
 | 
| 
18
 | 
    99 	def setAntisense(self, antisense):
 | 
| 
 | 
   100 		self.antisense = antisense
 | 
| 
 | 
   101 
 | 
| 
 | 
   102 	def setNoOverlap(self, noOverlap):
 | 
| 
 | 
   103 		self.noOverlap = noOverlap
 | 
| 
6
 | 
   104 
 | 
| 
18
 | 
   105 	def setMinDistance(self, distance):
 | 
| 
 | 
   106 		self.minDistance = distance
 | 
| 
 | 
   107 
 | 
| 
 | 
   108 	def setMaxDistance(self, distance):
 | 
| 
 | 
   109 		self.maxDistance = distance
 | 
| 
 | 
   110 
 | 
| 
 | 
   111 	def setNewTagName(self, tagName):
 | 
| 
 | 
   112 		self.tagName = tagName
 | 
| 
6
 | 
   113 
 | 
| 
18
 | 
   114 	def match(self, transcriptQuery, transcriptRef, direction):
 | 
| 
 | 
   115 		#print "comparing", transcriptQuery, "with", transcriptRef, "on direction", direction
 | 
| 
 | 
   116 		if direction == 1 and transcriptRef.getEnd() < transcriptQuery.getStart():
 | 
| 
 | 
   117 			return False
 | 
| 
 | 
   118 		if direction == -1 and transcriptQuery.getEnd() < transcriptRef.getStart():
 | 
| 
 | 
   119 			return False
 | 
| 
 | 
   120 		if self.noOverlap and transcriptRef.overlapWith(transcriptQuery):
 | 
| 
 | 
   121 			return False
 | 
| 
 | 
   122 		if self.colinear and transcriptRef.getDirection() != transcriptQuery.getDirection():
 | 
| 
 | 
   123 			return False
 | 
| 
 | 
   124 		if self.antisense and transcriptRef.getDirection() == transcriptQuery.getDirection():
 | 
| 
 | 
   125 			return False
 | 
| 
 | 
   126 		if self.minDistance != None or self.maxDistance != None:
 | 
| 
 | 
   127 			distance = transcriptRef.getDistance(transcriptQuery)
 | 
| 
 | 
   128 			if self.minDistance != None and distance < self.minDistance:
 | 
| 
 | 
   129 				return False
 | 
| 
 | 
   130 			if self.maxDistance != None and distance > self.maxDistance:
 | 
| 
 | 
   131 				return False
 | 
| 
 | 
   132 		return True
 | 
| 
 | 
   133 
 | 
| 
 | 
   134 	def getFlanking(self, chromosome, direction):
 | 
| 
 | 
   135 		if chromosome not in self.transcripts[REFERENCE]:
 | 
| 
 | 
   136 			return
 | 
| 
 | 
   137 		sortedTranscripts = dict([id, {}] for id in INPUTS)
 | 
| 
 | 
   138 		for id in INPUTS:
 | 
| 
 | 
   139 			sortedTranscripts[id] = sorted(self.transcripts[id][chromosome], key = lambda t: getOrderKey(t, direction, id))
 | 
| 
 | 
   140 		refIndex = 0
 | 
| 
 | 
   141 		progress = Progress(len(sortedTranscripts[QUERY]), "Reading chr %s %s" % (chromosome, STRANDSTOSTR[direction]), self.verbosity)
 | 
| 
 | 
   142 		for query in sortedTranscripts[QUERY]:
 | 
| 
 | 
   143 			#print "Q: ", query
 | 
| 
 | 
   144 			#print "R1: ", sortedTranscripts[REFERENCE][refIndex]
 | 
| 
 | 
   145 			while not self.match(query, sortedTranscripts[REFERENCE][refIndex], direction):
 | 
| 
 | 
   146 				refIndex += 1
 | 
| 
 | 
   147 				if refIndex == len(sortedTranscripts[REFERENCE]):
 | 
| 
 | 
   148 					progress.done()
 | 
| 
 | 
   149 					#print "done"
 | 
| 
 | 
   150 					return
 | 
| 
 | 
   151 				#print "R2: ", sortedTranscripts[REFERENCE][refIndex]
 | 
| 
 | 
   152 			self.flankings[query][direction] = sortedTranscripts[REFERENCE][refIndex]
 | 
| 
 | 
   153 			progress.inc()
 | 
| 
 | 
   154 		progress.done()
 | 
| 
6
 | 
   155 
 | 
| 
18
 | 
   156 	def setTags(self, query, reference, direction):
 | 
| 
 | 
   157 		refName = reference.getTagValue("ID")
 | 
| 
 | 
   158 		if refName == None:
 | 
| 
 | 
   159 			refName = reference.getName()
 | 
| 
 | 
   160 		if refName == None:
 | 
| 
 | 
   161 			refName = reference.__str__()
 | 
| 
 | 
   162 		query.setTagValue("%s%s" % (self.tagName, TAGS_REGION[direction*query.getDirection()]), refName)
 | 
| 
 | 
   163 		query.setTagValue("%s_%s%s" % (TAG_DISTANCE, self.tagName, TAGS_REGION[direction*query.getDirection()]), query.getDistance(reference))
 | 
| 
 | 
   164 		query.setTagValue("%s_%s" % (TAG_SENSE, self.tagName), TAGS_SENSE[query.getDirection() * reference.getDirection()])
 | 
| 
 | 
   165 		if direction == 0:
 | 
| 
 | 
   166 			query.setTagValue("%s_%s" % (TAG_REGION, self.tagName), TAGS_RREGION[cmp(query.getRelativeDistance(reference), 0)])
 | 
| 
 | 
   167 		for tag in reference.getTagNames():
 | 
| 
 | 
   168 			if tag not in ("quality", "feature"):
 | 
| 
 | 
   169 				query.setTagValue("%s%s_%s" % (self.tagName, TAGS_REGION[direction*query.getDirection()], tag), reference.getTagValue(tag))
 | 
| 
 | 
   170 		return query
 | 
| 
6
 | 
   171 
 | 
| 
18
 | 
   172 	def write(self):
 | 
| 
 | 
   173 		progress = Progress(len(self.flankings.keys()), "Printing data", self.verbosity)
 | 
| 
 | 
   174 		for transcriptQuery in self.flankings.keys():
 | 
| 
 | 
   175 			if not self.flankings[transcriptQuery]:
 | 
| 
 | 
   176 				self.writer.addTranscript(transcriptQuery)
 | 
| 
 | 
   177 			elif self.directions:
 | 
| 
 | 
   178 				for direction in self.directions:
 | 
| 
 | 
   179 					#relativeDirection = direction if transcriptQuery.getDirection() == 1 else - direction
 | 
| 
 | 
   180 					relativeDirection = direction * transcriptQuery.getDirection()
 | 
| 
 | 
   181 					if relativeDirection in self.flankings[transcriptQuery]:
 | 
| 
 | 
   182 						transcriptRef = self.flankings[transcriptQuery][relativeDirection]
 | 
| 
 | 
   183 						transcriptQuery = self.setTags(transcriptQuery, transcriptRef, relativeDirection)
 | 
| 
 | 
   184 				self.writer.addTranscript(transcriptQuery)
 | 
| 
 | 
   185 			else:
 | 
| 
 | 
   186 				transcriptRef = sorted(self.flankings[transcriptQuery].values(), key = lambda transcriptRef: transcriptQuery.getDistance(transcriptRef))[0]
 | 
| 
 | 
   187 				self.writer.addTranscript(self.setTags(transcriptQuery, transcriptRef, 0))
 | 
| 
 | 
   188 			progress.inc()
 | 
| 
 | 
   189 		progress.done()
 | 
| 
6
 | 
   190 
 | 
| 
18
 | 
   191 	def run(self):
 | 
| 
 | 
   192 		for chromosome in sorted(self.transcripts[QUERY].keys()):
 | 
| 
 | 
   193 			self.flankings = dict([query, {}] for query in self.transcripts[QUERY][chromosome])
 | 
| 
 | 
   194 			for direction in STRANDS:
 | 
| 
 | 
   195 				#print "comparison", chromosome, direction
 | 
| 
 | 
   196 				self.getFlanking(chromosome, direction)
 | 
| 
 | 
   197 			self.write()
 | 
| 
 | 
   198 		self.writer.close()
 | 
| 
6
 | 
   199 
 | 
| 
 | 
   200 if __name__ == "__main__":
 | 
| 
18
 | 
   201 	
 | 
| 
 | 
   202 	description = "Get Flanking v1.0.1: Get the flanking regions of a set of reference. [Category: Data Selection]"
 | 
| 
6
 | 
   203 
 | 
| 
18
 | 
   204 	parser = OptionParser(description = description)
 | 
| 
 | 
   205 	parser.add_option("-i", "--input1",      dest="inputFileName1", action="store",                          type="string", help="query input file [compulsory] [format: file in transcript format given by -f]")
 | 
| 
 | 
   206 	parser.add_option("-f", "--format1",     dest="format1",        action="store",                          type="string", help="format of previous file [compulsory] [format: transcript file format]")
 | 
| 
 | 
   207 	parser.add_option("-j", "--input2",      dest="inputFileName2", action="store",                          type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]")
 | 
| 
 | 
   208 	parser.add_option("-g", "--format2",     dest="format2",        action="store",                          type="string", help="format of previous file [compulsory] [format: transcript file format]")
 | 
| 
 | 
   209 	parser.add_option("-5", "--upstream",    dest="upstream",       action="store_true", default=False,                     help="output upstream elements [format: boolean] [default: False]")
 | 
| 
 | 
   210 	parser.add_option("-3", "--downstream",  dest="downstream",     action="store_true", default=False,                     help="output downstream elements [format: boolean] [default: False]")
 | 
| 
 | 
   211 	parser.add_option("-c", "--colinear",    dest="colinear",       action="store_true", default=False,                     help="find first colinear element [format: boolean] [default: False]")
 | 
| 
 | 
   212 	parser.add_option("-a", "--antisense",   dest="antisense",      action="store_true", default=False,                     help="find first anti-sense element [format: boolean] [default: False]")
 | 
| 
 | 
   213 	parser.add_option("-e", "--noOverlap",   dest="noOverlap",      action="store_true", default=False,                     help="do not consider elements which are overlapping reference elements [format: boolean] [default: False]")
 | 
| 
 | 
   214 	parser.add_option("-d", "--minDistance", dest="minDistance",    action="store",      default=None,       type="int",    help="minimum distance between 2 elements [format: int]")
 | 
| 
 | 
   215 	parser.add_option("-D", "--maxDistance", dest="maxDistance",    action="store",      default=None,       type="int",    help="maximum distance between 2 elements [format: int]")
 | 
| 
 | 
   216 	parser.add_option("-t", "--tag",         dest="tagName",        action="store",      default="flanking", type="string", help="name of the new tag [format: string] [default: flanking]")
 | 
| 
 | 
   217 	parser.add_option("-o", "--output",      dest="outputFileName", action="store",                          type="string", help="output file [format: output file in GFF3 format]")
 | 
| 
 | 
   218 	parser.add_option("-v", "--verbosity",   dest="verbosity",      action="store",      default=1,          type="int",    help="trace level [format: int]")
 | 
| 
 | 
   219 	(options, args) = parser.parse_args()
 | 
| 
6
 | 
   220 
 | 
| 
18
 | 
   221 	gf = GetFlanking(options.verbosity)
 | 
| 
 | 
   222 	gf.setInputFile(options.inputFileName1, options.format1, QUERY)
 | 
| 
 | 
   223 	gf.setInputFile(options.inputFileName2, options.format2, REFERENCE)
 | 
| 
 | 
   224 	gf.setOutputFile(options.outputFileName)
 | 
| 
 | 
   225 	gf.addUpstreamDirection(options.upstream)
 | 
| 
 | 
   226 	gf.addDownstreamDirection(options.downstream)
 | 
| 
 | 
   227 	gf.setColinear(options.colinear)
 | 
| 
 | 
   228 	gf.setAntisense(options.antisense)
 | 
| 
 | 
   229 	gf.setNoOverlap(options.noOverlap)
 | 
| 
 | 
   230 	gf.setMinDistance(options.minDistance)
 | 
| 
 | 
   231 	gf.setMaxDistance(options.maxDistance)
 | 
| 
 | 
   232 	gf.setNewTagName(options.tagName)
 | 
| 
 | 
   233 	gf.run()
 |