comparison smart_toolShed/SMART/Java/Python/GetFlanking.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2011
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 from optparse import OptionParser
32 from commons.core.parsing.ParserChooser import ParserChooser
33 from commons.core.writer.TranscriptWriter import TranscriptWriter
34 from SMART.Java.Python.structure.Transcript import Transcript
35 from SMART.Java.Python.structure.Interval import Interval
36 from SMART.Java.Python.misc.Progress import Progress
37
38 QUERY = 0
39 REFERENCE = 1
40 INPUTS = (QUERY, REFERENCE)
41 STRANDS = (-1, 1)
42 TAG_DISTANCE = "distance_"
43 TAG_SENSE = "_sense"
44 TAG_REGION = "_region"
45 TAGS_REGION = {-1: "_upstream", 0: "", 1: "_downstream"}
46 TAGS_RREGION = {-1: "upstream", 0: "overlapping", 1: "downstream"}
47 TAGS_SENSE = {-1: "antisense", 0: "", 1: "colinear"}
48 STRANDSTOSTR = {-1: "(-)", 0: "", 1: "(+)"}
49
50
51 def getOrderKey(transcript, direction):
52 if direction == 1:
53 return transcript.getEnd()
54 return - transcript.getStart()
55
56 def isInGoodRegion(transcriptRef, transcriptQuery, direction):
57 if direction == 1:
58 return transcriptQuery.getEnd() > transcriptRef.getEnd()
59 return transcriptQuery.getStart() < transcriptRef.getStart()
60
61
62 class GetFlanking(object):
63
64 def __init__(self, verbosity):
65 self.verbosity = verbosity
66 self.transcripts = dict([id, {}] for id in INPUTS)
67 self.directions = []
68 self.noOverlap = False
69 self.colinear = False
70 self.antisense = False
71 self.distance = None
72 self.minDistance = None
73 self.maxDistance = None
74 self.tagName = "flanking"
75
76 def setInputFile(self, fileName, format, id):
77 chooser = ParserChooser(self.verbosity)
78 chooser.findFormat(format)
79 parser = chooser.getParser(fileName)
80 for transcript in parser.getIterator():
81 chromosome = transcript.getChromosome()
82 if chromosome not in self.transcripts[id]:
83 self.transcripts[id][chromosome] = []
84 self.transcripts[id][chromosome].append(transcript)
85
86 def setOutputFile(self, fileName):
87 self.writer = TranscriptWriter(fileName, "gff3", self.verbosity)
88
89 def addUpstreamDirection(self, upstream):
90 if upstream:
91 self.directions.append(-1)
92
93 def addDownstreamDirection(self, downstream):
94 if downstream:
95 self.directions.append(1)
96
97 def setColinear(self, colinear):
98 self.colinear = colinear
99
100 def setAntisense(self, antisense):
101 self.antisense = antisense
102
103 def setNoOverlap(self, noOverlap):
104 self.noOverlap = noOverlap
105
106 def setMinDistance(self, distance):
107 self.minDistance = distance
108
109 def setMaxDistance(self, distance):
110 self.maxDistance = distance
111
112 def setNewTagName(self, tagName):
113 self.tagName = tagName
114
115 def match(self, transcriptRef, transcriptQuery, direction):
116 if self.noOverlap and transcriptRef.overlapWith(transcriptQuery):
117 return False
118 if self.colinear and transcriptRef.getDirection() != transcriptQuery.getDirection():
119 return False
120 if self.antisense and transcriptRef.getDirection() == transcriptQuery.getDirection():
121 return False
122 if self.minDistance != None or self.maxDistance != None:
123 distance = transcriptRef.getDistance(transcriptQuery)
124 if self.minDistance != None and distance < self.minDistance:
125 return False
126 if self.maxDistance != None and distance > self.maxDistance:
127 return False
128 return True
129
130 def getFlanking(self, direction):
131 for chromosome in sorted(self.transcripts[REFERENCE].keys()):
132 if chromosome not in self.transcripts[QUERY]:
133 continue
134 sortedTranscripts = dict([id, {}] for id in INPUTS)
135 for id in INPUTS:
136 sortedTranscripts[id] = sorted(self.transcripts[id][chromosome], key = lambda t: getOrderKey(t, direction))
137 refIndex = 0
138 currentRefs = []
139 outputs = set()
140 progress = Progress(len(sortedTranscripts[QUERY]), "Reading chr %s %s" % (chromosome, STRANDSTOSTR[direction]), self.verbosity)
141 for query in sortedTranscripts[QUERY]:
142 while refIndex < len(sortedTranscripts[REFERENCE]) and isInGoodRegion(sortedTranscripts[REFERENCE][refIndex], query, direction):
143 currentRefs.append(sortedTranscripts[REFERENCE][refIndex])
144 refIndex += 1
145 nextCurrentRefs = []
146 for currentRef in currentRefs:
147 if self.match(currentRef, query, direction):
148 if currentRef not in self.flankings:
149 self.flankings[currentRef] = {}
150 self.flankings[currentRef][direction * currentRef.getDirection()] = query
151 else:
152 nextCurrentRefs.append(currentRef)
153 currentRefs = nextCurrentRefs
154 progress.inc()
155 progress.done()
156
157 def setTags(self, query, reference, direction):
158 refName = reference.getTagValue("ID")
159 if refName == None:
160 refName = reference.getName()
161 if refName == None:
162 refName = reference.__str__()
163 query.setTagValue("%s%s" % (self.tagName, TAGS_REGION[direction]), refName)
164 query.setTagValue("%s_%s%s" % (TAG_DISTANCE, self.tagName, TAGS_REGION[direction]), query.getDistance(reference))
165 if direction == 0:
166 query.setTagValue("%s_%s" % (TAG_SENSE, self.tagName), TAGS_SENSE[query.getDirection() * reference.getDirection()])
167 query.setTagValue("%s_%s" % (TAG_REGION, self.tagName), TAGS_RREGION[cmp(query.getRelativeDistance(reference), 0)])
168 for tag in reference.getTagNames():
169 if tag not in ("quality", "feature"):
170 query.setTagValue("%s%s_%s" % (self.tagName, TAGS_REGION[direction], tag), reference.getTagValue(tag))
171 return query
172
173 def write(self):
174 outputs = set()
175 progress = Progress(len(self.flankings.keys()), "Printing data", self.verbosity)
176 for transcriptRef in self.flankings.keys():
177 if self.directions:
178 for direction in self.directions:
179 if direction in self.flankings[transcriptRef]:
180 query = self.flankings[transcriptRef][direction]
181 outputs.add(self.setTags(query, transcriptRef, direction))
182 else:
183 if self.flankings[transcriptRef]:
184 query = sorted(self.flankings[transcriptRef].values(), key = lambda query: query.getDistance(transcriptRef))[0]
185 outputs.add(self.setTags(query, transcriptRef, 0))
186 progress.inc()
187 for transcript in sorted(list(outputs), key = lambda flanking: (flanking.getChromosome(), flanking.getStart(), flanking.getEnd())):
188 self.writer.addTranscript(transcript)
189 self.writer.close()
190 progress.done()
191
192 def run(self):
193 self.flankings = {}
194 for direction in STRANDS:
195 self.getFlanking(direction)
196 self.write()
197
198 if __name__ == "__main__":
199
200 description = "Get Flanking v1.0.1: Get the flanking regions of a set of reference. [Category: Data Selection]"
201
202 parser = OptionParser(description = description)
203 parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]")
204 parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")
205 parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]")
206 parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")
207 parser.add_option("-5", "--upstream", dest="upstream", action="store_true", default=False, help="output upstream elements [format: boolean] [default: False]")
208 parser.add_option("-3", "--downstream", dest="downstream", action="store_true", default=False, help="output downstream elements [format: boolean] [default: False]")
209 parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="find first colinear element [format: boolean] [default: False]")
210 parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="find first anti-sense element [format: boolean] [default: False]")
211 parser.add_option("-e", "--noOverlap", dest="noOverlap", action="store_true", default=False, help="do not consider elements which are overlapping reference elements [format: boolean] [default: False]")
212 parser.add_option("-d", "--minDistance", dest="minDistance", action="store", default=None, type="int", help="minimum distance between 2 elements [format: int]")
213 parser.add_option("-D", "--maxDistance", dest="maxDistance", action="store", default=None, type="int", help="maximum distance between 2 elements [format: int]")
214 parser.add_option("-t", "--tag", dest="tagName", action="store", default="flanking", type="string", help="name of the new tag [format: string] [default: flanking]")
215 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]")
216 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
217 (options, args) = parser.parse_args()
218
219 gf = GetFlanking(options.verbosity)
220 gf.setInputFile(options.inputFileName1, options.format1, QUERY)
221 gf.setInputFile(options.inputFileName2, options.format2, REFERENCE)
222 gf.setOutputFile(options.outputFileName)
223 gf.addUpstreamDirection(options.upstream)
224 gf.addDownstreamDirection(options.downstream)
225 gf.setColinear(options.colinear)
226 gf.setAntisense(options.antisense)
227 gf.setNoOverlap(options.noOverlap)
228 gf.setMinDistance(options.minDistance)
229 gf.setMaxDistance(options.maxDistance)
230 gf.setNewTagName(options.tagName)
231 gf.run()