comparison smart_toolShed/SMART/Java/Python/CompareOverlappingSmallRef.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2011
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 from optparse import OptionParser
32 from commons.core.parsing.ParserChooser import ParserChooser
33 from commons.core.writer.TranscriptWriter import TranscriptWriter
34 from SMART.Java.Python.structure.Interval import Interval
35 from SMART.Java.Python.structure.Transcript import Transcript
36 from SMART.Java.Python.structure.Mapping import Mapping
37 from SMART.Java.Python.misc.Progress import Progress
38 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
39
40 MINBIN = 3
41 MAXBIN = 7
42 REFERENCE = 0
43 QUERY = 1
44
45 def getBin(start, end):
46 for i in range(MINBIN, MAXBIN + 1):
47 binLevel = 10 ** i
48 if int(start / binLevel) == int(end / binLevel):
49 return int(i * 10 ** (MAXBIN + 1) + int(start / binLevel))
50 return int((MAXBIN + 1) * 10 ** (MAXBIN + 1))
51
52 def getOverlappingBins(start, end):
53 array = []
54 bigBin = int((MAXBIN + 1) * 10 ** (MAXBIN + 1))
55 for i in range(MINBIN, MAXBIN + 1):
56 binLevel = 10 ** i
57 array.append((int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)), int(i * 10 ** (MAXBIN + 1) + int(end / binLevel))))
58 array.append((bigBin, bigBin))
59 return array
60
61
62 class CompareOverlappingSmallRef(object):
63
64 def __init__(self, verbosity):
65 self.verbosity = verbosity
66 self.tableNames = {}
67 self.nbQueries = 0
68 self.nbRefs = 0
69 self.nbWritten = 0
70 self.nbOverlaps = 0
71 self.invert = False
72 self.antisense = False
73 self.collinear = False
74 self.distance = None
75 self.bins = {}
76 self.notOverlapping = False
77
78 def setReferenceFile(self, fileName, format):
79 chooser = ParserChooser(self.verbosity)
80 chooser.findFormat(format)
81 self.refParser = chooser.getParser(fileName)
82
83 def setQueryFile(self, fileName, format):
84 chooser = ParserChooser(self.verbosity)
85 chooser.findFormat(format)
86 self.queryParser = chooser.getParser(fileName)
87
88 def setOutputFile(self, fileName):
89 self.writer = TranscriptWriter(fileName, "gff3", self.verbosity)
90
91 def setDistance(self, distance):
92 self.distance = distance
93
94 def setCollinear(self, boolean):
95 self.collinear = boolean
96
97 def setAntisense(self, boolean):
98 self.antisense = boolean
99
100 def setInvert(self, boolean):
101 self.invert = boolean
102
103 def includeNotOverlapping(self, boolean):
104 self.notOverlapping = boolean
105
106 def loadRef(self):
107 progress = UnlimitedProgress(10000, "Reading references", self.verbosity)
108 for transcript in self.refParser.getIterator():
109 if transcript.__class__.__name__ == "Mapping":
110 transcript = transcript.getTranscript()
111 transcript = self._alterTranscript(transcript, REFERENCE)
112 chromosome = transcript.getChromosome()
113 bin = getBin(transcript.getStart(), transcript.getEnd())
114 if chromosome not in self.bins:
115 self.bins[chromosome] = {}
116 if bin not in self.bins[chromosome]:
117 self.bins[chromosome][bin] = []
118 self.bins[chromosome][bin].append(transcript)
119 self.nbRefs += 1
120 progress.inc()
121 progress.done()
122
123 def _alterTranscript(self, transcript, type):
124 if type == REFERENCE:
125 if self.distance != None:
126 transcript.extendExons(self.distance)
127 return transcript
128
129 def _compareTwoTranscripts(self, queryTranscript, refTranscript):
130 if not queryTranscript.overlapWithExon(refTranscript):
131 return False
132 if self.collinear and queryTranscript.getDirection() != refTranscript.getDirection():
133 return False
134 if self.antisense and queryTranscript.getDirection() == refTranscript.getDirection():
135 return False
136 return True
137
138 def _compareTranscript(self, queryTranscript):
139 queryChromosome = queryTranscript.getChromosome()
140 if queryChromosome not in self.bins:
141 return []
142 queryStart = queryTranscript.getStart()
143 queryEnd = queryTranscript.getEnd()
144 bins = getOverlappingBins(queryStart, queryEnd)
145 overlaps = {}
146 for binRange in bins:
147 for bin in range(binRange[0], binRange[1]+1):
148 if bin not in self.bins[queryChromosome]:
149 continue
150 for refTranscript in self.bins[queryChromosome][bin]:
151 if self._compareTwoTranscripts(queryTranscript, refTranscript):
152 nbElements = int(float(refTranscript.getTagValue("nbElements"))) if "nbElements" in refTranscript.getTagNames() else 1
153 overlaps[refTranscript.getName()] = int(float(refTranscript.getTagValue("nbElements"))) if "nbElements" in refTranscript.getTagNames() else 1
154 self.nbOverlaps += nbElements
155 return overlaps
156
157 def _updateTranscript(self, queryTranscript, overlaps):
158 queryTranscript.setTagValue("nbOverlaps", sum(overlaps.values()))
159 if overlaps:
160 queryTranscript.setTagValue("overlapsWith", "--".join(overlaps.keys())[:100])
161 return queryTranscript
162
163 def compare(self):
164 progress = UnlimitedProgress(10000, "Comparing queries", self.verbosity)
165 for queryTranscript in self.queryParser.getIterator():
166 if queryTranscript.__class__.__name__ == "Mapping":
167 queryTranscript = queryTranscript.getTranscript()
168 progress.inc()
169 self.nbQueries += 1
170 overlaps = self._compareTranscript(queryTranscript)
171 if self.notOverlapping or (overlaps and not self.invert) or (not overlaps and self.invert):
172 if not self.invert:
173 queryTranscript = self._updateTranscript(queryTranscript, overlaps)
174 self.writer.addTranscript(queryTranscript)
175 self.nbWritten += 1
176 progress.done()
177 self.writer.close()
178
179 def displayResults(self):
180 print "# queries: %d" % (self.nbQueries)
181 print "# refs: %d" % (self.nbRefs)
182 print "# written: %d (%d overlaps)" % (self.nbWritten, self.nbOverlaps)
183
184 def run(self):
185 self.loadRef()
186 self.compare()
187 self.displayResults()
188
189 if __name__ == "__main__":
190
191 description = "Compare Overlapping Small Reference v1.0.1: Provide the queries that overlap with a reference, when the reference is small. [Category: Data Comparison]"
192
193 parser = OptionParser(description = description)
194 parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]")
195 parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")
196 parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]")
197 parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")
198 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]")
199 parser.add_option("-O", "--notOverlapping", dest="notOverlapping", action="store_true", default=False, help="also output not overlapping data [format: bool] [default: false]")
200 parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="accept some distance between query and reference [format: int]")
201 parser.add_option("-c", "--collinear", dest="collinear", action="store_true", default=False, help="provide collinear features [format: bool] [default: false]")
202 parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="provide antisense features [format: bool] [default: false]")
203 parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="invert the match [format: bool] [default: false]")
204 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
205 (options, args) = parser.parse_args()
206
207 cosr = CompareOverlappingSmallRef(options.verbosity)
208 cosr.setQueryFile(options.inputFileName1, options.format1)
209 cosr.setReferenceFile(options.inputFileName2, options.format2)
210 cosr.setOutputFile(options.outputFileName)
211 cosr.includeNotOverlapping(options.notOverlapping)
212 cosr.setDistance(options.distance)
213 cosr.setAntisense(options.antisense)
214 cosr.setInvert(options.exclude)
215 cosr.setInvert(options.exclude)
216 cosr.run()
217