comparison SMART/Java/Python/CompareOverlappingSmallRef.py @ 38:2c0c0a89fad7

Uploaded
author m-zytnicki
date Thu, 02 May 2013 09:56:47 -0400
parents 44d5973c188c
children 169d364ddd91
comparison
equal deleted inserted replaced
37:d22fadc825e3 38:2c0c0a89fad7
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2011
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 from optparse import OptionParser
32 from commons.core.parsing.ParserChooser import ParserChooser
33 from commons.core.writer.TranscriptWriter import TranscriptWriter
34 from SMART.Java.Python.structure.Interval import Interval
35 from SMART.Java.Python.structure.Transcript import Transcript
36 from SMART.Java.Python.structure.Mapping import Mapping
37 from SMART.Java.Python.misc.Progress import Progress
38 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
39
40 MINBIN = 3
41 MAXBIN = 7
42 REFERENCE = 0
43 QUERY = 1
44
45 def getBin(start, end):
46 for i in range(MINBIN, MAXBIN + 1):
47 binLevel = 10 ** i
48 if int(start / binLevel) == int(end / binLevel):
49 return int(i * 10 ** (MAXBIN + 1) + int(start / binLevel))
50 return int((MAXBIN + 1) * 10 ** (MAXBIN + 1))
51
52 def getOverlappingBins(start, end):
53 array = []
54 bigBin = int((MAXBIN + 1) * 10 ** (MAXBIN + 1))
55 for i in range(MINBIN, MAXBIN + 1):
56 binLevel = 10 ** i
57 array.append((int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)), int(i * 10 ** (MAXBIN + 1) + int(end / binLevel))))
58 array.append((bigBin, bigBin))
59 return array
60
61
62 class CompareOverlappingSmallRef(object):
63
64 def __init__(self, verbosity):
65 self.verbosity = verbosity
66 self.tableNames = {}
67 self.nbQueries = 0
68 self.nbRefs = 0
69 self.nbWritten = 0
70 self.nbOverlaps = 0
71 self.invert = False
72 self.antisense = False
73 self.collinear = False
74 self.distance = None
75 self.minOverlap = False
76 self.pcOverlapQuery = False
77 self.pcOverlapRef = False
78 self.included = False
79 self.including = False
80 self.bins = {}
81 self.notOverlapping = False
82
83 def setReferenceFile(self, fileName, format):
84 chooser = ParserChooser(self.verbosity)
85 chooser.findFormat(format)
86 self.refParser = chooser.getParser(fileName)
87
88 def setQueryFile(self, fileName, format):
89 chooser = ParserChooser(self.verbosity)
90 chooser.findFormat(format)
91 self.queryParser = chooser.getParser(fileName)
92
93 def setOutputFile(self, fileName):
94 self.writer = TranscriptWriter(fileName, "gff3", self.verbosity)
95
96 def setDistance(self, distance):
97 self.distance = distance
98
99 def setCollinear(self, boolean):
100 self.collinear = boolean
101
102 def setAntisense(self, boolean):
103 self.antisense = boolean
104
105 def setInvert(self, boolean):
106 self.invert = boolean
107
108 def setMinPercentOverlap(self, pcOverlapQuery, pcOverlapRef):
109 self.pcOverlapQuery = pcOverlapQuery
110 self.pcOverlapRef = pcOverlapRef
111
112 def setInclude(self, included, including):
113 self.included = included
114 self.including = including
115
116 def includeNotOverlapping(self, boolean):
117 self.notOverlapping = boolean
118
119 def loadRef(self):
120 progress = UnlimitedProgress(10000, "Reading references", self.verbosity)
121 for transcript in self.refParser.getIterator():
122 if transcript.__class__.__name__ == "Mapping":
123 transcript = transcript.getTranscript()
124 transcript = self._alterTranscript(transcript, REFERENCE)
125 chromosome = transcript.getChromosome()
126 bin = getBin(transcript.getStart(), transcript.getEnd())
127 if chromosome not in self.bins:
128 self.bins[chromosome] = {}
129 if bin not in self.bins[chromosome]:
130 self.bins[chromosome][bin] = []
131 self.bins[chromosome][bin].append(transcript)
132 self.nbRefs += 1
133 progress.inc()
134 progress.done()
135
136 def _alterTranscript(self, transcript, type):
137 if type == REFERENCE:
138 if self.distance != None:
139 transcript.extendExons(self.distance)
140 return transcript
141
142 def _compareTwoTranscripts(self, queryTranscript, refTranscript):
143 if not queryTranscript.overlapWithExon(refTranscript):
144 return False
145 if self.collinear and queryTranscript.getDirection() != refTranscript.getDirection():
146 return False
147 if self.antisense and queryTranscript.getDirection() == refTranscript.getDirection():
148 return False
149 if self.included and not queryTranscript.isIncluded(refTranscript):
150 return False
151 if self.including and not refTranscript.isIncluded(queryTranscript):
152 return False
153 querySize = queryTranscript.getSize()
154 if self.pcOverlapQuery and not queryTranscript.overlapWithExon(refTranscript, int(querySize * self.pcOverlapQuery / 100.0)):
155 return False
156 refSize = refTranscript.getSize()
157 if self.pcOverlapRef and not queryTranscript.overlapWithExon(refTranscript, int(refSize * self.pcOverlapRef / 100.0)):
158 return False
159 if self.minOverlap and not queryTranscript.overlapWithExon(refTranscript, self.minOverlap):
160 return False
161 return True
162
163 def _compareTranscript(self, queryTranscript):
164 queryChromosome = queryTranscript.getChromosome()
165 if queryChromosome not in self.bins:
166 return []
167 queryStart = queryTranscript.getStart()
168 queryEnd = queryTranscript.getEnd()
169 bins = getOverlappingBins(queryStart, queryEnd)
170 overlaps = {}
171 for binRange in bins:
172 for bin in range(binRange[0], binRange[1]+1):
173 if bin not in self.bins[queryChromosome]:
174 continue
175 for refTranscript in self.bins[queryChromosome][bin]:
176 if self._compareTwoTranscripts(queryTranscript, refTranscript):
177 nbElements = int(float(refTranscript.getTagValue("nbElements"))) if "nbElements" in refTranscript.getTagNames() else 1
178 overlaps[refTranscript.getName()] = int(float(refTranscript.getTagValue("nbElements"))) if "nbElements" in refTranscript.getTagNames() else 1
179 self.nbOverlaps += nbElements
180 return overlaps
181
182 def _updateTranscript(self, queryTranscript, overlaps):
183 queryTranscript.setTagValue("nbOverlaps", sum(overlaps.values()))
184 if overlaps:
185 queryTranscript.setTagValue("overlapsWith", "--".join(overlaps.keys())[:100])
186 return queryTranscript
187
188 def compare(self):
189 progress = UnlimitedProgress(10000, "Comparing queries", self.verbosity)
190 for queryTranscript in self.queryParser.getIterator():
191 if queryTranscript.__class__.__name__ == "Mapping":
192 queryTranscript = queryTranscript.getTranscript()
193 progress.inc()
194 self.nbQueries += 1
195 overlaps = self._compareTranscript(queryTranscript)
196 if self.notOverlapping or (overlaps and not self.invert) or (not overlaps and self.invert):
197 if not self.invert:
198 queryTranscript = self._updateTranscript(queryTranscript, overlaps)
199 self.writer.addTranscript(queryTranscript)
200 self.nbWritten += 1
201 progress.done()
202 self.writer.close()
203
204 def displayResults(self):
205 if self.verbosity > 0:
206 print "# queries: %d" % (self.nbQueries)
207 print "# refs: %d" % (self.nbRefs)
208 print "# written: %d (%d overlaps)" % (self.nbWritten, self.nbOverlaps)
209
210 def run(self):
211 self.loadRef()
212 self.compare()
213 self.displayResults()
214
215 if __name__ == "__main__":
216
217 description = "Compare Overlapping Small Reference v1.0.1: Provide the queries that overlap with a reference, when the reference is small. [Category: Data Comparison]"
218
219 parser = OptionParser(description = description)
220 parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]")
221 parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")
222 parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]")
223 parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")
224 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]")
225 parser.add_option("-O", "--notOverlapping", dest="notOverlapping", action="store_true", default=False, help="also output not overlapping data [format: bool] [default: false]")
226 parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="accept some distance between query and reference [format: int]")
227 parser.add_option("-c", "--collinear", dest="collinear", action="store_true", default=False, help="provide collinear features [format: bool] [default: false]")
228 parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="provide antisense features [format: bool] [default: false]")
229 parser.add_option("-m", "--minOverlap", dest="minOverlap", action="store", default=False, type="int", help="min. #nt overlap [format: bool] [default: false]")
230 parser.add_option("-p", "--pcOverlapQuery", dest="pcOverlapQuery", action="store", default=False, type="int", help="min. % overlap of the query [format: bool] [default: false]")
231 parser.add_option("-P", "--pcOverlapRef", dest="pcOverlapRef", action="store", default=False, type="int", help="min. % overlap of the reference [format: bool] [default: false]")
232 parser.add_option("-k", "--included", dest="included", action="store_true", default=False, help="provide query elements which are nested in reference elements [format: bool] [default: false]")
233 parser.add_option("-K", "--including", dest="including", action="store_true", default=False, help="provide query elements in which reference elements are nested [format: bool] [default: false]")
234 parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="invert the match [format: bool] [default: false]")
235 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
236 (options, args) = parser.parse_args()
237
238 cosr = CompareOverlappingSmallRef(options.verbosity)
239 cosr.setQueryFile(options.inputFileName1, options.format1)
240 cosr.setReferenceFile(options.inputFileName2, options.format2)
241 cosr.setOutputFile(options.outputFileName)
242 cosr.includeNotOverlapping(options.notOverlapping)
243 cosr.setDistance(options.distance)
244 cosr.setAntisense(options.antisense)
245 cosr.setInclude(options.included, options.including)
246 cosr.setInvert(options.exclude)
247 cosr.setMinOverlap(options.minOverlap)
248 cosr.setMinPercentOverlap(options.pcOverlapQuery, options.pcOverlapRef)
249 cosr.run()
250