comparison smart_toolShed/SMART/Java/Python/CompareOverlapping.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2010
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 import os, struct, time, random
32 from optparse import OptionParser
33 from commons.core.parsing.ParserChooser import ParserChooser
34 from commons.core.writer.Gff3Writer import Gff3Writer
35 from SMART.Java.Python.structure.Transcript import Transcript
36 from SMART.Java.Python.structure.Interval import Interval
37 from SMART.Java.Python.ncList.NCList import NCList
38 from SMART.Java.Python.ncList.NCListCursor import NCListCursor
39 from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle
40 from SMART.Java.Python.ncList.NCListHandler import NCListHandler
41 from SMART.Java.Python.ncList.ConvertToNCList import ConvertToNCList
42 from SMART.Java.Python.misc.Progress import Progress
43 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
44 from SMART.Java.Python.misc import Utils
45 try:
46 import cPickle as pickle
47 except:
48 import pickle
49
50 REFERENCE = 0
51 QUERY = 1
52 TYPES = (REFERENCE, QUERY)
53 TYPETOSTRING = {0: "reference", 1: "query"}
54
55 class CompareOverlapping(object):
56
57 def __init__(self, verbosity = 1):
58 self._outputFileName = "outputOverlaps.gff3"
59 self._iWriter = None
60 self._nbOverlappingQueries = 0
61 self._nbOverlaps = 0
62 self._nbLines = {REFERENCE: 0, QUERY: 0}
63 self._verbosity = verbosity
64 self._ncLists = {}
65 self._cursors = {}
66 self._splittedFileNames = {}
67 self._nbElements = {}
68 self._nbElementsPerChromosome = {}
69 self._inputFileNames = {REFERENCE: None, QUERY: None}
70 self._inputFileFormats = {REFERENCE: None, QUERY: None}
71 self._starts = {REFERENCE: None, QUERY: None}
72 self._ends = {REFERENCE: None, QUERY: None}
73 self._fivePrimes = {REFERENCE: None, QUERY: None}
74 self._threePrimes = {REFERENCE: None, QUERY: None}
75 self._ncListHandlers = {REFERENCE: None, QUERY: None}
76 self._convertedFileNames = {REFERENCE: False, QUERY: False}
77 self._sorted = False
78 self._index = False
79 self._introns = False
80 self._antisense = False
81 self._colinear = False
82 self._invert = False
83 self._distance = 0
84 self._minOverlap = 1
85 self._pcOverlap = None
86 self._included = False
87 self._including = False
88 self._outputNotOverlapping = False
89 self._tmpRefFileName = None
90 self._currentQueryTranscript = None
91 self._currentOrQueryTranscript = None
92 self._currentExQueryTranscript = None
93 self._randInt = random.randint(0, 100000)
94
95 def __del__(self):
96 for fileName in [self._tmpRefFileName] + self._convertedFileNames.values():
97 if fileName != None and os.path.exists(fileName):
98 os.remove(fileName)
99
100 def close(self):
101 self._iWriter.close()
102
103 def setInput(self, fileName, format, type):
104 chooser = ParserChooser(self._verbosity)
105 chooser.findFormat(format)
106 self._inputFileNames[type] = fileName
107 self._inputFileFormats[type] = format
108
109 def setOutput(self, outputFileName):
110 if outputFileName != '':
111 self._outputFileName = outputFileName
112 self._iWriter = Gff3Writer(self._outputFileName)
113
114 def setSorted(self, sorted):
115 self._sorted = sorted
116
117 def setIndex(self, index):
118 self._index = index
119
120 def restrictToStart(self, distance, type):
121 self._starts[type] = distance
122
123 def restrictToEnd(self, distance, type):
124 self._ends[type] = distance
125
126 def extendFivePrime(self, distance, type):
127 self._fivePrimes[type] = distance
128
129 def extendThreePrime(self, distance, type):
130 self._threePrimes[type] = distance
131
132 def acceptIntrons(self, boolean):
133 self._introns = boolean
134
135 def getAntisenseOnly(self, boolean):
136 self._antisense = boolean
137
138 def getColinearOnly(self, boolean):
139 self._colinear = boolean
140
141 def getInvert(self, boolean):
142 self._invert = boolean
143
144 def setMaxDistance(self, distance):
145 self._distance = distance
146
147 def setMinOverlap(self, overlap):
148 self._minOverlap = overlap
149
150 def setPcOverlap(self, overlap):
151 self._pcOverlap = overlap
152
153 def setIncludedOnly(self, boolean):
154 self._included = boolean
155
156 def setIncludingOnly(self, boolean):
157 self._including = boolean
158
159 def includeNotOverlapping(self, boolean):
160 self._outputNotOverlapping = boolean
161
162 def transformTranscript(self, transcript, type):
163 if self._starts[type] != None:
164 transcript.restrictStart(self._starts[type])
165 if self._ends[type] != None:
166 transcript.restrictEnd(self._ends[type])
167 if self._fivePrimes[type] != None:
168 transcript.extendStart(self._fivePrimes[type])
169 if self._threePrimes[type] != None:
170 transcript.extendEnd(self._threePrimes[type])
171 if self._introns:
172 transcript.exons = []
173 if type == REFERENCE and self._distance > 0:
174 transcript.extendExons(self._distance)
175 return transcript
176
177 def extendQueryTranscript(self, transcript):
178 self._currentExQueryTranscript = Transcript()
179 self._currentExQueryTranscript.copy(transcript)
180 if self._fivePrimes[QUERY] != None:
181 self._currentExQueryTranscript.extendStart(self._fivePrimes[QUERY])
182 if self._threePrimes[QUERY] != None:
183 self._currentExQueryTranscript.extendEnd(self._threePrimes[QUERY])
184 transcript.exons = []
185
186 def createTmpRefFile(self):
187 self._tmpRefFileName = "tmp_ref_%d.pkl" % (self._randInt)
188 if "SMARTTMPPATH" in os.environ:
189 self._tmpRefFileName = os.path.join(os.environ["SMARTTMPPATH"], self._tmpRefFileName)
190 chooser = ParserChooser(self._verbosity)
191 chooser.findFormat(self._inputFileFormats[REFERENCE])
192 parser = chooser.getParser(self._inputFileNames[REFERENCE])
193 writer = NCListFilePickle(self._tmpRefFileName, self._verbosity)
194 for transcript in parser.getIterator():
195 transcript = self.transformTranscript(transcript, REFERENCE)
196 writer.addTranscript(transcript)
197 writer.close()
198 self._inputFileNames[REFERENCE] = self._tmpRefFileName
199 self._inputFileFormats[REFERENCE] = "pkl"
200
201 def createNCLists(self):
202 self._ncLists = dict([type, {}] for type in TYPES)
203 self._indices = dict([type, {}] for type in TYPES)
204 self._cursors = dict([type, {}] for type in TYPES)
205 for type in TYPES:
206 if self._verbosity > 2:
207 print "Creating %s NC-list..." % (TYPETOSTRING[type])
208 self._convertedFileNames[type] = "%s_%d_%d.ncl" % (self._inputFileNames[type], self._randInt, type)
209 ncLists = ConvertToNCList(self._verbosity)
210 ncLists.setInputFileName(self._inputFileNames[type], self._inputFileFormats[type])
211 ncLists.setOutputFileName(self._convertedFileNames[type])
212 ncLists.setSorted(self._sorted)
213 if type == REFERENCE and self._index:
214 ncLists.setIndex(True)
215 ncLists.run()
216 self._ncListHandlers[type] = NCListHandler(self._verbosity)
217 self._ncListHandlers[type].setFileName(self._convertedFileNames[type])
218 self._ncListHandlers[type].loadData()
219 self._nbLines[type] = self._ncListHandlers[type].getNbElements()
220 self._nbElementsPerChromosome[type] = self._ncListHandlers[type].getNbElementsPerChromosome()
221 self._ncLists[type] = self._ncListHandlers[type].getNCLists()
222 for chromosome, ncList in self._ncLists[type].iteritems():
223 self._cursors[type][chromosome] = NCListCursor(None, ncList, 0, self._verbosity)
224 if type == REFERENCE and self._index:
225 self._indices[REFERENCE][chromosome] = ncList.getIndex()
226 if self._verbosity > 2:
227 print " ...done"
228
229 def compare(self):
230 nbSkips, nbMoves = 0, 0
231 previousChromosome = None
232 done = False
233 refNCList = None
234 queryNCList = None
235 startTime = time.time()
236 progress = Progress(len(self._ncLists[QUERY].keys()), "Checking overlap", self._verbosity)
237 for chromosome, queryNCList in self._ncLists[QUERY].iteritems():
238 queryParser = self._ncListHandlers[QUERY].getParser(chromosome)
239 queryNCList = self._ncLists[QUERY][chromosome]
240 queryCursor = self._cursors[QUERY][chromosome]
241 if chromosome != previousChromosome:
242 skipChromosome = False
243 previousChromosome = chromosome
244 if chromosome not in self._ncLists[REFERENCE]:
245 if self._outputNotOverlapping:
246 while not queryCursor.isOut():
247 self._currentQueryTranscript = queryCursor.getTranscript()
248 self._writeIntervalInNewGFF3({})
249 if queryCursor.hasChildren():
250 queryCursor.moveDown()
251 else:
252 queryCursor.moveNext()
253 progress.inc()
254 continue
255 refNCList = self._ncLists[REFERENCE][chromosome]
256 refCursor = self._cursors[REFERENCE][chromosome]
257 while True:
258 self._currentOrQueryTranscript = queryCursor.getTranscript()
259 self._currentQueryTranscript = Transcript()
260 self._currentQueryTranscript.copy(self._currentOrQueryTranscript)
261 self._currentQueryTranscript = self.transformTranscript(self._currentQueryTranscript, QUERY)
262 self.extendQueryTranscript(self._currentOrQueryTranscript)
263 newRefLaddr = self.checkIndex(refCursor)
264 if newRefLaddr != None:
265 nbMoves += 1
266 refCursor.setLIndex(newRefLaddr)
267 done = False
268 refCursor, done, unmatched = self.findOverlapIter(refCursor, done)
269 if refCursor.isOut():
270 if not self._invert and not self._outputNotOverlapping:
271 break
272 if (unmatched and not self._invert and not self._outputNotOverlapping) or not queryCursor.hasChildren():
273 queryCursor.moveNext()
274 nbSkips += 1
275 else:
276 queryCursor.moveDown()
277 if queryCursor.isOut():
278 break
279 progress.inc()
280 progress.done()
281 endTime = time.time()
282 self._timeSpent = endTime - startTime
283 if self._verbosity >= 10:
284 print "# skips: %d" % (nbSkips)
285 print "# moves: %d" % (nbMoves)
286
287 def findOverlapIter(self, cursor, done):
288 chromosome = self._currentQueryTranscript.getChromosome()
289 matched = False
290 if chromosome not in self._ncLists[REFERENCE]:
291 return None, False, True
292 ncList = self._ncLists[REFERENCE][chromosome]
293 overlappingNames = {}
294 nextDone = False
295 firstOverlapLAddr = NCListCursor(cursor)
296 firstOverlapLAddr.setLIndex(-1)
297 if cursor.isOut():
298 self._writeIntervalInNewGFF3(overlappingNames)
299 return firstOverlapLAddr, False, True
300 parentCursor = NCListCursor(cursor)
301 parentCursor.moveUp()
302 firstParentAfter = False
303 while not parentCursor.isOut():
304 if self.isOverlapping(parentCursor) == 0:
305 matched = True
306 if self._checkOverlap(parentCursor.getTranscript()):
307 overlappingNames.update(self._extractID(parentCursor.getTranscript()))
308 if firstOverlapLAddr.isOut():
309 firstOverlapLAddr.copy(parentCursor)
310 nextDone = True
311 elif self.isOverlapping(parentCursor) == 1:
312 firstParentAfter = NCListCursor(parentCursor)
313 parentCursor.moveUp()
314 if firstParentAfter:
315 written = self._writeIntervalInNewGFF3(overlappingNames)
316 return firstParentAfter, False, not written if self._invert else not matched
317 #This loop finds the overlaps with currentRefLAddr.#
318 while True:
319 parentCursor = NCListCursor(cursor)
320 parentCursor.moveUp()
321 #In case: Query is on the right of the RefInterval and does not overlap.
322 overlap = self.isOverlapping(cursor)
323 if overlap == -1:
324 cursor.moveNext()
325 #In case: Query overlaps with RefInterval.
326 elif overlap == 0:
327 matched = True
328 if self._checkOverlap(cursor.getTranscript()):
329 overlappingNames.update(self._extractID(cursor.getTranscript()))
330 if firstOverlapLAddr.compare(parentCursor):
331 firstOverlapLAddr.copy(cursor)
332 nextDone = True
333 if done:
334 cursor.moveNext()
335 else:
336 if not cursor.hasChildren():
337 cursor.moveNext()
338 if cursor.isOut():
339 break
340 else:
341 cursor.moveDown()
342 #In case: Query is on the left of the RefInterval and does not overlap.
343 else:
344 if firstOverlapLAddr.isOut() or firstOverlapLAddr.compare(parentCursor):
345 firstOverlapLAddr.copy(cursor)
346 nextDone = False # new
347 break
348
349 done = False
350 if cursor.isOut():
351 break
352 written = self._writeIntervalInNewGFF3(overlappingNames)
353 return firstOverlapLAddr, nextDone, not written if self._invert else not matched
354
355 def isOverlapping(self, refTranscript):
356 if (self._currentExQueryTranscript.getStart() <= refTranscript.getEnd() and self._currentExQueryTranscript.getEnd() >= refTranscript.getStart()):
357 return 0
358 if self._currentExQueryTranscript.getEnd() < refTranscript.getStart():
359 return 1
360 return -1
361
362 def checkIndex(self, cursor):
363 if not self._index:
364 return None
365 if cursor.isOut():
366 return None
367 chromosome = self._currentExQueryTranscript.getChromosome()
368 nextLIndex = self._indices[REFERENCE][chromosome].getIndex(self._currentExQueryTranscript)
369 if nextLIndex == None:
370 return None
371 ncList = self._ncLists[REFERENCE][chromosome]
372 nextGffAddress = ncList.getRefGffAddr(nextLIndex)
373 thisGffAddress = cursor.getGffAddress()
374 if nextGffAddress > thisGffAddress:
375 return nextLIndex
376 return None
377
378 def _writeIntervalInNewGFF3(self, names):
379 nbOverlaps = 0
380 for cpt in names.values():
381 nbOverlaps += cpt
382 self._nbOverlappingQueries += 1 if Utils.xor(names, self._invert) else 0
383 self._nbOverlaps += nbOverlaps if Utils.xor(names, self._invert) else 0
384 if names:
385 self._currentQueryTranscript.setTagValue("overlapWith", ",".join(names))
386 self._currentQueryTranscript.setTagValue("nbOverlaps", nbOverlaps)
387 if self._invert:
388 return False
389 else:
390 if self._outputNotOverlapping:
391 self._currentQueryTranscript.setTagValue("nbOverlaps", 0)
392 elif not self._invert:
393 return False
394 self._iWriter.addTranscript(self._currentQueryTranscript)
395 self._iWriter.write()
396 return True
397
398 def _extractID(self, transcript):
399 id = transcript.getTagValue("ID") if "ID" in transcript.getTagNames() else transcript.getUniqueName()
400 nbElements = transcript.getTagValue("nbElements") if "nbElements" in transcript.getTagNames() else 1
401 return {id: float(nbElements)}
402
403 def _checkOverlap(self, refTranscript):
404 if self._currentQueryTranscript.getDistance(refTranscript) > self._distance:
405 return False
406 minOverlap = self._minOverlap
407 if self._pcOverlap != None:
408 minOverlap = max(self._minOverlap, self._currentQueryTranscript.getSize() / 100.0 * self._pcOverlap)
409 if not self._currentQueryTranscript.overlapWith(refTranscript, minOverlap):
410 return False
411 if self._antisense and self._currentQueryTranscript.getDirection() == refTranscript.getDirection():
412 return False
413 if self._colinear and self._currentQueryTranscript.getDirection() != refTranscript.getDirection():
414 return False
415 if self._included and not refTranscript.include(self._currentQueryTranscript):
416 return False
417 if self._including and not self._currentQueryTranscript.include(refTranscript):
418 return False
419 if self._introns:
420 return True
421 return self._currentQueryTranscript.overlapWithExon(refTranscript, minOverlap)
422
423 def run(self):
424 self.createTmpRefFile()
425 self.createNCLists()
426 self.compare()
427 self.close()
428 if self._verbosity > 0:
429 print "# queries: %d" % (self._nbLines[QUERY])
430 print "# refs: %d" % (self._nbLines[REFERENCE])
431 print "# written: %d (%d overlaps)" % (self._nbOverlappingQueries, self._nbOverlaps)
432 print "time: %ds" % (self._timeSpent)
433
434
435 if __name__ == "__main__":
436 description = "Compare Overlapping v1.0.4: Get the data which overlap with a reference set. [Category: Data Comparison]"
437
438 parser = OptionParser(description = description)
439 parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]")
440 parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]")
441 parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]")
442 parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of file 2 [compulsory] [format: transcript file format]")
443 parser.add_option("-o", "--output", dest="output", action="store", default=None, type="string", help="output file [compulsory] [format: output file in GFF3 format]")
444 parser.add_option("-D", "--index", dest="index", action="store_true", default=False, help="add an index to the reference file (faster but more memory) [format: boolean] [default: False]")
445 parser.add_option("-r", "--sorted", dest="sorted", action="store_true", default=False, help="input files are already sorted [format: boolean] [default: False]")
446 parser.add_option("-S", "--start1", dest="start1", action="store", default=None, type="int", help="only consider the n first nucleotides of the transcripts in file 1 (do not use it with -U) [format: int]")
447 parser.add_option("-s", "--start2", dest="start2", action="store", default=None, type="int", help="only consider the n first nucleotides of the transcripts in file 2 (do not use it with -u) [format: int]")
448 parser.add_option("-U", "--end1", dest="end1", action="store", default=None, type="int", help="only consider the n last nucleotides of the transcripts in file 1 (do not use it with -S) [format: int]")
449 parser.add_option("-u", "--end2", dest="end2", action="store", default=None, type="int", help="only consider the n last nucleotides of the transcripts in file 2 (do not use it with -s) [format: int]")
450 parser.add_option("-t", "--intron", dest="introns", action="store_true", default=False, help="also report introns [format: bool] [default: false]")
451 parser.add_option("-E", "--5primeExtension1", dest="fivePrime1", action="store", default=None, type="int", help="extension towards 5' in file 1 [format: int]")
452 parser.add_option("-e", "--5primeExtension2", dest="fivePrime2", action="store", default=None, type="int", help="extension towards 5' in file 2 [format: int]")
453 parser.add_option("-N", "--3primeExtension1", dest="threePrime1", action="store", default=None, type="int", help="extension towards 3' in file 1 [format: int]")
454 parser.add_option("-n", "--3primeExtension2", dest="threePrime2", action="store", default=None, type="int", help="extension towards 3' in file 2 [format: int]")
455 parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="colinear only [format: bool] [default: false]")
456 parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="antisense only [format: bool] [default: false]")
457 parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="accept some distance between query and reference [format: int]")
458 parser.add_option("-k", "--included", dest="included", action="store_true", default=False, help="keep only elements from file 1 which are included in an element of file 2 [format: bool] [default: false]")
459 parser.add_option("-K", "--including", dest="including", action="store_true", default=False, help="keep only elements from file 2 which are included in an element of file 1 [format: bool] [default: false]")
460 parser.add_option("-m", "--minOverlap", dest="minOverlap", action="store", default=1, type="int", help="minimum number of nucleotides overlapping to declare an overlap [format: int] [default: 1]")
461 parser.add_option("-p", "--pcOverlap", dest="pcOverlap", action="store", default=None, type="int", help="minimum percentage of nucleotides to overlap to declare an overlap [format: int]")
462 parser.add_option("-O", "--notOverlapping", dest="notOverlapping", action="store_true", default=False, help="also output not overlapping data [format: bool] [default: false]")
463 parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="invert the match [format: bool] [default: false]")
464 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
465 (options, args) = parser.parse_args()
466
467 co = CompareOverlapping(options.verbosity)
468 co.setInput(options.inputFileName1, options.format1, QUERY)
469 co.setInput(options.inputFileName2, options.format2, REFERENCE)
470 co.setOutput(options.output)
471 co.setSorted(options.sorted)
472 co.setIndex(options.index)
473 co.restrictToStart(options.start1, QUERY)
474 co.restrictToStart(options.start2, REFERENCE)
475 co.restrictToEnd(options.end1, QUERY)
476 co.restrictToEnd(options.end2, REFERENCE)
477 co.extendFivePrime(options.fivePrime1, QUERY)
478 co.extendFivePrime(options.fivePrime2, REFERENCE)
479 co.extendThreePrime(options.threePrime1, QUERY)
480 co.extendThreePrime(options.threePrime2, REFERENCE)
481 co.acceptIntrons(options.introns)
482 co.getAntisenseOnly(options.antisense)
483 co.getColinearOnly(options.colinear)
484 co.getInvert(options.exclude)
485 co.setMaxDistance(options.distance)
486 co.setMinOverlap(options.minOverlap)
487 co.setPcOverlap(options.pcOverlap)
488 co.setIncludedOnly(options.included)
489 co.setIncludingOnly(options.including)
490 co.includeNotOverlapping(options.notOverlapping)
491 co.run()