comparison smart_toolShed/SMART/Java/Python/ncList/FindOverlapsWithOneInterval.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2010
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31
32 import struct
33 import math
34 import os
35 from optparse import OptionParser
36 from commons.core.writer.Gff3Writer import Gff3Writer
37 from SMART.Java.Python.ncList.NCList import NCList
38 from SMART.Java.Python.ncList.FileSorter import FileSorter
39 from commons.core.parsing.ParserChooser import ParserChooser
40 from SMART.Java.Python.ncList.NCListCursor import NCListCursor
41 from SMART.Java.Python.structure.Transcript import Transcript
42
43 LONGSIZE = struct.calcsize('l')
44
45 class FindOverlapsWithOneInterval(object):
46
47 def __init__(self, verbosity):
48 self._sortedFileName = None
49 self._verbosity = verbosity
50 self._overlappingNames = []
51 self._nbOverlaps = 0
52 self._nbWritten = 0
53
54 def __del__(self):
55 if self._sortedFileName and os.path.exists(self._sortedFileName):
56 os.remove(self._sortedFileName)
57
58 def close(self):
59 self._iWriter.close()
60
61 def setOutputFileName(self, fileName):
62 self._iWriter = Gff3Writer(fileName)
63
64 def setFileName(self, fileName, format):
65 chooser = ParserChooser(self._verbosity)
66 chooser.findFormat(format)
67 self._parser = chooser.getParser(fileName)
68 self._sortedFileName = "%s_sorted.pkl" % (os.path.splitext(fileName)[0])
69
70 def setInterval(self, chromosome, start, end):
71 self._chromosome = chromosome
72 self._start = start
73 self._end = end
74 self._transcript = Transcript()
75 self._transcript.setChromosome(chromosome)
76 self._transcript.setStart(start)
77 self._transcript.setEnd(end)
78 self._transcript.setDirection("+")
79
80 def setTranscript(self, transcript):
81 if transcript.__class__.__name__ == "Mapping":
82 transcript = transcript.getTranscript()
83 self._chromosome = transcript.getChromosome()
84 self._start = transcript.getStart()
85 self._end = transcript.getEnd()
86 self._transcript = transcript
87
88 def prepareIntermediateFiles(self):
89 fs = FileSorter(self._parser, self._verbosity-4)
90 fs.selectChromosome(self._chromosome)
91 fs.perChromosome(False)
92 fs.setOutputFileName(self._sortedFileName)
93 fs.sort()
94 self._nbTotalLines = fs.getNbElements()
95 self._nbLines = fs.getNbElementsPerChromosome()[self._chromosome]
96
97 def createNCList(self):
98 if self._verbosity > 2:
99 print "Creating NC-list..."
100 ncList = NCList(self._verbosity)
101 ncList.createIndex(True)
102 ncList.setChromosome(self._chromosome)
103 ncList.setFileName(self._sortedFileName)
104 ncList.setNbElements(self._nbTotalLines)
105 ncList.buildLists()
106 self.setNCList(ncList, ncList.getIndex())
107 if self._verbosity > 2:
108 print " ...done (%ds)" % (endTime - startTime)
109
110 def setNCList(self, ncList, index):
111 self._ncList = ncList
112 self._indix = index
113
114 def binarySearch(self, cursor, startL, endL):
115 if startL > endL:
116 return None
117 middleL = (startL + endL) / 2
118 cursor.moveSibling(middleL)
119 overlap = self.isOverlapping(cursor)
120 if overlap == 0:
121 if middleL == startL:
122 return cursor
123 else:
124 return self.binarySearch(cursor, startL, middleL)
125 if overlap == -1:
126 return self.binarySearch(cursor, middleL + 1, endL)
127 return self.binarySearch(cursor, startL, middleL - 1)
128
129 def compare(self, cursor = None):
130 self._ncList.openFiles()
131 if cursor == None:
132 dump = True
133 cursor = NCListCursor(None, self._ncList, 0, self._verbosity)
134 cursor._getSiblingData()
135 cursor = self.binarySearch(cursor, cursor._firstSiblingLIndex, cursor._lastSiblingLIndex)
136 if cursor == None:
137 return
138 while not cursor.isOut() and self.isOverlapping(cursor) == 0:
139 self.write(cursor)
140 newCursor = NCListCursor(cursor)
141 if newCursor.hasChildren():
142 newCursor.moveDown()
143 self.compare(newCursor)
144 if cursor.isLast():
145 return
146 cursor.moveRight()
147
148 def isOverlapping(self, cursor):
149 if self._end < cursor.getStart():
150 return 1
151 if self._start > cursor.getEnd():
152 return -1
153 return 0
154
155 def write(self, cursor):
156 self._nbOverlaps += 1
157 refTranscript = cursor.getTranscript()
158 self._overlappingNames.append(refTranscript.getName())
159
160 def dumpWriter(self):
161 if (not self._overlappingNames) or self._transcript == None:
162 return
163 self._transcript.setTagValue("nbOverlaps", len(self._overlappingNames))
164 self._transcript.setTagValue("overlapsWith", "--".join(self._overlappingNames))
165 self._iWriter.addTranscript(self._transcript)
166 self._nbWritten += 1
167 self._overlappingNames = []
168
169 def run(self):
170 self.prepareIntermediateFiles()
171 self.createNCList()
172 self.compare()
173 self.dumpWriter()
174 self.close()
175 if self._verbosity > 0:
176 print "# refs: %d" % (self._nbLines)
177 print "# written: %d (%d overlaps)" % (self._nbOverlappingQueries, self._nbOverlaps)
178
179
180 if __name__ == "__main__":
181 description = "FindOverlapsWithOneInterval: Finds overlaps with one query interval."
182
183 parser = OptionParser(description = description)
184 parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="Input file [compulsory] [format: file in transcript format given by -f]")
185 parser.add_option("-f", "--format", dest="format", action="store", type="string", help="Format of previous file [compulsory] [format: transcript file format]")
186 parser.add_option("-s", "--start", dest="start", action="store", type="int", help="The start of the query interval [compulsory] [format: int]")
187 parser.add_option("-e", "--end", dest="end", action="store", type="int", help="The end of the query interval [compulsory] [format: int]")
188 parser.add_option("-c", "--chromosome", dest="chromosome", action="store", type="string", help="Chromosome of the query interval [compulsory] [format: string]")
189 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="Output file [compulsory] [format: output file in GFF3 format]")
190 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="Trace level [format: int] [default: 1]")
191 (options, args) = parser.parse_args()
192
193 iFOWOI = FindOverlapsWithOneInterval(options.verbosity)
194 iFOWOI.setFileName(options.inputFileName, options.format)
195 iFOWOI.setInterval(options.chromosome, options.start, options.end)
196 iFOWOI.setOutputFileName(options.outputFileName)
197 iFOWOI.run()