6
|
1 #! /usr/bin/env python
|
|
2 #
|
|
3 # Copyright INRA-URGI 2009-2010
|
|
4 #
|
|
5 # This software is governed by the CeCILL license under French law and
|
|
6 # abiding by the rules of distribution of free software. You can use,
|
|
7 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
8 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
9 # "http://www.cecill.info".
|
|
10 #
|
|
11 # As a counterpart to the access to the source code and rights to copy,
|
|
12 # modify and redistribute granted by the license, users are provided only
|
|
13 # with a limited warranty and the software's author, the holder of the
|
|
14 # economic rights, and the successive licensors have only limited
|
|
15 # liability.
|
|
16 #
|
|
17 # In this respect, the user's attention is drawn to the risks associated
|
|
18 # with loading, using, modifying and/or developing or reproducing the
|
|
19 # software by the user in light of its specific status of free software,
|
|
20 # that may mean that it is complicated to manipulate, and that also
|
|
21 # therefore means that it is reserved for developers and experienced
|
|
22 # professionals having in-depth computer knowledge. Users are therefore
|
|
23 # encouraged to load and test the software's suitability as regards their
|
|
24 # requirements in conditions enabling the security of their systems and/or
|
|
25 # data to be ensured and, more generally, to use and operate it in the
|
|
26 # same conditions as regards security.
|
|
27 #
|
|
28 # The fact that you are presently reading this means that you have had
|
|
29 # knowledge of the CeCILL license and that you accept its terms.
|
|
30 #
|
|
31
|
|
32 import struct
|
|
33 import math
|
|
34 import os
|
|
35 from optparse import OptionParser
|
|
36 from commons.core.writer.Gff3Writer import Gff3Writer
|
|
37 from SMART.Java.Python.ncList.NCList import NCList
|
|
38 from SMART.Java.Python.ncList.FileSorter import FileSorter
|
|
39 from commons.core.parsing.ParserChooser import ParserChooser
|
|
40 from SMART.Java.Python.ncList.NCListCursor import NCListCursor
|
|
41 from SMART.Java.Python.structure.Transcript import Transcript
|
|
42
|
|
43 LONGSIZE = struct.calcsize('l')
|
|
44
|
|
45 class FindOverlapsWithOneInterval(object):
|
|
46
|
|
47 def __init__(self, verbosity):
|
|
48 self._sortedFileName = None
|
|
49 self._verbosity = verbosity
|
|
50 self._overlappingNames = []
|
|
51 self._nbOverlaps = 0
|
|
52 self._nbWritten = 0
|
|
53
|
|
54 def __del__(self):
|
|
55 if self._sortedFileName and os.path.exists(self._sortedFileName):
|
|
56 os.remove(self._sortedFileName)
|
|
57
|
|
58 def close(self):
|
|
59 self._iWriter.close()
|
|
60
|
|
61 def setOutputFileName(self, fileName):
|
|
62 self._iWriter = Gff3Writer(fileName)
|
|
63
|
|
64 def setFileName(self, fileName, format):
|
|
65 chooser = ParserChooser(self._verbosity)
|
|
66 chooser.findFormat(format)
|
|
67 self._parser = chooser.getParser(fileName)
|
|
68 self._sortedFileName = "%s_sorted.pkl" % (os.path.splitext(fileName)[0])
|
|
69
|
|
70 def setInterval(self, chromosome, start, end):
|
|
71 self._chromosome = chromosome
|
|
72 self._start = start
|
|
73 self._end = end
|
|
74 self._transcript = Transcript()
|
|
75 self._transcript.setChromosome(chromosome)
|
|
76 self._transcript.setStart(start)
|
|
77 self._transcript.setEnd(end)
|
|
78 self._transcript.setDirection("+")
|
|
79
|
|
80 def setTranscript(self, transcript):
|
|
81 if transcript.__class__.__name__ == "Mapping":
|
|
82 transcript = transcript.getTranscript()
|
|
83 self._chromosome = transcript.getChromosome()
|
|
84 self._start = transcript.getStart()
|
|
85 self._end = transcript.getEnd()
|
|
86 self._transcript = transcript
|
|
87
|
|
88 def prepareIntermediateFiles(self):
|
|
89 fs = FileSorter(self._parser, self._verbosity-4)
|
|
90 fs.selectChromosome(self._chromosome)
|
|
91 fs.perChromosome(False)
|
|
92 fs.setOutputFileName(self._sortedFileName)
|
|
93 fs.sort()
|
|
94 self._nbTotalLines = fs.getNbElements()
|
|
95 self._nbLines = fs.getNbElementsPerChromosome()[self._chromosome]
|
|
96
|
|
97 def createNCList(self):
|
|
98 if self._verbosity > 2:
|
|
99 print "Creating NC-list..."
|
|
100 ncList = NCList(self._verbosity)
|
|
101 ncList.createIndex(True)
|
|
102 ncList.setChromosome(self._chromosome)
|
|
103 ncList.setFileName(self._sortedFileName)
|
|
104 ncList.setNbElements(self._nbTotalLines)
|
|
105 ncList.buildLists()
|
|
106 self.setNCList(ncList, ncList.getIndex())
|
|
107 if self._verbosity > 2:
|
|
108 print " ...done (%ds)" % (endTime - startTime)
|
|
109
|
|
110 def setNCList(self, ncList, index):
|
|
111 self._ncList = ncList
|
|
112 self._indix = index
|
|
113
|
|
114 def binarySearch(self, cursor, startL, endL):
|
|
115 if startL > endL:
|
|
116 return None
|
|
117 middleL = (startL + endL) / 2
|
|
118 cursor.moveSibling(middleL)
|
|
119 overlap = self.isOverlapping(cursor)
|
|
120 if overlap == 0:
|
|
121 if middleL == startL:
|
|
122 return cursor
|
|
123 else:
|
|
124 return self.binarySearch(cursor, startL, middleL)
|
|
125 if overlap == -1:
|
|
126 return self.binarySearch(cursor, middleL + 1, endL)
|
|
127 return self.binarySearch(cursor, startL, middleL - 1)
|
|
128
|
|
129 def compare(self, cursor = None):
|
|
130 self._ncList.openFiles()
|
|
131 if cursor == None:
|
|
132 dump = True
|
|
133 cursor = NCListCursor(None, self._ncList, 0, self._verbosity)
|
|
134 cursor._getSiblingData()
|
|
135 cursor = self.binarySearch(cursor, cursor._firstSiblingLIndex, cursor._lastSiblingLIndex)
|
|
136 if cursor == None:
|
|
137 return
|
|
138 while not cursor.isOut() and self.isOverlapping(cursor) == 0:
|
|
139 self.write(cursor)
|
|
140 newCursor = NCListCursor(cursor)
|
|
141 if newCursor.hasChildren():
|
|
142 newCursor.moveDown()
|
|
143 self.compare(newCursor)
|
|
144 if cursor.isLast():
|
|
145 return
|
|
146 cursor.moveRight()
|
|
147
|
|
148 def isOverlapping(self, cursor):
|
|
149 if self._end < cursor.getStart():
|
|
150 return 1
|
|
151 if self._start > cursor.getEnd():
|
|
152 return -1
|
|
153 return 0
|
|
154
|
|
155 def write(self, cursor):
|
|
156 self._nbOverlaps += 1
|
|
157 refTranscript = cursor.getTranscript()
|
|
158 self._overlappingNames.append(refTranscript.getName())
|
|
159
|
|
160 def dumpWriter(self):
|
|
161 if (not self._overlappingNames) or self._transcript == None:
|
|
162 return
|
|
163 self._transcript.setTagValue("nbOverlaps", len(self._overlappingNames))
|
|
164 self._transcript.setTagValue("overlapsWith", "--".join(self._overlappingNames))
|
|
165 self._iWriter.addTranscript(self._transcript)
|
|
166 self._nbWritten += 1
|
|
167 self._overlappingNames = []
|
|
168
|
|
169 def run(self):
|
|
170 self.prepareIntermediateFiles()
|
|
171 self.createNCList()
|
|
172 self.compare()
|
|
173 self.dumpWriter()
|
|
174 self.close()
|
|
175 if self._verbosity > 0:
|
|
176 print "# refs: %d" % (self._nbLines)
|
|
177 print "# written: %d (%d overlaps)" % (self._nbOverlappingQueries, self._nbOverlaps)
|
|
178
|
|
179
|
|
180 if __name__ == "__main__":
|
|
181 description = "FindOverlapsWithOneInterval: Finds overlaps with one query interval."
|
|
182
|
|
183 parser = OptionParser(description = description)
|
|
184 parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="Input file [compulsory] [format: file in transcript format given by -f]")
|
|
185 parser.add_option("-f", "--format", dest="format", action="store", type="string", help="Format of previous file [compulsory] [format: transcript file format]")
|
|
186 parser.add_option("-s", "--start", dest="start", action="store", type="int", help="The start of the query interval [compulsory] [format: int]")
|
|
187 parser.add_option("-e", "--end", dest="end", action="store", type="int", help="The end of the query interval [compulsory] [format: int]")
|
|
188 parser.add_option("-c", "--chromosome", dest="chromosome", action="store", type="string", help="Chromosome of the query interval [compulsory] [format: string]")
|
|
189 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="Output file [compulsory] [format: output file in GFF3 format]")
|
|
190 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="Trace level [format: int] [default: 1]")
|
|
191 (options, args) = parser.parse_args()
|
|
192
|
|
193 iFOWOI = FindOverlapsWithOneInterval(options.verbosity)
|
|
194 iFOWOI.setFileName(options.inputFileName, options.format)
|
|
195 iFOWOI.setInterval(options.chromosome, options.start, options.end)
|
|
196 iFOWOI.setOutputFileName(options.outputFileName)
|
|
197 iFOWOI.run()
|