comparison SMART/Java/Python/fo.py @ 31:0ab839023fe4

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 14:33:21 -0400
parents
children
comparison
equal deleted inserted replaced
30:5677346472b5 31:0ab839023fe4
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2012
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31
32 import os, struct, time, shutil
33 from optparse import OptionParser
34 from pyRepetUnit.commons.parsing.ParserChooser import ParserChooser
35 from pyRepetUnit.commons.writer.Gff3Writer import Gff3Writer
36 from SMART.Java.Python.structure.Transcript import Transcript
37 from SMART.Java.Python.structure.Interval import Interval
38 from SMART.Java.Python.ncList.NCList import NCList
39 from SMART.Java.Python.ncList.ConvertToNCList import ConvertToNCList
40 from SMART.Java.Python.ncList.NCListParser import NCListParser
41 from SMART.Java.Python.ncList.NCListCursor import NCListCursor
42 from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle
43 from SMART.Java.Python.ncList.FileSorter import FileSorter
44 from SMART.Java.Python.ncList.NCListHandler import NCListHandler
45 from SMART.Java.Python.misc.Progress import Progress
46 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
47 try:
48 import cPickle as pickle
49 except:
50 import pickle
51
52 REFERENCE = 0
53 QUERY = 1
54 TYPES = (REFERENCE, QUERY)
55 TYPETOSTRING = {0: "reference", 1: "query"}
56
57 class FindOverlapsOptim(object):
58
59 def __init__(self, verbosity = 1):
60 self._parsers = {}
61 self._sortedFileNames = {}
62 self._outputFileName = "outputOverlaps.gff3"
63 self._iWriter = None
64 self._inputFileNames = {REFERENCE: None, QUERY: None}
65 self._convertedFileNames = {REFERENCE: False, QUERY: False}
66 self._inputFileFormats = {REFERENCE: None, QUERY: None}
67 self._converted = {REFERENCE: False, QUERY: False}
68 self._ncListHandlers = {REFERENCE: None, QUERY: None}
69 self._splittedFileNames = {REFERENCE: {}, QUERY: {}}
70 self._nbOverlappingQueries = 0
71 self._nbOverlaps = 0
72 self._nbLines = {REFERENCE: 0, QUERY: 0}
73 self._verbosity = verbosity
74 self._ncLists = {}
75 self._cursors = {}
76 self._nbElementsPerChromosome = {}
77 self._tmpDirectories = {REFERENCE: False, QUERY: False}
78
79 def close(self):
80 self._iWriter.close()
81 for fileName in (self._sortedFileNames.values()):
82 if os.path.exists(fileName):
83 os.remove(fileName)
84 for fileName in self._convertedFileNames.values():
85 if fileName:
86 os.remove(fileName)
87
88 def setRefFileName(self, fileName, format):
89 self.setFileName(fileName, format, REFERENCE)
90
91 def setQueryFileName(self, fileName, format):
92 self.setFileName(fileName, format, QUERY)
93
94 def setFileName(self, fileName, format, type):
95 self._inputFileNames[type] = fileName
96 self._inputFileFormats[type] = format
97 if format.lower() != "nclist":
98 self._converted[type] = True
99
100 def setOutputFileName(self, outputFileName):
101 self._outputFileName = outputFileName
102 self._iWriter = Gff3Writer(self._outputFileName)
103
104 def createNCLists(self):
105 startTime = time.time()
106 if self._verbosity > 1:
107 print "Building database"
108 self._ncLists = dict([type, {}] for type in TYPES)
109 self._indices = dict([type, {}] for type in TYPES)
110 self._cursors = dict([type, {}] for type in TYPES)
111 for type in TYPES:
112 self._ncListHandlers[type] = NCListHandler(self._verbosity-3)
113 if self._converted[type]:
114 self._convertedFileNames[type] = "%s_%d.ncl" % (os.path.splitext(self._inputFileNames[type])[0], type)
115 ncLists = ConvertToNCList(self._verbosity-3)
116 ncLists.setInputFileName(self._inputFileNames[type], self._inputFileFormats[type])
117 ncLists.setOutputFileName(self._convertedFileNames[type])
118 if type == REFERENCE:
119 ncLists.setIndex(True)
120 ncLists.run()
121 self._ncListHandlers[type].setFileName(self._convertedFileNames[type])
122 else:
123 self._ncListHandlers[type].setFileName(self._inputFileNames[type])
124 self._ncListHandlers[type].loadData()
125 self._nbLines[type] = self._ncListHandlers[type].getNbElements()
126 self._nbElementsPerChromosome[type] = self._ncListHandlers[type].getNbElementsPerChromosome()
127 self._ncLists[type] = self._ncListHandlers[type].getNCLists()
128 for chromosome, ncList in self._ncLists[type].iteritems():
129 self._cursors[type][chromosome] = NCListCursor(None, ncList, 0, self._verbosity)
130 if type == REFERENCE:
131 self._indices[REFERENCE][chromosome] = ncList.getIndex()
132 endTime = time.time()
133 if self._verbosity > 1:
134 print "done (%.2gs)" % (endTime - startTime)
135
136 def compare(self):
137 nbSkips, nbMoves = 0, 0
138 previousChromosome = None
139 done = False
140 startTime = time.time()
141 progress = Progress(len(self._ncLists[QUERY].keys()), "Checking overlap", self._verbosity)
142 #print "query:", self._ncLists[QUERY].keys()
143 #print "reference:", self._ncLists[REFERENCE].keys()
144 for chromosome, queryNCList in self._ncLists[QUERY].iteritems():
145 queryParser = self._ncListHandlers[QUERY].getParser(chromosome)
146 queryCursor = self._cursors[QUERY][chromosome]
147 if chromosome != previousChromosome:
148 skipChromosome = False
149 previousChromosome = chromosome
150 if chromosome not in self._ncLists[REFERENCE]:
151 #print "out ", chromosome
152 continue
153 refNCList = self._ncLists[REFERENCE][chromosome]
154 refCursor = self._cursors[REFERENCE][chromosome]
155 #print "starting", chromosome
156 while True:
157 queryTranscript = queryCursor.getTranscript()
158 #print queryTranscript
159 newRefLaddr = self.checkIndex(queryTranscript, refCursor)
160 #print "query is", queryTranscript
161 if newRefLaddr != None:
162 nbMoves += 1
163 refCursor.setLIndex(newRefLaddr)
164 #print "skipping to", refCursor
165 done = False
166 refCursor, done, unmatched = self.findOverlapIter(queryTranscript, refCursor, done)
167 #print "completed with", refCursor, done, unmatched
168 if refCursor.isOut():
169 #print "exiting 1", chromosome
170 break
171 if unmatched or not queryCursor.hasChildren():
172 queryCursor.moveNext()
173 #print "moving next to", queryCursor
174 nbSkips += 1
175 else:
176 queryCursor.moveDown()
177 #print "moving down to", queryCursor
178 if queryCursor.isOut():
179 #print "exiting 2", chromosome
180 break
181 progress.inc()
182 progress.done()
183 endTime = time.time()
184 self._timeSpent = endTime - startTime
185 if self._verbosity >= 10:
186 print "# skips: %d" % (nbSkips)
187 print "# moves: %d" % (nbMoves)
188
189 def findOverlapIter(self, queryTranscript, cursor, done):
190 chromosome = queryTranscript.getChromosome()
191 if chromosome not in self._ncLists[REFERENCE]:
192 return False, None
193 ncList = self._ncLists[REFERENCE][chromosome]
194 overlappingNames = {}
195 nextDone = False
196 firstOverlapLAddr = NCListCursor(cursor)
197 firstOverlapLAddr.setLIndex(-1)
198 if cursor.isOut():
199 return firstOverlapLAddr, False
200 parentCursor = NCListCursor(cursor)
201 parentCursor.moveUp()
202 firstParentAfter = False
203 #print "query transcript 1", queryTranscript
204 #print "cursor 1", cursor
205 #print "parent 1", parentCursor
206 while not parentCursor.isOut():
207 if self.isOverlapping(queryTranscript, parentCursor) == 0:
208 #print "overlap parent choice 0"
209 overlappingNames.update(self._extractID(parentCursor.getTranscript()))
210 if firstOverlapLAddr.isOut():
211 #print "overlap parent 2"
212 firstOverlapLAddr.copy(parentCursor)
213 nextDone = True # new
214 # lastCursor = NCListCursor(parentCursor)
215 # lastCursor.moveDown()
216 # lastCursor.moveLastSibling()
217 # if self.isOverlapping(queryTranscript, lastCursor) == -1:
218 # #print "next done 1"
219 # nextDone = True
220 elif self.isOverlapping(queryTranscript, parentCursor) == 1:
221 #print "overlap parent choice 1"
222 firstParentAfter = NCListCursor(parentCursor)
223 parentCursor.moveUp()
224 #print "parent 2", parentCursor
225 if firstParentAfter:
226 #print "exit parent", firstParentAfter, overlappingNames
227 self._writeIntervalInNewGFF3(queryTranscript, overlappingNames)
228 return firstParentAfter, False, not overlappingNames
229 #This loop finds the overlaps with currentRefLAddr.#
230 while True:
231 #print "ref cursor now is", cursor
232 parentCursor = NCListCursor(cursor)
233 parentCursor.moveUp()
234 #In case: Query is on the right of the RefInterval and does not overlap.
235 overlap = self.isOverlapping(queryTranscript, cursor)
236 if overlap == -1:
237 #print "choice 1"
238 # if cursor.isLast() and not firstOverlapLAddr.isOut():
239 # if firstOverlapLAddr.compare(parentCursor):
240 # #print "next done 2"
241 # nextDone = True
242 cursor.moveNext()
243 #In case: Query overlaps with RefInterval.
244 elif overlap == 0:
245 #print "choice 2"
246 overlappingNames.update(self._extractID(cursor.getTranscript()))
247 if firstOverlapLAddr.compare(parentCursor):
248 firstOverlapLAddr.copy(cursor)
249 nextDone = True # new
250 if done:
251 cursor.moveNext()
252 else:
253 if not cursor.hasChildren():
254 cursor.moveNext()
255 if cursor.isOut():
256 #print "break 1"
257 break
258 else:
259 cursor.moveDown()
260 #In case: Query is on the left of the RefInterval and does not overlap.
261 else:
262 #print "choice 3"
263 if firstOverlapLAddr.isOut() or firstOverlapLAddr.compare(parentCursor):
264 #print "changing nfo 2"
265 firstOverlapLAddr.copy(cursor)
266 nextDone = False # new
267 #print "break 2"
268 break
269
270 done = False
271 if cursor.isOut():
272 #print "break 3"
273 break
274 self._writeIntervalInNewGFF3(queryTranscript, overlappingNames)
275 return firstOverlapLAddr, nextDone, not overlappingNames
276
277 def isOverlapping(self, queryTranscript, refTranscript):
278 if (queryTranscript.getStart() <= refTranscript.getEnd() and queryTranscript.getEnd() >= refTranscript.getStart()):
279 return 0
280 if queryTranscript.getEnd() < refTranscript.getStart():
281 return 1
282 return -1
283
284 def checkIndex(self, transcript, cursor):
285 chromosome = transcript.getChromosome()
286 nextLIndex = self._indices[REFERENCE][chromosome].getIndex(transcript)
287 if nextLIndex == None:
288 return None
289 ncList = self._ncLists[REFERENCE][chromosome]
290 nextGffAddress = ncList.getRefGffAddr(nextLIndex)
291 thisGffAddress = cursor.getGffAddress()
292 if nextGffAddress > thisGffAddress:
293 return nextLIndex
294 return None
295
296 def _writeIntervalInNewGFF3(self, transcript, names):
297 nbOverlaps = 0
298 for cpt in names.values():
299 nbOverlaps += cpt
300 if not names:
301 return
302 transcript.setTagValue("overlapsWith", "--".join(sorted(names.keys())))
303 transcript.setTagValue("nbOverlaps", nbOverlaps)
304 self._iWriter.addTranscript(transcript)
305 self._iWriter.write()
306 self._nbOverlappingQueries += 1
307 self._nbOverlaps += nbOverlaps
308
309 def _extractID(self, transcript):
310 nbElements = float(transcript.getTagValue("nbElements")) if "nbElements" in transcript.getTagNames() else 1
311 id = transcript.getTagValue("ID") if "ID" in transcript.getTagNames() else transcript.getUniqueName()
312 return {id: nbElements}
313
314 def run(self):
315 self.createNCLists()
316 self.compare()
317 self.close()
318 if self._verbosity > 0:
319 print "# queries: %d" % (self._nbLines[QUERY])
320 print "# refs: %d" % (self._nbLines[REFERENCE])
321 print "# written: %d (%d overlaps)" % (self._nbOverlappingQueries, self._nbOverlaps)
322 print "time: %.2gs" % (self._timeSpent)
323
324
325 if __name__ == "__main__":
326 description = "Find Overlaps Optim v1.0.0: Finds overlaps with several query intervals. [Category: Data Comparison]"
327
328 parser = OptionParser(description = description)
329 parser.add_option("-i", "--query", dest="inputQueryFileName", action="store", type="string", help="Query input file [compulsory] [format: file in transcript or other format given by -f]")
330 parser.add_option("-f", "--queryFormat", dest="queryFormat", action="store", type="string", help="format of previous file (possibly in NCL format) [compulsory] [format: transcript or other file format]")
331 parser.add_option("-j", "--ref", dest="inputRefFileName", action="store", type="string", help="Reference input file [compulsory] [format: file in transcript or other format given by -g]")
332 parser.add_option("-g", "--refFormat", dest="refFormat", action="store", type="string", help="format of previous file (possibly in NCL format) [compulsory] [format: transcript or other file format]")
333 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="Output file [compulsory] [format: output file in GFF3 format]")
334 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="Trace level [format: int] [default: 1]")
335 (options, args) = parser.parse_args()
336
337 iFOO = FindOverlapsOptim(options.verbosity)
338 iFOO.setRefFileName(options.inputRefFileName, options.refFormat)
339 iFOO.setQueryFileName(options.inputQueryFileName, options.queryFormat)
340 iFOO.setOutputFileName(options.outputFileName)
341 iFOO.run()