comparison SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervalsBin.py @ 6:769e306b7933

Change the repository level.
author yufei-luo
date Fri, 18 Jan 2013 04:54:14 -0500
parents
children
comparison
equal deleted inserted replaced
5:ea3082881bf8 6:769e306b7933
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2011
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 import random, os, os.path, time, sqlite3
32 from optparse import OptionParser
33 from commons.core.parsing.ParserChooser import ParserChooser
34 from commons.core.writer.TranscriptWriter import TranscriptWriter
35 from SMART.Java.Python.structure.Interval import Interval
36 from SMART.Java.Python.structure.Transcript import Transcript
37 from SMART.Java.Python.structure.Mapping import Mapping
38 from SMART.Java.Python.misc.Progress import Progress
39 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
40 try:
41 import cPickle as pickle
42 except:
43 import pickle
44
45 MINBIN = 3
46 MAXBIN = 7
47
48
49 def getBin(start, end):
50 for i in range(MINBIN, MAXBIN + 1):
51 binLevel = 10 ** i
52 if int(start / binLevel) == int(end / binLevel):
53 return int(i * 10 ** (MAXBIN + 1) + int(start / binLevel))
54 return int((MAXBIN + 1) * 10 ** (MAXBIN + 1))
55
56 def getOverlappingBins(start, end):
57 array = []
58 bigBin = int((MAXBIN + 1) * 10 ** (MAXBIN + 1))
59 for i in range(MINBIN, MAXBIN + 1):
60 binLevel = 10 ** i
61 array.append((int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)), int(i * 10 ** (MAXBIN + 1) + int(end / binLevel))))
62 array.append((bigBin, bigBin))
63 return array
64
65
66 class FindOverlapsWithSeveralIntervalsBin(object):
67
68 def __init__(self, verbosity):
69 self.verbosity = verbosity
70 self.randomNumber = random.randint(0, 10000)
71 self.dbName = "smartdb%d" % (self.randomNumber)
72 if "SMARTTMPPATH" in os.environ:
73 self.dbName = os.join(os.environ["SMARTTMPPATH"], self.dbName)
74 self.connection = sqlite3.connect(self.dbName)
75 self.tableNames = {}
76 self.nbQueries = 0
77 self.nbRefs = 0
78 self.nbWritten = 0
79 self.nbOverlaps = 0
80 cursor = self.connection.cursor()
81 cursor.execute("PRAGMA journal_mode = OFF")
82 cursor.execute("PRAGMA synchronous = 0")
83 cursor.execute("PRAGMA locking_mode = EXCLUSIVE")
84 cursor.execute("PRAGMA count_change = OFF")
85 cursor.execute("PRAGMA temp_store = 2")
86
87 def __del__(self):
88 cursor = self.connection.cursor()
89 for tableName in self.tableNames.values():
90 cursor.execute("DROP TABLE IF EXISTS %s" % (tableName))
91 if os.path.exists(self.dbName):
92 os.remove(self.dbName)
93
94 def createTable(self, chromosome):
95 cursor = self.connection.cursor()
96 tableName = "tmpTable_%s_%d" % (chromosome.replace("-", "_"), self.randomNumber)
97 cursor.execute("CREATE TABLE %s (start INT, end INT, transcript BLOB, bin INT)" % (tableName))
98 cursor.execute("CREATE INDEX index_%s ON %s (bin)" % (tableName, tableName))
99 self.tableNames[chromosome] = tableName
100
101 def setReferenceFile(self, fileName, format):
102 chooser = ParserChooser(self.verbosity)
103 chooser.findFormat(format)
104 parser = chooser.getParser(fileName)
105 startTime = time.time()
106 if self.verbosity > 2:
107 print "Storing into table"
108 for transcript in parser.getIterator():
109 if transcript.__class__.__name__ == "Mapping":
110 transcript = transcript.getTranscript()
111 transcriptString = pickle.dumps(transcript)
112 chromosome = transcript.getChromosome()
113 if chromosome not in self.tableNames:
114 self.createTable(chromosome)
115 start = transcript.getStart()
116 end = transcript.getEnd()
117 bin = getBin(start, end)
118 cursor = self.connection.cursor()
119 cursor.execute("INSERT INTO %s (start, end, transcript, bin) VALUES (?, ?, ?, ?)" % (self.tableNames[chromosome]), (start, end, sqlite3.Binary(transcriptString), bin))
120 self.nbRefs += 1
121 self.connection.commit()
122 endTime = time.time()
123 if self.verbosity > 2:
124 print " ...done (%.2gs)" % (endTime - startTime)
125
126 def setQueryFile(self, fileName, format):
127 chooser = ParserChooser(self.verbosity)
128 chooser.findFormat(format)
129 self.queryParser = chooser.getParser(fileName)
130 self.nbQueries = self.queryParser.getNbItems()
131
132 def setOutputFile(self, fileName):
133 self.writer = TranscriptWriter(fileName, "gff3", self.verbosity)
134
135 def compare(self):
136 progress = Progress(self.nbQueries, "Reading queries", self.verbosity)
137 startTime = time.time()
138 for queryTranscript in self.queryParser.getIterator():
139 if queryTranscript.__class__.__name__ == "Mapping":
140 queryTranscript = queryTranscript.getTranscript()
141 progress.inc()
142 queryChromosome = queryTranscript.getChromosome()
143 if queryChromosome not in self.tableNames:
144 continue
145 queryStart = queryTranscript.getStart()
146 queryEnd = queryTranscript.getEnd()
147 bins = getOverlappingBins(queryStart, queryEnd)
148 commands = []
149 for bin in bins:
150 command = "SELECT * FROM %s WHERE bin " % (self.tableNames[queryChromosome])
151 if bin[0] == bin[1]:
152 command += "= %d" % (bin[0])
153 else:
154 command += "BETWEEN %d AND %d" % (bin[0], bin[1])
155 commands.append(command)
156 command = " UNION ".join(commands)
157 cursor = self.connection.cursor()
158 cursor.execute(command)
159 overlap = False
160 line = cursor.fetchone()
161 while line:
162 refStart, refEnd, refTranscriptString, refBin = line
163 if refStart <= queryEnd and refEnd >= queryStart:
164 refTranscript = pickle.loads(str(refTranscriptString))
165 if refTranscript.overlapWith(queryTranscript):
166 overlap = True
167 self.nbOverlaps += 1
168 line = cursor.fetchone()
169 if overlap:
170 self.writer.addTranscript(queryTranscript)
171 self.nbWritten += 1
172 progress.done()
173 endTime = time.time()
174 self.timeSpent = endTime - startTime
175
176 def displayResults(self):
177 print "# queries: %d" % (self.nbQueries)
178 print "# refs: %d" % (self.nbRefs)
179 print "# written: %d (%d overlaps)" % (self.nbWritten, self.nbOverlaps)
180 print "time: %.2gs" % (self.timeSpent)
181
182 def run(self):
183 self.compare()
184 self.displayResults()
185
186 if __name__ == "__main__":
187
188 description = "Find Overlaps With Several Intervals Using Bin v1.0.1: Use MySQL binning to compare intervals. [Category: Personal]"
189
190 parser = OptionParser(description = description)
191 parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]")
192 parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")
193 parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]")
194 parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")
195 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]")
196 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
197 (options, args) = parser.parse_args()
198
199 fowsib = FindOverlapsWithSeveralIntervalsBin(options.verbosity)
200 fowsib.setQueryFile(options.inputFileName1, options.format1)
201 fowsib.setReferenceFile(options.inputFileName2, options.format2)
202 fowsib.setOutputFile(options.outputFileName)
203 fowsib.run()
204