comparison smart_toolShed/SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervalsIndex.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2011
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 import random, os, time, MySQLdb
32 from optparse import OptionParser
33 from commons.core.parsing.ParserChooser import ParserChooser
34 from commons.core.writer.TranscriptWriter import TranscriptWriter
35 from SMART.Java.Python.structure.Transcript import Transcript
36 from SMART.Java.Python.misc.Progress import Progress
37 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
38
39
40 class FindOverlapsWithSeveralIntervalsIndex(object):
41
42 def __init__(self, verbosity):
43 self.verbosity = verbosity
44 randomNumber = random.randint(0, 10000)
45 self.dbName = "smartdb"
46 if "SMARTTMPPATH" in os.environ:
47 self.dbName = os.join(os.environ["SMARTTMPPATH"], self.dbName)
48 self.db = MySQLdb.connect(db = self.dbName)
49 self.tableName = "table_%s" % (randomNumber)
50 self.nbQueries = 0
51 self.nbRefs = 0
52 self.nbOverlaps = 0
53
54 def __del__(self):
55 cursor = self.db.cursor()
56 cursor.execute("DROP TABLE IF EXISTS %s" % (self.tableName))
57
58
59 def setReferenceFile(self, fileName, format):
60 cursor = self.db.cursor()
61 cursor.execute("CREATE TABLE %s (start INT, end INT)" % (self.tableName))
62 cursor.execute("CREATE INDEX index_%s ON %s (start, end)" % (self.tableName, self.tableName))
63 chooser = ParserChooser(self.verbosity)
64 chooser.findFormat(format)
65 parser = chooser.getParser(fileName)
66 progress = UnlimitedProgress(1000, "Reading references", self.verbosity)
67 for transcript in parser.getIterator():
68 start = transcript.getStart()
69 end = transcript.getEnd()
70 cursor = self.db.cursor()
71 cursor.execute("INSERT INTO %s (start, end) VALUES (%d, %d)" % (self.tableName, start, end))
72 self.nbRefs += 1
73 progress.inc()
74 self.db.commit()
75 progress.done()
76
77 def setQueryFile(self, fileName, format):
78 chooser = ParserChooser(self.verbosity)
79 chooser.findFormat(format)
80 self.queryParser = chooser.getParser(fileName)
81 self.nbQueries = self.queryParser.getNbTranscripts()
82
83 def setOutputFile(self, fileName):
84 self.writer = TranscriptWriter(fileName, "gff3", self.verbosity)
85
86 def compare(self):
87 progress = Progress(self.nbQueries, "Reading queries", self.verbosity)
88 startTime = time.time()
89 for queryTranscript in self.queryParser.getIterator():
90 queryStart = queryTranscript.getStart()
91 queryEnd = queryTranscript.getEnd()
92 command = "SELECT 1 FROM %s WHERE start <= %d and end >= %d" % (self.tableName, queryEnd, queryStart)
93 cursor = self.db.cursor()
94 cursor.execute(command)
95 overlap = False
96 line = cursor.fetchone()
97 while line:
98 overlap = True
99 line = cursor.fetchone()
100 if overlap:
101 self.writer.addTranscript(queryTranscript)
102 self.nbOverlaps += 1
103 progress.inc()
104 progress.done()
105 endTime = time.time()
106 self.timeSpent = endTime - startTime
107
108 def displayResults(self):
109 print "# queries: %d" % (self.nbQueries)
110 print "# refs: %d" % (self.nbRefs)
111 print "# overlaps: %d" % (self.nbOverlaps)
112 print "time: %.2gs" % (self.timeSpent)
113
114 def run(self):
115 self.compare()
116 self.displayResults()
117
118 if __name__ == "__main__":
119
120 description = "Find Overlaps With Several Intervals Using Indices v1.0.1: Use MySQL to compare intervals. [Category: Personal]"
121
122 parser = OptionParser(description = description)
123 parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]")
124 parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")
125 parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]")
126 parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")
127 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]")
128 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
129 (options, args) = parser.parse_args()
130
131 fowsii = FindOverlapsWithSeveralIntervalsIndex(options.verbosity)
132 fowsii.setQueryFile(options.inputFileName1, options.format1)
133 fowsii.setReferenceFile(options.inputFileName2, options.format2)
134 fowsii.setOutputFile(options.outputFileName)
135 fowsii.run()
136
137