comparison commons/tools/srptTableOverlap.py @ 31:0ab839023fe4

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 14:33:21 -0400
parents 94ab73e8a190
children
comparison
equal deleted inserted replaced
30:5677346472b5 31:0ab839023fe4
1 #!/usr/bin/env python
2
3 import os
4 import sys
5 import getopt
6 import logging
7 import string
8 import ConfigParser
9
10 from pyRepet.sql.TableAdaptator import *
11 import pyRepet.sql.RepetDBMySQL
12 import pyRepet.coord.Map
13 import pyRepet.coord.Path
14 import pyRepet.coord.Set
15
16
17 def help():
18 print
19 print "usage: %s [ options ]" % ( sys.argv[0].split("/")[-1] )
20 print "options:"
21 print " -h: this help"
22 print " -q: query table"
23 print " -s: subject table"
24 print " -p: by path"
25 print " -t: table type comparison: qtype/stype where qtype=[map,set,path] and stype=[path,set,map]"
26 print " -c: configuration file from TEdenovo or TEannot pipeline"
27 print " -H: MySQL host (if no configuration file)"
28 print " -U: MySQL user (if no configuration file)"
29 print " -P: MySQL password (if no configuration file)"
30 print " -D: MySQL database (if no configuration file)"
31 print
32
33
34 def pathOverlapByPath( qtable, qtype, stable, stype, db, fout, verbose=0 ):
35
36 if qtype == "path":
37 db.create_path_index( qtable )
38 qtablePathAdaptator = TablePathAdaptator( db, qtable )
39 path_num_list = qtablePathAdaptator.getPath_num()
40 elif qtype == "set":
41 db.create_set_index( qtable )
42 qtableSetAdaptator = TableSetAdaptator( db, qtable )
43 path_num_list = qtableSetAdaptator.getSet_num()
44 else:
45 string = "unknown query table type: %s" % ( qtype )
46 if verbose > 0:
47 print string
48 logging.error( string )
49 sys.exit(1)
50 string = "nb of paths in query table: %i" % (len(path_num_list) )
51 if verbose > 0:
52 print string
53 logging.info( string )
54
55 if stype == "path":
56 stablePathAdaptator = TableBinPathAdaptator( db, stable )
57 # stablePathAdaptator=TablePathAdaptator(db,stable)
58 elif stype == "set":
59 stableSetAdaptator = TableBinSetAdaptator( db, stable )
60 # stableSetAdaptator=TableSetAdaptator(db,stable)
61 else:
62 string = "unknown subject table type: %s" % ( stype )
63 if verbose > 0:
64 print string
65 logging.error( string )
66 sys.exit(1)
67
68 count = 0
69 for path_num in path_num_list:
70 if qtype == "path":
71 qlist = qtablePathAdaptator.getPathList_from_num( path_num )
72 qlist = pyRepet.coord.Path.path_list_rangeQ2Set( qlist )
73 elif qtype == "set":
74 qlist = qtableSetAdaptator.getSetList_from_num( path_num )
75
76 qlist.sort()
77 qmin, qmax = pyRepet.coord.Set.set_list_boundaries( qlist )
78
79 qmin = qmin - 1
80 qmax = qmax + 1
81 if stype == "path":
82 slist = stablePathAdaptator.getPathList_from_qcoord(qlist[0].seqname.split()[0],qmin,qmax)
83 slist = pyRepet.coord.Path.path_list_rangeQ2Set( slist )
84 elif stype == "set":
85 slist = stableSetAdaptator.getSetList_from_qcoord(qlist[0].seqname.split()[0],qmin,qmax)
86
87 if len(slist) > 0:
88 print "----------------------------------------"
89 print "query:"
90 pyRepet.coord.Set.set_list_show( qlist )
91 qlist=pyRepet.coord.Set.set_list_merge( qlist )
92 qsize=pyRepet.coord.Set.set_list_size( qlist )
93 print "query size=",qsize
94
95 slist_dict = pyRepet.coord.Set.set_list_split( slist )
96 subj_names = ""
97 for i,l in slist_dict.items():
98 if subj_names != "":
99 subj_names += "|"
100 subj_names += "%d:%s" % (i,l[0].name)
101 subj_count = len(slist_dict.keys())
102
103 print "subject:"
104 pyRepet.coord.Set.set_list_show( slist )
105 slist = pyRepet.coord.Set.set_list_merge( slist )
106 ssize = pyRepet.coord.Set.set_list_size( slist )
107 osize = pyRepet.coord.Set.set_list_overlap_size( qlist, slist )
108
109 print "subject size=",ssize
110 print "overlap size=",osize
111
112 fout.write("%d\t%s\t%d\t%s\t%d\t%d\t%d\t%f\t%f\n"\
113 %(path_num,qlist[0].name,\
114 qsize,\
115 subj_names,\
116 subj_count,\
117 ssize,\
118 osize,\
119 float(osize)/qsize,\
120 float(osize)/ssize))
121 else:
122 print "----------------------------------------"
123 print "query:"
124 pyRepet.coord.Set.set_list_show( qlist )
125 qlist = pyRepet.coord.Set.set_list_merge( qlist )
126 qsize = pyRepet.coord.Set.set_list_size( qlist )
127 print "query size=",qsize
128 print "No match!"
129 fout.write("%d\t%s\t%d\t-\t0\t0\t0\t0.0\t0.0\n"\
130 %(path_num,qlist[0].name,qsize))
131
132
133 def getOverlapAllPaths( qtable, qtype, stable, stype, db, verbose=0 ):
134 """
135 For each query in qtable, compute the overlap between its matches and the matches in stable.
136 """
137 if qtype =="map":
138 qtableAdaptator = pyRepet.sql.TableAdaptator.TableMapAdaptator( db, qtable )
139 elif qtype == "path":
140 qtableAdaptator = pyRepet.sql.TableAdaptator.TablePathAdaptator( db, qtable )
141 elif qtype == "set":
142 qtableAdaptator = pyRepet.sql.TableAdaptator.TableSetAdaptator( db, qtable )
143 else:
144 string = "unknown query table type: %s" % ( qtype )
145 if verbose > 0:
146 print string
147 logging.error( string )
148 sys.exit(1)
149
150 string = "fetching query table data..."
151 contigs = qtableAdaptator.getContig_name()
152 string += " %i contig(s)" % ( len(contigs) )
153 if verbose > 0:
154 print string; sys.stdout.flush()
155 logging.info( string )
156
157 if stype == "map":
158 stableAdaptator = pyRepet.sql.TableAdaptator.TableMapAdaptator( db, stable )
159 elif stype == "path":
160 stableAdaptator = pyRepet.sql.TableAdaptator.TablePathAdaptator( db, stable )
161 elif stype == "set":
162 stableAdaptator = pyRepet.sql.TableAdaptator.TableSetAdaptator( db, stable )
163 else:
164 string = "unknown subject table type: %s" % ( stype )
165 if verbose > 0:
166 print string
167 logging.error( string )
168 sys.exit(1)
169
170 string = "looking for overlaps with subject data..."
171 if verbose > 0:
172 print string; sys.stdout.flush()
173 logging.info( string )
174 sum_qsize = 0
175 sum_osize = 0
176 sum_non_osize = 0
177 for c in contigs:
178 string = "contig '%s': "%(c)
179 qlist = qtableAdaptator.getSetList_from_contig( c )
180 qlist = pyRepet.coord.Set.set_list_merge( qlist )
181 slist = stableAdaptator.getSetList_from_contig( c )
182 slist = pyRepet.coord.Set.set_list_merge( slist )
183 qsize = pyRepet.coord.Set.set_list_size( qlist )
184 osize = pyRepet.coord.Set.set_list_overlap_size( qlist, slist )
185 sum_osize += osize
186 sum_qsize += qsize
187 sum_non_osize += qsize - osize
188 string += "qsize=%d osize=%d" % ( qsize, osize )
189 logging.debug( string )
190 if verbose > 0:
191 print string; sys.stdout.flush()
192
193 string = "summary:"
194 string += "\ncumulative coverage of the query table: %i nt" % ( sum_qsize )
195 string += "\nsize of overlaps with the subject table: %i nt" % ( sum_osize )
196 string += "\n proportion of query: %.3f" % ( float(sum_osize)/sum_qsize )
197 string += "\nsize of non-overlaps with the subject table: %i nt" % ( sum_non_osize )
198 string += "\n proportion of query: %.3f" % ( float(sum_non_osize)/sum_qsize )
199 if verbose > 0:
200 print string; sys.stdout.flush()
201 logging.info( string )
202
203 return sum_osize, sum_non_osize, sum_qsize
204
205
206 def main ():
207 """
208 This program computes the overlaps between two tables recording spatial coordinates.
209 """
210 qtable = ""
211 stable = ""
212 type = ""
213 by_path = False
214 configFileName = ""
215 host = ""
216 user = ""
217 passwd = ""
218 db = ""
219 verbose = 0
220 try:
221 opts, args = getopt.getopt( sys.argv[1:], "hq:s:t:pc:H:U:P:D:v:" )
222 except getopt.GetoptError:
223 help()
224 sys.exit(1)
225 if len(args) != 0:
226 help()
227 sys.exit(1)
228 for o,a in opts:
229 if o == "-h":
230 help()
231 sys.exit(0)
232 elif o == "-q":
233 qtable = a
234 elif o == "-s":
235 stable = a
236 elif o == "-t":
237 type = a
238 elif o == "-p":
239 by_path = True
240 elif o == "-c":
241 configFileName = a
242 elif o == "-H":
243 host = a
244 elif o == "-U":
245 user = a
246 elif o == "-P":
247 passwd = a
248 elif o == "-D":
249 db = a
250 elif o == "-v":
251 verbose = int(a)
252 if qtable=="" or stable=="" or \
253 (configFileName== "" and (host=="" or \
254 user=="" or passwd=="" or db=="")):
255 print "ERROR: missing compulsory options"
256 help()
257 sys.exit(1)
258 if verbose > 0:
259 print "START %s" % (sys.argv[0].split("/")[-1])
260 sys.stdout.flush()
261
262 if configFileName != "":
263 config = ConfigParser.ConfigParser()
264 config.readfp( open(configFileName) )
265 host = config.get("repet_env","repet_host")
266 user = config.get("repet_env","repet_user")
267 passwd = config.get("repet_env","repet_pw")
268 dbname = config.get("repet_env","repet_db")
269
270 logfilename = qtable + "-" + stable + "-" + str(os.getpid()) + ".log"
271 handler = logging.FileHandler( logfilename )
272 formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s")
273 handler.setFormatter( formatter )
274 logging.getLogger('').addHandler(handler)
275 logging.getLogger('').setLevel(logging.DEBUG)
276 logging.info("started")
277
278 db = pyRepet.sql.RepetDBMySQL.RepetDB( user, host, passwd, dbname )
279
280 qtype, stype = type.split("/")
281
282 if not db.exist( qtable ):
283 if not os.path.exists( qtable ):
284 msg = "ERROR: neither table nor file '%s'" % ( qtable )
285 sys.stderr.write( "%s\n" % msg )
286 sys.exit(1)
287 tmp = qtable.replace(".","_")
288 db.create_table( db, tmp, qtable, qtype )
289 qtable = tmp
290 if not db.exist( stable ):
291 if not os.path.exists( stable ):
292 msg = "ERROR: neither table nor file '%s'" % ( stable )
293 sys.stderr.write( "%s\n" % msg )
294 sys.exit(1)
295 tmp = stable.replace(".","_")
296 db.create_table( db, tmp, stable, qtype )
297 stable = tmp
298
299 string = "input tables:"
300 string += "\nquery table: %s ('%s' format)" % ( qtable, qtype )
301 string += "\nsubject table: %s ('%s' format)" % ( stable, stype )
302 logging.info( string )
303
304 if by_path:
305 fout = open(qtable+"_vs_"+stable+".dat","w")
306 pathOverlapByPath( qtable, qtype, stable, stype, db, fout, verbose )
307 fout.close()
308 else:
309 getOverlapAllPaths( qtable, qtype, stable, stype, db, verbose )
310
311 logging.info("finished")
312
313 if verbose > 0:
314 print "END %s" % (sys.argv[0].split("/")[-1])
315 sys.stdout.flush()
316
317
318 if __name__ == "__main__":
319 main()