diff commons/tools/srptTableOverlap.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/commons/tools/srptTableOverlap.py	Mon Apr 29 03:20:15 2013 -0400
@@ -0,0 +1,319 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import getopt
+import logging
+import string
+import ConfigParser
+
+from pyRepet.sql.TableAdaptator import *
+import pyRepet.sql.RepetDBMySQL
+import pyRepet.coord.Map
+import pyRepet.coord.Path
+import pyRepet.coord.Set
+
+
+def help():
+    print
+    print "usage: %s [ options ]" % ( sys.argv[0].split("/")[-1] )
+    print "options:"
+    print "     -h: this help"
+    print "     -q: query table"
+    print "     -s: subject table"
+    print "     -p: by path"
+    print "     -t: table type comparison: qtype/stype where qtype=[map,set,path] and stype=[path,set,map]"
+    print "     -c: configuration file from TEdenovo or TEannot pipeline"
+    print "     -H: MySQL host (if no configuration file)"
+    print "     -U: MySQL user (if no configuration file)"
+    print "     -P: MySQL password (if no configuration file)"
+    print "     -D: MySQL database (if no configuration file)"
+    print
+    
+    
+def pathOverlapByPath( qtable, qtype, stable, stype, db, fout, verbose=0 ):
+    
+    if qtype == "path":
+        db.create_path_index( qtable )
+        qtablePathAdaptator = TablePathAdaptator( db, qtable )
+        path_num_list = qtablePathAdaptator.getPath_num()
+    elif qtype == "set":
+        db.create_set_index( qtable )
+        qtableSetAdaptator = TableSetAdaptator( db, qtable )
+        path_num_list = qtableSetAdaptator.getSet_num()
+    else:
+        string = "unknown query table type: %s" % ( qtype )
+        if verbose > 0:
+            print string
+        logging.error( string )
+        sys.exit(1)
+    string = "nb of paths in query table: %i" % (len(path_num_list) )
+    if verbose > 0:
+        print string
+    logging.info( string )
+    
+    if stype == "path":
+        stablePathAdaptator = TableBinPathAdaptator( db, stable )
+#        stablePathAdaptator=TablePathAdaptator(db,stable)
+    elif stype == "set":
+        stableSetAdaptator = TableBinSetAdaptator( db, stable )
+#        stableSetAdaptator=TableSetAdaptator(db,stable)
+    else:
+        string = "unknown subject table type: %s" % ( stype )
+        if verbose > 0:
+            print string
+        logging.error( string )
+        sys.exit(1)
+        
+    count = 0
+    for path_num in path_num_list:
+        if qtype == "path":
+            qlist = qtablePathAdaptator.getPathList_from_num( path_num )
+            qlist = pyRepet.coord.Path.path_list_rangeQ2Set( qlist )
+        elif qtype == "set":
+            qlist = qtableSetAdaptator.getSetList_from_num( path_num )
+            
+        qlist.sort()
+        qmin, qmax = pyRepet.coord.Set.set_list_boundaries( qlist )
+        
+        qmin = qmin - 1
+        qmax = qmax + 1
+        if stype == "path":
+            slist = stablePathAdaptator.getPathList_from_qcoord(qlist[0].seqname.split()[0],qmin,qmax)
+            slist = pyRepet.coord.Path.path_list_rangeQ2Set( slist )
+        elif stype == "set":
+            slist = stableSetAdaptator.getSetList_from_qcoord(qlist[0].seqname.split()[0],qmin,qmax)
+            
+        if len(slist) > 0:
+            print "----------------------------------------"
+            print "query:"
+            pyRepet.coord.Set.set_list_show( qlist )
+            qlist=pyRepet.coord.Set.set_list_merge( qlist )
+            qsize=pyRepet.coord.Set.set_list_size( qlist )
+            print "query size=",qsize
+            
+            slist_dict = pyRepet.coord.Set.set_list_split( slist )
+            subj_names = ""
+            for i,l in slist_dict.items():
+                if subj_names != "":
+                    subj_names += "|"
+                subj_names += "%d:%s" % (i,l[0].name)
+            subj_count = len(slist_dict.keys())
+            
+            print "subject:"
+            pyRepet.coord.Set.set_list_show( slist )
+            slist = pyRepet.coord.Set.set_list_merge( slist )
+            ssize = pyRepet.coord.Set.set_list_size( slist )
+            osize = pyRepet.coord.Set.set_list_overlap_size( qlist, slist )
+            
+            print "subject size=",ssize
+            print "overlap size=",osize
+            
+            fout.write("%d\t%s\t%d\t%s\t%d\t%d\t%d\t%f\t%f\n"\
+                       %(path_num,qlist[0].name,\
+                         qsize,\
+                         subj_names,\
+                         subj_count,\
+                         ssize,\
+                         osize,\
+                         float(osize)/qsize,\
+                         float(osize)/ssize))
+        else:
+            print "----------------------------------------"
+            print "query:"
+            pyRepet.coord.Set.set_list_show( qlist )
+            qlist = pyRepet.coord.Set.set_list_merge( qlist )
+            qsize = pyRepet.coord.Set.set_list_size( qlist )
+            print "query size=",qsize
+            print "No match!"
+            fout.write("%d\t%s\t%d\t-\t0\t0\t0\t0.0\t0.0\n"\
+                       %(path_num,qlist[0].name,qsize))
+            
+            
+def getOverlapAllPaths( qtable, qtype, stable, stype, db, verbose=0 ):
+    """
+    For each query in qtable, compute the overlap between its matches and the matches in stable.
+    """
+    if qtype =="map":
+        qtableAdaptator = pyRepet.sql.TableAdaptator.TableMapAdaptator( db, qtable )
+    elif qtype == "path":
+        qtableAdaptator = pyRepet.sql.TableAdaptator.TablePathAdaptator( db, qtable )
+    elif qtype == "set":
+        qtableAdaptator = pyRepet.sql.TableAdaptator.TableSetAdaptator( db, qtable )
+    else:
+        string = "unknown query table type: %s" % ( qtype )
+        if verbose > 0:
+            print string
+        logging.error( string )
+        sys.exit(1)
+        
+    string = "fetching query table data..."
+    contigs = qtableAdaptator.getContig_name()
+    string += " %i contig(s)" % ( len(contigs) )
+    if verbose > 0:
+        print string; sys.stdout.flush()
+    logging.info( string )
+    
+    if stype == "map":
+        stableAdaptator = pyRepet.sql.TableAdaptator.TableMapAdaptator( db, stable )
+    elif stype == "path":
+        stableAdaptator = pyRepet.sql.TableAdaptator.TablePathAdaptator( db, stable )
+    elif stype == "set":
+        stableAdaptator = pyRepet.sql.TableAdaptator.TableSetAdaptator( db, stable )
+    else:
+        string = "unknown subject table type: %s" % ( stype )
+        if verbose > 0:
+            print string
+        logging.error( string )
+        sys.exit(1)
+        
+    string = "looking for overlaps with subject data..."
+    if verbose > 0:
+        print string; sys.stdout.flush()
+    logging.info( string )
+    sum_qsize = 0
+    sum_osize = 0
+    sum_non_osize = 0
+    for c in contigs:
+        string = "contig '%s': "%(c)
+        qlist = qtableAdaptator.getSetList_from_contig( c )
+        qlist = pyRepet.coord.Set.set_list_merge( qlist )
+        slist = stableAdaptator.getSetList_from_contig( c )
+        slist = pyRepet.coord.Set.set_list_merge( slist )
+        qsize = pyRepet.coord.Set.set_list_size( qlist )
+        osize = pyRepet.coord.Set.set_list_overlap_size( qlist, slist )
+        sum_osize += osize
+        sum_qsize += qsize
+        sum_non_osize += qsize - osize
+        string += "qsize=%d osize=%d" % ( qsize, osize )
+        logging.debug( string )
+        if verbose > 0:
+            print string; sys.stdout.flush()
+            
+    string = "summary:"
+    string += "\ncumulative coverage of the query table: %i nt" % ( sum_qsize )
+    string += "\nsize of overlaps with the subject table: %i nt" % ( sum_osize )
+    string += "\n proportion of query: %.3f" % ( float(sum_osize)/sum_qsize )
+    string += "\nsize of non-overlaps with the subject table: %i nt" % ( sum_non_osize )
+    string += "\n proportion of query: %.3f" % ( float(sum_non_osize)/sum_qsize )
+    if verbose > 0:
+        print string; sys.stdout.flush()
+    logging.info( string )
+    
+    return sum_osize, sum_non_osize, sum_qsize
+
+
+def main ():
+    """
+    This program computes the overlaps between two tables recording spatial coordinates.
+    """
+    qtable = ""
+    stable = ""
+    type = ""
+    by_path = False
+    configFileName = ""
+    host = ""
+    user = ""
+    passwd = ""
+    db = ""
+    verbose = 0
+    try:
+        opts, args = getopt.getopt( sys.argv[1:], "hq:s:t:pc:H:U:P:D:v:" )
+    except getopt.GetoptError:
+        help()
+        sys.exit(1)
+    if len(args) != 0:
+        help()
+        sys.exit(1)
+    for o,a in opts:
+        if o == "-h":
+            help()
+            sys.exit(0)
+        elif o == "-q":
+            qtable = a
+        elif o == "-s":
+            stable = a
+        elif o == "-t":
+            type = a
+        elif o == "-p":
+            by_path = True
+        elif o == "-c":
+            configFileName = a
+        elif o == "-H":
+            host = a
+        elif o == "-U":
+            user = a 
+        elif o == "-P":
+            passwd = a
+        elif o == "-D":
+            db = a
+        elif o == "-v":
+            verbose = int(a)
+    if qtable=="" or stable=="" or \
+    (configFileName== "" and (host=="" or \
+                              user=="" or passwd=="" or db=="")):
+        print "ERROR: missing compulsory options"
+        help()
+        sys.exit(1)
+    if verbose > 0:
+        print "START %s" % (sys.argv[0].split("/")[-1])
+        sys.stdout.flush()
+        
+    if configFileName != "":
+        config = ConfigParser.ConfigParser()
+        config.readfp( open(configFileName) )
+        host = config.get("repet_env","repet_host")
+        user = config.get("repet_env","repet_user")
+        passwd = config.get("repet_env","repet_pw")
+        dbname = config.get("repet_env","repet_db")
+        
+    logfilename = qtable + "-" + stable + "-" + str(os.getpid()) + ".log"
+    handler = logging.FileHandler( logfilename )
+    formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s")
+    handler.setFormatter( formatter )
+    logging.getLogger('').addHandler(handler)
+    logging.getLogger('').setLevel(logging.DEBUG)
+    logging.info("started")
+    
+    db = pyRepet.sql.RepetDBMySQL.RepetDB( user, host, passwd, dbname )
+    
+    qtype, stype = type.split("/")
+    
+    if not db.exist( qtable ):
+        if not os.path.exists( qtable ):
+            msg = "ERROR: neither table nor file '%s'" % ( qtable )
+            sys.stderr.write( "%s\n" % msg )
+            sys.exit(1)
+        tmp = qtable.replace(".","_")
+        db.create_table( db, tmp, qtable, qtype )
+        qtable = tmp
+    if not db.exist( stable ):
+        if not os.path.exists( stable ):
+            msg = "ERROR: neither table nor file '%s'" % ( stable )
+            sys.stderr.write( "%s\n" % msg )
+            sys.exit(1)
+        tmp = stable.replace(".","_")
+        db.create_table( db, tmp, stable, qtype )
+        stable = tmp
+        
+    string = "input tables:"
+    string += "\nquery table: %s ('%s' format)" % ( qtable, qtype )
+    string += "\nsubject table: %s ('%s' format)" % ( stable, stype )
+    logging.info( string )
+    
+    if by_path:
+        fout = open(qtable+"_vs_"+stable+".dat","w")
+        pathOverlapByPath( qtable, qtype, stable, stype, db, fout, verbose )
+        fout.close()
+    else:
+        getOverlapAllPaths( qtable, qtype, stable, stype, db, verbose )
+        
+    logging.info("finished")
+    
+    if verbose > 0:
+        print "END %s" % (sys.argv[0].split("/")[-1])
+        sys.stdout.flush()
+        
+        
+if __name__ == "__main__":
+    main()