view commons/tools/srptTableOverlap.py @ 31:0ab839023fe4

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 14:33:21 -0400
parents 94ab73e8a190
children
line wrap: on
line source

#!/usr/bin/env python

import os
import sys
import getopt
import logging
import string
import ConfigParser

from pyRepet.sql.TableAdaptator import *
import pyRepet.sql.RepetDBMySQL
import pyRepet.coord.Map
import pyRepet.coord.Path
import pyRepet.coord.Set


def help():
    print
    print "usage: %s [ options ]" % ( sys.argv[0].split("/")[-1] )
    print "options:"
    print "     -h: this help"
    print "     -q: query table"
    print "     -s: subject table"
    print "     -p: by path"
    print "     -t: table type comparison: qtype/stype where qtype=[map,set,path] and stype=[path,set,map]"
    print "     -c: configuration file from TEdenovo or TEannot pipeline"
    print "     -H: MySQL host (if no configuration file)"
    print "     -U: MySQL user (if no configuration file)"
    print "     -P: MySQL password (if no configuration file)"
    print "     -D: MySQL database (if no configuration file)"
    print
    
    
def pathOverlapByPath( qtable, qtype, stable, stype, db, fout, verbose=0 ):
    
    if qtype == "path":
        db.create_path_index( qtable )
        qtablePathAdaptator = TablePathAdaptator( db, qtable )
        path_num_list = qtablePathAdaptator.getPath_num()
    elif qtype == "set":
        db.create_set_index( qtable )
        qtableSetAdaptator = TableSetAdaptator( db, qtable )
        path_num_list = qtableSetAdaptator.getSet_num()
    else:
        string = "unknown query table type: %s" % ( qtype )
        if verbose > 0:
            print string
        logging.error( string )
        sys.exit(1)
    string = "nb of paths in query table: %i" % (len(path_num_list) )
    if verbose > 0:
        print string
    logging.info( string )
    
    if stype == "path":
        stablePathAdaptator = TableBinPathAdaptator( db, stable )
#        stablePathAdaptator=TablePathAdaptator(db,stable)
    elif stype == "set":
        stableSetAdaptator = TableBinSetAdaptator( db, stable )
#        stableSetAdaptator=TableSetAdaptator(db,stable)
    else:
        string = "unknown subject table type: %s" % ( stype )
        if verbose > 0:
            print string
        logging.error( string )
        sys.exit(1)
        
    count = 0
    for path_num in path_num_list:
        if qtype == "path":
            qlist = qtablePathAdaptator.getPathList_from_num( path_num )
            qlist = pyRepet.coord.Path.path_list_rangeQ2Set( qlist )
        elif qtype == "set":
            qlist = qtableSetAdaptator.getSetList_from_num( path_num )
            
        qlist.sort()
        qmin, qmax = pyRepet.coord.Set.set_list_boundaries( qlist )
        
        qmin = qmin - 1
        qmax = qmax + 1
        if stype == "path":
            slist = stablePathAdaptator.getPathList_from_qcoord(qlist[0].seqname.split()[0],qmin,qmax)
            slist = pyRepet.coord.Path.path_list_rangeQ2Set( slist )
        elif stype == "set":
            slist = stableSetAdaptator.getSetList_from_qcoord(qlist[0].seqname.split()[0],qmin,qmax)
            
        if len(slist) > 0:
            print "----------------------------------------"
            print "query:"
            pyRepet.coord.Set.set_list_show( qlist )
            qlist=pyRepet.coord.Set.set_list_merge( qlist )
            qsize=pyRepet.coord.Set.set_list_size( qlist )
            print "query size=",qsize
            
            slist_dict = pyRepet.coord.Set.set_list_split( slist )
            subj_names = ""
            for i,l in slist_dict.items():
                if subj_names != "":
                    subj_names += "|"
                subj_names += "%d:%s" % (i,l[0].name)
            subj_count = len(slist_dict.keys())
            
            print "subject:"
            pyRepet.coord.Set.set_list_show( slist )
            slist = pyRepet.coord.Set.set_list_merge( slist )
            ssize = pyRepet.coord.Set.set_list_size( slist )
            osize = pyRepet.coord.Set.set_list_overlap_size( qlist, slist )
            
            print "subject size=",ssize
            print "overlap size=",osize
            
            fout.write("%d\t%s\t%d\t%s\t%d\t%d\t%d\t%f\t%f\n"\
                       %(path_num,qlist[0].name,\
                         qsize,\
                         subj_names,\
                         subj_count,\
                         ssize,\
                         osize,\
                         float(osize)/qsize,\
                         float(osize)/ssize))
        else:
            print "----------------------------------------"
            print "query:"
            pyRepet.coord.Set.set_list_show( qlist )
            qlist = pyRepet.coord.Set.set_list_merge( qlist )
            qsize = pyRepet.coord.Set.set_list_size( qlist )
            print "query size=",qsize
            print "No match!"
            fout.write("%d\t%s\t%d\t-\t0\t0\t0\t0.0\t0.0\n"\
                       %(path_num,qlist[0].name,qsize))
            
            
def getOverlapAllPaths( qtable, qtype, stable, stype, db, verbose=0 ):
    """
    For each query in qtable, compute the overlap between its matches and the matches in stable.
    """
    if qtype =="map":
        qtableAdaptator = pyRepet.sql.TableAdaptator.TableMapAdaptator( db, qtable )
    elif qtype == "path":
        qtableAdaptator = pyRepet.sql.TableAdaptator.TablePathAdaptator( db, qtable )
    elif qtype == "set":
        qtableAdaptator = pyRepet.sql.TableAdaptator.TableSetAdaptator( db, qtable )
    else:
        string = "unknown query table type: %s" % ( qtype )
        if verbose > 0:
            print string
        logging.error( string )
        sys.exit(1)
        
    string = "fetching query table data..."
    contigs = qtableAdaptator.getContig_name()
    string += " %i contig(s)" % ( len(contigs) )
    if verbose > 0:
        print string; sys.stdout.flush()
    logging.info( string )
    
    if stype == "map":
        stableAdaptator = pyRepet.sql.TableAdaptator.TableMapAdaptator( db, stable )
    elif stype == "path":
        stableAdaptator = pyRepet.sql.TableAdaptator.TablePathAdaptator( db, stable )
    elif stype == "set":
        stableAdaptator = pyRepet.sql.TableAdaptator.TableSetAdaptator( db, stable )
    else:
        string = "unknown subject table type: %s" % ( stype )
        if verbose > 0:
            print string
        logging.error( string )
        sys.exit(1)
        
    string = "looking for overlaps with subject data..."
    if verbose > 0:
        print string; sys.stdout.flush()
    logging.info( string )
    sum_qsize = 0
    sum_osize = 0
    sum_non_osize = 0
    for c in contigs:
        string = "contig '%s': "%(c)
        qlist = qtableAdaptator.getSetList_from_contig( c )
        qlist = pyRepet.coord.Set.set_list_merge( qlist )
        slist = stableAdaptator.getSetList_from_contig( c )
        slist = pyRepet.coord.Set.set_list_merge( slist )
        qsize = pyRepet.coord.Set.set_list_size( qlist )
        osize = pyRepet.coord.Set.set_list_overlap_size( qlist, slist )
        sum_osize += osize
        sum_qsize += qsize
        sum_non_osize += qsize - osize
        string += "qsize=%d osize=%d" % ( qsize, osize )
        logging.debug( string )
        if verbose > 0:
            print string; sys.stdout.flush()
            
    string = "summary:"
    string += "\ncumulative coverage of the query table: %i nt" % ( sum_qsize )
    string += "\nsize of overlaps with the subject table: %i nt" % ( sum_osize )
    string += "\n proportion of query: %.3f" % ( float(sum_osize)/sum_qsize )
    string += "\nsize of non-overlaps with the subject table: %i nt" % ( sum_non_osize )
    string += "\n proportion of query: %.3f" % ( float(sum_non_osize)/sum_qsize )
    if verbose > 0:
        print string; sys.stdout.flush()
    logging.info( string )
    
    return sum_osize, sum_non_osize, sum_qsize


def main ():
    """
    This program computes the overlaps between two tables recording spatial coordinates.
    """
    qtable = ""
    stable = ""
    type = ""
    by_path = False
    configFileName = ""
    host = ""
    user = ""
    passwd = ""
    db = ""
    verbose = 0
    try:
        opts, args = getopt.getopt( sys.argv[1:], "hq:s:t:pc:H:U:P:D:v:" )
    except getopt.GetoptError:
        help()
        sys.exit(1)
    if len(args) != 0:
        help()
        sys.exit(1)
    for o,a in opts:
        if o == "-h":
            help()
            sys.exit(0)
        elif o == "-q":
            qtable = a
        elif o == "-s":
            stable = a
        elif o == "-t":
            type = a
        elif o == "-p":
            by_path = True
        elif o == "-c":
            configFileName = a
        elif o == "-H":
            host = a
        elif o == "-U":
            user = a 
        elif o == "-P":
            passwd = a
        elif o == "-D":
            db = a
        elif o == "-v":
            verbose = int(a)
    if qtable=="" or stable=="" or \
    (configFileName== "" and (host=="" or \
                              user=="" or passwd=="" or db=="")):
        print "ERROR: missing compulsory options"
        help()
        sys.exit(1)
    if verbose > 0:
        print "START %s" % (sys.argv[0].split("/")[-1])
        sys.stdout.flush()
        
    if configFileName != "":
        config = ConfigParser.ConfigParser()
        config.readfp( open(configFileName) )
        host = config.get("repet_env","repet_host")
        user = config.get("repet_env","repet_user")
        passwd = config.get("repet_env","repet_pw")
        dbname = config.get("repet_env","repet_db")
        
    logfilename = qtable + "-" + stable + "-" + str(os.getpid()) + ".log"
    handler = logging.FileHandler( logfilename )
    formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s")
    handler.setFormatter( formatter )
    logging.getLogger('').addHandler(handler)
    logging.getLogger('').setLevel(logging.DEBUG)
    logging.info("started")
    
    db = pyRepet.sql.RepetDBMySQL.RepetDB( user, host, passwd, dbname )
    
    qtype, stype = type.split("/")
    
    if not db.exist( qtable ):
        if not os.path.exists( qtable ):
            msg = "ERROR: neither table nor file '%s'" % ( qtable )
            sys.stderr.write( "%s\n" % msg )
            sys.exit(1)
        tmp = qtable.replace(".","_")
        db.create_table( db, tmp, qtable, qtype )
        qtable = tmp
    if not db.exist( stable ):
        if not os.path.exists( stable ):
            msg = "ERROR: neither table nor file '%s'" % ( stable )
            sys.stderr.write( "%s\n" % msg )
            sys.exit(1)
        tmp = stable.replace(".","_")
        db.create_table( db, tmp, stable, qtype )
        stable = tmp
        
    string = "input tables:"
    string += "\nquery table: %s ('%s' format)" % ( qtable, qtype )
    string += "\nsubject table: %s ('%s' format)" % ( stable, stype )
    logging.info( string )
    
    if by_path:
        fout = open(qtable+"_vs_"+stable+".dat","w")
        pathOverlapByPath( qtable, qtype, stable, stype, db, fout, verbose )
        fout.close()
    else:
        getOverlapAllPaths( qtable, qtype, stable, stype, db, verbose )
        
    logging.info("finished")
    
    if verbose > 0:
        print "END %s" % (sys.argv[0].split("/")[-1])
        sys.stdout.flush()
        
        
if __name__ == "__main__":
    main()