Mercurial > repos > yufei-luo > s_mart

#!/usr/bin/env python

###@file
# Read a file recording matches in the 'tab' format (output from Matcher) and return the number of matches between queries and subjects being CC, CI, IC and II.
# A match is said to be CC (for complete-complete) when both query and subject match over x% of their entire respective length. By default, x=95.
#
# usage: tabFileReader.py [ options ]
# options:
#      -h: this help
#      -m: name of the file recording the matches (format='tab', output from Matcher)
#      -q: name of the fasta file recording the queries
#      -s: name of the fasta file recording the subjects
#      -t: threshold over which the match is 'complete', in % of the seq length (default=95)
#      -i: identity below which matches are ignored (default=0)
#      -l: length below which matches are ignored (default=0)
#      -o: overlap on query and subject below which matches are ignored (default=0)
#      -v: verbose (default=0/1)

import sys
import getopt
from string import *

import pyRepet.seq.BioseqDB
import pyRepet.util.Stat

#TODO: remove case changes in headers (4 lower() method calls in this script)

#----------------------------------------------------------------------------

def help():
    print
    print "usage: %s [ options ]" % ( sys.argv[0].split("/")[-1] )
    print "options:"
    print "     -h: this help"
    print "     -m: name of the file recording the matches (format='tab', output from Matcher)"
    print "     -q: name of the fasta file recording the queries"
    print "     -s: name of the fasta file recording the subjects"
    print "     -t: coverage threshold over which the match is 'complete' (in %% of the seq length, default=95)"
    print "     -i: identity below which matches are ignored (default=0)"
    print "     -l: length below which matches are ignored (default=0)"
    print "     -o: overlap on query and subject below which matches are ignored (default=0)"
    print "     -I: identity threshold for 'CC' matches (default=90)"
    print "     -E: E-value threshold for 'CC' matches (default=1e-10)"
    print "     -T: coverage threshold for match length on query compare to subject length (default=90)"
    print "     -v: verbose (default=0/1)"
    print

#----------------------------------------------------------------------------

#here are the fields of a '.tab' file:
#[0]: query sequence name
#[1]: whole match start coordinate on the query sequence
#[2]: whole match end coordinate on the query sequence
#[3]: length on the query sequence
#[4]: length in percentage of the query sequence
#[5]: length on the query relative to the subject length in percentage
#[6]: subject sequence name
#[7]: whole match start coordinate on the subject sequence
#[8]: whole match end coordinate on the subject sequence
#[9]: length on the subject sequence
#[10]: length in percentage of the subject sequence
#[11]: BLAST E-value
#[12]: BLAST score
#[13]: identity percentage
#[14]: path

class tabFileReader( object ):

    def __init__( self, line ):

        columns = line.split("\t")

        self.name_sbj = (columns[6])
        self.length_sbj = int(round(int(columns[9])/float(columns[10]),0))  #length of the subject
        self.prct_sbj = float(columns[10]) * 100  #prct_sbj = length of the match on the subject divided by the length of the subject * 100
        if int(columns[7]) < int(columns[8]):
            self.start_sbj = int(columns[7])                        #start of the match on the subject
            self.end_sbj = int(columns[8])                          #end of the match on the subject
        else:
            self.start_sbj = int(columns[8])
            self.end_sbj = int(columns[7])
        self.sbj_dist_ends = int(columns[9])                    #length on the subject that matches with the query

        self.name_qry = columns[0]
        self.length_qry = int(round(int(columns[3])/float(columns[4]),0))  #length of the query
        self.prct_qry = float(columns[4]) * 100   #prct_qry = length of the match on the query divided by the length of the query * 100
        if int(columns[1]) < int(columns[2]):
            self.start_qry = int(columns[1])                        #start of the match on the query
            self.end_qry = int(columns[2])                          #end of the match on the query
        else:
            self.start_qry = int(columns[2])
            self.end_qry = int(columns[1])
        self.qry_dist_ends = int(columns[3])                    #length on the query that matches with the subject

        self.length_match = int(columns[3])
        self.prct_matchQryOverSbj = float(columns[5]) * 100   #length on the query relative to the subject length in percentage
        self.identity = float(columns[13])
        self.score = int(columns[12])
        self.evalue = float(columns[11])

        self.sbj2qry = [self.length_sbj,self.prct_sbj,self.start_sbj,self.end_sbj,self.name_qry,self.length_sbj,self.prct_qry,self.start_qry,self.end_qry,self.identity,self.score]

        self.qry2sbj = [self.length_qry,self.prct_qry,self.start_qry,self.end_qry,self.name_sbj,self.length_sbj,self.prct_sbj,self.start_sbj,self.end_sbj,self.identity,self.score]

#----------------------------------------------------------------------------

def make_dico( lMatches ):
    """
    Record the matches in two dictionaries which keys are the queries or the subjects.
    """

    Sbj2Qry = {}
    Qry2Sbj = {}

    for match in lMatches:
        if Sbj2Qry.has_key( match.name_sbj ):
            Sbj2Qry[match.name_sbj].append( match )
        else:
            Sbj2Qry[match.name_sbj] = [ match ]
        if Qry2Sbj.has_key( match.name_qry ):
            Qry2Sbj[match.name_qry].append( match )
        else:
            Qry2Sbj[match.name_qry] = [ match ]

    return [ Sbj2Qry, Qry2Sbj ]

#----------------------------------------------------------------------------

def find_UniqRedun( list_matchs ):

    list_total_sbj = [];list_total_qry = []
    list_uniq_sbj = [];list_redun_sbj = []
    list_uniq_qry = [];list_redun_qry = []

    for match in list_matchs:
        list_total_sbj.append(match.name_sbj)
        list_total_qry.append(match.name_qry)

    for name_sbj in list_total_sbj:
        if list_total_sbj.count(name_sbj) == 1:
            list_uniq_sbj.append(name_sbj)
        else:
            if name_sbj not in list_redun_sbj:
                list_redun_sbj.append(name_sbj)

    for name_qry in list_total_qry:
        if list_total_qry.count(name_qry) == 1:
            list_uniq_qry.append(name_qry)
        else:
            if name_qry not in list_redun_qry:
                list_redun_qry.append(name_qry)

    return [ list_uniq_sbj, list_redun_sbj, list_uniq_qry, list_redun_qry ]

#----------------------------------------------------------------------------

def remove( all, sup_sbjqry, sup_sbj, sup_qry, inf_sbjqry ):

    for name_sbj in all.keys():

        if sup_sbjqry.has_key( name_sbj ) and sup_sbj.has_key( name_sbj ):
            del sup_sbj[ name_sbj ]

        if sup_sbjqry.has_key( name_sbj ) and sup_qry.has_key( name_sbj ):
            del sup_qry[ name_sbj ]

        if sup_sbjqry.has_key( name_sbj ) and inf_sbjqry.has_key( name_sbj ):
            del inf_sbjqry[ name_sbj ]

        if sup_sbj.has_key( name_sbj ) and sup_qry.has_key( name_sbj ):
            del sup_qry[ name_sbj ]

        if sup_sbj.has_key( name_sbj ) and inf_sbjqry.has_key( name_sbj ):
            del inf_sbjqry[ name_sbj ]

        if sup_qry.has_key( name_sbj ) and inf_sbjqry.has_key( name_sbj ):
            del inf_sbjqry[ name_sbj ]

    return [ sup_sbj, sup_qry, inf_sbjqry ]

#----------------------------------------------------------------------------

def write_output( outFile, match_type, Sbj2Qry, dSbj2Cat, Qry2Sbj, dQry2Cat ):
    """
    Save the results (subjects in each category and its matches) in a human-readable way.
    """

    if match_type == 'CC':
        msg = "Matches with L >= %i%% for subject and query (CC)" % ( thresholdCoverage )
    elif match_type == 'CI':
        msg = "Matches with L >= %i%% for subject and L < %i%% for query (CI)" % ( thresholdCoverage, thresholdCoverage )
    elif match_type == 'IC':
        msg = "Matches with L < %i%% for subject and L >= %i%% for query (IC)" % ( thresholdCoverage, thresholdCoverage )
    elif match_type == 'II':
        msg ="Matches with L < %i%% for subject and query (II)" % ( thresholdCoverage )
    if verbose > 1:
        print "%s: %i subjects" % ( msg, len(Sbj2Qry.keys()) )
    outFile.write("\n%s\n" % ( msg ) )

    for name_sbj in Sbj2Qry.keys():
        matchs = Sbj2Qry[name_sbj]
        if len(matchs) == 1:
            outFile.write("-> subject %s (%s: %s,%s) matches with query %s (%s: %s,%s): prct_sbj %.3f & prct_qry %.3f (id=%.3f,Eval=%g)\n" % (name_sbj,matchs[0].length_sbj,matchs[0].start_sbj,matchs[0].end_sbj,matchs[0].name_qry,matchs[0].length_qry,matchs[0].start_qry,matchs[0].end_qry,matchs[0].prct_sbj,matchs[0].prct_qry,matchs[0].identity,matchs[0].evalue))
        else:
            outFile.write("-> subject %s (%s: %s,%s) matches with %s queries:\n" % (name_sbj,matchs[0].length_sbj,matchs[0].start_sbj,matchs[0].end_sbj,len(matchs)))
            for match in matchs:
                outFile.write("%s versus %s (%s: %s,%s): prct_sbj %.3f & prct_qry %.3f (id=%.3f,Eval=%g)\n"%(name_sbj,match.name_qry,match.length_qry,match.start_qry,match.end_qry,match.prct_sbj,match.prct_qry,match.identity,match.evalue))

    tmpList = []
    for name_sbj in Sbj2Qry.keys():
        tmpList.append( name_sbj.split(" ")[0].lower() )
    tmpList.sort()
    for name_sbj in tmpList:
        outFile.write( name_sbj+"\n" )
        dSbj2Cat[ name_sbj ] = match_type

    tmpList = []
    for name_qry in Qry2Sbj.keys():
        tmpList.append( name_qry.split(" ")[0].lower() )
    tmpList.sort()
    for name_qry in tmpList:
        outFile.write( name_qry+"\n" )
        dQry2Cat[ name_qry ] = match_type

#----------------------------------------------------------------------------

def writeSubjectCategory( dSbj2Cat ):
    """
    Save the category (CC/CI/IC/II/NA) in which each subject has been found.

    @param dSbj2Cat: dictionary which keys are subject names and values the category of that subject
    @type dSbj2Cat: dictionary
    """

    # sort the subject names in alphabetical order
    lSbjSorted = dSbj2Cat.keys()
    lSbjSorted.sort()

    catFile = open( tabFileName + "_sbjCategories.txt", "w" )
    for sbj in lSbjSorted:
        string = "%s\t%s\n" % ( sbj, dSbj2Cat[ sbj ] )
        catFile.write( string )
    catFile.close()

#----------------------------------------------------------------------------

def writeQueryCategory( dQry2Cat ):
    """
    Save the category (CC/CI/IC/II/NA) in which each query has been found.

    @param dQry2Cat: dictionary which keys are query names and values the category of that query
    @type dQry2Cat: dictionary
    """

    # sort the query names in alphabetical order
    lQrySorted = dQry2Cat.keys()
    lQrySorted.sort()

    catFile = open( tabFileName + "_qryCategories.txt", "w" )
    for qry in lQrySorted:
        string = "%s\t%s\n" % ( qry, dQry2Cat[ qry ] )
        catFile.write( string )
    catFile.close()

#----------------------------------------------------------------------------

def main():

    global tabFileName
    tabFileName = ""
    qryFileName = ""
    sbjFileName = ""
    global thresholdCoverage
    thresholdCoverage = 95
    minIdentity = 0
    minLength = 0
    minOverlap = 0
    global thresholdIdentity
    thresholdIdentity = 90
    global thresholdEvalue
    thresholdEvalue = 1e-10
    global thresholdCoverageMatch
    thresholdCoverageMatch = 90
    global verbose
    verbose = 0

    try:
        opts, args = getopt.getopt(sys.argv[1:],"hm:q:s:t:i:l:I:E:T:o:v:")
    except getopt.GetoptError, err:
        print str(err); help(); sys.exit(1)
    for o,a in opts:
        if o == "-h":
            help()
            sys.exit(0)
        elif o == "-m":
            tabFileName = a
        elif o == "-q":
            qryFileName = a
        elif o == "-s":
            sbjFileName = a
        elif o == "-t":
            thresholdCoverage = int(a)
        elif o == "-i":
            minIdentity = float(a)
        elif o == "-l":
            minLength = int(a)
        elif o == "-o":
            minOverlap = float(a)
        elif o == "-I":
            thresholdIdentity = int(a)
        elif o == "-E":
            thresholdEvalue = float(a)
        elif o == "-T":
            thresholdCoverageMatch = int(a)
        elif o == "-v":
            verbose = int(a)

    if tabFileName == "":
        msg = "ERROR: missing 'tab' file (-m)"
        sys.stderr.write( "%s\n" % msg )
        help()
        sys.exit(1)
    if qryFileName == "" or sbjFileName == "":
        msg = "ERROR: missing 'fasta' files (-q or -s)"
        sys.stderr.write( "%s\n" % msg )
        help()
        sys.exit(1)

    if verbose > 0:
        print "START %s" % (sys.argv[0].split("/")[-1])
        sys.stdout.flush()

    # 4 categories of matchs:
    # type 1 (CC): the length of the match on the subject is >= 95% of the total length of the subject, idem for the query
    # type 2 (CI): sbj >= 95% & qry < 95%
    # type 3 (IC): sbj < 95% & qry >= 95%
    # type 4 (II): sbj & qry < 95%
    ListMatches_all = []
    ListMatches_sup_sbjqry = []
    ListMatches_sup_sbj = []
    ListMatches_sup_qry = []
    ListMatches_inf_sbjqry = []

    qryDB = pyRepet.seq.BioseqDB.BioseqDB( qryFileName )
    nbQry = qryDB.getSize()
    if verbose > 0:
        print "nb of queries in '%s': %i" % ( qryFileName, nbQry )
    dQry2Cat = {}
    for bs in qryDB.db:
        dQry2Cat[ bs.header.split(" ")[0].lower() ] = "NA"

    sbjDB = pyRepet.seq.BioseqDB.BioseqDB( sbjFileName )
    nbSbj = sbjDB.getSize()
    if verbose > 0:
        print "nb of subjects in '%s': %i" % ( sbjFileName, nbSbj )
    dSbj2Cat = {}
    for bs in sbjDB.db:
        dSbj2Cat[ bs.header.split(" ")[0].lower() ] = "NA"

    tabFile = open( tabFileName )
    nbMatchesInTab = 0
    dSubject2DistinctQueries = {}
    dQuery2DistinctSubjects = {}

    # For each match, create a 'tabFileReader' object and record it in a list according to the type of the match
    if verbose > 0:
        print "parse the 'tab' file..."; sys.stdout.flush()
    while True:
        line = tabFile.readline()
        if line == "":
            break
        if line[0:10] == "query.name":
            continue
        nbMatchesInTab += 1

        match = tabFileReader( line )
        if match.identity < minIdentity:
            line = tabFile.readline()
            continue
        if match.length_match < minLength:
            line = tabFile.readline()
            continue
        if match.prct_qry < minOverlap or match.prct_sbj < minOverlap:
            line = tabFile.readline()
            continue
        ListMatches_all.append( match )

        # type 1: sbj C & qry C
        if match.prct_sbj >= thresholdCoverage and match.prct_qry >= thresholdCoverage:
            qsLengthRatio = 100 * match.length_qry / float(match.length_sbj)
            if match.identity >= thresholdIdentity \
            and match.evalue <= thresholdEvalue \
            and qsLengthRatio >= thresholdCoverage - 2 \
            and qsLengthRatio <= 100 + (100-thresholdCoverage) + 2 \
            and match.prct_matchQryOverSbj >= thresholdCoverageMatch:
                ListMatches_sup_sbjqry.append( match )
            else:
                ListMatches_inf_sbjqry.append( match )

        # type 2: sbj C & qry I
        elif match.prct_sbj >= thresholdCoverage and match.prct_qry < thresholdCoverage:
            ListMatches_sup_sbj.append( match )

        # type 3: sbj I & qry C
        elif match.prct_qry >= thresholdCoverage and match.prct_sbj < thresholdCoverage:
            ListMatches_sup_qry.append( match )

        # type 4: sbj I & qry I
        elif match.prct_qry < thresholdCoverage and match.prct_sbj < thresholdCoverage:
            ListMatches_inf_sbjqry.append( match )

        if not dSubject2DistinctQueries.has_key( match.name_sbj ):
            dSubject2DistinctQueries[ match.name_sbj ] = []
        if not match.name_qry in dSubject2DistinctQueries[ match.name_sbj ]:
            dSubject2DistinctQueries[ match.name_sbj ].append( match.name_qry )
        if not dQuery2DistinctSubjects.has_key( match.name_qry ):
            dQuery2DistinctSubjects[ match.name_qry ] = []
        if not match.name_sbj in dQuery2DistinctSubjects[ match.name_qry ]:
            dQuery2DistinctSubjects[ match.name_qry ].append( match.name_sbj )

    if verbose > 0:
        print "parsing done !"; sys.stdout.flush()
        print "nb matches in '%s': %i" % ( tabFileName, nbMatchesInTab )
        print "nb matches 'CC': %i" % ( len(ListMatches_sup_sbjqry) )
        if verbose > 1:
            for match in ListMatches_sup_sbjqry:
                print "\t%s (%.2f%%) - %s (%.2f%%) id=%.2f" % ( match.name_sbj, match.prct_sbj, match.name_qry, match.prct_qry, match.identity )
        print "nb matches 'CI': %i" % ( len(ListMatches_sup_sbj) )
        if verbose > 1:
            for match in ListMatches_sup_sbj:
                print "\t%s (%.2f%%) - %s (%.2f%%) id=%.2f" % ( match.name_sbj, match.prct_sbj, match.name_qry, match.prct_qry, match.identity )
        print "nb matches 'IC': %i" % ( len(ListMatches_sup_qry) )
        print "nb matches 'II': %i" % ( len(ListMatches_inf_sbjqry) )

    if nbMatchesInTab == 0:
        print "nothing to do"
        sys.exit(0)

    # For each type of matchs, record them in 2 dictionaries: Sbj2Qry and Qry2Sbj
    D_all = make_dico( ListMatches_all )
    Sbj2Qry_all = D_all[0]
    Qry2Sbj_all = D_all[1]

    D_sup_sbjqry = make_dico(ListMatches_sup_sbjqry)
    Sbj2Qry_sup_sbjqry = D_sup_sbjqry[0]
    Qry2Sbj_sup_sbjqry = D_sup_sbjqry[1]

    D_sup_sbj = make_dico(ListMatches_sup_sbj)
    Sbj2Qry_sup_sbj = D_sup_sbj[0]
    Qry2Sbj_sup_sbj = D_sup_sbj[1]

    D_sup_qry = make_dico(ListMatches_sup_qry)
    Sbj2Qry_sup_qry = D_sup_qry[0]
    Qry2Sbj_sup_qry = D_sup_qry[1]

    D_inf_sbjqry = make_dico(ListMatches_inf_sbjqry)
    Sbj2Qry_inf_sbjqry = D_inf_sbjqry[0]
    Qry2Sbj_inf_sbjqry = D_inf_sbjqry[1]


    # For each type of matches, find the subjects/queries that are involve in one or several match
    list_all = find_UniqRedun(ListMatches_all)
    UniqSbj_all = list_all[0]
    RedunSbj_all = list_all[1]
    UniqQry_all = list_all[2]
    RedunQry_all = list_all[3]

    list1 = find_UniqRedun(ListMatches_sup_sbjqry)
    UniqSbj_sup_sbjqry = list1[0]
    RedunSbj_sup_sbjqry = list1[1]
    UniqQry_sup_sbjqry = list1[2]
    RedunQry_sup_sbjqry = list1[3]

    list2 = find_UniqRedun(ListMatches_sup_sbj)
    UniqSbj_sup_sbj = list2[0]
    RedunSbj_sup_sbj = list2[1]
    UniqQry_sup_sbj = list2[2]
    RedunQry_sup_sbj = list2[3]

    list3 = find_UniqRedun(ListMatches_sup_qry)
    UniqSbj_sup_qry = list3[0]
    RedunSbj_sup_qry = list3[1]
    UniqQry_sup_qry = list3[2]
    RedunQry_sup_qry = list3[3]

    list4 = find_UniqRedun(ListMatches_inf_sbjqry)
    UniqSbj_inf_sbjqry = list4[0]
    RedunSbj_inf_sbjqry = list4[1]
    UniqQry_inf_sbjqry = list4[2]
    RedunQry_inf_sbjqry = list4[3]

    iStatSbj = pyRepet.util.Stat.Stat()
    for subject in dSubject2DistinctQueries.keys():
        iStatSbj.add( len( dSubject2DistinctQueries[ subject ] ) )
    iStatQry = pyRepet.util.Stat.Stat()
    for query in dQuery2DistinctSubjects.keys():
        iStatQry.add( len( dQuery2DistinctSubjects[ query ] ) )


    # Write the review of the '.tab' file
    outFile = open( tabFileName + "_tabFileReader.txt", "w" )
    outFile.write( "Input: %s\n" % ( tabFileName ) )

    outFile.write( "\n# Number of subjects in '%s': %i\n" % ( sbjFileName, nbSbj ) )
    outFile.write( "# Number of queries in '%s': %i\n" % ( qryFileName, nbQry ) )

    outFile.write( "\nNumber of matches: %s\n" % (len(ListMatches_all)))
    outFile.write( "    # Number of different subjects that match: %s (Sn*=%.2f%%)\n" % ( len(Sbj2Qry_all.keys()), 100 * len(Sbj2Qry_all.keys()) / float(nbSbj) ) )
    outFile.write( "        Among them, number of different subjects having exactly one match: %s (%.2f%%)\n" % ( len(UniqSbj_all), 100 * len(UniqSbj_all) / float(len(Sbj2Qry_all.keys())) ) )
    outFile.write( "        Among them, number of different subjects having more than one match: %s\n" % (len(RedunSbj_all)))
    outFile.write( "        Different queries per subject: mean=%.2f sd=%.2f min=%.2f q25=%.2f med=%.2f q75=%.2f max=%.2f\n" % ( iStatSbj.mean(), iStatSbj.sd(), iStatSbj.min, iStatSbj.quantile(0.25), iStatSbj.median(), iStatSbj.quantile(0.75), iStatSbj.max ) )
    outFile.write( "    # Number of different queries that match: %s (Sp*=%.2f%%)\n" % ( len(Qry2Sbj_all.keys()), 100 * len(Qry2Sbj_all.keys()) / float(nbQry) ) )
    outFile.write( "        Among them, number of different queries having exactly one match: %s (%.2f%%)\n" % ( len(UniqQry_all), 100 * len(UniqQry_all) / float(len(Qry2Sbj_all.keys())) ) )
    outFile.write( "        Among them, number of different queries having more than one match: %s\n" % (len(RedunQry_all)) )
    outFile.write( "        Different subjects per query: mean=%.2f sd=%.2f min=%.2f q25=%.2f med=%.2f q75=%.2f max=%.2f\n" % ( iStatQry.mean(), iStatQry.sd(), iStatQry.min, iStatQry.quantile(0.25), iStatQry.median(), iStatQry.quantile(0.75), iStatQry.max ) )

    outFile.write( "\nNumber of matches with L >= %i%% for subject & query: %i\n" % ( thresholdCoverage, len(ListMatches_sup_sbjqry) ) )
    outFile.write( "    # Number of different subjects in the 'CC' case: %s (%.2f%%)\n" % ( len(Sbj2Qry_sup_sbjqry), 100 *  len(Sbj2Qry_sup_sbjqry) / float(nbSbj) ) )
    outFile.write( "        Among them, number of different subjects having exactly one match: %s\n" % (len(UniqSbj_sup_sbjqry)))
    outFile.write( "        Among them, number of different subjects having more than one match: %s\n" % (len(RedunSbj_sup_sbjqry)))
    outFile.write( "    # Number of different queries in the 'CC' case: %s (%.2f%%)\n" % ( len(Qry2Sbj_sup_sbjqry), 100 * len(Qry2Sbj_sup_sbjqry) / float(nbQry) ) )
    outFile.write( "        Among them, number of different queries having exactly one match: %s\n" % (len(UniqQry_sup_sbjqry)))
    outFile.write( "        Among them, number of different queries having more than one match: %s\n" % (len(RedunQry_sup_sbjqry)))

    outFile.write( "\nNumber of matches with L >= %i%% for subject and L < %i%% for query: %i\n" % ( thresholdCoverage, thresholdCoverage, len(ListMatches_sup_sbj) ) )
    outFile.write( "    Number of different subjects in that case: %s\n" % (len(Sbj2Qry_sup_sbj)))
    outFile.write( "        Among them, number of different subjects having exactly one match: %s\n" % (len(UniqSbj_sup_sbj)))
    outFile.write( "        Among them, number of different subjects having more than one match: %s\n" % (len(RedunSbj_sup_sbj)))
    outFile.write( "    Number of different queries in that case: %s\n" % (len(Qry2Sbj_sup_sbj)))
    outFile.write( "        Among them, number of different queries having exactly one match: %s\n" % (len(UniqQry_sup_sbj)))
    outFile.write( "        Among them, number of different queries having more than one match: %s\n" % (len(RedunQry_sup_sbj)))

    outFile.write( "\nNumber of matches with L < %i%% for subject and L >= %i%% for query: %i\n" % ( thresholdCoverage, thresholdCoverage, len(ListMatches_sup_qry) ) )
    outFile.write( "    Number of different subjects in that case: %s\n" % (len(Sbj2Qry_sup_qry)))
    outFile.write( "        Among them, number of different subjects having exactly one match: %s\n" % (len(UniqSbj_sup_qry)))
    outFile.write( "        Among them, number of different subjects having more than one match: %s\n" % (len(RedunSbj_sup_qry)))
    outFile.write( "    Number of different queries in that case: %s\n" % (len(Qry2Sbj_sup_qry)))
    outFile.write( "        Among them, number of different queries having exactly one match: %s\n" % (len(UniqQry_sup_qry)))
    outFile.write( "        Among them, number of different queries having more than one match: %s\n" % (len(RedunQry_sup_qry)))

    outFile.write( "\nNumber of matches with L < %i%% for subject & query: %i\n" % ( thresholdCoverage, len(ListMatches_inf_sbjqry) ) )
    outFile.write( "    Number of different subjects in that case: %s\n" % (len(Sbj2Qry_inf_sbjqry)))
    outFile.write( "        Among them, number of different subjects having exactly one match: %s\n" % (len(UniqSbj_inf_sbjqry)))
    outFile.write( "        Among them, number of different subjects having more than one match: %s\n" % (len(RedunSbj_inf_sbjqry)))
    outFile.write( "    Number of different queries in that case: %s\n" % (len(Qry2Sbj_inf_sbjqry)))
    outFile.write( "        Among them, number of different queries having exactly one match: %s\n" % (len(UniqQry_inf_sbjqry)))
    outFile.write( "        Among them, number of different queries having more than one match: %s\n" % (len(RedunQry_inf_sbjqry)))


    # For the elements already counted in the matches with L >= 95% for subject & query, remove them from the other dictionnaries
    rmv_Sbj2Qry = remove( Sbj2Qry_all, Sbj2Qry_sup_sbjqry, Sbj2Qry_sup_sbj, Sbj2Qry_sup_qry, Sbj2Qry_inf_sbjqry )
    rmv_Qry2Sbj = remove( Qry2Sbj_all, Qry2Sbj_sup_sbjqry, Qry2Sbj_sup_sbj, Qry2Sbj_sup_qry, Qry2Sbj_inf_sbjqry )

    outFile.write("\n\nAfter removal of the subjects/queries already counted in the matches with L >= %i%% for them:\n" % ( thresholdCoverage ) )

    outFile.write( "\nMatches with L >= %i%% for subject and L < %i%% for query:\n" % ( thresholdCoverage, thresholdCoverage ) )
    outFile.write( "    # Number of different subjects in the 'CI' case: %s (%.2f%%)\n" % ( len(rmv_Sbj2Qry[0]), 100*len(rmv_Sbj2Qry[0])/float(nbSbj) ) )
    outFile.write( "    # Number of different queries in the 'CI' case: %s (%.2f%%)\n" % ( len(rmv_Qry2Sbj[0]), 100*len(rmv_Qry2Sbj[0])/float(nbQry) ) )

    outFile.write( "\nMatches with L < %i%% for subject and L >= %i%% for query:\n" % ( thresholdCoverage, thresholdCoverage ) )
    outFile.write( "    # Number of different subjects in the 'IC' case: %s (%.2f%%)\n" % (len(rmv_Sbj2Qry[1]), 100*len(rmv_Sbj2Qry[1])/float(nbSbj) ) )
    outFile.write( "    # Number of different queries in the 'IC' case: %s (%.2f%%)\n" % (len(rmv_Qry2Sbj[1]), 100*len(rmv_Qry2Sbj[1])/float(nbQry) ) )

    outFile.write( "\nMatches with L < %i%% for subject & query:\n" % ( thresholdCoverage ) )
    outFile.write( "    # Number of different subjects in the 'II' case: %s (%.2f%%)\n" % (len(rmv_Sbj2Qry[2]), 100*len(rmv_Sbj2Qry[2])/float(nbSbj) ) )
    outFile.write( "    # Number of different queries in the 'II' case: %s (%.2f%%)\n" % (len(rmv_Qry2Sbj[2]), 100*len(rmv_Qry2Sbj[2])/float(nbQry) ) )

    outFile.write("\n==========================================================================\n")

    write_output( outFile, 'CC', Sbj2Qry_sup_sbjqry, dSbj2Cat, Qry2Sbj_sup_sbjqry, dQry2Cat )

    outFile.write("\n==========================================================================\n")

    write_output( outFile, 'CI', rmv_Sbj2Qry[0], dSbj2Cat, rmv_Qry2Sbj[0], dQry2Cat )

    outFile.write("\n==========================================================================\n")

    write_output( outFile, 'IC', rmv_Sbj2Qry[1], dSbj2Cat, rmv_Qry2Sbj[1], dQry2Cat )

    outFile.write("\n==========================================================================\n")

    write_output( outFile, 'II', rmv_Sbj2Qry[2], dSbj2Cat, rmv_Qry2Sbj[2], dQry2Cat )

    outFile.write("\n==========================================================================\n")

    outFile.close()

    writeSubjectCategory( dSbj2Cat )
    writeQueryCategory( dQry2Cat )

    if verbose > 0:
        print "END %s" % (sys.argv[0].split("/")[-1])
        sys.stdout.flush()

    return 0

#-----------------------------------------------------------------------------------------------------

if __name__ == "__main__":
    main()
author	m-zytnicki
date	Mon, 29 Apr 2013 03:20:15 -0400
parents
children