annotate commons/tools/filterOutMatcher.py @ 31:0ab839023fe4

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 14:33:21 -0400
parents 94ab73e8a190
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
18
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
1 #!/usr/bin/env python
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
2
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
3 import os
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
4 import sys
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
5 import getopt
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
6 import logging
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
7
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
8 if not os.environ.has_key( "REPET_PATH" ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
9 print "ERROR: no environment variable REPET_PATH"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
10 sys.exit(1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
11 sys.path.append( os.environ["REPET_PATH"] )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
12 import pyRepet.coord.MatchDB
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
13 import pyRepet.seq.BioseqDB
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
14
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
15
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
16 def help():
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
17 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
18 Give the list of the command-line options.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
19 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
20 print
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
21 print "usage: %s [ options ]" % ( sys.argv[0] )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
22 print "options:"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
23 print " -h: this help"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
24 print " -q: fasta filename of the queries"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
25 print " -s: fasta filename of the subjects (same as queries if left blank)"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
26 print " -m: output file from Matcher (format='tab')"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
27 print " -o: name of the output query file (format=fasta, default=qryFileName+'.filtered')"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
28 print " -i: identity threshold (default=0.95)"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
29 print " -l: length threshold (default=0.98)"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
30 print " -L: name of a 'log' file (usually from 'rmvRedundancy.py')"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
31 print " -v: verbose (default=0/1)"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
32 print
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
33
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
34
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
35 def writeOutQuery( qryDB, outFileName, lQryToKeep ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
36 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
37 Write in a fasta file the queries than haven't been filtered (i.e. they are not included in any subject).
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
38 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
39 outFile = open( outFileName, "w" )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
40 nbRmvSeq = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
41 for bs in qryDB.db:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
42 if bs.header in lQryToKeep:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
43 bs.write( outFile )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
44 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
45 nbRmvSeq += 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
46 outFile.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
47 if verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
48 print "%i removed queries out of %i" % ( nbRmvSeq, qryDB.getSize() ); sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
49
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
50
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
51 def main():
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
52 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
53 This program filters the ouput from Matcher by removing queries 'included' in subjects.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
54 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
55 qryFileName = ""
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
56 sbjFileName = ""
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
57 tabFileName = ""
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
58 outFileName = ""
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
59 thresIdentity = 0.95 # remove the seq if it is identical to 95% of another seq
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
60 thresLength = 0.98 # and if its length is 98% of that seq
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
61 logFileName = ""
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
62 global verbose
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
63 verbose = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
64 try:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
65 opts, args = getopt.getopt(sys.argv[1:],"h:q:s:m:o:i:l:L:v:")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
66 except getopt.GetoptError:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
67 help()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
68 sys.exit(1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
69 for o,a in opts:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
70 if o == "-h":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
71 help()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
72 sys.exit(0)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
73 elif o == "-q":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
74 qryFileName = a
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
75 elif o == "-s":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
76 sbjFileName = a
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
77 elif o == "-m":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
78 tabFileName = a
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
79 elif o == "-o":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
80 outFileName = a
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
81 elif o == "-i":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
82 thresIdentity = float(a)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
83 elif o == "-l":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
84 thresLength = float(a)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
85 elif o == "-L":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
86 logFileName = a
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
87 elif o == "-v":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
88 verbose = int(a)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
89 if qryFileName == "" or tabFileName == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
90 print "ERROR: missing compulsory options"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
91 help()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
92 sys.exit(1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
93 if verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
94 print "START %s" % (sys.argv[0].split("/")[-1])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
95 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
96
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
97 # prepare the 'log' file
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
98 handler = logging.FileHandler( logFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
99 formatter = logging.Formatter( "%(asctime)s %(levelname)s: %(message)s" )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
100 handler.setFormatter( formatter )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
101 logging.getLogger('').addHandler( handler )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
102 logging.getLogger('').setLevel( logging.DEBUG )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
103 logging.info( "use '%s' on '%s'" % ( sys.argv[0].split("/")[-1], tabFileName ) )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
104
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
105 if sbjFileName == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
106 sbjFileName = qryFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
107 if outFileName == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
108 outFileName = "%s.filtered" % ( qryFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
109
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
110 # load the input fasta file corresponding to the queries
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
111 qryDB = pyRepet.seq.BioseqDB.BioseqDB( qryFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
112 if sbjFileName != qryFileName:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
113 string = "nb of input sequences (as query only): %i" % ( qryDB.getSize() ); sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
114 logging.info( string )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
115 if verbose > 0: print string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
116 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
117 string = "nb of input sequences (as query and subject): %i" % ( qryDB.getSize() ); sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
118 logging.info( string )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
119 if verbose > 0: print string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
120
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
121 # load the input 'tab' file
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
122 matchDB = pyRepet.coord.MatchDB.MatchDB()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
123 tabFile = open( tabFileName, "r" )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
124 matchDB.read( tabFile, thresIdentity, thresLength, verbose )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
125 tabFile.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
126 longString = ""
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
127 string = "nb of matches (id>=%.2f,qlgth>=%.2f): %i" % ( thresIdentity, thresLength, matchDB.getNbMatchesWithThres( thresIdentity, thresLength ) )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
128 longString += "\n%s" % ( string )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
129 if verbose > 0: print string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
130 string = "nb of distinct queries having matches (id>=%.2f,qlgth>=%.2f): %i" % ( thresIdentity, thresLength, matchDB.getNbDistinctQryWithThres( thresIdentity, thresLength ) )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
131 longString += "\n%s" % ( string )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
132 if verbose > 0: print string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
133 logging.info( longString )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
134 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
135
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
136 lQryToKeep = matchDB.filterDiffQrySbj( qryDB, thresIdentity, thresLength, verbose - 1 )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
137
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
138 # here, possibility to save the information about by which match a specific query has been removed
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
139
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
140 string = "%i queries to be kept" % ( len(lQryToKeep) ); sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
141 logging.info( string )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
142 if verbose > 0: print string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
143
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
144 # write the output fasta file without the included queries
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
145 writeOutQuery( qryDB, outFileName, lQryToKeep )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
146
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
147 if verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
148 print "END %s" % (sys.argv[0].split("/")[-1])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
149 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
150
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
151 return 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
152
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
153
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
154 if __name__ == "__main__":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
155 main ()