Mercurial > repos > yufei-luo > s_mart
comparison commons/tools/filterOutMatcher.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
17:b0e8584489e6 | 18:94ab73e8a190 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import os | |
4 import sys | |
5 import getopt | |
6 import logging | |
7 | |
8 if not os.environ.has_key( "REPET_PATH" ): | |
9 print "ERROR: no environment variable REPET_PATH" | |
10 sys.exit(1) | |
11 sys.path.append( os.environ["REPET_PATH"] ) | |
12 import pyRepet.coord.MatchDB | |
13 import pyRepet.seq.BioseqDB | |
14 | |
15 | |
16 def help(): | |
17 """ | |
18 Give the list of the command-line options. | |
19 """ | |
20 print | |
21 print "usage: %s [ options ]" % ( sys.argv[0] ) | |
22 print "options:" | |
23 print " -h: this help" | |
24 print " -q: fasta filename of the queries" | |
25 print " -s: fasta filename of the subjects (same as queries if left blank)" | |
26 print " -m: output file from Matcher (format='tab')" | |
27 print " -o: name of the output query file (format=fasta, default=qryFileName+'.filtered')" | |
28 print " -i: identity threshold (default=0.95)" | |
29 print " -l: length threshold (default=0.98)" | |
30 print " -L: name of a 'log' file (usually from 'rmvRedundancy.py')" | |
31 print " -v: verbose (default=0/1)" | |
32 print | |
33 | |
34 | |
35 def writeOutQuery( qryDB, outFileName, lQryToKeep ): | |
36 """ | |
37 Write in a fasta file the queries than haven't been filtered (i.e. they are not included in any subject). | |
38 """ | |
39 outFile = open( outFileName, "w" ) | |
40 nbRmvSeq = 0 | |
41 for bs in qryDB.db: | |
42 if bs.header in lQryToKeep: | |
43 bs.write( outFile ) | |
44 else: | |
45 nbRmvSeq += 1 | |
46 outFile.close() | |
47 if verbose > 0: | |
48 print "%i removed queries out of %i" % ( nbRmvSeq, qryDB.getSize() ); sys.stdout.flush() | |
49 | |
50 | |
51 def main(): | |
52 """ | |
53 This program filters the ouput from Matcher by removing queries 'included' in subjects. | |
54 """ | |
55 qryFileName = "" | |
56 sbjFileName = "" | |
57 tabFileName = "" | |
58 outFileName = "" | |
59 thresIdentity = 0.95 # remove the seq if it is identical to 95% of another seq | |
60 thresLength = 0.98 # and if its length is 98% of that seq | |
61 logFileName = "" | |
62 global verbose | |
63 verbose = 0 | |
64 try: | |
65 opts, args = getopt.getopt(sys.argv[1:],"h:q:s:m:o:i:l:L:v:") | |
66 except getopt.GetoptError: | |
67 help() | |
68 sys.exit(1) | |
69 for o,a in opts: | |
70 if o == "-h": | |
71 help() | |
72 sys.exit(0) | |
73 elif o == "-q": | |
74 qryFileName = a | |
75 elif o == "-s": | |
76 sbjFileName = a | |
77 elif o == "-m": | |
78 tabFileName = a | |
79 elif o == "-o": | |
80 outFileName = a | |
81 elif o == "-i": | |
82 thresIdentity = float(a) | |
83 elif o == "-l": | |
84 thresLength = float(a) | |
85 elif o == "-L": | |
86 logFileName = a | |
87 elif o == "-v": | |
88 verbose = int(a) | |
89 if qryFileName == "" or tabFileName == "": | |
90 print "ERROR: missing compulsory options" | |
91 help() | |
92 sys.exit(1) | |
93 if verbose > 0: | |
94 print "START %s" % (sys.argv[0].split("/")[-1]) | |
95 sys.stdout.flush() | |
96 | |
97 # prepare the 'log' file | |
98 handler = logging.FileHandler( logFileName ) | |
99 formatter = logging.Formatter( "%(asctime)s %(levelname)s: %(message)s" ) | |
100 handler.setFormatter( formatter ) | |
101 logging.getLogger('').addHandler( handler ) | |
102 logging.getLogger('').setLevel( logging.DEBUG ) | |
103 logging.info( "use '%s' on '%s'" % ( sys.argv[0].split("/")[-1], tabFileName ) ) | |
104 | |
105 if sbjFileName == "": | |
106 sbjFileName = qryFileName | |
107 if outFileName == "": | |
108 outFileName = "%s.filtered" % ( qryFileName ) | |
109 | |
110 # load the input fasta file corresponding to the queries | |
111 qryDB = pyRepet.seq.BioseqDB.BioseqDB( qryFileName ) | |
112 if sbjFileName != qryFileName: | |
113 string = "nb of input sequences (as query only): %i" % ( qryDB.getSize() ); sys.stdout.flush() | |
114 logging.info( string ) | |
115 if verbose > 0: print string | |
116 else: | |
117 string = "nb of input sequences (as query and subject): %i" % ( qryDB.getSize() ); sys.stdout.flush() | |
118 logging.info( string ) | |
119 if verbose > 0: print string | |
120 | |
121 # load the input 'tab' file | |
122 matchDB = pyRepet.coord.MatchDB.MatchDB() | |
123 tabFile = open( tabFileName, "r" ) | |
124 matchDB.read( tabFile, thresIdentity, thresLength, verbose ) | |
125 tabFile.close() | |
126 longString = "" | |
127 string = "nb of matches (id>=%.2f,qlgth>=%.2f): %i" % ( thresIdentity, thresLength, matchDB.getNbMatchesWithThres( thresIdentity, thresLength ) ) | |
128 longString += "\n%s" % ( string ) | |
129 if verbose > 0: print string | |
130 string = "nb of distinct queries having matches (id>=%.2f,qlgth>=%.2f): %i" % ( thresIdentity, thresLength, matchDB.getNbDistinctQryWithThres( thresIdentity, thresLength ) ) | |
131 longString += "\n%s" % ( string ) | |
132 if verbose > 0: print string | |
133 logging.info( longString ) | |
134 sys.stdout.flush() | |
135 | |
136 lQryToKeep = matchDB.filterDiffQrySbj( qryDB, thresIdentity, thresLength, verbose - 1 ) | |
137 | |
138 # here, possibility to save the information about by which match a specific query has been removed | |
139 | |
140 string = "%i queries to be kept" % ( len(lQryToKeep) ); sys.stdout.flush() | |
141 logging.info( string ) | |
142 if verbose > 0: print string | |
143 | |
144 # write the output fasta file without the included queries | |
145 writeOutQuery( qryDB, outFileName, lQryToKeep ) | |
146 | |
147 if verbose > 0: | |
148 print "END %s" % (sys.argv[0].split("/")[-1]) | |
149 sys.stdout.flush() | |
150 | |
151 return 0 | |
152 | |
153 | |
154 if __name__ == "__main__": | |
155 main () |