18
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import os
|
|
4 import sys
|
|
5 import getopt
|
|
6 import logging
|
|
7
|
|
8 if not os.environ.has_key( "REPET_PATH" ):
|
|
9 print "ERROR: no environment variable REPET_PATH"
|
|
10 sys.exit(1)
|
|
11 sys.path.append( os.environ["REPET_PATH"] )
|
|
12 import pyRepet.coord.MatchDB
|
|
13 import pyRepet.seq.BioseqDB
|
|
14
|
|
15
|
|
16 def help():
|
|
17 """
|
|
18 Give the list of the command-line options.
|
|
19 """
|
|
20 print
|
|
21 print "usage: %s [ options ]" % ( sys.argv[0] )
|
|
22 print "options:"
|
|
23 print " -h: this help"
|
|
24 print " -q: fasta filename of the queries"
|
|
25 print " -s: fasta filename of the subjects (same as queries if left blank)"
|
|
26 print " -m: output file from Matcher (format='tab')"
|
|
27 print " -o: name of the output query file (format=fasta, default=qryFileName+'.filtered')"
|
|
28 print " -i: identity threshold (default=0.95)"
|
|
29 print " -l: length threshold (default=0.98)"
|
|
30 print " -L: name of a 'log' file (usually from 'rmvRedundancy.py')"
|
|
31 print " -v: verbose (default=0/1)"
|
|
32 print
|
|
33
|
|
34
|
|
35 def writeOutQuery( qryDB, outFileName, lQryToKeep ):
|
|
36 """
|
|
37 Write in a fasta file the queries than haven't been filtered (i.e. they are not included in any subject).
|
|
38 """
|
|
39 outFile = open( outFileName, "w" )
|
|
40 nbRmvSeq = 0
|
|
41 for bs in qryDB.db:
|
|
42 if bs.header in lQryToKeep:
|
|
43 bs.write( outFile )
|
|
44 else:
|
|
45 nbRmvSeq += 1
|
|
46 outFile.close()
|
|
47 if verbose > 0:
|
|
48 print "%i removed queries out of %i" % ( nbRmvSeq, qryDB.getSize() ); sys.stdout.flush()
|
|
49
|
|
50
|
|
51 def main():
|
|
52 """
|
|
53 This program filters the ouput from Matcher by removing queries 'included' in subjects.
|
|
54 """
|
|
55 qryFileName = ""
|
|
56 sbjFileName = ""
|
|
57 tabFileName = ""
|
|
58 outFileName = ""
|
|
59 thresIdentity = 0.95 # remove the seq if it is identical to 95% of another seq
|
|
60 thresLength = 0.98 # and if its length is 98% of that seq
|
|
61 logFileName = ""
|
|
62 global verbose
|
|
63 verbose = 0
|
|
64 try:
|
|
65 opts, args = getopt.getopt(sys.argv[1:],"h:q:s:m:o:i:l:L:v:")
|
|
66 except getopt.GetoptError:
|
|
67 help()
|
|
68 sys.exit(1)
|
|
69 for o,a in opts:
|
|
70 if o == "-h":
|
|
71 help()
|
|
72 sys.exit(0)
|
|
73 elif o == "-q":
|
|
74 qryFileName = a
|
|
75 elif o == "-s":
|
|
76 sbjFileName = a
|
|
77 elif o == "-m":
|
|
78 tabFileName = a
|
|
79 elif o == "-o":
|
|
80 outFileName = a
|
|
81 elif o == "-i":
|
|
82 thresIdentity = float(a)
|
|
83 elif o == "-l":
|
|
84 thresLength = float(a)
|
|
85 elif o == "-L":
|
|
86 logFileName = a
|
|
87 elif o == "-v":
|
|
88 verbose = int(a)
|
|
89 if qryFileName == "" or tabFileName == "":
|
|
90 print "ERROR: missing compulsory options"
|
|
91 help()
|
|
92 sys.exit(1)
|
|
93 if verbose > 0:
|
|
94 print "START %s" % (sys.argv[0].split("/")[-1])
|
|
95 sys.stdout.flush()
|
|
96
|
|
97 # prepare the 'log' file
|
|
98 handler = logging.FileHandler( logFileName )
|
|
99 formatter = logging.Formatter( "%(asctime)s %(levelname)s: %(message)s" )
|
|
100 handler.setFormatter( formatter )
|
|
101 logging.getLogger('').addHandler( handler )
|
|
102 logging.getLogger('').setLevel( logging.DEBUG )
|
|
103 logging.info( "use '%s' on '%s'" % ( sys.argv[0].split("/")[-1], tabFileName ) )
|
|
104
|
|
105 if sbjFileName == "":
|
|
106 sbjFileName = qryFileName
|
|
107 if outFileName == "":
|
|
108 outFileName = "%s.filtered" % ( qryFileName )
|
|
109
|
|
110 # load the input fasta file corresponding to the queries
|
|
111 qryDB = pyRepet.seq.BioseqDB.BioseqDB( qryFileName )
|
|
112 if sbjFileName != qryFileName:
|
|
113 string = "nb of input sequences (as query only): %i" % ( qryDB.getSize() ); sys.stdout.flush()
|
|
114 logging.info( string )
|
|
115 if verbose > 0: print string
|
|
116 else:
|
|
117 string = "nb of input sequences (as query and subject): %i" % ( qryDB.getSize() ); sys.stdout.flush()
|
|
118 logging.info( string )
|
|
119 if verbose > 0: print string
|
|
120
|
|
121 # load the input 'tab' file
|
|
122 matchDB = pyRepet.coord.MatchDB.MatchDB()
|
|
123 tabFile = open( tabFileName, "r" )
|
|
124 matchDB.read( tabFile, thresIdentity, thresLength, verbose )
|
|
125 tabFile.close()
|
|
126 longString = ""
|
|
127 string = "nb of matches (id>=%.2f,qlgth>=%.2f): %i" % ( thresIdentity, thresLength, matchDB.getNbMatchesWithThres( thresIdentity, thresLength ) )
|
|
128 longString += "\n%s" % ( string )
|
|
129 if verbose > 0: print string
|
|
130 string = "nb of distinct queries having matches (id>=%.2f,qlgth>=%.2f): %i" % ( thresIdentity, thresLength, matchDB.getNbDistinctQryWithThres( thresIdentity, thresLength ) )
|
|
131 longString += "\n%s" % ( string )
|
|
132 if verbose > 0: print string
|
|
133 logging.info( longString )
|
|
134 sys.stdout.flush()
|
|
135
|
|
136 lQryToKeep = matchDB.filterDiffQrySbj( qryDB, thresIdentity, thresLength, verbose - 1 )
|
|
137
|
|
138 # here, possibility to save the information about by which match a specific query has been removed
|
|
139
|
|
140 string = "%i queries to be kept" % ( len(lQryToKeep) ); sys.stdout.flush()
|
|
141 logging.info( string )
|
|
142 if verbose > 0: print string
|
|
143
|
|
144 # write the output fasta file without the included queries
|
|
145 writeOutQuery( qryDB, outFileName, lQryToKeep )
|
|
146
|
|
147 if verbose > 0:
|
|
148 print "END %s" % (sys.argv[0].split("/")[-1])
|
|
149 sys.stdout.flush()
|
|
150
|
|
151 return 0
|
|
152
|
|
153
|
|
154 if __name__ == "__main__":
|
|
155 main ()
|