annotate commons/tools/FilterAlign.py @ 31:0ab839023fe4

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 14:33:21 -0400
parents 94ab73e8a190
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
18
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
1 #!/usr/bin/env python
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
2
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
3 import sys
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
4 import getopt
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
5 import os
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
6
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
7
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
8 def help():
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
9 print
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
10 print "usage: ",sys.argv[0].split("/")[-1],"[ options ]"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
11 print "options:"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
12 print " -h: this help"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
13 print " -i: name of the input file (format='align')"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
14 print " -E: maximum E-value (default=100)"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
15 print " -S: minimum score (default=0)"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
16 print " -I: minimum identity (default=0)"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
17 print " -l: minimum length (default=0)"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
18 print " -L: maximum length (default=1000000000)"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
19 print " -o: name of the output file (default=inFileName+'.filtered')"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
20 print " -v: verbose (default=0/1)"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
21 print
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
22
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
23
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
24 def main():
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
25 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
26 This program filters the output from BLASTER ('align' file recording HSPs).
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
27 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
28
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
29 inFileName = ""
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
30 outFileName = ""
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
31 maxEValue = 100
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
32 minIdentity = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
33 minLength = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
34 maxLength = 1000000000
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
35 minScore = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
36 verbose = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
37
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
38 try:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
39 opts, args = getopt.getopt(sys.argv[1:],"hi:E:S:I:l:L:o:v:")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
40 except getopt.GetoptError, err:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
41 print str(err)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
42 help()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
43 sys.exit(1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
44 for o,a in opts:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
45 if o == "-h":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
46 help()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
47 sys.exit()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
48 elif o == "-i":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
49 inFileName = a
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
50 elif o == "-E":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
51 maxEValue = float(a)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
52 elif o == "-S":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
53 minScore = int(float(a))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
54 elif o == "-I":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
55 minIdentity = int(float(a))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
56 elif o == "-l":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
57 minLength = int(a)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
58 elif o == "-L":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
59 maxLength = int(a)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
60 elif o == "-o":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
61 outFileName = a
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
62 elif o == "-v":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
63 verbose = int(a)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
64
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
65 if inFileName == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
66 print "ERROR: missing input file name"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
67 help()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
68 sys.exit(1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
69
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
70 if outFileName == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
71 outFileName = "%s.filtered" % ( inFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
72
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
73 if os.path.exists( os.environ["REPET_PATH"] + "/bin/filterAlign" ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
74 prg = os.environ["REPET_PATH"] + "/bin/filterAlign"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
75 cmd = prg
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
76 cmd += " -i %s" % ( inFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
77 cmd += " -E %g" % ( maxEValue )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
78 cmd += " -S %i" % ( minScore )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
79 cmd += " -I %f" % ( minIdentity )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
80 cmd += " -l %i" % ( minLength )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
81 cmd += " -L %i" % ( maxLength )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
82 cmd += " -o %s" % ( outFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
83 cmd += " -v %i" % ( verbose )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
84 return os.system( cmd )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
85
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
86 if verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
87 print "START %s" % (sys.argv[0].split("/")[-1])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
88 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
89
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
90 inFile = open( inFileName, "r" )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
91 outFile = open( outFileName, "w" )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
92
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
93 nbMatches = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
94 nbFiltered = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
95
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
96 line = inFile.readline()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
97 while True:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
98 if line == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
99 break
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
100 nbMatches += 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
101 data = line.split("\t")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
102 qryName = data[0]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
103 qryStart = data[1]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
104 qryEnd = data[2]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
105 sbjName = data[3]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
106 sbjStart = data[4]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
107 sbjEnd = data[5]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
108 Evalue = data[6]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
109 score = data[7]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
110 identity = data[8]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
111
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
112 if int(qryStart) < int(qryEnd):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
113 matchLength = int(qryEnd) - int(qryStart) + 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
114 elif int(qryStart) > int(qryEnd):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
115 matchLength = int(qryStart) - int(qryEnd) + 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
116
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
117 if float(Evalue) <= maxEValue and matchLength >= minLength and \
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
118 float(identity) >= minIdentity and matchLength <= maxLength and \
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
119 int(score) >= minScore:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
120 string = qryName + "\t" + qryStart + "\t" + qryEnd + "\t" +\
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
121 sbjName + "\t" + sbjStart + "\t" + sbjEnd + "\t" +\
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
122 Evalue + "\t" + score + "\t" + identity
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
123 outFile.write( string )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
124 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
125 nbFiltered += 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
126 string = "qry %s (%s-%s) vs subj %s (%s-%s): Eval=%s identity=%s matchLength=%s score=%s" %\
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
127 ( qryName, qryStart, qryEnd, sbjName, sbjStart, sbjEnd, Evalue, identity.split("\n")[0], matchLength, score )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
128 if verbose > 1:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
129 print string; sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
130
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
131 line = inFile.readline()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
132
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
133 inFile.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
134 outFile.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
135
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
136 if verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
137 msg = "total number of matches: %i" % ( nbMatches )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
138 msg += "\nnumber of filtered matches: %i" % ( nbFiltered )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
139 print msg; sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
140
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
141 if verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
142 print "END %s" % (sys.argv[0].split("/")[-1])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
143 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
144
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
145 return 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
146
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
147
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
148 if __name__ == "__main__":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
149 main()