annotate tools/filters/gff/gff_filter_by_attribute.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/env python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2 # This tool takes a gff file as input and creates filters on attributes based on certain properties.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 # The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 # TODO: much of this code is copied from the Filter1 tool (filtering.py in tools/stats/). The commonalities should be
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 # abstracted and leveraged in each filtering tool.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 from __future__ import division
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 import sys
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 from galaxy import eggs
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 from galaxy.util.json import to_json_string, from_json_string
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 # Older py compatibility
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 set()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 from sets import Set as set
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 assert sys.version_info[:2] >= ( 2, 4 )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 # Helper functions.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 def get_operands( filter_condition ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 # Note that the order of all_operators is important
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 items_to_strip = ['+', '-', '**', '*', '//', '/', '%', '<<', '>>', '&', '|', '^', '~', '<=', '<', '>=', '>', '==', '!=', '<>', ' and ', ' or ', ' not ', ' is ', ' is not ', ' in ', ' not in ']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 for item in items_to_strip:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 if filter_condition.find( item ) >= 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 filter_condition = filter_condition.replace( item, ' ' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 operands = set( filter_condition.split( ' ' ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 return operands
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 def stop_err( msg ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 sys.stderr.write( msg )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 sys.exit()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 def check_for_executable( text, description='' ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 # Attempt to determine if the condition includes executable stuff and, if so, exit.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 secured = dir()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 operands = get_operands( text )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 for operand in operands:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 check = int( operand )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 if operand in secured:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 stop_err( "Illegal value '%s' in %s '%s'" % ( operand, description, text ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 # Process inputs.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 in_fname = sys.argv[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 out_fname = sys.argv[2]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 cond_text = sys.argv[3]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 attribute_types = from_json_string( sys.argv[4] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 # Convert types from str to type objects.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 for name, a_type in attribute_types.items():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 check_for_executable(a_type)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 attribute_types[ name ] = eval( a_type )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 # Unescape if input has been escaped
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 mapped_str = {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 '__lt__': '<',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 '__le__': '<=',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 '__eq__': '==',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 '__ne__': '!=',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 '__gt__': '>',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 '__ge__': '>=',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 '__sq__': '\'',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 '__dq__': '"',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 for key, value in mapped_str.items():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 cond_text = cond_text.replace( key, value )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 # Attempt to determine if the condition includes executable stuff and, if so, exit.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 check_for_executable( cond_text, 'condition')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 # Prepare the column variable names and wrappers for column data types. Only
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 # prepare columns up to largest column in condition.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 attrs, type_casts = [], []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 for name, attr_type in attribute_types.items():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 attrs.append( name )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 type_cast = "get_value('%(name)s', attribute_types['%(name)s'], attribute_values)" % ( {'name': name} )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 type_casts.append( type_cast )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 attr_str = ', '.join( attrs ) # 'c1, c2, c3, c4'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 type_cast_str = ', '.join( type_casts ) # 'str(c1), int(c2), int(c3), str(c4)'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 wrap = "%s = %s" % ( attr_str, type_cast_str )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 # Stats
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 skipped_lines = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 first_invalid_line = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 invalid_line = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 lines_kept = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 total_lines = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 out = open( out_fname, 'wt' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 # Helper function to safely get and type cast a value in a dict.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 def get_value(name, a_type, values_dict):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 if name in values_dict:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 return (a_type)(values_dict[ name ])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 return None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 # Read and filter input file, skipping invalid lines
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 code = '''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108 for i, line in enumerate( file( in_fname ) ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 total_lines += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 line = line.rstrip( '\\r\\n' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 if not line or line.startswith( '#' ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112 skipped_lines += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 if not invalid_line:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 first_invalid_line = i + 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 invalid_line = line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 # Place attribute values into variables with attribute
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119 # name; type casting is done as well.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 elems = line.split( '\t' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 attribute_values = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122 for name_value_pair in elems[8].split(";"):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 pair = name_value_pair.strip().split(" ")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124 if pair == '':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125 continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126 name = pair[0].strip()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127 if name == '':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128 continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129 # Need to strip double quote from value and typecast.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130 attribute_values[name] = pair[1].strip(" \\"")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131 %s
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132 if %s:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133 lines_kept += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134 print >> out, line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135 except Exception, e:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136 print e
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137 skipped_lines += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138 if not invalid_line:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139 first_invalid_line = i + 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
140 invalid_line = line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
141 ''' % ( wrap, cond_text )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
142
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
143 valid_filter = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
144 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
145 exec code
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
146 except Exception, e:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
147 out.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
148 if str( e ).startswith( 'invalid syntax' ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
149 valid_filter = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
150 stop_err( 'Filter condition "%s" likely invalid. See tool tips, syntax and examples.' % cond_text )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
151 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
152 stop_err( str( e ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
153
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
154 if valid_filter:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
155 out.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
156 valid_lines = total_lines - skipped_lines
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
157 print 'Filtering with %s, ' % ( cond_text )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
158 if valid_lines > 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
159 print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
160 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
161 print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % cond_text
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
162 if skipped_lines > 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
163 print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )