0
|
1 #!/usr/bin/env python
|
|
2 # This tool takes a gff file as input and creates filters on attributes based on certain properties.
|
|
3 # The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
|
|
4 # TODO: much of this code is copied from the Filter1 tool (filtering.py in tools/stats/). The commonalities should be
|
|
5 # abstracted and leveraged in each filtering tool.
|
|
6
|
|
7 from __future__ import division
|
|
8 import sys
|
|
9 from galaxy import eggs
|
|
10 from galaxy.util.json import to_json_string, from_json_string
|
|
11
|
|
12 # Older py compatibility
|
|
13 try:
|
|
14 set()
|
|
15 except:
|
|
16 from sets import Set as set
|
|
17
|
|
18 assert sys.version_info[:2] >= ( 2, 4 )
|
|
19
|
|
20 #
|
|
21 # Helper functions.
|
|
22 #
|
|
23
|
|
24 def get_operands( filter_condition ):
|
|
25 # Note that the order of all_operators is important
|
|
26 items_to_strip = ['+', '-', '**', '*', '//', '/', '%', '<<', '>>', '&', '|', '^', '~', '<=', '<', '>=', '>', '==', '!=', '<>', ' and ', ' or ', ' not ', ' is ', ' is not ', ' in ', ' not in ']
|
|
27 for item in items_to_strip:
|
|
28 if filter_condition.find( item ) >= 0:
|
|
29 filter_condition = filter_condition.replace( item, ' ' )
|
|
30 operands = set( filter_condition.split( ' ' ) )
|
|
31 return operands
|
|
32
|
|
33 def stop_err( msg ):
|
|
34 sys.stderr.write( msg )
|
|
35 sys.exit()
|
|
36
|
|
37 def check_for_executable( text, description='' ):
|
|
38 # Attempt to determine if the condition includes executable stuff and, if so, exit.
|
|
39 secured = dir()
|
|
40 operands = get_operands( text )
|
|
41 for operand in operands:
|
|
42 try:
|
|
43 check = int( operand )
|
|
44 except:
|
|
45 if operand in secured:
|
|
46 stop_err( "Illegal value '%s' in %s '%s'" % ( operand, description, text ) )
|
|
47
|
|
48 #
|
|
49 # Process inputs.
|
|
50 #
|
|
51
|
|
52 in_fname = sys.argv[1]
|
|
53 out_fname = sys.argv[2]
|
|
54 cond_text = sys.argv[3]
|
|
55 attribute_types = from_json_string( sys.argv[4] )
|
|
56
|
|
57 # Convert types from str to type objects.
|
|
58 for name, a_type in attribute_types.items():
|
|
59 check_for_executable(a_type)
|
|
60 attribute_types[ name ] = eval( a_type )
|
|
61
|
|
62 # Unescape if input has been escaped
|
|
63 mapped_str = {
|
|
64 '__lt__': '<',
|
|
65 '__le__': '<=',
|
|
66 '__eq__': '==',
|
|
67 '__ne__': '!=',
|
|
68 '__gt__': '>',
|
|
69 '__ge__': '>=',
|
|
70 '__sq__': '\'',
|
|
71 '__dq__': '"',
|
|
72 }
|
|
73 for key, value in mapped_str.items():
|
|
74 cond_text = cond_text.replace( key, value )
|
|
75
|
|
76 # Attempt to determine if the condition includes executable stuff and, if so, exit.
|
|
77 check_for_executable( cond_text, 'condition')
|
|
78
|
|
79 # Prepare the column variable names and wrappers for column data types. Only
|
|
80 # prepare columns up to largest column in condition.
|
|
81 attrs, type_casts = [], []
|
|
82 for name, attr_type in attribute_types.items():
|
|
83 attrs.append( name )
|
|
84 type_cast = "get_value('%(name)s', attribute_types['%(name)s'], attribute_values)" % ( {'name': name} )
|
|
85 type_casts.append( type_cast )
|
|
86
|
|
87 attr_str = ', '.join( attrs ) # 'c1, c2, c3, c4'
|
|
88 type_cast_str = ', '.join( type_casts ) # 'str(c1), int(c2), int(c3), str(c4)'
|
|
89 wrap = "%s = %s" % ( attr_str, type_cast_str )
|
|
90
|
|
91 # Stats
|
|
92 skipped_lines = 0
|
|
93 first_invalid_line = 0
|
|
94 invalid_line = None
|
|
95 lines_kept = 0
|
|
96 total_lines = 0
|
|
97 out = open( out_fname, 'wt' )
|
|
98
|
|
99 # Helper function to safely get and type cast a value in a dict.
|
|
100 def get_value(name, a_type, values_dict):
|
|
101 if name in values_dict:
|
|
102 return (a_type)(values_dict[ name ])
|
|
103 else:
|
|
104 return None
|
|
105
|
|
106 # Read and filter input file, skipping invalid lines
|
|
107 code = '''
|
|
108 for i, line in enumerate( file( in_fname ) ):
|
|
109 total_lines += 1
|
|
110 line = line.rstrip( '\\r\\n' )
|
|
111 if not line or line.startswith( '#' ):
|
|
112 skipped_lines += 1
|
|
113 if not invalid_line:
|
|
114 first_invalid_line = i + 1
|
|
115 invalid_line = line
|
|
116 continue
|
|
117 try:
|
|
118 # Place attribute values into variables with attribute
|
|
119 # name; type casting is done as well.
|
|
120 elems = line.split( '\t' )
|
|
121 attribute_values = {}
|
|
122 for name_value_pair in elems[8].split(";"):
|
|
123 pair = name_value_pair.strip().split(" ")
|
|
124 if pair == '':
|
|
125 continue
|
|
126 name = pair[0].strip()
|
|
127 if name == '':
|
|
128 continue
|
|
129 # Need to strip double quote from value and typecast.
|
|
130 attribute_values[name] = pair[1].strip(" \\"")
|
|
131 %s
|
|
132 if %s:
|
|
133 lines_kept += 1
|
|
134 print >> out, line
|
|
135 except Exception, e:
|
|
136 print e
|
|
137 skipped_lines += 1
|
|
138 if not invalid_line:
|
|
139 first_invalid_line = i + 1
|
|
140 invalid_line = line
|
|
141 ''' % ( wrap, cond_text )
|
|
142
|
|
143 valid_filter = True
|
|
144 try:
|
|
145 exec code
|
|
146 except Exception, e:
|
|
147 out.close()
|
|
148 if str( e ).startswith( 'invalid syntax' ):
|
|
149 valid_filter = False
|
|
150 stop_err( 'Filter condition "%s" likely invalid. See tool tips, syntax and examples.' % cond_text )
|
|
151 else:
|
|
152 stop_err( str( e ) )
|
|
153
|
|
154 if valid_filter:
|
|
155 out.close()
|
|
156 valid_lines = total_lines - skipped_lines
|
|
157 print 'Filtering with %s, ' % ( cond_text )
|
|
158 if valid_lines > 0:
|
|
159 print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
|
|
160 else:
|
|
161 print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % cond_text
|
|
162 if skipped_lines > 0:
|
|
163 print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
|