0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 """
|
|
4 This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
|
|
5
|
|
6 usage: %prog [options]
|
|
7 -i, --input=i: tabular input file
|
|
8 -o, --output=o: filtered output file
|
|
9 -c, --cond=c: conditions to filter on
|
|
10 -n, --n_handling=n: how to handle N and X
|
|
11 -l, --columns=l: columns
|
|
12 -t, --col_types=t: column types
|
|
13
|
|
14 """
|
|
15
|
|
16 #from __future__ import division
|
|
17 import os.path, re, string, string, sys
|
|
18 from galaxy import eggs
|
|
19 import pkg_resources; pkg_resources.require( "bx-python" )
|
|
20 from bx.cookbook import doc_optparse
|
|
21
|
|
22 # Older py compatibility
|
|
23 try:
|
|
24 set()
|
|
25 except:
|
|
26 from sets import Set as set
|
|
27
|
|
28 #assert sys.version_info[:2] >= ( 2, 4 )
|
|
29
|
|
30 def get_operands( filter_condition ):
|
|
31 # Note that the order of all_operators is important
|
|
32 items_to_strip = [ '==', '!=', ' and ', ' or ' ]
|
|
33 for item in items_to_strip:
|
|
34 if filter_condition.find( item ) >= 0:
|
|
35 filter_condition = filter_condition.replace( item, ' ' )
|
|
36 operands = set( filter_condition.split( ' ' ) )
|
|
37 return operands
|
|
38
|
|
39 def stop_err( msg ):
|
|
40 sys.stderr.write( msg )
|
|
41 sys.exit()
|
|
42
|
|
43 def __main__():
|
|
44 #Parse Command Line
|
|
45 options, args = doc_optparse.parse( __doc__ )
|
|
46 input = options.input
|
|
47 output = options.output
|
|
48 cond = options.cond
|
|
49 n_handling = options.n_handling
|
|
50 columns = options.columns
|
|
51 col_types = options.col_types
|
|
52
|
|
53 try:
|
|
54 in_columns = int( columns )
|
|
55 assert col_types #check to see that the column types variable isn't null
|
|
56 in_column_types = col_types.split( ',' )
|
|
57 except:
|
|
58 stop_err( "Data does not appear to be tabular. This tool can only be used with tab-delimited data." )
|
|
59
|
|
60 # Unescape if input has been escaped
|
|
61 cond_text = cond.replace( '__eq__', '==' ).replace( '__ne__', '!=' ).replace( '__sq__', "'" )
|
|
62 orig_cond_text = cond_text
|
|
63 # Expand to allow for DNA codes
|
|
64 dot_letters = [ letter for letter in string.uppercase if letter not in \
|
|
65 [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ] ]
|
|
66 dot_letters.append( '.' )
|
|
67 codes = {'A': [ 'A', 'D', 'H', 'M', 'R', 'V', 'W' ],
|
|
68 'C': [ 'C', 'B', 'H', 'M', 'S', 'V', 'Y' ],
|
|
69 'G': [ 'G', 'B', 'D', 'K', 'R', 'S', 'V' ],
|
|
70 'T': [ 'T', 'U', 'B', 'D', 'H', 'K', 'W', 'Y' ],
|
|
71 'U': [ 'T', 'U', 'B', 'D', 'H', 'K', 'W', 'Y' ],
|
|
72 'K': [ 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'R', 'S', 'V', 'W', 'Y' ],
|
|
73 'M': [ 'A', 'C', 'B', 'D', 'H', 'M', 'R', 'S', 'V', 'W', 'Y' ],
|
|
74 'R': [ 'A', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W' ],
|
|
75 'Y': [ 'C', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'S', 'V', 'W', 'Y' ],
|
|
76 'S': [ 'C', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'Y' ],
|
|
77 'W': [ 'A', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'V', 'W', 'Y' ],
|
|
78 'B': [ 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
|
|
79 'V': [ 'A', 'C', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W' ],
|
|
80 'H': [ 'A', 'C', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
|
|
81 'D': [ 'A', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
|
|
82 '.': dot_letters,
|
|
83 '-': [ '-' ]}
|
|
84 # Add handling for N and X
|
|
85 if n_handling == "all":
|
|
86 codes[ 'N' ] = [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ]
|
|
87 codes[ 'X' ] = [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ]
|
|
88 for code in codes.keys():
|
|
89 if code != '.' and code != '-':
|
|
90 codes[code].append( 'N' )
|
|
91 codes[code].append( 'X' )
|
|
92 else:
|
|
93 codes[ 'N' ] = dot_letters
|
|
94 codes[ 'X' ] = dot_letters
|
|
95 codes[ '.' ].extend( [ 'N', 'X' ] )
|
|
96 # Expand conditions to allow for DNA codes
|
|
97 try:
|
|
98 match_replace = {}
|
|
99 pat = re.compile( 'c\d+\s*[!=]=\s*[\w\d"\'+-.]+' )
|
|
100 matches = pat.findall( cond_text )
|
|
101 for match in matches:
|
|
102 if match.find( 'chr' ) >= 0 or match.find( 'scaffold' ) >= 0 or match.find( '+' ) >= 0:
|
|
103 if match.find( '==' ) >= 0:
|
|
104 match_parts = match.split( '==' )
|
|
105 elif match.find( '!=' ) >= 0:
|
|
106 match_parts = match.split( '!=' )
|
|
107 else:
|
|
108 raise Exception, "The operators '==' and '!=' were not found."
|
|
109 left = match_parts[0].strip()
|
|
110 right = match_parts[1].strip()
|
|
111 new_match = "(%s)" % ( match )
|
|
112 elif match.find( '==' ) > 0:
|
|
113 match_parts = match.split( '==' )
|
|
114 left = match_parts[0].strip()
|
|
115 right = match_parts[1].strip()
|
|
116 new_match = '(%s in codes[%s] and %s in codes[%s])' % ( left, right, right, left )
|
|
117 elif match.find( '!=' ) > 0 :
|
|
118 match_parts = match.split( '!=' )
|
|
119 left = match_parts[0].strip()
|
|
120 right = match_parts[1].strip()
|
|
121 new_match = '(%s not in codes[%s] or %s not in codes[%s])' % ( left, right, right, left )
|
|
122 else:
|
|
123 raise Exception, "The operators '==' and '!=' were not found."
|
|
124 assert left.startswith( 'c' ), 'The column names should start with c (lowercase)'
|
|
125 if right.find( "'" ) >= 0 or right.find( '"' ) >= 0:
|
|
126 test = right.replace( "'", '' ).replace( '"', '' )
|
|
127 assert test in string.uppercase or test.find( '+' ) >= 0 or test.find( '.' ) >= 0 or test.find( '-' ) >= 0\
|
|
128 or test.startswith( 'chr' ) or test.startswith( 'scaffold' ), \
|
|
129 'The value to search for should be a valid base, code, plus sign, chromosome (like "chr1") or scaffold (like "scaffold5"). ' \
|
|
130 'Use the general filter tool to filter on anything else first'
|
|
131 else:
|
|
132 assert right.startswith( 'c' ), 'The column names should start with c (lowercase)'
|
|
133 match_replace[match] = new_match
|
|
134 if len( match_replace.keys() ) == 0:
|
|
135 raise Exception, 'There do not appear to be any valid conditions'
|
|
136 for match in match_replace.keys():
|
|
137 cond_text = cond_text.replace( match, match_replace[match] )
|
|
138 except Exception, e:
|
|
139 stop_err( "At least one of your conditions is invalid. Make sure to use only '!=' or '==', valid column numbers, and valid base values.\n" + str(e) )
|
|
140
|
|
141 # Attempt to determine if the condition includes executable stuff and, if so, exit
|
|
142 secured = dir()
|
|
143 operands = get_operands( cond_text )
|
|
144 for operand in operands:
|
|
145 try:
|
|
146 check = int( operand )
|
|
147 except:
|
|
148 if operand in secured:
|
|
149 stop_err( "Illegal value '%s' in condition '%s'" % ( operand, cond_text ) )
|
|
150
|
|
151 # Prepare the column variable names and wrappers for column data types
|
|
152 cols, type_casts = [], []
|
|
153 for col in range( 1, in_columns + 1 ):
|
|
154 col_name = "c%d" % col
|
|
155 cols.append( col_name )
|
|
156 col_type = in_column_types[ col - 1 ]
|
|
157 type_cast = "%s(%s)" % ( col_type, col_name )
|
|
158 type_casts.append( type_cast )
|
|
159
|
|
160 col_str = ', '.join( cols ) # 'c1, c2, c3, c4'
|
|
161 type_cast_str = ', '.join( type_casts ) # 'str(c1), int(c2), int(c3), str(c4)'
|
|
162 assign = "%s = line.split( '\\t' )" % col_str
|
|
163 wrap = "%s = %s" % ( col_str, type_cast_str )
|
|
164 skipped_lines = 0
|
|
165 first_invalid_line = 0
|
|
166 invalid_line = None
|
|
167 lines_kept = 0
|
|
168 total_lines = 0
|
|
169 out = open( output, 'wt' )
|
|
170 # Read and filter input file, skipping invalid lines
|
|
171 code = '''
|
|
172 for i, line in enumerate( file( input ) ):
|
|
173 total_lines += 1
|
|
174 line = line.rstrip( '\\r\\n' )
|
|
175 if not line or line.startswith( '#' ):
|
|
176 skipped_lines += 1
|
|
177 if not invalid_line:
|
|
178 first_invalid_line = i + 1
|
|
179 invalid_line = line
|
|
180 continue
|
|
181 try:
|
|
182 %s = line.split( '\\t' )
|
|
183 %s = %s
|
|
184 if %s:
|
|
185 lines_kept += 1
|
|
186 print >> out, line
|
|
187 except Exception, e:
|
|
188 skipped_lines += 1
|
|
189 if not invalid_line:
|
|
190 first_invalid_line = i + 1
|
|
191 invalid_line = line
|
|
192 ''' % ( col_str, col_str, type_cast_str, cond_text )
|
|
193
|
|
194 valid_filter = True
|
|
195 try:
|
|
196 exec code
|
|
197 except Exception, e:
|
|
198 out.close()
|
|
199 if str( e ).startswith( 'invalid syntax' ):
|
|
200 valid_filter = False
|
|
201 stop_err( 'Filter condition "%s" likely invalid. See tool tips, syntax and examples.' % orig_cond_text + ' '+str(e))
|
|
202 else:
|
|
203 stop_err( str( e ) )
|
|
204
|
|
205 if valid_filter:
|
|
206 out.close()
|
|
207 valid_lines = total_lines - skipped_lines
|
|
208 print 'Filtering with %s, ' % orig_cond_text
|
|
209 if valid_lines > 0:
|
|
210 print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
|
|
211 else:
|
|
212 print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % orig_cond_text
|
|
213 if skipped_lines > 0:
|
|
214 print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
|
|
215
|
|
216 if __name__ == "__main__" : __main__()
|