comparison column_join.py @ 0:6bb6c0a30c67 draft default tip

Uploaded
author jjohnson
date Tue, 01 Apr 2014 09:30:45 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6bb6c0a30c67
1 #!/usr/bin/env python
2
3 """
4 This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
5
6 usage: %prog -o output -1 input1 -2 input2 -c column1[,column2[,column3[,...]]] -g hinge1[,hinge2[,hinge3[,...]]] -f <fill_options_file> [other_input1 [other_input2 [other_input3 ...]]]
7 -o, output=0: the output pileup
8 -1, input1=1: the pileup file to start with
9 -2, input2=2: the second pileup file to join
10 -g, hinge=h: the columns to be used for matching
11 -c, columns=c: the columns that should appear in the output
12 -f, fill_options_file=f: the file specifying the fill value to use
13 other_inputs: the other input files to join
14 """
15
16 import optparse, os, re, struct, sys, tempfile
17 from galaxy.util.bunch import Bunch
18 from galaxy.util import stringify_dictionary_keys
19 import json
20
21 def stop_err( msg ):
22 sys.stderr.write( msg )
23 sys.exit()
24
25 def split_nums( text ):
26 """
27 Splits a string into pieces of numbers and non-numbers, like 'abc23B3' --> [ 'abc', 23, 'B', 3 ]
28 """
29 split_t = []
30 c = ''
31 n = ''
32 for ch in text:
33 try:
34 v = int( ch )
35 n += ch
36 if c:
37 split_t.append( ''.join( c ) )
38 c = ''
39 except ValueError:
40 c += ch
41 if n:
42 split_t.append( int( ''.join( n ) ) )
43 n = ''
44 if c:
45 split_t.append( ''.join( c ) )
46 if n:
47 split_t.append( int( ''.join( n ) ) )
48 return split_t
49
50 def hinge_compare( hinge1, hinge2 ):
51 """
52 Compares items like 'chr10' and 'chrM' or 'scaffold2' and scaffold10' so that
53 first part handled as text but last part as number
54 """
55 split_hinge1 = hinge1.split( '\t' )
56 split_hinge2 = hinge2.split( '\t' )
57 # quick check if either hinge is empty
58 if not ''.join( split_hinge2 ):
59 if ''.join( split_hinge1 ):
60 return 1
61 elif not ''.join( split_hinge1 ):
62 return 0
63 else:
64 if not ''.join( split_hinge1 ):
65 return -1
66 # go through all parts of the hinges and compare
67 for i, sh1 in enumerate( split_hinge1 ):
68 # if these hinge segments are the same, just move on to the next ones
69 if sh1 == split_hinge2[ i ]:
70 continue
71 # check all parts of each hinge
72 h1 = split_nums( sh1 )
73 h2 = split_nums( split_hinge2[ i ] )
74 for j, h in enumerate( h1 ):
75 # if second hinge has no more parts, first is considered larger
76 if j > 0 and len( h2 ) <= j:
77 return 1
78 # if these two parts are the same, move on to next
79 if h == h2[ j ]:
80 continue
81 # do actual comparison, depending on whether letter or number
82 if type( h ) == int:
83 if type( h2[ j ] ) == int:
84 if h > h2[ j ]:
85 return 1
86 elif h < h2[ j ]:
87 return -1
88 # numbers are less than letters
89 elif type( h2[ j ] ) == str:
90 return -1
91 elif type( h ) == str:
92 if type( h2[ j ] ) == str:
93 if h > h2[ j ]:
94 return 1
95 elif h < h2[ j ]:
96 return -1
97 # numbers are less than letters
98 elif type( h2[ j ] ) == int:
99 return 1
100 # if all else has failed, just do basic string comparison
101 if hinge1 > hinge2:
102 return 1
103 elif hinge1 == hinge2:
104 return 0
105 elif hinge1 < hinge2:
106 return -1
107
108 def hinge_sort( infile, outfile, hinge ):
109 """Given input file name, sorts logically (text vs. numeric) into provided output file name."""
110 hinge_locs = {}
111 bad_lines = []
112 fin = open( infile, 'rb' )
113 line = fin.readline()
114 while line.strip():
115 try:
116 hinge_parts = line.split( '\t' )[ :hinge ]
117 try:
118 hinge_locs[ '\t'.join( hinge_parts ) ].append( fin.tell() - len( line ) )
119 except KeyError:
120 hinge_locs[ '\t'.join( hinge_parts ) ] = [ fin.tell() - len( line ) ]
121 except ValueError:
122 bad_line.append( line )
123 line = fin.readline()
124 fin.close()
125 fin = open( infile, 'rb' )
126 fout = open( outfile, 'wb' )
127 hinge_locs_sorted = hinge_locs.keys()
128 hinge_locs_sorted.sort( hinge_compare )
129 for hinge_loc in hinge_locs_sorted:
130 locs = hinge_locs[ hinge_loc ]
131 for loc in locs:
132 fin.seek( loc )
133 fout.write( fin.readline() )
134 fout.close()
135 fin.close()
136
137 def __main__():
138 parser = optparse.OptionParser()
139 parser.add_option( '-o', '--output', dest='output', help='The name of the output file' )
140 parser.add_option( '-1', '--input1', dest='input1', help='The name of the first input file' )
141 parser.add_option( '-2', '--input2', dest='input2', help='The name of the second input file' )
142 parser.add_option( '-g', '--hinge', dest='hinge', help='The "hinge" to use (the value to compare)' )
143 parser.add_option( '-c', '--columns', dest='columns', help='The columns to include in the output file' )
144 parser.add_option( '-f', '--fill_options_file', dest='fill_options_file', default=None, help='The file specifying the fill value to use' )
145 (options, args) = parser.parse_args()
146 hinge = int( options.hinge )
147 cols = [ int( c ) for c in str( options.columns ).split( ',' ) if int( c ) > hinge ]
148 inputs = [ options.input1, options.input2 ]
149 if options.fill_options_file == 'None':
150 inputs.extend( args )
151 elif len( args ) > 0:
152 inputs.extend( args )
153 fill_options = None
154 if options.fill_options_file != 'None' and options.fill_options_file is not None:
155 try:
156 fill_options = Bunch( **stringify_dictionary_keys( json.load( open( options.fill_options_file ) ) ) )
157 except Exception, e:
158 print 'Warning: Ignoring fill options due to json error (%s).' % e
159 if fill_options is None:
160 fill_options = Bunch()
161 if 'file1_columns' not in fill_options:
162 fill_options.file1_columns = None
163 if fill_options and fill_options.file1_columns:
164 fill_empty = {}
165 for col in cols:
166 fill_empty[ col ] = fill_options.file1_columns[ col - 1 ]
167 else:
168 fill_empty = None
169 assert len( cols ) > 0, 'You need to select at least one column in addition to the hinge'
170 delimiter = '\t'
171 # make sure all files are sorted in same way, ascending
172 tmp_input_files = []
173 input_files = inputs[:]
174 for in_file in input_files:
175 tmp_file = tempfile.NamedTemporaryFile()
176 tmp_file_name = tmp_file.name
177 tmp_file.close()
178 hinge_sort( in_file, tmp_file_name, hinge )
179 tmp_file = open( tmp_file_name, 'rb' )
180 tmp_input_files.append( tmp_file )
181 # cycle through files, getting smallest line of all files one at a time
182 # also have to keep track of vertical position of extra columns
183 fout = file( options.output, 'w' )
184 old_current = ''
185 first_line = True
186 current_lines = [ f.readline().rstrip( '\r\n' ) for f in tmp_input_files ]
187 last_lines = ''.join( current_lines )
188 last_loc = -1
189 while last_lines:
190 # get the "minimum" hinge, which should come first, and the file location in list
191 hinges = [ delimiter.join( line.split( delimiter )[ :hinge ] ) for line in current_lines ]
192 hinge_dict = {}
193 for i in range( len( hinges ) ):
194 if not hinge_dict.has_key( hinges[ i ] ):
195 hinge_dict[ hinges[ i ] ] = i
196 hinges.sort( hinge_compare )
197 hinges = [ h for h in hinges if h ]
198 current, loc = hinges[0], hinge_dict[ hinges[0] ]
199 # first output empty columns for vertical alignment (account for "missing" files)
200 # write output for leading and trailing empty columns
201 # columns missing from actual file handled further below
202 current_data = []
203 if current != old_current:
204 # fill trailing empty columns with appropriate fill value
205 if not first_line:
206 if last_loc < len( inputs ) - 1:
207 if not fill_empty:
208 filler = [ '' for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
209 else:
210 filler = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
211 fout.write( '%s%s' % ( delimiter, delimiter.join( filler ) ) )
212 # insert line break before current line
213 fout.write( '\n' )
214 # fill leading empty columns with appropriate fill value
215 if loc > 0:
216 if not fill_empty:
217 current_data = [ '' for col in range( loc * len( cols ) ) ]
218 else:
219 current_data = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( loc * len( cols ) ) ]
220 else:
221 if loc - last_loc > 1:
222 if not fill_empty:
223 current_data = [ '' for col in range( ( loc - last_loc - 1 ) * len( cols ) ) ]
224 else:
225 current_data = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( loc - last_loc - 1 ) * len( cols ) ) ]
226 # now output actual data
227 split_line = current_lines[ loc ].split( delimiter )
228 # fill empties within actual line if appropriate
229 if fill_empty:
230 new_split_line = split_line[:]
231 split_line = []
232 for i, item in enumerate( new_split_line ):
233 col = i + 1
234 if not item:
235 try:
236 split_line.append( fill_empty[ i + 1 ] )
237 except KeyError:
238 split_line.append( item )
239 else:
240 split_line.append( item )
241 # add actual data to be output below
242 if ''.join( split_line ):
243 for col in cols:
244 if col > hinge:
245 # if this column doesn't exist, add the appropriate filler or empty column
246 try:
247 new_item = split_line[ col - 1 ]
248 except IndexError:
249 if fill_empty:
250 new_item = fill_empty[ col ]
251 else:
252 new_item = ''
253 current_data.append( new_item )
254 # grab next line for selected file
255 current_lines[ loc ] = tmp_input_files[ loc ].readline().rstrip( '\r\n' )
256 # write relevant data to file
257 if current == old_current and current_data:
258 fout.write( '%s%s' % ( delimiter, delimiter.join( current_data ) ) )
259 elif current_data:
260 fout.write( '%s%s%s' % ( current, delimiter, delimiter.join( current_data ) ) )
261 last_lines = ''.join( current_lines )
262 else:
263 last_lines = None
264 last_loc = loc
265 old_current = current
266 first_line = False
267 # fill trailing empty columns for final line
268 if last_loc < len( inputs ) - 1:
269 if not fill_empty:
270 filler = [ '' for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
271 else:
272 filler = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
273 fout.write( '%s%s' % ( delimiter, delimiter.join( filler ) ) )
274 fout.write( '\n' )
275 fout.close()
276 for f in tmp_input_files:
277 os.unlink( f.name )
278
279 if __name__ == "__main__" : __main__()