annotate tools/new_operations/column_join.py @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/env python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 usage: %prog -o output -1 input1 -2 input2 -c column1[,column2[,column3[,...]]] -g hinge1[,hinge2[,hinge3[,...]]] -f <fill_options_file> [other_input1 [other_input2 [other_input3 ...]]]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 -o, output=0: the output pileup
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 -1, input1=1: the pileup file to start with
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 -2, input2=2: the second pileup file to join
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 -g, hinge=h: the columns to be used for matching
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 -c, columns=c: the columns that should appear in the output
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 -f, fill_options_file=f: the file specifying the fill value to use
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 other_inputs: the other input files to join
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 import optparse, os, re, struct, sys, tempfile
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 simple_json_exception = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 from galaxy import eggs
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 from galaxy.util.bunch import Bunch
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 from galaxy.util import stringify_dictionary_keys
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 import pkg_resources
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 pkg_resources.require("simplejson")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 import simplejson
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 except Exception, e:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 simplejson_exception = e
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 simplejson = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 def stop_err( msg ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 sys.stderr.write( msg )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 sys.exit()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 def split_nums( text ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 Splits a string into pieces of numbers and non-numbers, like 'abc23B3' --> [ 'abc', 23, 'B', 3 ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 split_t = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 c = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 n = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 for ch in text:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 v = int( ch )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 n += ch
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 if c:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 split_t.append( ''.join( c ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 c = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 except ValueError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 c += ch
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 if n:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 split_t.append( int( ''.join( n ) ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 n = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 if c:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 split_t.append( ''.join( c ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 if n:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 split_t.append( int( ''.join( n ) ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 return split_t
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 def hinge_compare( hinge1, hinge2 ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 Compares items like 'chr10' and 'chrM' or 'scaffold2' and scaffold10' so that
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 first part handled as text but last part as number
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 split_hinge1 = hinge1.split( '\t' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 split_hinge2 = hinge2.split( '\t' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 # quick check if either hinge is empty
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 if not ''.join( split_hinge2 ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 if ''.join( split_hinge1 ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 return 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 elif not ''.join( split_hinge1 ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 return 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 if not ''.join( split_hinge1 ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 return -1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 # go through all parts of the hinges and compare
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 for i, sh1 in enumerate( split_hinge1 ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 # if these hinge segments are the same, just move on to the next ones
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 if sh1 == split_hinge2[ i ]:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 # check all parts of each hinge
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 h1 = split_nums( sh1 )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 h2 = split_nums( split_hinge2[ i ] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 for j, h in enumerate( h1 ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 # if second hinge has no more parts, first is considered larger
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 if j > 0 and len( h2 ) <= j:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 return 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 # if these two parts are the same, move on to next
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 if h == h2[ j ]:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 # do actual comparison, depending on whether letter or number
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 if type( h ) == int:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 if type( h2[ j ] ) == int:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 if h > h2[ j ]:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 return 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 elif h < h2[ j ]:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 return -1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 # numbers are less than letters
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 elif type( h2[ j ] ) == str:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 return -1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 elif type( h ) == str:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 if type( h2[ j ] ) == str:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 if h > h2[ j ]:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 return 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 elif h < h2[ j ]:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 return -1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 # numbers are less than letters
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 elif type( h2[ j ] ) == int:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108 return 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 # if all else has failed, just do basic string comparison
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 if hinge1 > hinge2:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 return 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112 elif hinge1 == hinge2:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 return 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 elif hinge1 < hinge2:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 return -1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 def hinge_sort( infile, outfile, hinge ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 """Given input file name, sorts logically (text vs. numeric) into provided output file name."""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119 hinge_locs = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 bad_lines = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 fin = open( infile, 'rb' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122 line = fin.readline()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 while line.strip():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125 hinge_parts = line.split( '\t' )[ :hinge ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127 hinge_locs[ '\t'.join( hinge_parts ) ].append( fin.tell() - len( line ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128 except KeyError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129 hinge_locs[ '\t'.join( hinge_parts ) ] = [ fin.tell() - len( line ) ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130 except ValueError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131 bad_line.append( line )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132 line = fin.readline()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133 fin.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134 fin = open( infile, 'rb' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135 fout = open( outfile, 'wb' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136 hinge_locs_sorted = hinge_locs.keys()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137 hinge_locs_sorted.sort( hinge_compare )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138 for hinge_loc in hinge_locs_sorted:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139 locs = hinge_locs[ hinge_loc ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
140 for loc in locs:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
141 fin.seek( loc )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
142 fout.write( fin.readline() )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
143 fout.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
144 fin.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
145
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
146 def __main__():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
147 parser = optparse.OptionParser()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
148 parser.add_option( '-o', '--output', dest='output', help='The name of the output file' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
149 parser.add_option( '-1', '--input1', dest='input1', help='The name of the first input file' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
150 parser.add_option( '-2', '--input2', dest='input2', help='The name of the second input file' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
151 parser.add_option( '-g', '--hinge', dest='hinge', help='The "hinge" to use (the value to compare)' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
152 parser.add_option( '-c', '--columns', dest='columns', help='The columns to include in the output file' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
153 parser.add_option( '-f', '--fill_options_file', dest='fill_options_file', default=None, help='The file specifying the fill value to use' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
154 (options, args) = parser.parse_args()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
155 hinge = int( options.hinge )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
156 cols = [ int( c ) for c in str( options.columns ).split( ',' ) if int( c ) > hinge ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
157 inputs = [ options.input1, options.input2 ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
158 if options.fill_options_file == 'None':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
159 inputs.extend( args )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
160 elif len( args ) > 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
161 inputs.extend( args )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
162 fill_options = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
163 if options.fill_options_file != 'None' and options.fill_options_file is not None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
164 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
165 if simplejson is None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
166 raise simplejson_exception
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
167 fill_options = Bunch( **stringify_dictionary_keys( simplejson.load( open( options.fill_options_file ) ) ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
168 except Exception, e:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
169 print 'Warning: Ignoring fill options due to simplejson error (%s).' % e
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
170 if fill_options is None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
171 fill_options = Bunch()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
172 if 'file1_columns' not in fill_options:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
173 fill_options.file1_columns = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
174 if fill_options and fill_options.file1_columns:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
175 fill_empty = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
176 for col in cols:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
177 fill_empty[ col ] = fill_options.file1_columns[ col - 1 ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
178 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
179 fill_empty = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
180 assert len( cols ) > 0, 'You need to select at least one column in addition to the hinge'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
181 delimiter = '\t'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
182 # make sure all files are sorted in same way, ascending
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
183 tmp_input_files = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
184 input_files = inputs[:]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
185 for in_file in input_files:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
186 tmp_file = tempfile.NamedTemporaryFile()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
187 tmp_file_name = tmp_file.name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
188 tmp_file.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
189 hinge_sort( in_file, tmp_file_name, hinge )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
190 tmp_file = open( tmp_file_name, 'rb' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
191 tmp_input_files.append( tmp_file )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
192 # cycle through files, getting smallest line of all files one at a time
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
193 # also have to keep track of vertical position of extra columns
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
194 fout = file( options.output, 'w' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
195 old_current = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
196 first_line = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
197 current_lines = [ f.readline().rstrip( '\r\n' ) for f in tmp_input_files ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
198 last_lines = ''.join( current_lines )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
199 last_loc = -1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
200 while last_lines:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
201 # get the "minimum" hinge, which should come first, and the file location in list
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
202 hinges = [ delimiter.join( line.split( delimiter )[ :hinge ] ) for line in current_lines ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
203 hinge_dict = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
204 for i in range( len( hinges ) ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
205 if not hinge_dict.has_key( hinges[ i ] ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
206 hinge_dict[ hinges[ i ] ] = i
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
207 hinges.sort( hinge_compare )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
208 hinges = [ h for h in hinges if h ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
209 current, loc = hinges[0], hinge_dict[ hinges[0] ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
210 # first output empty columns for vertical alignment (account for "missing" files)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
211 # write output for leading and trailing empty columns
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
212 # columns missing from actual file handled further below
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
213 current_data = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
214 if current != old_current:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
215 # fill trailing empty columns with appropriate fill value
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
216 if not first_line:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
217 if last_loc < len( inputs ) - 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
218 if not fill_empty:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
219 filler = [ '' for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
220 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
221 filler = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
222 fout.write( '%s%s' % ( delimiter, delimiter.join( filler ) ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
223 # insert line break before current line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
224 fout.write( '\n' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
225 # fill leading empty columns with appropriate fill value
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
226 if loc > 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
227 if not fill_empty:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
228 current_data = [ '' for col in range( loc * len( cols ) ) ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
229 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
230 current_data = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( loc * len( cols ) ) ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
231 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
232 if loc - last_loc > 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
233 if not fill_empty:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
234 current_data = [ '' for col in range( ( loc - last_loc - 1 ) * len( cols ) ) ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
235 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
236 current_data = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( loc - last_loc - 1 ) * len( cols ) ) ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
237 # now output actual data
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
238 split_line = current_lines[ loc ].split( delimiter )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
239 # fill empties within actual line if appropriate
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
240 if fill_empty:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
241 new_split_line = split_line[:]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
242 split_line = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
243 for i, item in enumerate( new_split_line ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
244 col = i + 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
245 if not item:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
246 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
247 split_line.append( fill_empty[ i + 1 ] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
248 except KeyError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
249 split_line.append( item )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
250 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
251 split_line.append( item )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
252 # add actual data to be output below
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
253 if ''.join( split_line ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
254 for col in cols:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
255 if col > hinge:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
256 # if this column doesn't exist, add the appropriate filler or empty column
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
257 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
258 new_item = split_line[ col - 1 ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
259 except IndexError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
260 if fill_empty:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
261 new_item = fill_empty[ col ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
262 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
263 new_item = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
264 current_data.append( new_item )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
265 # grab next line for selected file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
266 current_lines[ loc ] = tmp_input_files[ loc ].readline().rstrip( '\r\n' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
267 # write relevant data to file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
268 if current == old_current and current_data:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
269 fout.write( '%s%s' % ( delimiter, delimiter.join( current_data ) ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
270 elif current_data:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
271 fout.write( '%s%s%s' % ( current, delimiter, delimiter.join( current_data ) ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
272 last_lines = ''.join( current_lines )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
273 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
274 last_lines = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
275 last_loc = loc
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
276 old_current = current
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
277 first_line = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
278 # fill trailing empty columns for final line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
279 if last_loc < len( inputs ) - 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
280 if not fill_empty:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
281 filler = [ '' for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
282 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
283 filler = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
284 fout.write( '%s%s' % ( delimiter, delimiter.join( filler ) ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
285 fout.write( '\n' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
286 fout.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
287 for f in tmp_input_files:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
288 os.unlink( f.name )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
289
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
290 if __name__ == "__main__" : __main__()