+ − 1 #!/usr/bin/env python
+ − 2 #Guruprasad Ananda
+ − 3 """
+ − 4 This tool computes microsatellite mutability for the orthologous microsatellites fetched from 'Extract Orthologous Microsatellites from pair-wise alignments' tool.
+ − 5 """
+ − 6 from galaxy import eggs
+ − 7 import sys, string, re, commands, tempfile, os, fileinput
+ − 8 from galaxy.tools.util.galaxyops import *
+ − 9 from bx.intervals.io import *
+ − 10 from bx.intervals.operations import quicksect
+ − 11
+ − 12 fout = open(sys.argv[2],'w')
+ − 13 p_group = int(sys.argv[3]) #primary "group-by" feature
+ − 14 p_bin_size = int(sys.argv[4])
+ − 15 s_group = int(sys.argv[5]) #sub-group by feature
+ − 16 s_bin_size = int(sys.argv[6])
+ − 17 mono_threshold = 9
+ − 18 non_mono_threshold = 4
+ − 19 p_group_cols = [p_group, p_group+7]
+ − 20 s_group_cols = [s_group, s_group+7]
+ − 21 num_generations = int(sys.argv[7])
+ − 22 region = sys.argv[8]
+ − 23 int_file = sys.argv[9]
+ − 24 if int_file != "None": #User has specified an interval file
+ − 25 try:
+ − 26 fint = open(int_file, 'r')
+ − 27 dbkey_i = sys.argv[10]
+ − 28 chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[11] )
+ − 29 except:
+ − 30 stop_err("Unable to open input Interval file")
+ − 31
+ − 32 def stop_err(msg):
+ − 33 sys.stderr.write(msg)
+ − 34 sys.exit()
+ − 35
+ − 36 def reverse_complement(text):
+ − 37 DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" )
+ − 38 comp = [ch for ch in text.translate(DNA_COMP)]
+ − 39 comp.reverse()
+ − 40 return "".join(comp)
+ − 41
+ − 42 def get_unique_elems(elems):
+ − 43 seen=set()
+ − 44 return[x for x in elems if x not in seen and not seen.add(x)]
+ − 45
+ − 46 def get_binned_lists(uniqlist, binsize):
+ − 47 binnedlist=[]
+ − 48 uniqlist.sort()
+ − 49 start = int(uniqlist[0])
+ − 50 bin_ind=0
+ − 51 l_ind=0
+ − 52 binnedlist.append([])
+ − 53 while l_ind < len(uniqlist):
+ − 54 elem = int(uniqlist[l_ind])
+ − 55 if elem in range(start,start+binsize):
+ − 56 binnedlist[bin_ind].append(elem)
+ − 57 else:
+ − 58 start += binsize
+ − 59 bin_ind += 1
+ − 60 binnedlist.append([])
+ − 61 binnedlist[bin_ind].append(elem)
+ − 62 l_ind += 1
+ − 63 return binnedlist
+ − 64
+ − 65 def fetch_weight(H,C,t):
+ − 66 if (H-(C-H)) < t:
+ − 67 return 2.0
+ − 68 else:
+ − 69 return 1.0
+ − 70
+ − 71 def mutabilityEstimator(repeats1,repeats2,thresholds):
+ − 72 mut_num = 0.0 #Mutability Numerator
+ − 73 mut_den = 0.0 #Mutability denominator
+ − 74 for ind,H in enumerate(repeats1):
+ − 75 C = repeats2[ind]
+ − 76 t = thresholds[ind]
+ − 77 w = fetch_weight(H,C,t)
+ − 78 mut_num += ((H-C)*(H-C)*w)
+ − 79 mut_den += w
+ − 80 return [mut_num, mut_den]
+ − 81
+ − 82 def output_writer(blk, blk_lines):
+ − 83 global winspecies, speciesind
+ − 84 all_elems_1=[]
+ − 85 all_elems_2=[]
+ − 86 all_s_elems_1=[]
+ − 87 all_s_elems_2=[]
+ − 88 for bline in blk_lines:
+ − 89 if not(bline):
+ − 90 continue
+ − 91 items = bline.split('\t')
+ − 92 seq1 = items[1]
+ − 93 start1 = items[2]
+ − 94 end1 = items[3]
+ − 95 seq2 = items[8]
+ − 96 start2 = items[9]
+ − 97 end2 = items[10]
+ − 98 if p_group_cols[0] == 6:
+ − 99 items[p_group_cols[0]] = int(items[p_group_cols[0]])
+ − 100 items[p_group_cols[1]] = int(items[p_group_cols[1]])
+ − 101 if s_group_cols[0] == 6:
+ − 102 items[s_group_cols[0]] = int(items[s_group_cols[0]])
+ − 103 items[s_group_cols[1]] = int(items[s_group_cols[1]])
+ − 104 all_elems_1.append(items[p_group_cols[0]]) #primary col elements for species 1
+ − 105 all_elems_2.append(items[p_group_cols[1]]) #primary col elements for species 2
+ − 106 if s_group_cols[0] != -1: #sub-group is not None
+ − 107 all_s_elems_1.append(items[s_group_cols[0]]) #secondary col elements for species 1
+ − 108 all_s_elems_2.append(items[s_group_cols[1]]) #secondary col elements for species 2
+ − 109 uniq_elems_1 = get_unique_elems(all_elems_1)
+ − 110 uniq_elems_2 = get_unique_elems(all_elems_2)
+ − 111 if s_group_cols[0] != -1:
+ − 112 uniq_s_elems_1 = get_unique_elems(all_s_elems_1)
+ − 113 uniq_s_elems_2 = get_unique_elems(all_s_elems_2)
+ − 114 mut1={}
+ − 115 mut2={}
+ − 116 count1 = {}
+ − 117 count2 = {}
+ − 118 """
+ − 119 if p_group_cols[0] == 7: #i.e. the option chosen is group-by unit(AG, GTC, etc)
+ − 120 uniq_elems_1 = get_unique_units(j.sort(lambda x, y: len(x)-len(y)))
+ − 121 """
+ − 122 if p_group_cols[0] == 6: #i.e. the option chosen is group-by repeat number.
+ − 123 uniq_elems_1 = get_binned_lists(uniq_elems_1,p_bin_size)
+ − 124 uniq_elems_2 = get_binned_lists(uniq_elems_2,p_bin_size)
+ − 125
+ − 126 if s_group_cols[0] == 6: #i.e. the option chosen is subgroup-by repeat number.
+ − 127 uniq_s_elems_1 = get_binned_lists(uniq_s_elems_1,s_bin_size)
+ − 128 uniq_s_elems_2 = get_binned_lists(uniq_s_elems_2,s_bin_size)
+ − 129
+ − 130 for pitem1 in uniq_elems_1:
+ − 131 #repeats1 = []
+ − 132 #repeats2 = []
+ − 133 thresholds = []
+ − 134 if s_group_cols[0] != -1: #Sub-group by feature is not None
+ − 135 for sitem1 in uniq_s_elems_1:
+ − 136 repeats1 = []
+ − 137 repeats2 = []
+ − 138 if type(sitem1) == type(''):
+ − 139 sitem1 = sitem1.strip()
+ − 140 for bline in blk_lines:
+ − 141 belems = bline.split('\t')
+ − 142 if type(pitem1) == list:
+ − 143 if p_group_cols[0] == 6:
+ − 144 belems[p_group_cols[0]] = int(belems[p_group_cols[0]])
+ − 145 if belems[p_group_cols[0]] in pitem1:
+ − 146 if belems[s_group_cols[0]]==sitem1:
+ − 147 repeats1.append(int(belems[6]))
+ − 148 repeats2.append(int(belems[13]))
+ − 149 if belems[4] == 'mononucleotide':
+ − 150 thresholds.append(mono_threshold)
+ − 151 else:
+ − 152 thresholds.append(non_mono_threshold)
+ − 153 mut1[str(pitem1)+'\t'+str(sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
+ − 154 if region == 'align':
+ − 155 count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2))
+ − 156 else:
+ − 157 if winspecies == 1:
+ − 158 count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats1)
+ − 159 elif winspecies == 2:
+ − 160 count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats2)
+ − 161 else:
+ − 162 if type(sitem1) == list:
+ − 163 if s_group_cols[0] == 6:
+ − 164 belems[s_group_cols[0]] = int(belems[s_group_cols[0]])
+ − 165 if belems[p_group_cols[0]]==pitem1 and belems[s_group_cols[0]] in sitem1:
+ − 166 repeats1.append(int(belems[6]))
+ − 167 repeats2.append(int(belems[13]))
+ − 168 if belems[4] == 'mononucleotide':
+ − 169 thresholds.append(mono_threshold)
+ − 170 else:
+ − 171 thresholds.append(non_mono_threshold)
+ − 172 mut1["%s\t%s" %(pitem1,sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
+ − 173 if region == 'align':
+ − 174 count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2))
+ − 175 else:
+ − 176 if winspecies == 1:
+ − 177 count1[str(pitem1)+'\t'+str(sitem1)]=sum(repeats1)
+ − 178 elif winspecies == 2:
+ − 179 count1[str(pitem1)+'\t'+str(sitem1)]=sum(repeats2)
+ − 180 else:
+ − 181 if belems[p_group_cols[0]]==pitem1 and belems[s_group_cols[0]]==sitem1:
+ − 182 repeats1.append(int(belems[6]))
+ − 183 repeats2.append(int(belems[13]))
+ − 184 if belems[4] == 'mononucleotide':
+ − 185 thresholds.append(mono_threshold)
+ − 186 else:
+ − 187 thresholds.append(non_mono_threshold)
+ − 188 mut1["%s\t%s" %(pitem1,sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
+ − 189 if region == 'align':
+ − 190 count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2))
+ − 191 else:
+ − 192 if winspecies == 1:
+ − 193 count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats1)
+ − 194 elif winspecies == 2:
+ − 195 count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats2)
+ − 196 else: #Sub-group by feature is None
+ − 197 for bline in blk_lines:
+ − 198 belems = bline.split('\t')
+ − 199 if type(pitem1) == list:
+ − 200 #print >>sys.stderr, "item: " + str(item1)
+ − 201 if p_group_cols[0] == 6:
+ − 202 belems[p_group_cols[0]] = int(belems[p_group_cols[0]])
+ − 203 if belems[p_group_cols[0]] in pitem1:
+ − 204 repeats1.append(int(belems[6]))
+ − 205 repeats2.append(int(belems[13]))
+ − 206 if belems[4] == 'mononucleotide':
+ − 207 thresholds.append(mono_threshold)
+ − 208 else:
+ − 209 thresholds.append(non_mono_threshold)
+ − 210 else:
+ − 211 if belems[p_group_cols[0]]==pitem1:
+ − 212 repeats1.append(int(belems[6]))
+ − 213 repeats2.append(int(belems[13]))
+ − 214 if belems[4] == 'mononucleotide':
+ − 215 thresholds.append(mono_threshold)
+ − 216 else:
+ − 217 thresholds.append(non_mono_threshold)
+ − 218 mut1["%s" %(pitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
+ − 219 if region == 'align':
+ − 220 count1["%s" %(pitem1)]=min(sum(repeats1),sum(repeats2))
+ − 221 else:
+ − 222 if winspecies == 1:
+ − 223 count1[str(pitem1)]=sum(repeats1)
+ − 224 elif winspecies == 2:
+ − 225 count1[str(pitem1)]=sum(repeats2)
+ − 226
+ − 227 for pitem2 in uniq_elems_2:
+ − 228 #repeats1 = []
+ − 229 #repeats2 = []
+ − 230 thresholds = []
+ − 231 if s_group_cols[0] != -1: #Sub-group by feature is not None
+ − 232 for sitem2 in uniq_s_elems_2:
+ − 233 repeats1 = []
+ − 234 repeats2 = []
+ − 235 if type(sitem2)==type(''):
+ − 236 sitem2 = sitem2.strip()
+ − 237 for bline in blk_lines:
+ − 238 belems = bline.split('\t')
+ − 239 if type(pitem2) == list:
+ − 240 if p_group_cols[0] == 6:
+ − 241 belems[p_group_cols[1]] = int(belems[p_group_cols[1]])
+ − 242 if belems[p_group_cols[1]] in pitem2 and belems[s_group_cols[1]]==sitem2:
+ − 243 repeats2.append(int(belems[13]))
+ − 244 repeats1.append(int(belems[6]))
+ − 245 if belems[4] == 'mononucleotide':
+ − 246 thresholds.append(mono_threshold)
+ − 247 else:
+ − 248 thresholds.append(non_mono_threshold)
+ − 249 mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
+ − 250 #count2[str(pitem2)+'\t'+str(sitem2)]=len(repeats2)
+ − 251 if region == 'align':
+ − 252 count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2))
+ − 253 else:
+ − 254 if winspecies == 1:
+ − 255 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2)
+ − 256 elif winspecies == 2:
+ − 257 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1)
+ − 258 else:
+ − 259 if type(sitem2) == list:
+ − 260 if s_group_cols[0] == 6:
+ − 261 belems[s_group_cols[1]] = int(belems[s_group_cols[1]])
+ − 262 if belems[p_group_cols[1]]==pitem2 and belems[s_group_cols[1]] in sitem2:
+ − 263 repeats2.append(int(belems[13]))
+ − 264 repeats1.append(int(belems[6]))
+ − 265 if belems[4] == 'mononucleotide':
+ − 266 thresholds.append(mono_threshold)
+ − 267 else:
+ − 268 thresholds.append(non_mono_threshold)
+ − 269 mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
+ − 270 if region == 'align':
+ − 271 count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2))
+ − 272 else:
+ − 273 if winspecies == 1:
+ − 274 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2)
+ − 275 elif winspecies == 2:
+ − 276 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1)
+ − 277 else:
+ − 278 if belems[p_group_cols[1]]==pitem2 and belems[s_group_cols[1]]==sitem2:
+ − 279 repeats1.append(int(belems[13]))
+ − 280 repeats2.append(int(belems[6]))
+ − 281 if belems[4] == 'mononucleotide':
+ − 282 thresholds.append(mono_threshold)
+ − 283 else:
+ − 284 thresholds.append(non_mono_threshold)
+ − 285 mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
+ − 286 if region == 'align':
+ − 287 count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2))
+ − 288 else:
+ − 289 if winspecies == 1:
+ − 290 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2)
+ − 291 elif winspecies == 2:
+ − 292 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1)
+ − 293 else: #Sub-group by feature is None
+ − 294 for bline in blk_lines:
+ − 295 belems = bline.split('\t')
+ − 296 if type(pitem2) == list:
+ − 297 if p_group_cols[0] == 6:
+ − 298 belems[p_group_cols[1]] = int(belems[p_group_cols[1]])
+ − 299 if belems[p_group_cols[1]] in pitem2:
+ − 300 repeats2.append(int(belems[13]))
+ − 301 repeats1.append(int(belems[6]))
+ − 302 if belems[4] == 'mononucleotide':
+ − 303 thresholds.append(mono_threshold)
+ − 304 else:
+ − 305 thresholds.append(non_mono_threshold)
+ − 306 else:
+ − 307 if belems[p_group_cols[1]]==pitem2:
+ − 308 repeats2.append(int(belems[13]))
+ − 309 repeats1.append(int(belems[6]))
+ − 310 if belems[4] == 'mononucleotide':
+ − 311 thresholds.append(mono_threshold)
+ − 312 else:
+ − 313 thresholds.append(non_mono_threshold)
+ − 314 mut2["%s" %(pitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
+ − 315 if region == 'align':
+ − 316 count2["%s" %(pitem2)]=min(sum(repeats1),sum(repeats2))
+ − 317 else:
+ − 318 if winspecies == 1:
+ − 319 count2["%s" %(pitem2)]=sum(repeats2)
+ − 320 elif winspecies == 2:
+ − 321 count2["%s" %(pitem2)]=sum(repeats1)
+ − 322 for key in mut1.keys():
+ − 323 if key in mut2.keys():
+ − 324 mut = (mut1[key][0]+mut2[key][0])/(mut1[key][1]+mut2[key][1])
+ − 325 count = count1[key]
+ − 326 del mut2[key]
+ − 327 else:
+ − 328 unit_found = False
+ − 329 if p_group_cols[0] == 7 or s_group_cols[0] == 7: #if it is Repeat Unit (AG, GCT etc.) check for reverse-complements too
+ − 330 if p_group_cols[0] == 7:
+ − 331 this,other = 0,1
+ − 332 else:
+ − 333 this,other = 1,0
+ − 334 groups1 = key.split('\t')
+ − 335 mutn = mut1[key][0]
+ − 336 mutd = mut1[key][1]
+ − 337 count = 0
+ − 338 for key2 in mut2.keys():
+ − 339 groups2 = key2.split('\t')
+ − 340 if groups1[other] == groups2[other]:
+ − 341 if groups1[this] in groups2[this]*2 or reverse_complement(groups1[this]) in groups2[this]*2:
+ − 342 #mut = (mut1[key][0]+mut2[key2][0])/(mut1[key][1]+mut2[key2][1])
+ − 343 mutn += mut2[key2][0]
+ − 344 mutd += mut2[key2][1]
+ − 345 count += int(count2[key2])
+ − 346 unit_found = True
+ − 347 del mut2[key2]
+ − 348 #break
+ − 349 if unit_found:
+ − 350 mut = mutn/mutd
+ − 351 else:
+ − 352 mut = mut1[key][0]/mut1[key][1]
+ − 353 count = count1[key]
+ − 354 mut = "%.2e" %(mut/num_generations)
+ − 355 if region == 'align':
+ − 356 print >>fout, str(blk) + '\t'+seq1 + '\t' + seq2 + '\t' +key.strip()+ '\t'+str(mut) + '\t'+ str(count)
+ − 357 elif region == 'win':
+ − 358 fout.write("%s\t%s\t%s\t%s\n" %(blk,key.strip(),mut,count))
+ − 359 fout.flush()
+ − 360
+ − 361 #catch any remaining repeats, for instance if the orthologous position contained different repeat units
+ − 362 for remaining_key in mut2.keys():
+ − 363 mut = mut2[remaining_key][0]/mut2[remaining_key][1]
+ − 364 mut = "%.2e" %(mut/num_generations)
+ − 365 count = count2[remaining_key]
+ − 366 if region == 'align':
+ − 367 print >>fout, str(blk) + '\t'+seq1 + '\t'+seq2 + '\t'+remaining_key.strip()+ '\t'+str(mut)+ '\t'+ str(count)
+ − 368 elif region == 'win':
+ − 369 fout.write("%s\t%s\t%s\t%s\n" %(blk,remaining_key.strip(),mut,count))
+ − 370 fout.flush()
+ − 371 #print >>fout, blk + '\t'+remaining_key.strip()+ '\t'+str(mut)+ '\t'+ str(count)
+ − 372
+ − 373 def counter(node, start, end, report_func):
+ − 374 if start <= node.start < end and start < node.end <= end:
+ − 375 report_func(node)
+ − 376 if node.right:
+ − 377 counter(node.right, start, end, report_func)
+ − 378 if node.left:
+ − 379 counter(node.left, start, end, report_func)
+ − 380 elif node.start < start and node.right:
+ − 381 counter(node.right, start, end, report_func)
+ − 382 elif node.start >= end and node.left and node.left.maxend > start:
+ − 383 counter(node.left, start, end, report_func)
+ − 384
+ − 385
+ − 386 def main():
+ − 387 infile = sys.argv[1]
+ − 388
+ − 389 for i, line in enumerate( file ( infile )):
+ − 390 line = line.rstrip('\r\n')
+ − 391 if len( line )>0 and not line.startswith( '#' ):
+ − 392 elems = line.split( '\t' )
+ − 393 break
+ − 394 if i == 30:
+ − 395 break # Hopefully we'll never get here...
+ − 396
+ − 397 if len( elems ) != 15:
+ − 398 stop_err( "This tool only works on tabular data output by 'Extract Orthologous Microsatellites from pair-wise alignments' tool. The data in your input dataset is either missing or not formatted properly." )
+ − 399 global winspecies, speciesind
+ − 400 if region == 'win':
+ − 401 if dbkey_i in elems[1]:
+ − 402 winspecies = 1
+ − 403 speciesind = 1
+ − 404 elif dbkey_i in elems[8]:
+ − 405 winspecies = 2
+ − 406 speciesind = 8
+ − 407 else:
+ − 408 stop_err("The species build corresponding to your interval file is not present in the Microsatellite file.")
+ − 409
+ − 410 fin = open(infile, 'r')
+ − 411 skipped = 0
+ − 412 blk=0
+ − 413 win=0
+ − 414 linestr=""
+ − 415
+ − 416 if region == 'win':
+ − 417
+ − 418 msats = NiceReaderWrapper( fileinput.FileInput( infile ),
+ − 419 chrom_col = speciesind,
+ − 420 start_col = speciesind+1,
+ − 421 end_col = speciesind+2,
+ − 422 strand_col = -1,
+ − 423 fix_strand = True)
+ − 424 msatTree = quicksect.IntervalTree()
+ − 425 for item in msats:
+ − 426 if type( item ) is GenomicInterval:
+ − 427 msatTree.insert( item, msats.linenum, item.fields )
+ − 428
+ − 429 for iline in fint:
+ − 430 try:
+ − 431 iline = iline.rstrip('\r\n')
+ − 432 if not(iline) or iline == "":
+ − 433 continue
+ − 434 ielems = iline.strip("\r\n").split('\t')
+ − 435 ichr = ielems[chr_col_i]
+ − 436 istart = int(ielems[start_col_i])
+ − 437 iend = int(ielems[end_col_i])
+ − 438 isrc = "%s.%s" %(dbkey_i,ichr)
+ − 439 if isrc not in msatTree.chroms:
+ − 440 continue
+ − 441 result = []
+ − 442 root = msatTree.chroms[isrc] #root node for the chrom
+ − 443 counter(root, istart, iend, lambda node: result.append( node ))
+ − 444 if not(result):
+ − 445 continue
+ − 446 tmpfile1 = tempfile.NamedTemporaryFile('wb+')
+ − 447 for node in result:
+ − 448 tmpfile1.write("%s\n" % "\t".join( node.other ))
+ − 449
+ − 450 tmpfile1.seek(0)
+ − 451 output_writer(iline, tmpfile1.readlines())
+ − 452 except:
+ − 453 skipped+=1
+ − 454 if skipped:
+ − 455 print "Skipped %d intervals as invalid." %(skipped)
+ − 456 elif region == 'align':
+ − 457 if s_group_cols[0] != -1:
+ − 458 print >>fout, "#Window\tSpecies_1\tSpecies_2\tGroupby_Feature\tSubGroupby_Feature\tMutability\tCount"
+ − 459 else:
+ − 460 print >>fout, "#Window\tSpecies_1\tWindow_Start\tWindow_End\tSpecies_2\tGroupby_Feature\tMutability\tCount"
+ − 461 prev_bnum = -1
+ − 462 try:
+ − 463 for line in fin:
+ − 464 line = line.strip("\r\n")
+ − 465 if not(line) or line == "":
+ − 466 continue
+ − 467 elems = line.split('\t')
+ − 468 try:
+ − 469 assert int(elems[0])
+ − 470 assert len(elems) == 15
+ − 471 except:
+ − 472 continue
+ − 473 new_bnum = int(elems[0])
+ − 474 if new_bnum != prev_bnum:
+ − 475 if prev_bnum != -1:
+ − 476 output_writer(prev_bnum, linestr.strip().replace('\r','\n').split('\n'))
+ − 477 linestr = line + "\n"
+ − 478 else:
+ − 479 linestr += line
+ − 480 linestr += "\n"
+ − 481 prev_bnum = new_bnum
+ − 482 output_writer(prev_bnum, linestr.strip().replace('\r','\n').split('\n'))
+ − 483 except Exception, ea:
+ − 484 print >>sys.stderr, ea
+ − 485 skipped += 1
+ − 486 if skipped:
+ − 487 print "Skipped %d lines as invalid." %(skipped)
+ − 488 if __name__ == "__main__":
+ − 489 main()