comparison tools/regVariation/microsats_mutability.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 #!/usr/bin/env python
2 #Guruprasad Ananda
3 """
4 This tool computes microsatellite mutability for the orthologous microsatellites fetched from 'Extract Orthologous Microsatellites from pair-wise alignments' tool.
5 """
6 from galaxy import eggs
7 import sys, string, re, commands, tempfile, os, fileinput
8 from galaxy.tools.util.galaxyops import *
9 from bx.intervals.io import *
10 from bx.intervals.operations import quicksect
11
12 fout = open(sys.argv[2],'w')
13 p_group = int(sys.argv[3]) #primary "group-by" feature
14 p_bin_size = int(sys.argv[4])
15 s_group = int(sys.argv[5]) #sub-group by feature
16 s_bin_size = int(sys.argv[6])
17 mono_threshold = 9
18 non_mono_threshold = 4
19 p_group_cols = [p_group, p_group+7]
20 s_group_cols = [s_group, s_group+7]
21 num_generations = int(sys.argv[7])
22 region = sys.argv[8]
23 int_file = sys.argv[9]
24 if int_file != "None": #User has specified an interval file
25 try:
26 fint = open(int_file, 'r')
27 dbkey_i = sys.argv[10]
28 chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[11] )
29 except:
30 stop_err("Unable to open input Interval file")
31
32 def stop_err(msg):
33 sys.stderr.write(msg)
34 sys.exit()
35
36 def reverse_complement(text):
37 DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" )
38 comp = [ch for ch in text.translate(DNA_COMP)]
39 comp.reverse()
40 return "".join(comp)
41
42 def get_unique_elems(elems):
43 seen=set()
44 return[x for x in elems if x not in seen and not seen.add(x)]
45
46 def get_binned_lists(uniqlist, binsize):
47 binnedlist=[]
48 uniqlist.sort()
49 start = int(uniqlist[0])
50 bin_ind=0
51 l_ind=0
52 binnedlist.append([])
53 while l_ind < len(uniqlist):
54 elem = int(uniqlist[l_ind])
55 if elem in range(start,start+binsize):
56 binnedlist[bin_ind].append(elem)
57 else:
58 start += binsize
59 bin_ind += 1
60 binnedlist.append([])
61 binnedlist[bin_ind].append(elem)
62 l_ind += 1
63 return binnedlist
64
65 def fetch_weight(H,C,t):
66 if (H-(C-H)) < t:
67 return 2.0
68 else:
69 return 1.0
70
71 def mutabilityEstimator(repeats1,repeats2,thresholds):
72 mut_num = 0.0 #Mutability Numerator
73 mut_den = 0.0 #Mutability denominator
74 for ind,H in enumerate(repeats1):
75 C = repeats2[ind]
76 t = thresholds[ind]
77 w = fetch_weight(H,C,t)
78 mut_num += ((H-C)*(H-C)*w)
79 mut_den += w
80 return [mut_num, mut_den]
81
82 def output_writer(blk, blk_lines):
83 global winspecies, speciesind
84 all_elems_1=[]
85 all_elems_2=[]
86 all_s_elems_1=[]
87 all_s_elems_2=[]
88 for bline in blk_lines:
89 if not(bline):
90 continue
91 items = bline.split('\t')
92 seq1 = items[1]
93 start1 = items[2]
94 end1 = items[3]
95 seq2 = items[8]
96 start2 = items[9]
97 end2 = items[10]
98 if p_group_cols[0] == 6:
99 items[p_group_cols[0]] = int(items[p_group_cols[0]])
100 items[p_group_cols[1]] = int(items[p_group_cols[1]])
101 if s_group_cols[0] == 6:
102 items[s_group_cols[0]] = int(items[s_group_cols[0]])
103 items[s_group_cols[1]] = int(items[s_group_cols[1]])
104 all_elems_1.append(items[p_group_cols[0]]) #primary col elements for species 1
105 all_elems_2.append(items[p_group_cols[1]]) #primary col elements for species 2
106 if s_group_cols[0] != -1: #sub-group is not None
107 all_s_elems_1.append(items[s_group_cols[0]]) #secondary col elements for species 1
108 all_s_elems_2.append(items[s_group_cols[1]]) #secondary col elements for species 2
109 uniq_elems_1 = get_unique_elems(all_elems_1)
110 uniq_elems_2 = get_unique_elems(all_elems_2)
111 if s_group_cols[0] != -1:
112 uniq_s_elems_1 = get_unique_elems(all_s_elems_1)
113 uniq_s_elems_2 = get_unique_elems(all_s_elems_2)
114 mut1={}
115 mut2={}
116 count1 = {}
117 count2 = {}
118 """
119 if p_group_cols[0] == 7: #i.e. the option chosen is group-by unit(AG, GTC, etc)
120 uniq_elems_1 = get_unique_units(j.sort(lambda x, y: len(x)-len(y)))
121 """
122 if p_group_cols[0] == 6: #i.e. the option chosen is group-by repeat number.
123 uniq_elems_1 = get_binned_lists(uniq_elems_1,p_bin_size)
124 uniq_elems_2 = get_binned_lists(uniq_elems_2,p_bin_size)
125
126 if s_group_cols[0] == 6: #i.e. the option chosen is subgroup-by repeat number.
127 uniq_s_elems_1 = get_binned_lists(uniq_s_elems_1,s_bin_size)
128 uniq_s_elems_2 = get_binned_lists(uniq_s_elems_2,s_bin_size)
129
130 for pitem1 in uniq_elems_1:
131 #repeats1 = []
132 #repeats2 = []
133 thresholds = []
134 if s_group_cols[0] != -1: #Sub-group by feature is not None
135 for sitem1 in uniq_s_elems_1:
136 repeats1 = []
137 repeats2 = []
138 if type(sitem1) == type(''):
139 sitem1 = sitem1.strip()
140 for bline in blk_lines:
141 belems = bline.split('\t')
142 if type(pitem1) == list:
143 if p_group_cols[0] == 6:
144 belems[p_group_cols[0]] = int(belems[p_group_cols[0]])
145 if belems[p_group_cols[0]] in pitem1:
146 if belems[s_group_cols[0]]==sitem1:
147 repeats1.append(int(belems[6]))
148 repeats2.append(int(belems[13]))
149 if belems[4] == 'mononucleotide':
150 thresholds.append(mono_threshold)
151 else:
152 thresholds.append(non_mono_threshold)
153 mut1[str(pitem1)+'\t'+str(sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
154 if region == 'align':
155 count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2))
156 else:
157 if winspecies == 1:
158 count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats1)
159 elif winspecies == 2:
160 count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats2)
161 else:
162 if type(sitem1) == list:
163 if s_group_cols[0] == 6:
164 belems[s_group_cols[0]] = int(belems[s_group_cols[0]])
165 if belems[p_group_cols[0]]==pitem1 and belems[s_group_cols[0]] in sitem1:
166 repeats1.append(int(belems[6]))
167 repeats2.append(int(belems[13]))
168 if belems[4] == 'mononucleotide':
169 thresholds.append(mono_threshold)
170 else:
171 thresholds.append(non_mono_threshold)
172 mut1["%s\t%s" %(pitem1,sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
173 if region == 'align':
174 count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2))
175 else:
176 if winspecies == 1:
177 count1[str(pitem1)+'\t'+str(sitem1)]=sum(repeats1)
178 elif winspecies == 2:
179 count1[str(pitem1)+'\t'+str(sitem1)]=sum(repeats2)
180 else:
181 if belems[p_group_cols[0]]==pitem1 and belems[s_group_cols[0]]==sitem1:
182 repeats1.append(int(belems[6]))
183 repeats2.append(int(belems[13]))
184 if belems[4] == 'mononucleotide':
185 thresholds.append(mono_threshold)
186 else:
187 thresholds.append(non_mono_threshold)
188 mut1["%s\t%s" %(pitem1,sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
189 if region == 'align':
190 count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2))
191 else:
192 if winspecies == 1:
193 count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats1)
194 elif winspecies == 2:
195 count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats2)
196 else: #Sub-group by feature is None
197 for bline in blk_lines:
198 belems = bline.split('\t')
199 if type(pitem1) == list:
200 #print >>sys.stderr, "item: " + str(item1)
201 if p_group_cols[0] == 6:
202 belems[p_group_cols[0]] = int(belems[p_group_cols[0]])
203 if belems[p_group_cols[0]] in pitem1:
204 repeats1.append(int(belems[6]))
205 repeats2.append(int(belems[13]))
206 if belems[4] == 'mononucleotide':
207 thresholds.append(mono_threshold)
208 else:
209 thresholds.append(non_mono_threshold)
210 else:
211 if belems[p_group_cols[0]]==pitem1:
212 repeats1.append(int(belems[6]))
213 repeats2.append(int(belems[13]))
214 if belems[4] == 'mononucleotide':
215 thresholds.append(mono_threshold)
216 else:
217 thresholds.append(non_mono_threshold)
218 mut1["%s" %(pitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
219 if region == 'align':
220 count1["%s" %(pitem1)]=min(sum(repeats1),sum(repeats2))
221 else:
222 if winspecies == 1:
223 count1[str(pitem1)]=sum(repeats1)
224 elif winspecies == 2:
225 count1[str(pitem1)]=sum(repeats2)
226
227 for pitem2 in uniq_elems_2:
228 #repeats1 = []
229 #repeats2 = []
230 thresholds = []
231 if s_group_cols[0] != -1: #Sub-group by feature is not None
232 for sitem2 in uniq_s_elems_2:
233 repeats1 = []
234 repeats2 = []
235 if type(sitem2)==type(''):
236 sitem2 = sitem2.strip()
237 for bline in blk_lines:
238 belems = bline.split('\t')
239 if type(pitem2) == list:
240 if p_group_cols[0] == 6:
241 belems[p_group_cols[1]] = int(belems[p_group_cols[1]])
242 if belems[p_group_cols[1]] in pitem2 and belems[s_group_cols[1]]==sitem2:
243 repeats2.append(int(belems[13]))
244 repeats1.append(int(belems[6]))
245 if belems[4] == 'mononucleotide':
246 thresholds.append(mono_threshold)
247 else:
248 thresholds.append(non_mono_threshold)
249 mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
250 #count2[str(pitem2)+'\t'+str(sitem2)]=len(repeats2)
251 if region == 'align':
252 count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2))
253 else:
254 if winspecies == 1:
255 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2)
256 elif winspecies == 2:
257 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1)
258 else:
259 if type(sitem2) == list:
260 if s_group_cols[0] == 6:
261 belems[s_group_cols[1]] = int(belems[s_group_cols[1]])
262 if belems[p_group_cols[1]]==pitem2 and belems[s_group_cols[1]] in sitem2:
263 repeats2.append(int(belems[13]))
264 repeats1.append(int(belems[6]))
265 if belems[4] == 'mononucleotide':
266 thresholds.append(mono_threshold)
267 else:
268 thresholds.append(non_mono_threshold)
269 mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
270 if region == 'align':
271 count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2))
272 else:
273 if winspecies == 1:
274 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2)
275 elif winspecies == 2:
276 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1)
277 else:
278 if belems[p_group_cols[1]]==pitem2 and belems[s_group_cols[1]]==sitem2:
279 repeats1.append(int(belems[13]))
280 repeats2.append(int(belems[6]))
281 if belems[4] == 'mononucleotide':
282 thresholds.append(mono_threshold)
283 else:
284 thresholds.append(non_mono_threshold)
285 mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
286 if region == 'align':
287 count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2))
288 else:
289 if winspecies == 1:
290 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2)
291 elif winspecies == 2:
292 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1)
293 else: #Sub-group by feature is None
294 for bline in blk_lines:
295 belems = bline.split('\t')
296 if type(pitem2) == list:
297 if p_group_cols[0] == 6:
298 belems[p_group_cols[1]] = int(belems[p_group_cols[1]])
299 if belems[p_group_cols[1]] in pitem2:
300 repeats2.append(int(belems[13]))
301 repeats1.append(int(belems[6]))
302 if belems[4] == 'mononucleotide':
303 thresholds.append(mono_threshold)
304 else:
305 thresholds.append(non_mono_threshold)
306 else:
307 if belems[p_group_cols[1]]==pitem2:
308 repeats2.append(int(belems[13]))
309 repeats1.append(int(belems[6]))
310 if belems[4] == 'mononucleotide':
311 thresholds.append(mono_threshold)
312 else:
313 thresholds.append(non_mono_threshold)
314 mut2["%s" %(pitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
315 if region == 'align':
316 count2["%s" %(pitem2)]=min(sum(repeats1),sum(repeats2))
317 else:
318 if winspecies == 1:
319 count2["%s" %(pitem2)]=sum(repeats2)
320 elif winspecies == 2:
321 count2["%s" %(pitem2)]=sum(repeats1)
322 for key in mut1.keys():
323 if key in mut2.keys():
324 mut = (mut1[key][0]+mut2[key][0])/(mut1[key][1]+mut2[key][1])
325 count = count1[key]
326 del mut2[key]
327 else:
328 unit_found = False
329 if p_group_cols[0] == 7 or s_group_cols[0] == 7: #if it is Repeat Unit (AG, GCT etc.) check for reverse-complements too
330 if p_group_cols[0] == 7:
331 this,other = 0,1
332 else:
333 this,other = 1,0
334 groups1 = key.split('\t')
335 mutn = mut1[key][0]
336 mutd = mut1[key][1]
337 count = 0
338 for key2 in mut2.keys():
339 groups2 = key2.split('\t')
340 if groups1[other] == groups2[other]:
341 if groups1[this] in groups2[this]*2 or reverse_complement(groups1[this]) in groups2[this]*2:
342 #mut = (mut1[key][0]+mut2[key2][0])/(mut1[key][1]+mut2[key2][1])
343 mutn += mut2[key2][0]
344 mutd += mut2[key2][1]
345 count += int(count2[key2])
346 unit_found = True
347 del mut2[key2]
348 #break
349 if unit_found:
350 mut = mutn/mutd
351 else:
352 mut = mut1[key][0]/mut1[key][1]
353 count = count1[key]
354 mut = "%.2e" %(mut/num_generations)
355 if region == 'align':
356 print >>fout, str(blk) + '\t'+seq1 + '\t' + seq2 + '\t' +key.strip()+ '\t'+str(mut) + '\t'+ str(count)
357 elif region == 'win':
358 fout.write("%s\t%s\t%s\t%s\n" %(blk,key.strip(),mut,count))
359 fout.flush()
360
361 #catch any remaining repeats, for instance if the orthologous position contained different repeat units
362 for remaining_key in mut2.keys():
363 mut = mut2[remaining_key][0]/mut2[remaining_key][1]
364 mut = "%.2e" %(mut/num_generations)
365 count = count2[remaining_key]
366 if region == 'align':
367 print >>fout, str(blk) + '\t'+seq1 + '\t'+seq2 + '\t'+remaining_key.strip()+ '\t'+str(mut)+ '\t'+ str(count)
368 elif region == 'win':
369 fout.write("%s\t%s\t%s\t%s\n" %(blk,remaining_key.strip(),mut,count))
370 fout.flush()
371 #print >>fout, blk + '\t'+remaining_key.strip()+ '\t'+str(mut)+ '\t'+ str(count)
372
373 def counter(node, start, end, report_func):
374 if start <= node.start < end and start < node.end <= end:
375 report_func(node)
376 if node.right:
377 counter(node.right, start, end, report_func)
378 if node.left:
379 counter(node.left, start, end, report_func)
380 elif node.start < start and node.right:
381 counter(node.right, start, end, report_func)
382 elif node.start >= end and node.left and node.left.maxend > start:
383 counter(node.left, start, end, report_func)
384
385
386 def main():
387 infile = sys.argv[1]
388
389 for i, line in enumerate( file ( infile )):
390 line = line.rstrip('\r\n')
391 if len( line )>0 and not line.startswith( '#' ):
392 elems = line.split( '\t' )
393 break
394 if i == 30:
395 break # Hopefully we'll never get here...
396
397 if len( elems ) != 15:
398 stop_err( "This tool only works on tabular data output by 'Extract Orthologous Microsatellites from pair-wise alignments' tool. The data in your input dataset is either missing or not formatted properly." )
399 global winspecies, speciesind
400 if region == 'win':
401 if dbkey_i in elems[1]:
402 winspecies = 1
403 speciesind = 1
404 elif dbkey_i in elems[8]:
405 winspecies = 2
406 speciesind = 8
407 else:
408 stop_err("The species build corresponding to your interval file is not present in the Microsatellite file.")
409
410 fin = open(infile, 'r')
411 skipped = 0
412 blk=0
413 win=0
414 linestr=""
415
416 if region == 'win':
417
418 msats = NiceReaderWrapper( fileinput.FileInput( infile ),
419 chrom_col = speciesind,
420 start_col = speciesind+1,
421 end_col = speciesind+2,
422 strand_col = -1,
423 fix_strand = True)
424 msatTree = quicksect.IntervalTree()
425 for item in msats:
426 if type( item ) is GenomicInterval:
427 msatTree.insert( item, msats.linenum, item.fields )
428
429 for iline in fint:
430 try:
431 iline = iline.rstrip('\r\n')
432 if not(iline) or iline == "":
433 continue
434 ielems = iline.strip("\r\n").split('\t')
435 ichr = ielems[chr_col_i]
436 istart = int(ielems[start_col_i])
437 iend = int(ielems[end_col_i])
438 isrc = "%s.%s" %(dbkey_i,ichr)
439 if isrc not in msatTree.chroms:
440 continue
441 result = []
442 root = msatTree.chroms[isrc] #root node for the chrom
443 counter(root, istart, iend, lambda node: result.append( node ))
444 if not(result):
445 continue
446 tmpfile1 = tempfile.NamedTemporaryFile('wb+')
447 for node in result:
448 tmpfile1.write("%s\n" % "\t".join( node.other ))
449
450 tmpfile1.seek(0)
451 output_writer(iline, tmpfile1.readlines())
452 except:
453 skipped+=1
454 if skipped:
455 print "Skipped %d intervals as invalid." %(skipped)
456 elif region == 'align':
457 if s_group_cols[0] != -1:
458 print >>fout, "#Window\tSpecies_1\tSpecies_2\tGroupby_Feature\tSubGroupby_Feature\tMutability\tCount"
459 else:
460 print >>fout, "#Window\tSpecies_1\tWindow_Start\tWindow_End\tSpecies_2\tGroupby_Feature\tMutability\tCount"
461 prev_bnum = -1
462 try:
463 for line in fin:
464 line = line.strip("\r\n")
465 if not(line) or line == "":
466 continue
467 elems = line.split('\t')
468 try:
469 assert int(elems[0])
470 assert len(elems) == 15
471 except:
472 continue
473 new_bnum = int(elems[0])
474 if new_bnum != prev_bnum:
475 if prev_bnum != -1:
476 output_writer(prev_bnum, linestr.strip().replace('\r','\n').split('\n'))
477 linestr = line + "\n"
478 else:
479 linestr += line
480 linestr += "\n"
481 prev_bnum = new_bnum
482 output_writer(prev_bnum, linestr.strip().replace('\r','\n').split('\n'))
483 except Exception, ea:
484 print >>sys.stderr, ea
485 skipped += 1
486 if skipped:
487 print "Skipped %d lines as invalid." %(skipped)
488 if __name__ == "__main__":
489 main()