sharplabtool: tools/regVariation/microsats

comparison tools/regVariation/microsats_mutability.py @ 0:9071e359b9a3

Uploaded

author	xuebing
date	Fri, 09 Mar 2012 19:37:19 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:9071e359b9a3
+#!/usr/bin/env python
+#Guruprasad Ananda
+"""
+This tool computes microsatellite mutability for the orthologous microsatellites fetched from  'Extract Orthologous Microsatellites from pair-wise alignments' tool.
+"""
+from galaxy import eggs
+import sys, string, re, commands, tempfile, os, fileinput
+from galaxy.tools.util.galaxyops import *
+from bx.intervals.io import *
+from bx.intervals.operations import quicksect
+fout = open(sys.argv[2],'w')
+p_group = int(sys.argv[3])        #primary "group-by" feature
+p_bin_size = int(sys.argv[4])
+s_group = int(sys.argv[5])        #sub-group by feature
+s_bin_size = int(sys.argv[6])
+mono_threshold = 9
+non_mono_threshold = 4
+p_group_cols = [p_group, p_group+7]
+s_group_cols = [s_group, s_group+7]
+num_generations = int(sys.argv[7])
+region = sys.argv[8]
+int_file = sys.argv[9]
+if int_file != "None": #User has specified an interval file
+try:
+fint = open(int_file, 'r')
+dbkey_i = sys.argv[10]
+chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[11] )
+except:
+stop_err("Unable to open input Interval file")
+def stop_err(msg):
+sys.stderr.write(msg)
+sys.exit()
+def reverse_complement(text):
+DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" )
+comp = [ch for ch in text.translate(DNA_COMP)]
+comp.reverse()
+return "".join(comp)
+def get_unique_elems(elems):
+seen=set()
+return[x for x in elems if x not in seen and not seen.add(x)]
+def get_binned_lists(uniqlist, binsize):
+binnedlist=[]
+uniqlist.sort()
+start = int(uniqlist[0])
+bin_ind=0
+l_ind=0
+binnedlist.append([])
+while l_ind < len(uniqlist):
+elem = int(uniqlist[l_ind])
+if elem in range(start,start+binsize):
+binnedlist[bin_ind].append(elem)
+else:
+start += binsize
+bin_ind += 1
+binnedlist.append([])
+binnedlist[bin_ind].append(elem)
+l_ind += 1
+return binnedlist
+def fetch_weight(H,C,t):
+if (H-(C-H)) < t:
+return 2.0
+else:
+return 1.0
+def mutabilityEstimator(repeats1,repeats2,thresholds):
+mut_num = 0.0    #Mutability Numerator
+mut_den = 0.0    #Mutability denominator
+for ind,H in enumerate(repeats1):
+C = repeats2[ind]
+t = thresholds[ind]
+w = fetch_weight(H,C,t)
+mut_num += ((H-C)*(H-C)*w)
+mut_den += w
+return [mut_num, mut_den]
+def output_writer(blk, blk_lines):
+global winspecies, speciesind
+all_elems_1=[]
+all_elems_2=[]
+all_s_elems_1=[]
+all_s_elems_2=[]
+for bline in blk_lines:
+if not(bline):
+continue
+items = bline.split('\t')
+seq1 = items[1]
+start1 = items[2]
+end1 = items[3]
+seq2 = items[8]
+start2 = items[9]
+end2 = items[10]
+if p_group_cols[0] == 6:
+items[p_group_cols[0]] = int(items[p_group_cols[0]])
+items[p_group_cols[1]] = int(items[p_group_cols[1]])
+if s_group_cols[0] == 6:
+items[s_group_cols[0]] = int(items[s_group_cols[0]])
+items[s_group_cols[1]] = int(items[s_group_cols[1]])
+all_elems_1.append(items[p_group_cols[0]])    #primary col elements for species 1
+all_elems_2.append(items[p_group_cols[1]])    #primary col elements for species 2
+if s_group_cols[0] != -1:    #sub-group is not None
+all_s_elems_1.append(items[s_group_cols[0]])    #secondary col elements for species 1
+all_s_elems_2.append(items[s_group_cols[1]])    #secondary col elements for species 2
+uniq_elems_1 = get_unique_elems(all_elems_1)
+uniq_elems_2 = get_unique_elems(all_elems_2)
+if s_group_cols[0] != -1:
+uniq_s_elems_1 = get_unique_elems(all_s_elems_1)
+uniq_s_elems_2 = get_unique_elems(all_s_elems_2)
+mut1={}
+mut2={}
+count1 = {}
+count2 = {}
+"""
+if p_group_cols[0] == 7:    #i.e. the option chosen is group-by unit(AG, GTC, etc)
+uniq_elems_1 = get_unique_units(j.sort(lambda x, y: len(x)-len(y)))
+"""
+if p_group_cols[0] == 6:    #i.e. the option chosen is group-by repeat number.
+uniq_elems_1 = get_binned_lists(uniq_elems_1,p_bin_size)
+uniq_elems_2 = get_binned_lists(uniq_elems_2,p_bin_size)
+if s_group_cols[0] == 6:    #i.e. the option chosen is subgroup-by repeat number.
+uniq_s_elems_1 = get_binned_lists(uniq_s_elems_1,s_bin_size)
+uniq_s_elems_2 = get_binned_lists(uniq_s_elems_2,s_bin_size)
+for pitem1 in uniq_elems_1:
+#repeats1 = []
+#repeats2 = []
+thresholds = []
+if s_group_cols[0] != -1:    #Sub-group by feature is not None
+for sitem1 in uniq_s_elems_1:
+repeats1 = []
+repeats2 = []
+if type(sitem1) == type(''):
+sitem1 = sitem1.strip()
+for bline in blk_lines:
+belems = bline.split('\t')
+if type(pitem1) == list:
+if p_group_cols[0] == 6:
+belems[p_group_cols[0]] = int(belems[p_group_cols[0]])
+if belems[p_group_cols[0]] in pitem1:
+if belems[s_group_cols[0]]==sitem1:
+repeats1.append(int(belems[6]))
+repeats2.append(int(belems[13]))
+if belems[4] == 'mononucleotide':
+thresholds.append(mono_threshold)
+else:
+thresholds.append(non_mono_threshold)
+mut1[str(pitem1)+'\t'+str(sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
+if region == 'align':
+count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2))
+else:
+if winspecies == 1:
+count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats1)
+elif winspecies == 2:
+count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats2)
+else:
+if type(sitem1) == list:
+if s_group_cols[0] == 6:
+belems[s_group_cols[0]] = int(belems[s_group_cols[0]])
+if belems[p_group_cols[0]]==pitem1 and belems[s_group_cols[0]] in sitem1:
+repeats1.append(int(belems[6]))
+repeats2.append(int(belems[13]))
+if belems[4] == 'mononucleotide':
+thresholds.append(mono_threshold)
+else:
+thresholds.append(non_mono_threshold)
+mut1["%s\t%s" %(pitem1,sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
+if region == 'align':
+count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2))
+else:
+if winspecies == 1:
+count1[str(pitem1)+'\t'+str(sitem1)]=sum(repeats1)
+elif winspecies == 2:
+count1[str(pitem1)+'\t'+str(sitem1)]=sum(repeats2)
+else:
+if belems[p_group_cols[0]]==pitem1 and belems[s_group_cols[0]]==sitem1:
+repeats1.append(int(belems[6]))
+repeats2.append(int(belems[13]))
+if belems[4] == 'mononucleotide':
+thresholds.append(mono_threshold)
+else:
+thresholds.append(non_mono_threshold)
+mut1["%s\t%s" %(pitem1,sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
+if region == 'align':
+count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2))
+else:
+if winspecies == 1:
+count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats1)
+elif winspecies == 2:
+count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats2)
+else:   #Sub-group by feature is None
+for bline in blk_lines:
+belems = bline.split('\t')
+if type(pitem1) == list:
+#print >>sys.stderr, "item: " + str(item1)
+if p_group_cols[0] == 6:
+belems[p_group_cols[0]] = int(belems[p_group_cols[0]])
+if belems[p_group_cols[0]] in pitem1:
+repeats1.append(int(belems[6]))
+repeats2.append(int(belems[13]))
+if belems[4] == 'mononucleotide':
+thresholds.append(mono_threshold)
+else:
+thresholds.append(non_mono_threshold)
+else:
+if belems[p_group_cols[0]]==pitem1:
+repeats1.append(int(belems[6]))
+repeats2.append(int(belems[13]))
+if belems[4] == 'mononucleotide':
+thresholds.append(mono_threshold)
+else:
+thresholds.append(non_mono_threshold)
+mut1["%s" %(pitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
+if region == 'align':
+count1["%s" %(pitem1)]=min(sum(repeats1),sum(repeats2))
+else:
+if winspecies == 1:
+count1[str(pitem1)]=sum(repeats1)
+elif winspecies == 2:
+count1[str(pitem1)]=sum(repeats2)
+for pitem2 in uniq_elems_2:
+#repeats1 = []
+#repeats2 = []
+thresholds = []
+if s_group_cols[0] != -1:    #Sub-group by feature is not None
+for sitem2 in uniq_s_elems_2:
+repeats1 = []
+repeats2 = []
+if type(sitem2)==type(''):
+sitem2 = sitem2.strip()
+for bline in blk_lines:
+belems = bline.split('\t')
+if type(pitem2) == list:
+if p_group_cols[0] == 6:
+belems[p_group_cols[1]] = int(belems[p_group_cols[1]])
+if belems[p_group_cols[1]] in pitem2 and belems[s_group_cols[1]]==sitem2:
+repeats2.append(int(belems[13]))
+repeats1.append(int(belems[6]))
+if belems[4] == 'mononucleotide':
+thresholds.append(mono_threshold)
+else:
+thresholds.append(non_mono_threshold)
+mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
+#count2[str(pitem2)+'\t'+str(sitem2)]=len(repeats2)
+if region == 'align':
+count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2))
+else:
+if winspecies == 1:
+count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2)
+elif winspecies == 2:
+count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1)
+else:
+if type(sitem2) == list:
+if s_group_cols[0] == 6:
+belems[s_group_cols[1]] = int(belems[s_group_cols[1]])
+if belems[p_group_cols[1]]==pitem2 and belems[s_group_cols[1]] in sitem2:
+repeats2.append(int(belems[13]))
+repeats1.append(int(belems[6]))
+if belems[4] == 'mononucleotide':
+thresholds.append(mono_threshold)
+else:
+thresholds.append(non_mono_threshold)
+mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
+if region == 'align':
+count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2))
+else:
+if winspecies == 1:
+count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2)
+elif winspecies == 2:
+count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1)
+else:
+if belems[p_group_cols[1]]==pitem2 and belems[s_group_cols[1]]==sitem2:
+repeats1.append(int(belems[13]))
+repeats2.append(int(belems[6]))
+if belems[4] == 'mononucleotide':
+thresholds.append(mono_threshold)
+else:
+thresholds.append(non_mono_threshold)
+mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
+if region == 'align':
+count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2))
+else:
+if winspecies == 1:
+count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2)
+elif winspecies == 2:
+count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1)
+else:   #Sub-group by feature is None
+for bline in blk_lines:
+belems = bline.split('\t')
+if type(pitem2) == list:
+if p_group_cols[0] == 6:
+belems[p_group_cols[1]] = int(belems[p_group_cols[1]])
+if belems[p_group_cols[1]] in pitem2:
+repeats2.append(int(belems[13]))
+repeats1.append(int(belems[6]))
+if belems[4] == 'mononucleotide':
+thresholds.append(mono_threshold)
+else:
+thresholds.append(non_mono_threshold)
+else:
+if belems[p_group_cols[1]]==pitem2:
+repeats2.append(int(belems[13]))
+repeats1.append(int(belems[6]))
+if belems[4] == 'mononucleotide':
+thresholds.append(mono_threshold)
+else:
+thresholds.append(non_mono_threshold)
+mut2["%s" %(pitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
+if region == 'align':
+count2["%s" %(pitem2)]=min(sum(repeats1),sum(repeats2))
+else:
+if winspecies == 1:
+count2["%s" %(pitem2)]=sum(repeats2)
+elif winspecies == 2:
+count2["%s" %(pitem2)]=sum(repeats1)
+for key in mut1.keys():
+if key in mut2.keys():
+mut = (mut1[key][0]+mut2[key][0])/(mut1[key][1]+mut2[key][1])
+count = count1[key]
+del mut2[key]
+else:
+unit_found = False
+if p_group_cols[0] == 7 or s_group_cols[0] == 7: #if it is Repeat Unit (AG, GCT etc.) check for reverse-complements too
+if p_group_cols[0] == 7:
+this,other = 0,1
+else:
+this,other = 1,0
+groups1 = key.split('\t')
+mutn = mut1[key][0]
+mutd = mut1[key][1]
+count = 0
+for key2 in mut2.keys():
+groups2 = key2.split('\t')
+if groups1[other] == groups2[other]:
+if groups1[this] in groups2[this]*2 or reverse_complement(groups1[this]) in groups2[this]*2:
+#mut = (mut1[key][0]+mut2[key2][0])/(mut1[key][1]+mut2[key2][1])
+mutn += mut2[key2][0]
+mutd += mut2[key2][1]
+count += int(count2[key2])
+unit_found = True
+del mut2[key2]
+#break
+if unit_found:
+mut = mutn/mutd
+else:
+mut = mut1[key][0]/mut1[key][1]
+count = count1[key]
+mut = "%.2e" %(mut/num_generations)
+if region == 'align':
+print >>fout, str(blk) + '\t'+seq1 + '\t' + seq2 + '\t' +key.strip()+ '\t'+str(mut) + '\t'+ str(count)
+elif region == 'win':
+fout.write("%s\t%s\t%s\t%s\n" %(blk,key.strip(),mut,count))
+fout.flush()
+#catch any remaining repeats, for instance if the orthologous position contained different repeat units
+for remaining_key in mut2.keys():
+mut = mut2[remaining_key][0]/mut2[remaining_key][1]
+mut = "%.2e" %(mut/num_generations)
+count = count2[remaining_key]
+if region == 'align':
+print >>fout, str(blk) + '\t'+seq1 + '\t'+seq2 + '\t'+remaining_key.strip()+ '\t'+str(mut)+ '\t'+ str(count)
+elif region == 'win':
+fout.write("%s\t%s\t%s\t%s\n" %(blk,remaining_key.strip(),mut,count))
+fout.flush()
+#print >>fout, blk + '\t'+remaining_key.strip()+ '\t'+str(mut)+ '\t'+ str(count)
+def counter(node, start, end, report_func):
+if start <= node.start < end and start < node.end <= end:
+report_func(node)
+if node.right:
+counter(node.right, start, end, report_func)
+if node.left:
+counter(node.left, start, end, report_func)
+elif node.start < start and node.right:
+counter(node.right, start, end, report_func)
+elif node.start >= end and node.left and node.left.maxend > start:
+counter(node.left, start, end, report_func)
+def main():
+infile = sys.argv[1]
+for i, line in enumerate( file ( infile )):
+line = line.rstrip('\r\n')
+if len( line )>0 and not line.startswith( '#' ):
+elems = line.split( '\t' )
+break
+if i == 30:
+break # Hopefully we'll never get here...
+if len( elems ) != 15:
+stop_err( "This tool only works on tabular data output by 'Extract Orthologous Microsatellites from pair-wise alignments' tool. The data in your input dataset is either missing or not formatted properly." )
+global winspecies, speciesind
+if region == 'win':
+if dbkey_i in elems[1]:
+winspecies = 1
+speciesind = 1
+elif dbkey_i in elems[8]:
+winspecies = 2
+speciesind = 8
+else:
+stop_err("The species build corresponding to your interval file is not present in the Microsatellite file.")
+fin = open(infile, 'r')
+skipped = 0
+blk=0
+win=0
+linestr=""
+if region == 'win':
+msats = NiceReaderWrapper( fileinput.FileInput( infile ),
+chrom_col = speciesind,
+start_col = speciesind+1,
+end_col = speciesind+2,
+strand_col = -1,
+fix_strand = True)
+msatTree = quicksect.IntervalTree()
+for item in msats:
+if type( item ) is GenomicInterval:
+msatTree.insert( item, msats.linenum, item.fields )
+for iline in fint:
+try:
+iline = iline.rstrip('\r\n')
+if not(iline) or iline == "":
+continue
+ielems = iline.strip("\r\n").split('\t')
+ichr = ielems[chr_col_i]
+istart = int(ielems[start_col_i])
+iend = int(ielems[end_col_i])
+isrc = "%s.%s" %(dbkey_i,ichr)
+if isrc not in msatTree.chroms:
+continue
+result = []
+root = msatTree.chroms[isrc]    #root node for the chrom
+counter(root, istart, iend, lambda node: result.append( node ))
+if not(result):
+continue
+tmpfile1 = tempfile.NamedTemporaryFile('wb+')
+for node in result:
+tmpfile1.write("%s\n" % "\t".join( node.other ))
+tmpfile1.seek(0)
+output_writer(iline, tmpfile1.readlines())
+except:
+skipped+=1
+if skipped:
+print "Skipped %d intervals as invalid." %(skipped)
+elif region == 'align':
+if s_group_cols[0] != -1:
+print >>fout, "#Window\tSpecies_1\tSpecies_2\tGroupby_Feature\tSubGroupby_Feature\tMutability\tCount"
+else:
+print >>fout, "#Window\tSpecies_1\tWindow_Start\tWindow_End\tSpecies_2\tGroupby_Feature\tMutability\tCount"
+prev_bnum = -1
+try:
+for line in fin:
+line = line.strip("\r\n")
+if not(line) or line == "":
+continue
+elems = line.split('\t')
+try:
+assert int(elems[0])
+assert len(elems) == 15
+except:
+continue
+new_bnum = int(elems[0])
+if new_bnum != prev_bnum:
+if prev_bnum != -1:
+output_writer(prev_bnum, linestr.strip().replace('\r','\n').split('\n'))
+linestr = line + "\n"
+else:
+linestr += line
+linestr += "\n"
+prev_bnum = new_bnum
+output_writer(prev_bnum, linestr.strip().replace('\r','\n').split('\n'))
+except Exception, ea:
+print >>sys.stderr, ea
+skipped += 1
+if skipped:
+print "Skipped %d lines as invalid." %(skipped)
+if __name__ == "__main__":
+main()

Mercurial > repos > xuebing > sharplabtool

comparison tools/regVariation/microsats_mutability.py @ 0:9071e359b9a3