0
|
1 ### import libraries ###
|
|
2 import sys
|
|
3 import collections, math
|
|
4 import heapq
|
|
5 import itertools
|
|
6
|
|
7
|
|
8
|
|
9 ### basic function ###
|
|
10 def permuterepeat(n,rlist):
|
|
11 f = math.factorial
|
|
12 nfac=f(n)
|
|
13 rfaclist=[f(i) for i in rlist]
|
|
14 for rfac in rfaclist:
|
|
15 nfac=nfac/rfac
|
|
16 return nfac
|
|
17
|
|
18 def nCr(n,r):
|
|
19 f = math.factorial
|
|
20 return f(n) / f(r) / f(n-r)
|
|
21
|
|
22 def averagelist(a,b,expectedlevelofminor):
|
|
23 product=[]
|
|
24 for i in range(len(a)):
|
|
25 product.append((1-expectedlevelofminor)*a[i]+expectedlevelofminor*b[i])
|
|
26
|
|
27 return product
|
|
28
|
|
29 def complement_base(read):
|
|
30 collect=''
|
|
31 for i in read:
|
|
32 if i.upper()=='A':
|
|
33 collect+='T'
|
|
34 elif i.upper()=='T':
|
|
35 collect+='A'
|
|
36 elif i.upper()=='C':
|
|
37 collect+='G'
|
|
38 elif i.upper()=='G':
|
|
39 collect+='C'
|
|
40 return collect
|
|
41 def makeallpossible(read):
|
|
42 collect=[]
|
|
43 for i in range(len(read)):
|
|
44 tmp= read[i:]+read[:i]
|
|
45 collect.append(tmp)
|
|
46 collect.append(complement_base(tmp))
|
|
47 return collect
|
|
48
|
|
49 def motifsimplify(base):
|
|
50 '''str--> str
|
|
51 '''
|
|
52 motiflength=len(base)
|
|
53 temp=list(set(ALLMOTIF[motiflength]).intersection(set(makeallpossible(base))))
|
|
54
|
|
55 return temp[0]
|
|
56
|
|
57 def majorallele(seq):
|
|
58 binseq=list(set(seq))
|
|
59 binseq.sort(reverse=True) # highly mutate mode
|
|
60 #binseq.sort() # majority mode
|
|
61 storeform=''
|
|
62 storevalue=0
|
|
63 for i in binseq:
|
|
64 if seq.count(i)>storevalue:
|
|
65 storeform=i
|
|
66 storevalue=seq.count(i)
|
|
67
|
|
68 return int(storeform)
|
|
69
|
|
70 ### decide global parameter ###
|
|
71 COORDINATECOLUMN=1
|
|
72 ALLELECOLUMN=2
|
|
73 MOTIFCOLUMN=3
|
|
74 inputname=sys.argv[1]
|
|
75 errorprofile=sys.argv[2]
|
|
76 EXPECTEDLEVELOFMINOR=float(sys.argv[3])
|
|
77 if EXPECTEDLEVELOFMINOR >0.5:
|
|
78 try:
|
|
79 errorexpectcontribution=int('a')
|
|
80 except Exception, eee:
|
|
81 print eee
|
|
82 stop_err("Expected contribution of minor allele must be at least 0 and not more than 0.5")
|
|
83 MINIMUMMUTABLE=0 ###1.2*(1.0/(10**8)) #http://www.ncbi.nlm.nih.gov/pubmed/22914163 Kong et al 2012
|
|
84
|
|
85
|
|
86 ## Fixed global variable
|
|
87 ALLREPEATTYPE=[1,2,3,4]
|
|
88 ALLREPEATTYPENAME=['mono','di','tri','tetra']
|
|
89 monomotif=['A','C']
|
|
90 dimotif=['AC','AG','AT','CG']
|
|
91 trimotif=['AAC','AAG','AAT','ACC','ACG','ACT','AGC','AGG','ATC','CCG']
|
|
92 tetramotif=['AAAC','AAAG','AAAT','AACC','AACG','AACT','AAGC','AAGG','AAGT','AATC','AATG','AATT',\
|
|
93 'ACAG','ACAT','ACCC','ACCG','ACCT','ACGC','ACGG','ACGT','ACTC','ACTG','AGAT','AGCC','AGCG','AGCT',\
|
|
94 'AGGC','AGGG','ATCC','ATCG','ATGC','CCCG','CCGG','AGTC']
|
|
95 ALLMOTIF={1:monomotif,2:dimotif,3:trimotif,4:tetramotif}
|
|
96 monorange=range(5,60)
|
|
97 dirange=range(6,60)
|
|
98 trirange=range(9,60)
|
|
99 tetrarange=range(12,80)
|
|
100 ALLRANGE={1:monorange,2:dirange,3:trirange,4:tetrarange}
|
|
101
|
|
102 #########################################
|
|
103 ######## Prob calculation sector ########
|
|
104 #########################################
|
|
105 def multinomial_prob(majorallele,STRlength,motif,probdatabase):
|
|
106 '''int,int,str,dict-->int
|
|
107 ### get prob for each STRlength to be generated from major allele
|
|
108 '''
|
|
109 #print (majorallele,STRlength,motif)
|
|
110 prob=probdatabase[len(motif)][motif][majorallele][STRlength]
|
|
111 return prob
|
|
112
|
|
113 ################################################
|
|
114 ######## error model database sector ###########
|
|
115 ################################################
|
|
116
|
|
117 ## structure generator
|
|
118 errormodeldatabase={1:{},2:{},3:{},4:{}}
|
|
119 sumbymajoralleledatabase={1:{},2:{},3:{},4:{}}
|
|
120 for repeattype in ALLREPEATTYPE:
|
|
121 for motif in ALLMOTIF[repeattype]:
|
|
122 errormodeldatabase[repeattype][motif]={}
|
|
123 sumbymajoralleledatabase[repeattype][motif]={}
|
|
124 for motifsize1 in ALLRANGE[repeattype]:
|
|
125 errormodeldatabase[repeattype][motif][motifsize1]={}
|
|
126 sumbymajoralleledatabase[repeattype][motif][motifsize1]=0
|
|
127 for motifsize2 in ALLRANGE[repeattype]:
|
|
128 errormodeldatabase[repeattype][motif][motifsize1][motifsize2]=MINIMUMMUTABLE
|
|
129 #print errormodeldatabase
|
|
130 ## read database
|
|
131
|
|
132 ## get read count for each major allele
|
|
133 fd=open(errorprofile)
|
|
134 lines=fd.readlines()
|
|
135 for line in lines:
|
|
136 temp=line.strip().split('\t')
|
|
137 t_major=int(temp[0])
|
|
138 t_count=int(temp[2])
|
|
139 motif=temp[3]
|
|
140 sumbymajoralleledatabase[len(motif)][motif][t_major]+=t_count
|
|
141 fd.close()
|
|
142 ##print sumbymajoralleledatabase
|
|
143
|
|
144 ## get probability
|
|
145 fd=open(errorprofile)
|
|
146 lines=fd.readlines()
|
|
147 for line in lines:
|
|
148 temp=line.strip().split('\t')
|
|
149 t_major=int(temp[0])
|
|
150 t_read=int(temp[1])
|
|
151 t_count=int(temp[2])
|
|
152 motif=temp[3]
|
|
153 if sumbymajoralleledatabase[len(motif)][motif][t_major]>0:
|
|
154 errormodeldatabase[len(motif)][motif][t_major][t_read]=t_count/(sumbymajoralleledatabase[len(motif)][motif][t_major]*1.0)
|
|
155 #errormodeldatabase[repeattype][motif][t_major][t_read]=math.log(t_count/(sumbymajorallele[t_major]*1.0))
|
|
156
|
|
157 #else:
|
|
158 # errormodeldatabase[repeattype][motif][t_major][t_read]=0
|
|
159 fd.close()
|
|
160 #print errormodeldatabase
|
|
161 #print math.log(100,10)
|
|
162 #########################################
|
|
163 ######## input reading sector ###########
|
|
164 #########################################
|
|
165
|
|
166
|
|
167
|
|
168 fd = open(inputname)
|
|
169 ##fd=open('sampleinput_C.txt')
|
|
170 lines=fd.xreadlines()
|
|
171 for line in lines:
|
|
172 i_read=[]
|
|
173 i2_read=[]
|
|
174 temp=line.strip().split('\t')
|
|
175 i_coordinate=temp[COORDINATECOLUMN-1]
|
|
176 i_motif=motifsimplify(temp[MOTIFCOLUMN-1])
|
|
177 i_read=temp[ALLELECOLUMN-1].split(',')
|
|
178 i_read=map(int,i_read)
|
|
179 depth=len(i_read)
|
|
180 heteromajor1=int(temp[6])
|
|
181 heteromajor2=int(temp[7])
|
|
182
|
|
183 ### calculate the change to detect combination (using error profile)
|
|
184 heterozygous_collector=0
|
|
185 alist=[multinomial_prob(heteromajor1,x,i_motif,errormodeldatabase)for x in i_read]
|
|
186 blist=[multinomial_prob(heteromajor2,x,i_motif,errormodeldatabase)for x in i_read]
|
|
187
|
|
188 ablist=averagelist(alist,blist,EXPECTEDLEVELOFMINOR)
|
|
189
|
|
190 if 0 in ablist:
|
|
191 continue
|
|
192 heterozygous_collector=reduce(lambda y, z: y*z,ablist )
|
|
193
|
|
194 ### prob of combination (using multinomial distribution)
|
|
195 frequency_distribution=[len(list(group)) for key, group in itertools.groupby(i_read)]
|
|
196 ## print frequency_distribution
|
|
197 expandbypermutation=permuterepeat(depth,frequency_distribution)
|
|
198
|
|
199 print line.strip()+'\t'+str(heterozygous_collector)+'\t'+str(expandbypermutation)+'\t'+str(expandbypermutation*heterozygous_collector)+'\t'+str(depth)
|