0
|
1 ### import libraries ###
|
|
2 import sys
|
|
3 import collections, math
|
|
4 import heapq
|
|
5
|
|
6
|
|
7
|
|
8
|
|
9
|
|
10 ### basic function ###
|
|
11 def stop_err(msg):
|
|
12 sys.stderr.write(msg)
|
|
13 sys.exit()
|
|
14
|
|
15 def averagelist(a,b,expectedlevelofminor):
|
|
16 product=[]
|
|
17 for i in range(len(a)):
|
|
18 product.append((1-expectedlevelofminor)*a[i]+expectedlevelofminor*b[i])
|
|
19
|
|
20 return product
|
|
21
|
|
22 def complement_base(read):
|
|
23 collect=''
|
|
24 for i in read:
|
|
25 if i.upper()=='A':
|
|
26 collect+='T'
|
|
27 elif i.upper()=='T':
|
|
28 collect+='A'
|
|
29 elif i.upper()=='C':
|
|
30 collect+='G'
|
|
31 elif i.upper()=='G':
|
|
32 collect+='C'
|
|
33 return collect
|
|
34 def makeallpossible(read):
|
|
35 collect=[]
|
|
36 for i in range(len(read)):
|
|
37 tmp= read[i:]+read[:i]
|
|
38 collect.append(tmp)
|
|
39 collect.append(complement_base(tmp))
|
|
40 return collect
|
|
41
|
|
42 def motifsimplify(base):
|
|
43 '''str--> str
|
|
44 '''
|
|
45 motiflength=len(base)
|
|
46 temp=list(set(ALLMOTIF[motiflength]).intersection(set(makeallpossible(base))))
|
|
47
|
|
48 return temp[0]
|
|
49
|
|
50 def majorallele(seq):
|
|
51 binseq=list(set(seq))
|
|
52 binseq.sort(reverse=True) # highly mutate mode
|
|
53 #binseq.sort() # majority mode
|
|
54 storeform=''
|
|
55 storevalue=0
|
|
56 for i in binseq:
|
|
57 if seq.count(i)>storevalue:
|
|
58 storeform=i
|
|
59 storevalue=seq.count(i)
|
|
60
|
|
61 return int(storeform)
|
|
62
|
|
63 ### decide global parameter ###
|
|
64 COORDINATECOLUMN=1
|
|
65 ALLELECOLUMN=2
|
|
66 MOTIFCOLUMN=3
|
|
67 ##(0.01-0.5)
|
|
68 MINIMUMMUTABLE=1.2*(1.0/(10**8)) #http://www.ncbi.nlm.nih.gov/pubmed/22914163 Kong et al 2012
|
|
69
|
|
70
|
|
71 ## Fixed global variable
|
|
72 inputname=sys.argv[1]
|
|
73 errorprofile=sys.argv[2]
|
|
74 Genotypingcorrected=sys.argv[3]
|
|
75 EXPECTEDLEVELOFMINOR=float(sys.argv[4])
|
|
76 if EXPECTEDLEVELOFMINOR >0.5:
|
|
77 try:
|
|
78 expected_contribution_of_minor_allele=int('expected_contribution_of_minor_allele')
|
|
79 except Exception, eee:
|
|
80 print eee
|
|
81 stop_err("Expected contribution of minor allele must be at least 0 and not more than 0.5")
|
|
82 ALLREPEATTYPE=[1,2,3,4]
|
|
83 ALLREPEATTYPENAME=['mono','di','tri','tetra']
|
|
84 monomotif=['A','C']
|
|
85 dimotif=['AC','AG','AT','CG']
|
|
86 trimotif=['AAC','AAG','AAT','ACC','ACG','ACT','AGC','AGG','ATC','CCG']
|
|
87 tetramotif=['AAAC','AAAG','AAAT','AACC','AACG','AACT','AAGC','AAGG','AAGT','AATC','AATG','AATT',\
|
|
88 'ACAG','ACAT','ACCC','ACCG','ACCT','ACGC','ACGG','ACGT','ACTC','ACTG','AGAT','AGCC','AGCG','AGCT',\
|
|
89 'AGGC','AGGG','ATCC','ATCG','ATGC','CCCG','CCGG','AGTC']
|
|
90 ALLMOTIF={1:monomotif,2:dimotif,3:trimotif,4:tetramotif}
|
|
91 monorange=range(5,60)
|
|
92 dirange=range(6,60)
|
|
93 trirange=range(9,60)
|
|
94 tetrarange=range(12,80)
|
|
95 ALLRANGE={1:monorange,2:dirange,3:trirange,4:tetrarange}
|
|
96
|
|
97 #########################################
|
|
98 ######## Prob calculation sector ########
|
|
99 #########################################
|
|
100 def multinomial_prob(majorallele,STRlength,motif,probdatabase):
|
|
101 '''int,int,str,dict-->int
|
|
102 ### get prob for each STRlength to be generated from major allele
|
|
103 '''
|
|
104 #print (majorallele,STRlength,motif)
|
|
105 prob=probdatabase[len(motif)][motif][majorallele][STRlength]
|
|
106 return prob
|
|
107
|
|
108 ################################################
|
|
109 ######## error model database sector ###########
|
|
110 ################################################
|
|
111
|
|
112 ## structure generator
|
|
113 errormodeldatabase={1:{},2:{},3:{},4:{}}
|
|
114 sumbymajoralleledatabase={1:{},2:{},3:{},4:{}}
|
|
115 for repeattype in ALLREPEATTYPE:
|
|
116 for motif in ALLMOTIF[repeattype]:
|
|
117 errormodeldatabase[repeattype][motif]={}
|
|
118 sumbymajoralleledatabase[repeattype][motif]={}
|
|
119 for motifsize1 in ALLRANGE[repeattype]:
|
|
120 errormodeldatabase[repeattype][motif][motifsize1]={}
|
|
121 sumbymajoralleledatabase[repeattype][motif][motifsize1]=0
|
|
122 for motifsize2 in ALLRANGE[repeattype]:
|
|
123 errormodeldatabase[repeattype][motif][motifsize1][motifsize2]=MINIMUMMUTABLE
|
|
124
|
|
125 #print errormodeldatabase
|
|
126 ## read database
|
|
127
|
|
128
|
|
129 ## get read count for each major allele
|
|
130 fd=open(errorprofile)
|
|
131 lines=fd.readlines()
|
|
132 for line in lines:
|
|
133 temp=line.strip().split('\t')
|
|
134 t_major=int(temp[0])
|
|
135 t_count=int(temp[2])
|
|
136 motif=temp[3]
|
|
137 sumbymajoralleledatabase[len(motif)][motif][t_major]+=t_count
|
|
138 fd.close()
|
|
139 ##print sumbymajoralleledatabase
|
|
140
|
|
141 ## get probability
|
|
142 fd=open(errorprofile)
|
|
143 lines=fd.readlines()
|
|
144 for line in lines:
|
|
145 temp=line.strip().split('\t')
|
|
146 t_major=int(temp[0])
|
|
147 t_read=int(temp[1])
|
|
148 t_count=int(temp[2])
|
|
149 motif=temp[3]
|
|
150 if sumbymajoralleledatabase[len(motif)][motif][t_major]>0:
|
|
151 errormodeldatabase[len(motif)][motif][t_major][t_read]=t_count/(sumbymajoralleledatabase[len(motif)][motif][t_major]*1.0)
|
|
152 #errormodeldatabase[repeattype][motif][t_major][t_read]=math.log(t_count/(sumbymajorallele[t_major]*1.0))
|
|
153
|
|
154 #else:
|
|
155 # errormodeldatabase[repeattype][motif][t_major][t_read]=0
|
|
156 fd.close()
|
|
157
|
|
158 #########################################
|
|
159 ######## input reading sector ###########
|
|
160 #########################################
|
|
161 fdout=open(Genotypingcorrected,'w')
|
|
162
|
|
163 fd = open(inputname)
|
|
164
|
|
165 lines=fd.xreadlines()
|
|
166 for line in lines:
|
|
167 i_read=[]
|
|
168 i2_read=[]
|
|
169 temp=line.strip().split('\t')
|
|
170 i_coordinate=temp[COORDINATECOLUMN-1]
|
|
171 i_motif=motifsimplify(temp[MOTIFCOLUMN-1])
|
|
172 i_read=temp[ALLELECOLUMN-1].split(',')
|
|
173 i_read=map(int,i_read)
|
|
174 coverage=len(i_read)
|
|
175
|
|
176 ### Evaluate 1 major allele ###
|
|
177 i_all_allele=list(set(i_read))
|
|
178 i_major_allele=majorallele(i_read)
|
|
179 f_majorallele=i_read.count(i_major_allele)
|
|
180 ### Evaluate 2 major allele ###
|
|
181 if len(i_all_allele)>1:
|
|
182 i2_read=filter(lambda a: a != i_major_allele, i_read)
|
|
183 i_major2_allele=majorallele(i2_read)
|
|
184 f_majorallele2=i_read.count(i_major2_allele)
|
|
185 ### Evaluate 3 major allele ###
|
|
186 if len(i_all_allele)>2:
|
|
187 i3_read=filter(lambda a: a != i_major2_allele, i2_read)
|
|
188 i_major3_allele=majorallele(i3_read)
|
|
189 f_majorallele3=i_read.count(i_major3_allele)
|
|
190 ### No 3 major allele ###
|
|
191 elif len(i_all_allele)==2:
|
|
192 i_major3_allele=i_major2_allele
|
|
193 ### No 2 major allele ###
|
|
194 elif len(i_all_allele)==1:
|
|
195 #i_major2_allele=majorallele(i_read)
|
|
196 i_major2_allele=i_major_allele+len(i_motif)
|
|
197 i_major3_allele=i_major2_allele
|
|
198 #print line.strip()+'\t'+'\t'.join(['homo','only',str(i_major_allele),str(i_major_allele),'NA'])
|
|
199 #continue
|
|
200 else:
|
|
201 print("no allele is reading")
|
|
202 sys.exit()
|
|
203
|
|
204 ## scope filter
|
|
205
|
|
206 #########################################
|
5
|
207 ######## prob calculation sector ########
|
0
|
208 #########################################
|
|
209 homozygous_collector=0
|
|
210 heterozygous_collector=0
|
|
211
|
|
212
|
|
213 alist=[multinomial_prob(i_major_allele,x,i_motif,errormodeldatabase)for x in i_read]
|
|
214 blist=[multinomial_prob(i_major2_allele,x,i_motif,errormodeldatabase)for x in i_read]
|
|
215 clist=[multinomial_prob(i_major3_allele,x,i_motif,errormodeldatabase)for x in i_read]
|
|
216
|
|
217 ablist=averagelist(alist,blist,EXPECTEDLEVELOFMINOR)
|
|
218 bclist=averagelist(blist,clist,EXPECTEDLEVELOFMINOR)
|
|
219 aclist=averagelist(alist,clist,EXPECTEDLEVELOFMINOR)
|
|
220
|
|
221 #print alist,blist,clist
|
|
222 majora=sum([math.log(i,10) for i in alist])
|
|
223 majorb=sum([math.log(i,10) for i in blist])
|
|
224 majorc=sum([math.log(i,10) for i in clist])
|
|
225 homozygous_collector=max(majora,majorb,majorc)
|
|
226
|
|
227 homomajor1=max([(majora,i_major_allele),(majorb,i_major2_allele),(majorc,i_major3_allele)])[1]
|
|
228 homomajordict={i_major_allele:majora,i_major2_allele:majorb,i_major3_allele:majorc}
|
|
229
|
|
230 majorab=sum([math.log(i,10) for i in ablist])
|
|
231 majorbc=sum([math.log(i,10) for i in bclist])
|
|
232 majorac=sum([math.log(i,10) for i in aclist])
|
|
233 heterozygous_collector=max(majorab,majorbc,majorac)
|
|
234 bothheteromajor=max([(majorab,(i_major_allele,i_major2_allele)),(majorbc,(i_major2_allele,i_major3_allele)),(majorac,(i_major_allele,i_major3_allele))])[1]
|
|
235 ##heteromajor1=max(bothheteromajor)
|
|
236 ##heteromajor2=min(bothheteromajor)
|
|
237 pre_heteromajor1=bothheteromajor[0]
|
|
238 pre_heteromajor2=bothheteromajor[1]
|
|
239 heteromajor1=max((homomajordict[pre_heteromajor1],pre_heteromajor1),(homomajordict[pre_heteromajor2],pre_heteromajor2))[1]
|
|
240 heteromajor2=min((homomajordict[pre_heteromajor1],pre_heteromajor1),(homomajordict[pre_heteromajor2],pre_heteromajor2))[1]
|
|
241
|
|
242 logratio_homo=homozygous_collector-heterozygous_collector
|
|
243
|
|
244 if logratio_homo>0:
|
|
245 fdout.writelines(line.strip()+'\t'+'\t'.join(['homo',str(logratio_homo),str(homomajor1),str(heteromajor1),str(heteromajor2)])+'\n')
|
|
246 elif logratio_homo<0:
|
|
247 fdout.writelines(line.strip()+'\t'+'\t'.join(['hetero',str(logratio_homo),str(homomajor1),str(heteromajor1),str(heteromajor2)])+'\n')
|
|
248 fd.close()
|
|
249 fdout.close()
|