comparison GenotypeTRcorrection.py @ 0:70f8259b0b30 draft

Uploaded
author arkarachai-fungtammasan
date Wed, 01 Apr 2015 16:48:58 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:70f8259b0b30
1 ### import libraries ###
2 import sys
3 import collections, math
4 import heapq
5 from galaxy import eggs
6
7
8
9
10
11 ### basic function ###
12 def stop_err(msg):
13 sys.stderr.write(msg)
14 sys.exit()
15
16 def averagelist(a,b,expectedlevelofminor):
17 product=[]
18 for i in range(len(a)):
19 product.append((1-expectedlevelofminor)*a[i]+expectedlevelofminor*b[i])
20
21 return product
22
23 def complement_base(read):
24 collect=''
25 for i in read:
26 if i.upper()=='A':
27 collect+='T'
28 elif i.upper()=='T':
29 collect+='A'
30 elif i.upper()=='C':
31 collect+='G'
32 elif i.upper()=='G':
33 collect+='C'
34 return collect
35 def makeallpossible(read):
36 collect=[]
37 for i in range(len(read)):
38 tmp= read[i:]+read[:i]
39 collect.append(tmp)
40 collect.append(complement_base(tmp))
41 return collect
42
43 def motifsimplify(base):
44 '''str--> str
45 '''
46 motiflength=len(base)
47 temp=list(set(ALLMOTIF[motiflength]).intersection(set(makeallpossible(base))))
48
49 return temp[0]
50
51 def majorallele(seq):
52 binseq=list(set(seq))
53 binseq.sort(reverse=True) # highly mutate mode
54 #binseq.sort() # majority mode
55 storeform=''
56 storevalue=0
57 for i in binseq:
58 if seq.count(i)>storevalue:
59 storeform=i
60 storevalue=seq.count(i)
61
62 return int(storeform)
63
64 ### decide global parameter ###
65 COORDINATECOLUMN=1
66 ALLELECOLUMN=2
67 MOTIFCOLUMN=3
68 ##(0.01-0.5)
69 MINIMUMMUTABLE=1.2*(1.0/(10**8)) #http://www.ncbi.nlm.nih.gov/pubmed/22914163 Kong et al 2012
70
71
72 ## Fixed global variable
73 inputname=sys.argv[1]
74 errorprofile=sys.argv[2]
75 Genotypingcorrected=sys.argv[3]
76 EXPECTEDLEVELOFMINOR=float(sys.argv[4])
77 if EXPECTEDLEVELOFMINOR >0.5:
78 try:
79 expected_contribution_of_minor_allele=int('expected_contribution_of_minor_allele')
80 except Exception, eee:
81 print eee
82 stop_err("Expected contribution of minor allele must be at least 0 and not more than 0.5")
83 ALLREPEATTYPE=[1,2,3,4]
84 ALLREPEATTYPENAME=['mono','di','tri','tetra']
85 monomotif=['A','C']
86 dimotif=['AC','AG','AT','CG']
87 trimotif=['AAC','AAG','AAT','ACC','ACG','ACT','AGC','AGG','ATC','CCG']
88 tetramotif=['AAAC','AAAG','AAAT','AACC','AACG','AACT','AAGC','AAGG','AAGT','AATC','AATG','AATT',\
89 'ACAG','ACAT','ACCC','ACCG','ACCT','ACGC','ACGG','ACGT','ACTC','ACTG','AGAT','AGCC','AGCG','AGCT',\
90 'AGGC','AGGG','ATCC','ATCG','ATGC','CCCG','CCGG','AGTC']
91 ALLMOTIF={1:monomotif,2:dimotif,3:trimotif,4:tetramotif}
92 monorange=range(5,60)
93 dirange=range(6,60)
94 trirange=range(9,60)
95 tetrarange=range(12,80)
96 ALLRANGE={1:monorange,2:dirange,3:trirange,4:tetrarange}
97
98 #########################################
99 ######## Prob calculation sector ########
100 #########################################
101 def multinomial_prob(majorallele,STRlength,motif,probdatabase):
102 '''int,int,str,dict-->int
103 ### get prob for each STRlength to be generated from major allele
104 '''
105 #print (majorallele,STRlength,motif)
106 prob=probdatabase[len(motif)][motif][majorallele][STRlength]
107 return prob
108
109 ################################################
110 ######## error model database sector ###########
111 ################################################
112
113 ## structure generator
114 errormodeldatabase={1:{},2:{},3:{},4:{}}
115 sumbymajoralleledatabase={1:{},2:{},3:{},4:{}}
116 for repeattype in ALLREPEATTYPE:
117 for motif in ALLMOTIF[repeattype]:
118 errormodeldatabase[repeattype][motif]={}
119 sumbymajoralleledatabase[repeattype][motif]={}
120 for motifsize1 in ALLRANGE[repeattype]:
121 errormodeldatabase[repeattype][motif][motifsize1]={}
122 sumbymajoralleledatabase[repeattype][motif][motifsize1]=0
123 for motifsize2 in ALLRANGE[repeattype]:
124 errormodeldatabase[repeattype][motif][motifsize1][motifsize2]=MINIMUMMUTABLE
125
126 #print errormodeldatabase
127 ## read database
128
129
130 ## get read count for each major allele
131 fd=open(errorprofile)
132 lines=fd.readlines()
133 for line in lines:
134 temp=line.strip().split('\t')
135 t_major=int(temp[0])
136 t_count=int(temp[2])
137 motif=temp[3]
138 sumbymajoralleledatabase[len(motif)][motif][t_major]+=t_count
139 fd.close()
140 ##print sumbymajoralleledatabase
141
142 ## get probability
143 fd=open(errorprofile)
144 lines=fd.readlines()
145 for line in lines:
146 temp=line.strip().split('\t')
147 t_major=int(temp[0])
148 t_read=int(temp[1])
149 t_count=int(temp[2])
150 motif=temp[3]
151 if sumbymajoralleledatabase[len(motif)][motif][t_major]>0:
152 errormodeldatabase[len(motif)][motif][t_major][t_read]=t_count/(sumbymajoralleledatabase[len(motif)][motif][t_major]*1.0)
153 #errormodeldatabase[repeattype][motif][t_major][t_read]=math.log(t_count/(sumbymajorallele[t_major]*1.0))
154
155 #else:
156 # errormodeldatabase[repeattype][motif][t_major][t_read]=0
157 fd.close()
158
159 #########################################
160 ######## input reading sector ###########
161 #########################################
162 fdout=open(Genotypingcorrected,'w')
163
164 fd = open(inputname)
165
166 lines=fd.xreadlines()
167 for line in lines:
168 i_read=[]
169 i2_read=[]
170 temp=line.strip().split('\t')
171 i_coordinate=temp[COORDINATECOLUMN-1]
172 i_motif=motifsimplify(temp[MOTIFCOLUMN-1])
173 i_read=temp[ALLELECOLUMN-1].split(',')
174 i_read=map(int,i_read)
175 coverage=len(i_read)
176
177 ### Evaluate 1 major allele ###
178 i_all_allele=list(set(i_read))
179 i_major_allele=majorallele(i_read)
180 f_majorallele=i_read.count(i_major_allele)
181 ### Evaluate 2 major allele ###
182 if len(i_all_allele)>1:
183 i2_read=filter(lambda a: a != i_major_allele, i_read)
184 i_major2_allele=majorallele(i2_read)
185 f_majorallele2=i_read.count(i_major2_allele)
186 ### Evaluate 3 major allele ###
187 if len(i_all_allele)>2:
188 i3_read=filter(lambda a: a != i_major2_allele, i2_read)
189 i_major3_allele=majorallele(i3_read)
190 f_majorallele3=i_read.count(i_major3_allele)
191 ### No 3 major allele ###
192 elif len(i_all_allele)==2:
193 i_major3_allele=i_major2_allele
194 ### No 2 major allele ###
195 elif len(i_all_allele)==1:
196 #i_major2_allele=majorallele(i_read)
197 i_major2_allele=i_major_allele+len(i_motif)
198 i_major3_allele=i_major2_allele
199 #print line.strip()+'\t'+'\t'.join(['homo','only',str(i_major_allele),str(i_major_allele),'NA'])
200 #continue
201 else:
202 print("no allele is reading")
203 sys.exit()
204
205 ## scope filter
206
207 #########################################
208 ######## prob calculation option ########
209 #########################################
210 homozygous_collector=0
211 heterozygous_collector=0
212
213
214 alist=[multinomial_prob(i_major_allele,x,i_motif,errormodeldatabase)for x in i_read]
215 blist=[multinomial_prob(i_major2_allele,x,i_motif,errormodeldatabase)for x in i_read]
216 clist=[multinomial_prob(i_major3_allele,x,i_motif,errormodeldatabase)for x in i_read]
217
218 ablist=averagelist(alist,blist,EXPECTEDLEVELOFMINOR)
219 bclist=averagelist(blist,clist,EXPECTEDLEVELOFMINOR)
220 aclist=averagelist(alist,clist,EXPECTEDLEVELOFMINOR)
221
222 #print alist,blist,clist
223 majora=sum([math.log(i,10) for i in alist])
224 majorb=sum([math.log(i,10) for i in blist])
225 majorc=sum([math.log(i,10) for i in clist])
226 homozygous_collector=max(majora,majorb,majorc)
227
228 homomajor1=max([(majora,i_major_allele),(majorb,i_major2_allele),(majorc,i_major3_allele)])[1]
229 homomajordict={i_major_allele:majora,i_major2_allele:majorb,i_major3_allele:majorc}
230
231 majorab=sum([math.log(i,10) for i in ablist])
232 majorbc=sum([math.log(i,10) for i in bclist])
233 majorac=sum([math.log(i,10) for i in aclist])
234 heterozygous_collector=max(majorab,majorbc,majorac)
235 bothheteromajor=max([(majorab,(i_major_allele,i_major2_allele)),(majorbc,(i_major2_allele,i_major3_allele)),(majorac,(i_major_allele,i_major3_allele))])[1]
236 ##heteromajor1=max(bothheteromajor)
237 ##heteromajor2=min(bothheteromajor)
238 pre_heteromajor1=bothheteromajor[0]
239 pre_heteromajor2=bothheteromajor[1]
240 heteromajor1=max((homomajordict[pre_heteromajor1],pre_heteromajor1),(homomajordict[pre_heteromajor2],pre_heteromajor2))[1]
241 heteromajor2=min((homomajordict[pre_heteromajor1],pre_heteromajor1),(homomajordict[pre_heteromajor2],pre_heteromajor2))[1]
242
243 logratio_homo=homozygous_collector-heterozygous_collector
244
245 if logratio_homo>0:
246 fdout.writelines(line.strip()+'\t'+'\t'.join(['homo',str(logratio_homo),str(homomajor1),str(heteromajor1),str(heteromajor2)])+'\n')
247 elif logratio_homo<0:
248 fdout.writelines(line.strip()+'\t'+'\t'.join(['hetero',str(logratio_homo),str(homomajor1),str(heteromajor1),str(heteromajor2)])+'\n')
249 fd.close()
250 fdout.close()