Mercurial > repos > arkarachai-fungtammasan > microsatellite_ngs
comparison GenotypeTRcorrection.py @ 0:20ab85af9505
Uploaded
author | arkarachai-fungtammasan |
---|---|
date | Fri, 03 Oct 2014 20:54:30 -0400 |
parents | |
children | b27006b0a953 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:20ab85af9505 |
---|---|
1 ### import libraries ### | |
2 import sys | |
3 import collections, math | |
4 import heapq | |
5 from galaxy import eggs | |
6 | |
7 | |
8 | |
9 | |
10 | |
11 ### basic function ### | |
12 def stop_err(msg): | |
13 sys.stderr.write(msg) | |
14 sys.exit() | |
15 | |
16 def averagelist(a,b,expectedlevelofminor): | |
17 product=[] | |
18 for i in range(len(a)): | |
19 product.append((1-expectedlevelofminor)*a[i]+expectedlevelofminor*b[i]) | |
20 | |
21 return product | |
22 | |
23 def complement_base(read): | |
24 collect='' | |
25 for i in read: | |
26 if i.upper()=='A': | |
27 collect+='T' | |
28 elif i.upper()=='T': | |
29 collect+='A' | |
30 elif i.upper()=='C': | |
31 collect+='G' | |
32 elif i.upper()=='G': | |
33 collect+='C' | |
34 return collect | |
35 def makeallpossible(read): | |
36 collect=[] | |
37 for i in range(len(read)): | |
38 tmp= read[i:]+read[:i] | |
39 collect.append(tmp) | |
40 collect.append(complement_base(tmp)) | |
41 return collect | |
42 | |
43 def motifsimplify(base): | |
44 '''str--> str | |
45 ''' | |
46 motiflength=len(base) | |
47 temp=list(set(ALLMOTIF[motiflength]).intersection(set(makeallpossible(base)))) | |
48 | |
49 return temp[0] | |
50 | |
51 def majorallele(seq): | |
52 binseq=list(set(seq)) | |
53 binseq.sort(reverse=True) # highly mutate mode | |
54 #binseq.sort() # majority mode | |
55 storeform='' | |
56 storevalue=0 | |
57 for i in binseq: | |
58 if seq.count(i)>storevalue: | |
59 storeform=i | |
60 storevalue=seq.count(i) | |
61 | |
62 return int(storeform) | |
63 | |
64 ### decide global parameter ### | |
65 COORDINATECOLUMN=1 | |
66 ALLELECOLUMN=2 | |
67 MOTIFCOLUMN=3 | |
68 ##(0.01-0.5) | |
69 MINIMUMMUTABLE=1.2*(1.0/(10**8)) #http://www.ncbi.nlm.nih.gov/pubmed/22914163 Kong et al 2012 | |
70 | |
71 | |
72 ## Fixed global variable | |
73 inputname=sys.argv[1] | |
74 errorprofile=sys.argv[2] | |
75 Genotypingcorrected=sys.argv[3] | |
76 EXPECTEDLEVELOFMINOR=float(sys.argv[4]) | |
77 if EXPECTEDLEVELOFMINOR >0.5: | |
78 try: | |
79 expected_contribution_of_minor_allele=int('expected_contribution_of_minor_allele') | |
80 except Exception, eee: | |
81 print eee | |
82 stop_err("Expected contribution of minor allele must be at least 0 and not more than 0.5") | |
83 ALLREPEATTYPE=[1,2,3,4] | |
84 ALLREPEATTYPENAME=['mono','di','tri','tetra'] | |
85 monomotif=['A','C'] | |
86 dimotif=['AC','AG','AT','CG'] | |
87 trimotif=['AAC','AAG','AAT','ACC','ACG','ACT','AGC','AGG','ATC','CCG'] | |
88 tetramotif=['AAAC','AAAG','AAAT','AACC','AACG','AACT','AAGC','AAGG','AAGT','AATC','AATG','AATT',\ | |
89 'ACAG','ACAT','ACCC','ACCG','ACCT','ACGC','ACGG','ACGT','ACTC','ACTG','AGAT','AGCC','AGCG','AGCT',\ | |
90 'AGGC','AGGG','ATCC','ATCG','ATGC','CCCG','CCGG','AGTC'] | |
91 ALLMOTIF={1:monomotif,2:dimotif,3:trimotif,4:tetramotif} | |
92 monorange=range(5,60) | |
93 dirange=range(6,60) | |
94 trirange=range(9,60) | |
95 tetrarange=range(12,80) | |
96 ALLRANGE={1:monorange,2:dirange,3:trirange,4:tetrarange} | |
97 | |
98 ######################################### | |
99 ######## Prob calculation sector ######## | |
100 ######################################### | |
101 def multinomial_prob(majorallele,STRlength,motif,probdatabase): | |
102 '''int,int,str,dict-->int | |
103 ### get prob for each STRlength to be generated from major allele | |
104 ''' | |
105 #print (majorallele,STRlength,motif) | |
106 prob=probdatabase[len(motif)][motif][majorallele][STRlength] | |
107 return prob | |
108 | |
109 ################################################ | |
110 ######## error model database sector ########### | |
111 ################################################ | |
112 | |
113 ## structure generator | |
114 errormodeldatabase={1:{},2:{},3:{},4:{}} | |
115 sumbymajoralleledatabase={1:{},2:{},3:{},4:{}} | |
116 for repeattype in ALLREPEATTYPE: | |
117 for motif in ALLMOTIF[repeattype]: | |
118 errormodeldatabase[repeattype][motif]={} | |
119 sumbymajoralleledatabase[repeattype][motif]={} | |
120 for motifsize1 in ALLRANGE[repeattype]: | |
121 errormodeldatabase[repeattype][motif][motifsize1]={} | |
122 sumbymajoralleledatabase[repeattype][motif][motifsize1]=0 | |
123 for motifsize2 in ALLRANGE[repeattype]: | |
124 errormodeldatabase[repeattype][motif][motifsize1][motifsize2]=MINIMUMMUTABLE | |
125 | |
126 #print errormodeldatabase | |
127 ## read database | |
128 | |
129 | |
130 ## get read count for each major allele | |
131 fd=open(errorprofile) | |
132 lines=fd.readlines() | |
133 for line in lines: | |
134 temp=line.strip().split('\t') | |
135 t_major=int(temp[0]) | |
136 t_count=int(temp[2]) | |
137 motif=temp[3] | |
138 sumbymajoralleledatabase[len(motif)][motif][t_major]+=t_count | |
139 fd.close() | |
140 ##print sumbymajoralleledatabase | |
141 | |
142 ## get probability | |
143 fd=open(errorprofile) | |
144 lines=fd.readlines() | |
145 for line in lines: | |
146 temp=line.strip().split('\t') | |
147 t_major=int(temp[0]) | |
148 t_read=int(temp[1]) | |
149 t_count=int(temp[2]) | |
150 motif=temp[3] | |
151 if sumbymajoralleledatabase[len(motif)][motif][t_major]>0: | |
152 errormodeldatabase[len(motif)][motif][t_major][t_read]=t_count/(sumbymajoralleledatabase[len(motif)][motif][t_major]*1.0) | |
153 #errormodeldatabase[repeattype][motif][t_major][t_read]=math.log(t_count/(sumbymajorallele[t_major]*1.0)) | |
154 | |
155 #else: | |
156 # errormodeldatabase[repeattype][motif][t_major][t_read]=0 | |
157 fd.close() | |
158 | |
159 ######################################### | |
160 ######## input reading sector ########### | |
161 ######################################### | |
162 fdout=open(Genotypingcorrected,'w') | |
163 | |
164 fd = open(inputname) | |
165 | |
166 lines=fd.xreadlines() | |
167 for line in lines: | |
168 i_read=[] | |
169 i2_read=[] | |
170 temp=line.strip().split('\t') | |
171 i_coordinate=temp[COORDINATECOLUMN-1] | |
172 i_motif=motifsimplify(temp[MOTIFCOLUMN-1]) | |
173 i_read=temp[ALLELECOLUMN-1].split(',') | |
174 i_read=map(int,i_read) | |
175 coverage=len(i_read) | |
176 | |
177 ### Evaluate 1 major allele ### | |
178 i_all_allele=list(set(i_read)) | |
179 i_major_allele=majorallele(i_read) | |
180 f_majorallele=i_read.count(i_major_allele) | |
181 ### Evaluate 2 major allele ### | |
182 if len(i_all_allele)>1: | |
183 i2_read=filter(lambda a: a != i_major_allele, i_read) | |
184 i_major2_allele=majorallele(i2_read) | |
185 f_majorallele2=i_read.count(i_major2_allele) | |
186 ### Evaluate 3 major allele ### | |
187 if len(i_all_allele)>2: | |
188 i3_read=filter(lambda a: a != i_major2_allele, i2_read) | |
189 i_major3_allele=majorallele(i3_read) | |
190 f_majorallele3=i_read.count(i_major3_allele) | |
191 ### No 3 major allele ### | |
192 elif len(i_all_allele)==2: | |
193 i_major3_allele=i_major2_allele | |
194 ### No 2 major allele ### | |
195 elif len(i_all_allele)==1: | |
196 #i_major2_allele=majorallele(i_read) | |
197 i_major2_allele=i_major_allele+len(i_motif) | |
198 i_major3_allele=i_major2_allele | |
199 #print line.strip()+'\t'+'\t'.join(['homo','only',str(i_major_allele),str(i_major_allele),'NA']) | |
200 #continue | |
201 else: | |
202 print("no allele is reading") | |
203 sys.exit() | |
204 | |
205 ## scope filter | |
206 | |
207 ######################################### | |
208 ######## prob calculation option ######## | |
209 ######################################### | |
210 homozygous_collector=0 | |
211 heterozygous_collector=0 | |
212 | |
213 | |
214 alist=[multinomial_prob(i_major_allele,x,i_motif,errormodeldatabase)for x in i_read] | |
215 blist=[multinomial_prob(i_major2_allele,x,i_motif,errormodeldatabase)for x in i_read] | |
216 clist=[multinomial_prob(i_major3_allele,x,i_motif,errormodeldatabase)for x in i_read] | |
217 | |
218 ablist=averagelist(alist,blist,EXPECTEDLEVELOFMINOR) | |
219 bclist=averagelist(blist,clist,EXPECTEDLEVELOFMINOR) | |
220 aclist=averagelist(alist,clist,EXPECTEDLEVELOFMINOR) | |
221 | |
222 #print alist,blist,clist | |
223 majora=sum([math.log(i,10) for i in alist]) | |
224 majorb=sum([math.log(i,10) for i in blist]) | |
225 majorc=sum([math.log(i,10) for i in clist]) | |
226 homozygous_collector=max(majora,majorb,majorc) | |
227 | |
228 homomajor1=max([(majora,i_major_allele),(majorb,i_major2_allele),(majorc,i_major3_allele)])[1] | |
229 homomajordict={i_major_allele:majora,i_major2_allele:majorb,i_major3_allele:majorc} | |
230 | |
231 majorab=sum([math.log(i,10) for i in ablist]) | |
232 majorbc=sum([math.log(i,10) for i in bclist]) | |
233 majorac=sum([math.log(i,10) for i in aclist]) | |
234 heterozygous_collector=max(majorab,majorbc,majorac) | |
235 bothheteromajor=max([(majorab,(i_major_allele,i_major2_allele)),(majorbc,(i_major2_allele,i_major3_allele)),(majorac,(i_major_allele,i_major3_allele))])[1] | |
236 ##heteromajor1=max(bothheteromajor) | |
237 ##heteromajor2=min(bothheteromajor) | |
238 pre_heteromajor1=bothheteromajor[0] | |
239 pre_heteromajor2=bothheteromajor[1] | |
240 heteromajor1=max((homomajordict[pre_heteromajor1],pre_heteromajor1),(homomajordict[pre_heteromajor2],pre_heteromajor2))[1] | |
241 heteromajor2=min((homomajordict[pre_heteromajor1],pre_heteromajor1),(homomajordict[pre_heteromajor2],pre_heteromajor2))[1] | |
242 | |
243 logratio_homo=homozygous_collector-heterozygous_collector | |
244 | |
245 if logratio_homo>0: | |
246 fdout.writelines(line.strip()+'\t'+'\t'.join(['homo',str(logratio_homo),str(homomajor1),str(heteromajor1),str(heteromajor2)])+'\n') | |
247 elif logratio_homo<0: | |
248 fdout.writelines(line.strip()+'\t'+'\t'.join(['hetero',str(logratio_homo),str(homomajor1),str(heteromajor1),str(heteromajor2)])+'\n') | |
249 fd.close() | |
250 fdout.close() |