Mercurial > repos > sebastian-boegel > seq2hla
comparison seq2HLA.py @ 0:913ea6991ee4 draft
Uploaded
author | sebastian-boegel |
---|---|
date | Thu, 20 Dec 2012 10:37:01 -0500 |
parents | |
children | 155b796033b6 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:913ea6991ee4 |
---|---|
1 ########################################################################################################## | |
2 #Title: | |
3 #seq2HLA - HLA genotyping from RNA-Seq sequence reads | |
4 # | |
5 #Release: 1.0 | |
6 # | |
7 #Author: | |
8 #Sebastian Boegel, 2012 (c) | |
9 #TRON - Translational Oncology at the University Medical Center Mainz, 55131 Mainz, Germany | |
10 #University Medical Center of the Johannes Gutenberg-University Mainz, III. Medical Department, Mainz, Germany | |
11 # | |
12 #Contact: | |
13 #boegels@uni-mainz.de | |
14 # | |
15 #Synopsis: | |
16 #We developed an in-silico method "Seq2HLA", written in python and R, which takes standard RNA-Seq sequence reads in fastq format | |
17 #as input, uses a bowtie index comprising all HLA alleles and outputs the most likely HLA class I and class II genotypes, | |
18 #a p-value for each call, and the expression of each class | |
19 # | |
20 #Usage: | |
21 #python seq2HLA.py -1 <readfile1> -2 <readfile2> -r "<runname>" -l <readlength> [-3 <int>]* | |
22 #*optional (Default:0) | |
23 # | |
24 #Dependencies: | |
25 #0.) seq2HLA is a python script, developed with Python 2.6.8 | |
26 #1.) bowtie must be reachable by the command "bowtie". seq2HLA was developed and tested with bowtie version 0.12.7 (64-bit). The call to bowtie is invoked with 6 CPUs. You can change that in the function "mapping". | |
27 #2.) R must be installed, seq2HLA.py was developed and tested with R version 2.12.2 (2011-02-25) | |
28 #3.) Input must be paired-end reads in fastq-format | |
29 #4.) Index files must be located in the folder "references". | |
30 #5.) Packages: biopython (developed with V1.58), numpy (1.3.0) | |
31 ########################################################################################################### | |
32 | |
33 from operator import itemgetter | |
34 import sys | |
35 import linecache | |
36 import ast | |
37 import os | |
38 import shutil | |
39 import tempfile | |
40 import subprocess | |
41 from Bio import SeqIO | |
42 import numpy | |
43 import operator | |
44 from optparse import OptionParser | |
45 | |
46 log_stderr = sys.stderr | |
47 log_file = open("err.log","w") | |
48 sys.stderr = log_file | |
49 | |
50 import pysam | |
51 | |
52 #These variables need to be global, as they are filled and used by different modules | |
53 readcount={} | |
54 readspergroup={} | |
55 allelesPerLocus={} | |
56 | |
57 def main(runName,readFile1,readFile2,fastaClassI,fastaClassII,bowtiebuildClassI,bowtiebuildClassII,mismatch,trim3,output1,output2,logfile,gzipped): | |
58 log = open(logfile,"w") | |
59 # mapopt="-a -v"+str(mismatch) | |
60 mapopt="-a -v"+str(mismatch) | |
61 #call HLA typing for Class I | |
62 mainClassI(runName+"-ClassI",readFile1,readFile2,bowtiebuildClassI,fastaClassI,mapopt,trim3,output1,log,gzipped) | |
63 #call HLA typing for Class II | |
64 mainClassII(runName+"-ClassII",readFile1,readFile2,bowtiebuildClassII,fastaClassII,mapopt,trim3,output2,log,gzipped) | |
65 | |
66 #---------------Class I------------------------------- | |
67 def mainClassI(runName,readFile1,readFile2,bowtiebuild,hla1fasta,mapopt,trim3,finaloutput,log,gzipped): | |
68 #-------1st iteration----------------------------------- | |
69 log.write("----------HLA class I------------\n") | |
70 sam1=runName+"-iteration1.bam" | |
71 iteration=1 | |
72 log.write("First iteration starts....\nMapping ......\n") | |
73 mapping(sam1,runName,readFile1,readFile2,bowtiebuild,1,mapopt,trim3,log,gzipped) | |
74 medians=[] | |
75 medians.extend([0,0,0]) | |
76 medianflag=False | |
77 #Calculation of first digital haplototype ..... | |
78 output1=runName+".digitalhaplotype1" | |
79 log.write("Calculation of first digital haplototype .....\n") | |
80 map=createRefDict(hla1fasta,"A","B","C") | |
81 readMapping(map,sam1) | |
82 predictHLA1(sam1,medians,output1,medianflag,log) | |
83 log.write("1st iteration done.\nNow removing reads that mapped to the three top-scoring groups .......\n") | |
84 removeReads(runName,createRemoveList(runName,map)) | |
85 | |
86 #------2nd iteration------------------------------------------ | |
87 log.write("Second iterations starts .....\n Mapping ......\n") | |
88 medians=[] | |
89 iteration=2 | |
90 sam2=runName+"-iteration2.bam" | |
91 newReadFile1=runName+"-2nditeration_1.fq" | |
92 newReadFile2=runName+"-2nditeration_2.fq" | |
93 mapping(sam2,runName,newReadFile1,newReadFile2,bowtiebuild,2,mapopt,trim3,log,gzipped) | |
94 medianfile=runName+".digitalhaplotype1" | |
95 for i in range(2,5): | |
96 medians.append(linecache.getline(medianfile, i).split('\t', 3)[2]) | |
97 medianflag=True | |
98 output2=runName+".digitalhaplotype2" | |
99 #Calculation of second digital haplototype | |
100 log.write("Calculation of second digital haplototype .....\n") | |
101 map=createRefDict(hla1fasta,"A","B","C") | |
102 readMapping(map,sam2) | |
103 predictHLA1(sam2,medians,output2,medianflag,log) | |
104 log.write("2nd iteration done.") | |
105 reportHLAgenotype(output1,output2,finaloutput) | |
106 log.write("Calculation of locus-specific expression ...\n") | |
107 expression("A","B","C",694,694,694,map,runName,readFile1,finaloutput,gzipped) | |
108 | |
109 #--------------Class II---------------------------------------- | |
110 def mainClassII(runName,readFile1,readFile2,bowtiebuild,hla2fasta,mapopt,trim3,finaloutput,log,gzipped): | |
111 #-------1st iteration---------------------------------------- | |
112 log.write("----------HLA class II------------\n") | |
113 sam1=runName+"-iteration1.bam" | |
114 iteration=1 | |
115 log.write("ClassII: first iteration starts....\nMapping ......\n") | |
116 mapping(sam1,runName,readFile1,readFile2,bowtiebuild,1,mapopt,trim3,log,gzipped) | |
117 medians=[] | |
118 medians.extend([0,0,0]) | |
119 medianflag=False | |
120 output1=runName+".digitalhaplotype1" | |
121 log.write("ClassII: calculation of first digital haplototype .....\n") | |
122 map=createRefDict(hla2fasta,"DQA1","DQB1","DRB1") | |
123 readMapping(map,sam1) | |
124 predictHLA2(sam1,medians,output1,medianflag,log) | |
125 log.write("1st iteration done.\nNow removing reads that mapped to the three top-scoring groups .......\n") | |
126 removeReads(runName,createRemoveList(runName,map)) | |
127 | |
128 #------2nd iteration------------------------------------------ | |
129 log.write("Second iterations starts .....\n Mapping ......\n") | |
130 medians=[] | |
131 iteration=2 | |
132 sam2=runName+"-iteration2.bam" | |
133 newReadFile1=runName+"-2nditeration_1.fq" | |
134 newReadFile2=runName+"-2nditeration_2.fq" | |
135 mapping(sam2,runName,newReadFile1,newReadFile2,bowtiebuild,2,mapopt,trim3,log,gzipped) | |
136 medianfile=runName+".digitalhaplotype1" | |
137 for i in range(2,5): | |
138 medians.append(float(linecache.getline(medianfile, i).split('\t', 3)[2])/2.0) | |
139 medianflag=True | |
140 output2=runName+".digitalhaplotype2" | |
141 #Calculation of second digital haplototype | |
142 log.write("ClassII: calculation of second digital haplototype .....\n") | |
143 map=createRefDict(hla2fasta,"DQA1","DQB1","DRB1") | |
144 readMapping(map,sam2) | |
145 predictHLA2(sam2,medians,output2,medianflag,log) | |
146 log.write("2nd iteration done.\n") | |
147 reportHLAgenotype(output1,output2,finaloutput) | |
148 log.write("Calculation of locus-specific expression ...\n") | |
149 expression("DQA1","DQB1","DRB1",400,421,421,map,runName,readFile1,finaloutput,gzipped) | |
150 | |
151 #performs the bowtie mapping for the 2 iterations using the given parameters | |
152 def mapping(sam,runName,readFile1,readFile2,bowtiebuild,iteration,mapopt,trim3,log,gzipped): | |
153 if iteration==1: | |
154 if gzipped == "True": | |
155 sam = sam.rsplit(".",1)[0] | |
156 cmd1 = "bowtie --threads 4 -3 %s --quiet -S %s --al %s.aligned %s -1 <(zcat %s) -2 <(zcat %s) | awk -F '\t' '$3 != \"*\"{ print $0 }' | samtools view -bS - | samtools sort - %s" % (trim3,mapopt,runName,bowtiebuild,readFile1,readFile2,sam) | |
157 mappingOutput = execBowtie(cmd1) | |
158 | |
159 cmd2 = "samtools index %s.bam" % sam | |
160 createBAMIndex(cmd2) | |
161 else: | |
162 sam = sam.rsplit(".",1)[0] | |
163 cmd1 = "bowtie --threads 4 -3 %s --quiet -S %s --al %s.aligned %s -1 %s -2 %s | awk -F '\t' '$3 != \"*\"{ print $0 }' | samtools view -bS - | samtools sort - %s" % (trim3,mapopt,runName,bowtiebuild,readFile1,readFile2,sam) | |
164 mappingOutput = execBowtie(cmd1) | |
165 | |
166 cmd2 = "samtools index %s.bam" % sam | |
167 createBAMIndex(cmd2) | |
168 # if iteration==2: | |
169 else: | |
170 | |
171 sam = sam.rsplit(".",1)[0] | |
172 cmd2 = "bowtie --threads 4 -3 %s --quiet -S %s %s -1 %s -2 %s | samtools view -bS - | samtools sort - %s" % (trim3,mapopt,bowtiebuild,readFile1,readFile2,sam) | |
173 mappingOutput = execBowtie(cmd2) | |
174 | |
175 cmd3 = "samtools index %s.bam" % sam | |
176 createBAMIndex(cmd3) | |
177 | |
178 log.write(mappingOutput+"\n") | |
179 | |
180 def createBAMIndex(cmd): | |
181 proc = subprocess.Popen(["/bin/bash","-c",cmd],stderr=subprocess.PIPE) | |
182 (stdoutdata,stderrdata) = proc.communicate() | |
183 r = proc.returncode | |
184 if r != 0: | |
185 raise IOError(stderrdata) | |
186 | |
187 def execBowtie(cmd): | |
188 proc = subprocess.Popen(["/bin/bash","-c",cmd],stdout=subprocess.PIPE,stderr=subprocess.PIPE) | |
189 (stdoutdata,stderrdata) = proc.communicate() | |
190 mappingOutput = stdoutdata | |
191 r = proc.returncode | |
192 if r != 0: | |
193 raise IOError(stderrdata) | |
194 return mappingOutput | |
195 | |
196 #create dictionary "map", that contains all IMGT/HLA-allele names as keys and the allele name (e.g. A*02:01:01) as value | |
197 #dictionary "readcount" is initialized with 0 for each allele | |
198 #dictionary "readspergroup" is initialized with 0 for each group (2digit, e.g. A*01) | |
199 #dictionary "allelesPerLocus" stores the number of alleles per locus. | |
200 def createRefDict(hlafasta,locus1,locus2,locus3): | |
201 map={} | |
202 for locus in (locus1,locus2,locus3): | |
203 allelesPerLocus[locus]=0 | |
204 handle=open(hlafasta,'r') | |
205 for record in SeqIO.parse(handle, "fasta") : | |
206 l=record.description.split(' ') | |
207 hlapseudoname=l[0] | |
208 hlaallele=l[1] | |
209 map[hlapseudoname]=hlaallele | |
210 readcount[hlaallele]=0 | |
211 readspergroup[hlaallele.split(":")[0]]=0 | |
212 allelesPerLocus[hlaallele.split('*')[0]]+=1 | |
213 handle.close() | |
214 return map | |
215 | |
216 #Open sam-file and count mappings for each allele | |
217 def readMapping(map,sam): | |
218 samhandle = pysam.Samfile(sam,"rb") | |
219 for ar in samhandle.fetch(): | |
220 hla = samhandle.getrname(ar.tid) | |
221 readcount[map[hla]]+=1 | |
222 samhandle.close() | |
223 | |
224 #predict HLA class II genotype | |
225 def predictHLA2(sam,medians,output,medianflag,log): | |
226 | |
227 readspergroupDQAlist=[] | |
228 readspergroupDQBlist=[] | |
229 readspergroupDRBlist=[] | |
230 readspergroupDQA={} | |
231 readspergroupDQB={} | |
232 readspergroupDRB={} | |
233 maxallelepergroup={} | |
234 | |
235 #for each allele, to which at least 1 read map, find the allele which has the most reads within a group (2-digit-level, e.g. DQA*02") and save | |
236 #i) the key of this allele as ambassador for the group => maxallelepergroup holds for each group the top-scoring allele | |
237 #ii) the number of reads mapping to the top-scoring allele => readspergroup | |
238 for key in readcount: | |
239 if readcount[key] > 0: | |
240 group=key.split(":")[0] | |
241 if readspergroup[group] <=readcount[key]: | |
242 readspergroup[group]=readcount[key] | |
243 maxallelepergroup[group]=key | |
244 | |
245 #consider all DQA-,DQB-, and DRB-groups seperately | |
246 #readspergroup<DQA|DQB|DRB>list = list of all reads mapping to the top-scoring groups minus the decision threshold (which is 0 in the first iteration) | |
247 #readspergroup<DQA|DQB|DRB> = contains the same entries as the list, but the entries are uniquely accessible via the group-key (e.g. DQA*01) | |
248 for key in readspergroup: | |
249 keyBegin = key[0:3] | |
250 if keyBegin=='DQA': | |
251 readspergroupDQAlist.append(readspergroup[key]-float(medians[0])) | |
252 readspergroupDQA[key]=readspergroup[key]-float(medians[0]) | |
253 if keyBegin=='DQB': | |
254 readspergroupDQBlist.append(readspergroup[key]-float(medians[1])) | |
255 readspergroupDQB[key]=readspergroup[key]-float(medians[1]) | |
256 if keyBegin=='DRB': | |
257 readspergroupDRBlist.append(readspergroup[key]-float(medians[2])) | |
258 readspergroupDRB[key]=readspergroup[key]-float(medians[2]) | |
259 | |
260 #compute the decision threshold (median) for homozygosity vs. heterozygosity for the second iteration | |
261 medianDQA=numpy.median(readspergroupDQAlist) | |
262 medianDQB=numpy.median(readspergroupDQBlist) | |
263 medianDRB=numpy.median(readspergroupDRBlist) | |
264 a = b = c = "" | |
265 | |
266 #Determine top-scoring group of the whole locus (DQA,DQBB,DRB) and store it | |
267 #maxkey<DQA,DQB,DRB> = group (e.g. DQA*01) with the most reads | |
268 #It can be that, e.g. in cancer cells a whole locus is lost. For that reason it is checked if the | |
269 #number of mapping reads of the top-scoring group maxkey<DQA,DQB,DRB> is > 0, otherwise "no" ist reported for this locus | |
270 if len(readspergroupDQAlist)>0: | |
271 maxkeyDQA=max(readspergroupDQA,key=lambda a:readspergroupDQA.get(a)) | |
272 if readspergroupDQA[maxkeyDQA] > 0: | |
273 maxDQA=maxallelepergroup[maxkeyDQA] | |
274 a = max(readspergroupDQA,key=lambda a:readspergroupDQA.get(a)) | |
275 else: | |
276 a = "no" | |
277 else: | |
278 a = "no" | |
279 | |
280 if len(readspergroupDQBlist)>0: | |
281 maxkeyDQB=max(readspergroupDQB,key=lambda a:readspergroupDQB.get(a)) | |
282 if readspergroupDQB[maxkeyDQB] > 0: | |
283 maxDQB=maxallelepergroup[maxkeyDQB] | |
284 b = max(readspergroupDQB,key=lambda a:readspergroupDQB.get(a)) | |
285 else: | |
286 b = "no" | |
287 else: | |
288 b = "no" | |
289 | |
290 if len(readspergroupDRBlist)>0: | |
291 maxkeyDRB=max(readspergroupDRB,key=lambda a:readspergroupDRB.get(a)) | |
292 if readspergroupDRB[maxkeyDRB] > 0: | |
293 maxDRB=maxallelepergroup[maxkeyDRB] | |
294 c = max(readspergroupDRB,key=lambda a:readspergroupDRB.get(a)) | |
295 else: | |
296 c = "no" | |
297 else: | |
298 c = "no" | |
299 | |
300 readspergroupDQA=sorted(readspergroupDQA.items(), key=itemgetter(1),reverse=True) | |
301 readspergroupDQB=sorted(readspergroupDQB.items(), key=itemgetter(1),reverse=True) | |
302 readspergroupDRB=sorted(readspergroupDRB.items(), key=itemgetter(1),reverse=True) | |
303 | |
304 dqavec=dqbvec=drbvec="" | |
305 if medianflag: | |
306 #in the 2nd iteration: | |
307 #1.) DO NOT remove the top-scoring group from the set <dqa,dqb,drb>vec as this is more strict when calculating the probability of the top scoring group being an outlier | |
308 #The strings <dqa,dqb,drb>vec are used by the R-script to calculate the probability of the top scoring group being an outlier | |
309 for key in maxallelepergroup: | |
310 keyBegin = key[0:3] | |
311 if keyBegin=="DQA": | |
312 dqavec=dqavec+str(readcount[maxallelepergroup[key]])+"," | |
313 #print key+": "+str(readcount[maxallelepergroup[key]]) | |
314 if keyBegin=="DQB": | |
315 dqbvec=dqbvec+str(readcount[maxallelepergroup[key]])+"," | |
316 #print key+": "+str(readcount[maxallelepergroup[key]]) | |
317 if keyBegin=="DRB": | |
318 drbvec=drbvec+str(readcount[maxallelepergroup[key]])+"," | |
319 #print key+": "+str(readcount[maxallelepergroup[key]]) | |
320 #2.) Add the decision thresholds to the sets <dqa,dqb,drb>vec, as this enables measuring the distance of the top-scoring group to this distance | |
321 if a=="no": | |
322 dqavec+=str(medians[0]) | |
323 if b=="no": | |
324 dqbvec+=str(medians[1]) | |
325 if c=="no": | |
326 drbvec+=str(medians[2]) | |
327 #3.) In case of, e.g. a loss of whole HLA-locus (DQA/B,DRB), <dqa,dqb,drb>vec only contain medians[<0|1|2>]. | |
328 #To avoid errors in the R-Script, set to 0 | |
329 if dqavec==str(medians[0]): | |
330 dqavec = dqacount="0" | |
331 else: | |
332 dqacount=str(readcount[maxallelepergroup[maxkeyDQA]]) | |
333 if dqavec==str(readcount[maxallelepergroup[maxkeyDQA]])+",": | |
334 dqavec=str(readcount[maxallelepergroup[maxkeyDQA]])+","+str(readcount[maxallelepergroup[maxkeyDQA]])+"," | |
335 if dqbvec==str(medians[1]): | |
336 dqbvec = dqbcount="0" | |
337 else: | |
338 dqbcount=str(readcount[maxallelepergroup[maxkeyDQB]]) | |
339 if dqbvec==str(readcount[maxallelepergroup[maxkeyDQB]])+",": | |
340 dqbvec=str(readcount[maxallelepergroup[maxkeyDQB]])+","+str(readcount[maxallelepergroup[maxkeyDQB]])+"," | |
341 if drbvec==str(medians[2]): | |
342 drbvec = drbcount="0" | |
343 else: | |
344 drbcount=str(readcount[maxallelepergroup[maxkeyDRB]]) | |
345 if drbvec==str(readcount[maxallelepergroup[maxkeyDRB]])+",": | |
346 drbvec=str(readcount[maxallelepergroup[maxkeyDDRB]])+","+str(readcount[maxallelepergroup[maxkeyDRB]])+"," | |
347 else: | |
348 #in the 1st iteration: remove the top-scoring group from the set <dqa,dqb,drb>vec as this increases the certainty when calculating the probability of the top scoring group being an outlier | |
349 for key in maxallelepergroup: | |
350 keyBegin = key[0:3] | |
351 if keyBegin=="DQA": | |
352 if key!=maxkeyDQA: | |
353 dqavec=dqavec+str(readcount[maxallelepergroup[key]])+"," | |
354 #print key+": "+str(readcount[maxallelepergroup[key]]) | |
355 if keyBegin=="DQB": | |
356 if key!=maxkeyDQB: | |
357 dqbvec=dqbvec+str(readcount[maxallelepergroup[key]])+"," | |
358 #print key+": "+str(readcount[maxallelepergroup[key]]) | |
359 if keyBegin=="DRB": | |
360 if key!=maxkeyDRB: | |
361 drbvec=drbvec+str(readcount[maxallelepergroup[key]])+"," | |
362 #print key+": "+str(readcount[maxallelepergroup[key]]) | |
363 #2.) DO NOT add the decision thresholds to the sets <dqa,dqb,drb>vec | |
364 if dqavec=="": | |
365 dqacount=dqavec="0" | |
366 else: | |
367 dqacount=str(readcount[maxallelepergroup[maxkeyDQA]]) | |
368 if dqavec==str(readcount[maxallelepergroup[maxkeyDQA]])+",": | |
369 dqavec=str(readcount[maxallelepergroup[maxkeyDQA]])+","+str(readcount[maxallelepergroup[maxkeyDQA]])+"," | |
370 if len(dqavec.split(","))==2: | |
371 dqavec=dqavec+"0," | |
372 | |
373 if dqbvec=="": | |
374 dqbcount=dqbvec="0" | |
375 else: | |
376 dqbcount=str(readcount[maxallelepergroup[maxkeyDQB]]) | |
377 if dqbvec==str(readcount[maxallelepergroup[maxkeyDQB]])+",": | |
378 dqbvec=str(readcount[maxallelepergroup[maxkeyDQB]])+","+str(readcount[maxallelepergroup[maxkeyDQB]])+"," | |
379 if len(dqbvec.split(","))==2: | |
380 dqbvec=dqbvec+"0," | |
381 | |
382 if drbvec=="": | |
383 drbcount=drbvec="0" | |
384 else: | |
385 drbcount=str(readcount[maxallelepergroup[maxkeyDRB]]) | |
386 if drbvec==str(readcount[maxallelepergroup[maxkeyDRB]])+",": | |
387 drbvec=str(readcount[maxallelepergroup[maxkeyDDRB]])+","+str(readcount[maxallelepergroup[maxkeyDRB]])+"," | |
388 if len(drbvec.split(","))==2: | |
389 drbvec=drbvec+"0," | |
390 | |
391 #call R-script "commmand.R" to calculate the confidence of the top-scoring allele | |
392 routput=os.popen("R --vanilla < "+sys.path[0]+"/command.R --args "+dqacount+" "+dqavec+" "+maxkeyDQA+" "+dqbcount+" "+dqbvec+" "+maxkeyDQB+" "+drbcount+" "+drbvec+" "+maxkeyDRB).read() | |
393 parseOutput=routput.split("\n") | |
394 | |
395 entries = [] | |
396 for entry in parseOutput: | |
397 if entry[0:3]=="[1]": | |
398 entries.append(str(entry[4:len(entry)])) | |
399 | |
400 if a=="no" and entries[1]!="NA": | |
401 entries[1]=str(1-float(entries[1])) | |
402 if b=="no" and entries[3]!="NA": | |
403 entries[3]=str(1-float(entries[3])) | |
404 if c=="no" and entries[5]!="NA": | |
405 entries[5]=str(1-float(entries[5])) | |
406 | |
407 pred2File(entries,medianDQA,medianDQB,medianDRB,output,a,b,c,log) | |
408 | |
409 #predict HLA class I genotype | |
410 def predictHLA1(sam,medians,output,medianflag,log): | |
411 | |
412 readspergroupAlist=[] | |
413 readspergroupBlist=[] | |
414 readspergroupClist=[] | |
415 readspergroupA={} | |
416 readspergroupB={} | |
417 readspergroupC={} | |
418 maxallelepergroup={} | |
419 | |
420 #for each allele, to which at least 1 read map, find the allele which has the most reads within a group (2-digit-level, e.g. A*02") and save | |
421 #i) the key of this allele as ambassador for the group => maxallelepergroup holds for each group the top-scoring allele | |
422 #ii) the number of reads mapping to the top-scoring allele => readspergroup | |
423 for key in readcount: | |
424 if readcount[key] > 0: | |
425 group=key.split(":")[0] | |
426 if readspergroup[group] <=readcount[key]: | |
427 readspergroup[group]=readcount[key] | |
428 maxallelepergroup[group]=key | |
429 | |
430 | |
431 #consider all A-,B-, and C-groups seperately | |
432 #readspergroup<A|B|C>list = list of all reads mapping to the top-scoring groups minus the decision threshold (which is 0 in the first iteration) | |
433 #readspergroup<A|B|C> = contains the same entries as the list, but the entries are uniquely accessible via the group-key (e.g. B*27) | |
434 for key in readspergroup: | |
435 firstKey = key[0] | |
436 if key[0]=='A': | |
437 readspergroupAlist.append(readspergroup[key]-float(medians[0])) | |
438 readspergroupA[key]=readspergroup[key]-float(medians[0]) | |
439 if key[0]=='B': | |
440 readspergroupBlist.append(readspergroup[key]-float(medians[1])) | |
441 readspergroupB[key]=readspergroup[key]-float(medians[1]) | |
442 if key[0]=='C': | |
443 readspergroupClist.append(readspergroup[key]-float(medians[2])) | |
444 readspergroupC[key]=readspergroup[key]-float(medians[2]) | |
445 | |
446 #compute the decision threshold (median) for homozygosity vs. heterozygosity for the second iteration | |
447 medianA=numpy.median(readspergroupAlist) | |
448 medianB=numpy.median(readspergroupBlist) | |
449 medianC=numpy.median(readspergroupClist) | |
450 a = b = c = "" | |
451 | |
452 #Determine top-scoring group of the whole locus (A,B,C) and store it | |
453 #maxkey<A,B,C> = group (e.g. A*02) with the most reads | |
454 #It can be that, e.g. in cancer cells A whole locus is lost. For that reason it is checked if the | |
455 #number of mapping reads of the top-scoring group maxkey<A,B,C> is > 0, otherwise "no" ist reported for this locus | |
456 if len(readspergroupAlist)>0: | |
457 maxkeyA=max(readspergroupA,key=lambda a:readspergroupA.get(a)) | |
458 if readspergroupA[maxkeyA] > 0: | |
459 maxA=maxallelepergroup[maxkeyA] | |
460 a = max(readspergroupA,key=lambda a:readspergroupA.get(a)) | |
461 else: | |
462 a = "no" | |
463 else: | |
464 a = "no" | |
465 | |
466 if len(readspergroupBlist)>0: | |
467 maxkeyB=max(readspergroupB,key=lambda a:readspergroupB.get(a)) | |
468 if readspergroupB[maxkeyB] > 0: | |
469 maxB=maxallelepergroup[maxkeyB] | |
470 b = max(readspergroupB,key=lambda a:readspergroupB.get(a)) | |
471 else: | |
472 b = "no" | |
473 else: | |
474 b = "no" | |
475 | |
476 if len(readspergroupClist)>0: | |
477 maxkeyC=max(readspergroupC,key=lambda a:readspergroupC.get(a)) | |
478 if readspergroupC[maxkeyC] > 0: | |
479 maxC=maxallelepergroup[maxkeyC] | |
480 c = max(readspergroupC,key=lambda a:readspergroupC.get(a)) | |
481 else: | |
482 c = "no" | |
483 else: | |
484 c = "no" | |
485 | |
486 readspergroupA=sorted(readspergroupA.items(), key=itemgetter(1),reverse=True) | |
487 readspergroupB=sorted(readspergroupB.items(), key=itemgetter(1),reverse=True) | |
488 readspergroupC=sorted(readspergroupC.items(), key=itemgetter(1),reverse=True) | |
489 | |
490 avec=bvec=cvec="" | |
491 | |
492 if medianflag: | |
493 #in the 2nd iteration: | |
494 #1.) DO NOT remove the top-scoring group from the set <a,b,c>vec as this is more strict when calculating the probability of the top scoring group being an outlier | |
495 #The strings <a,b,c>vec are used by the R-script to calculate the probability of the top scoring group being an outlier | |
496 for key in maxallelepergroup: | |
497 if key[0]=="A": | |
498 avec=avec+str(readcount[maxallelepergroup[key]])+"," | |
499 #print key+": "+str(readcount[maxallelepergroup[key]]) | |
500 if key[0]=="B": | |
501 bvec=bvec+str(readcount[maxallelepergroup[key]])+"," | |
502 #print key+": "+str(readcount[maxallelepergroup[key]]) | |
503 if key[0]=="C": | |
504 cvec=cvec+str(readcount[maxallelepergroup[key]])+"," | |
505 #print key+": "+str(readcount[maxallelepergroup[key]]) | |
506 #2.) Add the decision thresholds to the sets <a,b,c>vec, as this enables measuring the distance of the top-scoring group to this distance | |
507 if a=="no": | |
508 avec+=str(medians[0]) | |
509 if b=="no": | |
510 bvec+=str(medians[1]) | |
511 if c=="no": | |
512 cvec+=str(medians[2]) | |
513 #3.) In case of, e.g. a loss of whole HLA-locus (A,B,C), <a,b,c>vec only contain median[<0|1|2>]. | |
514 #To avoid errors in the R-Script, set to 0 | |
515 if avec=="" or avec==medians[0]: | |
516 avec = acount="0" | |
517 else: | |
518 acount=str(readcount[maxallelepergroup[maxkeyA]]) | |
519 | |
520 if bvec=="" or bvec==medians[1]: | |
521 bvec = bcount="0" | |
522 else: | |
523 bcount=str(readcount[maxallelepergroup[maxkeyB]]) | |
524 | |
525 if cvec=="" or cvec==medians[2]: | |
526 cvec = ccount="0" | |
527 else: | |
528 ccount=str(readcount[maxallelepergroup[maxkeyC]]) | |
529 else: | |
530 #in the 1st iteration: remove the top-scoring group from the set <a,b,c>vec as this increases the certainty when calculating the probability of the top scoring group being an outlier | |
531 for key in maxallelepergroup: | |
532 if key[0]=="A" and key!=maxkeyA: | |
533 avec=avec+str(readcount[maxallelepergroup[key]])+"," | |
534 #print key+": "+str(readcount[maxallelepergroup[key]]) | |
535 if key[0]=="B" and key!=maxkeyB: | |
536 bvec=bvec+str(readcount[maxallelepergroup[key]])+"," | |
537 #print key+": "+str(readcount[maxallelepergroup[key]]) | |
538 if key[0]=="C" and key!=maxkeyC: | |
539 cvec=cvec+str(readcount[maxallelepergroup[key]])+"," | |
540 #print key+": "+str(readcount[maxallelepergroup[key]]) | |
541 #2.) DO NOT add the decision thresholds to the sets <a,b,c>vec | |
542 if avec=="": | |
543 acount=avec="0" | |
544 else: | |
545 acount=str(readcount[maxallelepergroup[maxkeyA]]) | |
546 if bvec=="": | |
547 bcount=bvec="0" | |
548 else: | |
549 bcount=str(readcount[maxallelepergroup[maxkeyB]]) | |
550 if cvec=="": | |
551 ccount=cvec="0" | |
552 else: | |
553 ccount=str(readcount[maxallelepergroup[maxkeyC]]) | |
554 | |
555 #call R-script "commmand.R" to calculate the confidence of the top-scoring allele | |
556 routput=os.popen("R --vanilla < "+sys.path[0]+"/command.R --args "+acount+" "+avec+" "+maxkeyA+" "+bcount+" "+bvec+" "+maxkeyB+" "+ccount+" "+cvec+" "+maxkeyC).read() | |
557 parseOutput=routput.split("\n") | |
558 | |
559 entries = [] | |
560 for entry in parseOutput: | |
561 if entry[0:3]=="[1]": | |
562 entries.append(str(entry[4:len(entry)])) | |
563 | |
564 if a=="no" and entries[1]!="NA": | |
565 entries[1]=str(1-float(entries[1])) | |
566 if b=="no" and entries[3]!="NA": | |
567 entries[3]=str(1-float(entries[3])) | |
568 if c=="no" and entries[5]!="NA": | |
569 entries[5]=str(1-float(entries[5])) | |
570 | |
571 pred2File(entries,medianA,medianB,medianC,output,a,b,c,log) | |
572 | |
573 #write digital haplotype into file | |
574 def pred2File(entries,medianA,medianB,medianC,output,a,b,c,log): | |
575 out = open(output, 'w') | |
576 out.write("HLA\tHLA1\tmedian-Value\talternative\tp-Value\n") | |
577 out.write("A\t"+a+"\t") | |
578 # write the median-Value for A | |
579 out.write(str(medianA)+"\t") | |
580 out.write(entries[0]+"\t"+entries[1]+"\n") | |
581 | |
582 out.write("B\t"+b+"\t") | |
583 # write the median-Value for B | |
584 out.write(str(medianB)+"\t") | |
585 out.write(entries[2]+"\t"+entries[3]+"\n") | |
586 | |
587 out.write("C\t"+c+"\t") | |
588 # write the median-Value for C | |
589 out.write(str(medianC)+"\t") | |
590 out.write(entries[4]+"\t"+entries[5]+"\n") | |
591 out.close() | |
592 log.write("The digital haplotype is written into "+output+"\n") | |
593 | |
594 #open mapping file and all read ids to the list "removeList", which map to one of the three groups in "alleles" | |
595 def createRemoveList(runName,map): | |
596 removeList={} | |
597 alleles = [] | |
598 sam=runName+"-iteration1.bam" | |
599 alleles_in=runName+".digitalhaplotype1" | |
600 for i in range(2,5): | |
601 alleles.append(linecache.getline(alleles_in, i).split('\t', 2)[1]) | |
602 | |
603 samhandle=pysam.Samfile(sam,'rb') | |
604 for read in samhandle.fetch(): | |
605 illuminaid = read.qname | |
606 hlapseudoname = samhandle.getrname(read.tid) | |
607 if map[hlapseudoname].split(':')[0] in alleles: | |
608 removeList[illuminaid]=1 | |
609 samhandle.close() | |
610 return removeList | |
611 | |
612 #Remove reads that mapped to the three top-scoring alleles and write the remaining reads into two new read files | |
613 def removeReads(runName,removeList): | |
614 aligned1=runName+"_1.aligned" | |
615 aligned2=runName+"_2.aligned" | |
616 newReadFile1=runName+"-2nditeration_1.fq" | |
617 newReadFile2=runName+"-2nditeration_2.fq" | |
618 #r1 and r2, which are the input of bowtie in the 2nd iteration | |
619 r1=open(newReadFile1,"w") | |
620 r2=open(newReadFile2,"w") | |
621 #open the 2 files, that contain the reads that mapped in the 1st iteration | |
622 aligned_handle1=open(aligned1,"r") | |
623 aligned_handle2=open(aligned2,"r") | |
624 | |
625 #One read entry consists of 4 lines: header, seq, "+", qualities. | |
626 for record in SeqIO.parse(aligned_handle1, "fastq"): | |
627 illuminaid=record.id.split('/')[0].split(' ')[0]#find exact id, which also appears in the mapping file | |
628 if not illuminaid in removeList: | |
629 SeqIO.write(record, r1, "fastq") | |
630 | |
631 for record in SeqIO.parse(aligned_handle2, "fastq"): | |
632 illuminaid=record.id.split('/')[0].split(' ')[0]#find exact id, which also appears in the mapping file | |
633 if not illuminaid in removeList: | |
634 SeqIO.write(record, r2, "fastq") | |
635 | |
636 #write the final prediction (both digital haplotypes) to file and stdout | |
637 def reportHLAgenotype(output1,output2,finaloutput): | |
638 filehandle1=open(output1,'r').readlines()[1:4] | |
639 filehandle2=open(output2,'r').readlines()[1:4] | |
640 outfile=open(finaloutput, 'w') | |
641 | |
642 outfile.write("HLA\tHLA1\tp-Value\tHLA2\tp-Value\n") | |
643 for i in range(len(filehandle1)): | |
644 filehandle1[i]=filehandle1[i][0:-1] | |
645 for i in range(len(filehandle2)): | |
646 filehandle2[i]=filehandle2[i][0:-1] | |
647 | |
648 a1 = filehandle1[0].split('\t',2)[1] | |
649 a1score = filehandle1[0].split('\t')[4] | |
650 b1 = filehandle1[1].split('\t',2)[1] | |
651 b1score = filehandle1[1].split('\t')[4] | |
652 c1 = filehandle1[2].split('\t',2)[1] | |
653 c1score = filehandle1[2].split('\t')[4] | |
654 | |
655 a2 = filehandle2[0].split('\t',2)[1] | |
656 if a2 == "no": | |
657 a2 = "hoz("+filehandle2[0].split('\t')[3]+")" | |
658 a2score = filehandle2[0].split('\t')[4] | |
659 b2 = filehandle2[1].split('\t',2)[1] | |
660 if b2 == "no": | |
661 b2 = "hoz("+filehandle2[1].split('\t')[3]+")" | |
662 b2score = filehandle2[1].split('\t')[4] | |
663 c2 = filehandle2[2].split('\t', 2)[1] | |
664 if c2 == "no": | |
665 c2 = "hoz("+filehandle2[2].split('\t')[3]+")" | |
666 c2score = filehandle2[2].split('\t')[4] | |
667 #write complete HLA genotype to file | |
668 outfile.write("A\t"+a1+"\t"+a1score+"\t"+a2+"\t"+a2score+"\n") | |
669 outfile.write("B\t"+b1+"\t"+b1score+"\t"+b2+"\t"+b2score+"\n") | |
670 outfile.write("C\t"+c1+"\t"+c1score+"\t"+c2+"\t"+c2score+"\n") | |
671 #.. and print it to STDOUT | |
672 #print "A\t"+a1+"\t"+a1score+"\t"+a2+"\t"+a2score | |
673 #print "B\t"+b1+"\t"+b1score+"\t"+b2+"\t"+b2score | |
674 #print "C\t"+c1+"\t"+c1score+"\t"+c2+"\t"+c2score | |
675 | |
676 #calculate locus-specific expression | |
677 def expression(locus1,locus2,locus3,length1,length2,length3,map,runName,readFile1,finaloutput,gzipped): | |
678 outfile = open(finaloutput,'a') | |
679 aligned1=runName+"_1.aligned" | |
680 sam=runName+"-iteration1.bam" | |
681 alleles_in=runName+".digitalhaplotype1" | |
682 if gzipped: | |
683 proc=subprocess.Popen(["/bin/bash","-c","wc -l <(zcat " + readFile1 + ")"],stdout=subprocess.PIPE,stderr=subprocess.PIPE) | |
684 (stdoutdata,stderrdata) = proc.communicate() | |
685 numberlines = stdoutdata | |
686 else: | |
687 proc=subprocess.Popen(["/bin/bash","-c","wc -l " + readFile1],stdout=subprocess.PIPE,stderr=subprocess.PIPE) | |
688 (stdoutdata,stderrdata) = proc.communicate() | |
689 numberlines = stdoutdata | |
690 totalreads=int(numberlines.split(" ")[0])/4 | |
691 alleles=[] | |
692 for i in range(2,5): | |
693 alleles.append(linecache.getline(alleles_in, i).split('\t', 2)[1]) | |
694 alleles.append(linecache.getline(alleles_in, i).split('\t')[3]) | |
695 | |
696 #create read dictionary | |
697 reads={} | |
698 aligned_handle1=open(aligned1,"r") | |
699 for record in SeqIO.parse(aligned_handle1, "fastq"): | |
700 illuminaid=record.id.split('/')[0].split(' ')[0]#find exact id, which also appears in the mapping file | |
701 reads[illuminaid]={} | |
702 reads[illuminaid][locus1]=0 | |
703 reads[illuminaid][locus2]=0 | |
704 reads[illuminaid][locus3]=0 | |
705 samhandle = pysam.Samfile(sam,"rb") | |
706 for read in samhandle.fetch(): | |
707 illuminaid = read.qname | |
708 hlapseudoname = samhandle.getrname(read.tid) | |
709 if map[hlapseudoname].split(':')[0] in alleles: | |
710 reads[illuminaid][map[hlapseudoname].split('*')[0]]+=1 | |
711 | |
712 count={} | |
713 count[locus1]=0 | |
714 count[locus2]=0 | |
715 count[locus3]=0 | |
716 for key in reads: | |
717 n=0 | |
718 for locus in reads[key]: | |
719 if reads[key][locus] > 0: | |
720 n+=1 | |
721 for locus in reads[key]: | |
722 if reads[key][locus] > 0: | |
723 count[locus]+=float(1.0/float(n)) | |
724 | |
725 outfile.write("\n") | |
726 #Calculate RPKM and print expression values for each locus to stdout | |
727 for locus in count: | |
728 if locus==locus1: | |
729 outfile.write(locus+": "+str(round(float((1000.0/length1))*float((1000000.0/totalreads))*count[locus],2))+" RPKM\n") | |
730 if locus==locus2: | |
731 outfile.write(locus+": "+str(round(float((1000.0/length2))*float((1000000.0/totalreads))*count[locus],2))+" RPKM\n") | |
732 if locus==locus3: | |
733 outfile.write(locus+": "+str(round(float((1000.0/length3))*float((1000000.0/totalreads))*count[locus],2))+" RPKM\n") | |
734 | |
735 | |
736 if __name__ == '__main__': | |
737 parser = OptionParser(usage="usage: %prog -1 readFile1 -2 readFile2 -r runName -l readlength [-3 <int>]", version="%prog 1.0") | |
738 parser.add_option("-z","--gzipped",action="store",dest="gzipped",help="Select if input reads are in gzip format") | |
739 parser.add_option("-1", | |
740 action="store", | |
741 dest="readFile1", | |
742 help="File name of #1 mates ") | |
743 parser.add_option("-2", | |
744 action="store", | |
745 dest="readFile2", | |
746 help="File name of #2 mates") | |
747 parser.add_option("-r", "--runName", | |
748 action="store", | |
749 dest="runName", | |
750 help="Name of this HLA typing run. Wil be used throughout this process as part of the name of the newly created files.") | |
751 parser.add_option("-l", "--length", | |
752 action="store", | |
753 dest="length", | |
754 help="Readlength") | |
755 parser.add_option("-3", "--trim3", | |
756 action="store", | |
757 dest="trim3", | |
758 default="0", | |
759 help="Bowtie option: -3 <int> trims <int> bases from the low quality 3' end of each read. Default: 0") | |
760 parser.add_option("-o", "--output1", | |
761 action="store", | |
762 dest="output1", | |
763 help="Output file 1") | |
764 parser.add_option("-p", "--output2", | |
765 action="store", | |
766 dest="output2", | |
767 help="Output file 2") | |
768 parser.add_option("-g","--log",action="store",dest="logfile",help="Output Log File") | |
769 | |
770 | |
771 (options, args) = parser.parse_args() | |
772 if not options.readFile1: | |
773 parser.error('File name #1 pair not given.') | |
774 if not options.readFile2: | |
775 parser.error('File name #2 pair not given.') | |
776 if not options.runName: | |
777 parser.error('Run name not given.') | |
778 if not options.length: | |
779 parser.error('Readlength not given.') | |
780 gzipped = options.gzipped | |
781 readFile1=options.readFile1 | |
782 readFile2=options.readFile2 | |
783 runName=options.runName | |
784 output1 = options.output1 | |
785 output2 = options.output2 | |
786 logfile = options.logfile | |
787 bowtiebuildClassI=sys.path[0]+"/references/ClassIWithoutNQex2-3.plus75" | |
788 bowtiebuildClassII=sys.path[0]+"/references/HLA2.ex2.plus75" | |
789 fastaClassI=sys.path[0]+"/references/ClassIWithoutNQex2-3.plus75.fasta" | |
790 fastaClassII=sys.path[0]+"/references/HLA2.ex2.plus75.fasta" | |
791 #as shown in the publication HLA typing with RNA-Seq works best by allowing as less mismatches as necessary | |
792 if int(options.length)<=50: | |
793 mismatch=1 | |
794 elif int(options.length)>50 and int(options.length)<=100: | |
795 mismatch=2 | |
796 else: | |
797 mismatch=3 | |
798 trim3=str(options.trim3) | |
799 main(runName,readFile1,readFile2,fastaClassI,fastaClassII,bowtiebuildClassI,bowtiebuildClassII,mismatch,trim3,output1,output2,logfile,gzipped) | |
800 sys.stderr = log_stderr | |
801 log_file.close() |