comparison seq2HLA.py @ 0:913ea6991ee4 draft

Uploaded
author sebastian-boegel
date Thu, 20 Dec 2012 10:37:01 -0500
parents
children 155b796033b6
comparison
equal deleted inserted replaced
-1:000000000000 0:913ea6991ee4
1 ##########################################################################################################
2 #Title:
3 #seq2HLA - HLA genotyping from RNA-Seq sequence reads
4 #
5 #Release: 1.0
6 #
7 #Author:
8 #Sebastian Boegel, 2012 (c)
9 #TRON - Translational Oncology at the University Medical Center Mainz, 55131 Mainz, Germany
10 #University Medical Center of the Johannes Gutenberg-University Mainz, III. Medical Department, Mainz, Germany
11 #
12 #Contact:
13 #boegels@uni-mainz.de
14 #
15 #Synopsis:
16 #We developed an in-silico method "Seq2HLA", written in python and R, which takes standard RNA-Seq sequence reads in fastq format
17 #as input, uses a bowtie index comprising all HLA alleles and outputs the most likely HLA class I and class II genotypes,
18 #a p-value for each call, and the expression of each class
19 #
20 #Usage:
21 #python seq2HLA.py -1 <readfile1> -2 <readfile2> -r "<runname>" -l <readlength> [-3 <int>]*
22 #*optional (Default:0)
23 #
24 #Dependencies:
25 #0.) seq2HLA is a python script, developed with Python 2.6.8
26 #1.) bowtie must be reachable by the command "bowtie". seq2HLA was developed and tested with bowtie version 0.12.7 (64-bit). The call to bowtie is invoked with 6 CPUs. You can change that in the function "mapping".
27 #2.) R must be installed, seq2HLA.py was developed and tested with R version 2.12.2 (2011-02-25)
28 #3.) Input must be paired-end reads in fastq-format
29 #4.) Index files must be located in the folder "references".
30 #5.) Packages: biopython (developed with V1.58), numpy (1.3.0)
31 ###########################################################################################################
32
33 from operator import itemgetter
34 import sys
35 import linecache
36 import ast
37 import os
38 import shutil
39 import tempfile
40 import subprocess
41 from Bio import SeqIO
42 import numpy
43 import operator
44 from optparse import OptionParser
45
46 log_stderr = sys.stderr
47 log_file = open("err.log","w")
48 sys.stderr = log_file
49
50 import pysam
51
52 #These variables need to be global, as they are filled and used by different modules
53 readcount={}
54 readspergroup={}
55 allelesPerLocus={}
56
57 def main(runName,readFile1,readFile2,fastaClassI,fastaClassII,bowtiebuildClassI,bowtiebuildClassII,mismatch,trim3,output1,output2,logfile,gzipped):
58 log = open(logfile,"w")
59 # mapopt="-a -v"+str(mismatch)
60 mapopt="-a -v"+str(mismatch)
61 #call HLA typing for Class I
62 mainClassI(runName+"-ClassI",readFile1,readFile2,bowtiebuildClassI,fastaClassI,mapopt,trim3,output1,log,gzipped)
63 #call HLA typing for Class II
64 mainClassII(runName+"-ClassII",readFile1,readFile2,bowtiebuildClassII,fastaClassII,mapopt,trim3,output2,log,gzipped)
65
66 #---------------Class I-------------------------------
67 def mainClassI(runName,readFile1,readFile2,bowtiebuild,hla1fasta,mapopt,trim3,finaloutput,log,gzipped):
68 #-------1st iteration-----------------------------------
69 log.write("----------HLA class I------------\n")
70 sam1=runName+"-iteration1.bam"
71 iteration=1
72 log.write("First iteration starts....\nMapping ......\n")
73 mapping(sam1,runName,readFile1,readFile2,bowtiebuild,1,mapopt,trim3,log,gzipped)
74 medians=[]
75 medians.extend([0,0,0])
76 medianflag=False
77 #Calculation of first digital haplototype .....
78 output1=runName+".digitalhaplotype1"
79 log.write("Calculation of first digital haplototype .....\n")
80 map=createRefDict(hla1fasta,"A","B","C")
81 readMapping(map,sam1)
82 predictHLA1(sam1,medians,output1,medianflag,log)
83 log.write("1st iteration done.\nNow removing reads that mapped to the three top-scoring groups .......\n")
84 removeReads(runName,createRemoveList(runName,map))
85
86 #------2nd iteration------------------------------------------
87 log.write("Second iterations starts .....\n Mapping ......\n")
88 medians=[]
89 iteration=2
90 sam2=runName+"-iteration2.bam"
91 newReadFile1=runName+"-2nditeration_1.fq"
92 newReadFile2=runName+"-2nditeration_2.fq"
93 mapping(sam2,runName,newReadFile1,newReadFile2,bowtiebuild,2,mapopt,trim3,log,gzipped)
94 medianfile=runName+".digitalhaplotype1"
95 for i in range(2,5):
96 medians.append(linecache.getline(medianfile, i).split('\t', 3)[2])
97 medianflag=True
98 output2=runName+".digitalhaplotype2"
99 #Calculation of second digital haplototype
100 log.write("Calculation of second digital haplototype .....\n")
101 map=createRefDict(hla1fasta,"A","B","C")
102 readMapping(map,sam2)
103 predictHLA1(sam2,medians,output2,medianflag,log)
104 log.write("2nd iteration done.")
105 reportHLAgenotype(output1,output2,finaloutput)
106 log.write("Calculation of locus-specific expression ...\n")
107 expression("A","B","C",694,694,694,map,runName,readFile1,finaloutput,gzipped)
108
109 #--------------Class II----------------------------------------
110 def mainClassII(runName,readFile1,readFile2,bowtiebuild,hla2fasta,mapopt,trim3,finaloutput,log,gzipped):
111 #-------1st iteration----------------------------------------
112 log.write("----------HLA class II------------\n")
113 sam1=runName+"-iteration1.bam"
114 iteration=1
115 log.write("ClassII: first iteration starts....\nMapping ......\n")
116 mapping(sam1,runName,readFile1,readFile2,bowtiebuild,1,mapopt,trim3,log,gzipped)
117 medians=[]
118 medians.extend([0,0,0])
119 medianflag=False
120 output1=runName+".digitalhaplotype1"
121 log.write("ClassII: calculation of first digital haplototype .....\n")
122 map=createRefDict(hla2fasta,"DQA1","DQB1","DRB1")
123 readMapping(map,sam1)
124 predictHLA2(sam1,medians,output1,medianflag,log)
125 log.write("1st iteration done.\nNow removing reads that mapped to the three top-scoring groups .......\n")
126 removeReads(runName,createRemoveList(runName,map))
127
128 #------2nd iteration------------------------------------------
129 log.write("Second iterations starts .....\n Mapping ......\n")
130 medians=[]
131 iteration=2
132 sam2=runName+"-iteration2.bam"
133 newReadFile1=runName+"-2nditeration_1.fq"
134 newReadFile2=runName+"-2nditeration_2.fq"
135 mapping(sam2,runName,newReadFile1,newReadFile2,bowtiebuild,2,mapopt,trim3,log,gzipped)
136 medianfile=runName+".digitalhaplotype1"
137 for i in range(2,5):
138 medians.append(float(linecache.getline(medianfile, i).split('\t', 3)[2])/2.0)
139 medianflag=True
140 output2=runName+".digitalhaplotype2"
141 #Calculation of second digital haplototype
142 log.write("ClassII: calculation of second digital haplototype .....\n")
143 map=createRefDict(hla2fasta,"DQA1","DQB1","DRB1")
144 readMapping(map,sam2)
145 predictHLA2(sam2,medians,output2,medianflag,log)
146 log.write("2nd iteration done.\n")
147 reportHLAgenotype(output1,output2,finaloutput)
148 log.write("Calculation of locus-specific expression ...\n")
149 expression("DQA1","DQB1","DRB1",400,421,421,map,runName,readFile1,finaloutput,gzipped)
150
151 #performs the bowtie mapping for the 2 iterations using the given parameters
152 def mapping(sam,runName,readFile1,readFile2,bowtiebuild,iteration,mapopt,trim3,log,gzipped):
153 if iteration==1:
154 if gzipped == "True":
155 sam = sam.rsplit(".",1)[0]
156 cmd1 = "bowtie --threads 4 -3 %s --quiet -S %s --al %s.aligned %s -1 <(zcat %s) -2 <(zcat %s) | awk -F '\t' '$3 != \"*\"{ print $0 }' | samtools view -bS - | samtools sort - %s" % (trim3,mapopt,runName,bowtiebuild,readFile1,readFile2,sam)
157 mappingOutput = execBowtie(cmd1)
158
159 cmd2 = "samtools index %s.bam" % sam
160 createBAMIndex(cmd2)
161 else:
162 sam = sam.rsplit(".",1)[0]
163 cmd1 = "bowtie --threads 4 -3 %s --quiet -S %s --al %s.aligned %s -1 %s -2 %s | awk -F '\t' '$3 != \"*\"{ print $0 }' | samtools view -bS - | samtools sort - %s" % (trim3,mapopt,runName,bowtiebuild,readFile1,readFile2,sam)
164 mappingOutput = execBowtie(cmd1)
165
166 cmd2 = "samtools index %s.bam" % sam
167 createBAMIndex(cmd2)
168 # if iteration==2:
169 else:
170
171 sam = sam.rsplit(".",1)[0]
172 cmd2 = "bowtie --threads 4 -3 %s --quiet -S %s %s -1 %s -2 %s | samtools view -bS - | samtools sort - %s" % (trim3,mapopt,bowtiebuild,readFile1,readFile2,sam)
173 mappingOutput = execBowtie(cmd2)
174
175 cmd3 = "samtools index %s.bam" % sam
176 createBAMIndex(cmd3)
177
178 log.write(mappingOutput+"\n")
179
180 def createBAMIndex(cmd):
181 proc = subprocess.Popen(["/bin/bash","-c",cmd],stderr=subprocess.PIPE)
182 (stdoutdata,stderrdata) = proc.communicate()
183 r = proc.returncode
184 if r != 0:
185 raise IOError(stderrdata)
186
187 def execBowtie(cmd):
188 proc = subprocess.Popen(["/bin/bash","-c",cmd],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
189 (stdoutdata,stderrdata) = proc.communicate()
190 mappingOutput = stdoutdata
191 r = proc.returncode
192 if r != 0:
193 raise IOError(stderrdata)
194 return mappingOutput
195
196 #create dictionary "map", that contains all IMGT/HLA-allele names as keys and the allele name (e.g. A*02:01:01) as value
197 #dictionary "readcount" is initialized with 0 for each allele
198 #dictionary "readspergroup" is initialized with 0 for each group (2digit, e.g. A*01)
199 #dictionary "allelesPerLocus" stores the number of alleles per locus.
200 def createRefDict(hlafasta,locus1,locus2,locus3):
201 map={}
202 for locus in (locus1,locus2,locus3):
203 allelesPerLocus[locus]=0
204 handle=open(hlafasta,'r')
205 for record in SeqIO.parse(handle, "fasta") :
206 l=record.description.split(' ')
207 hlapseudoname=l[0]
208 hlaallele=l[1]
209 map[hlapseudoname]=hlaallele
210 readcount[hlaallele]=0
211 readspergroup[hlaallele.split(":")[0]]=0
212 allelesPerLocus[hlaallele.split('*')[0]]+=1
213 handle.close()
214 return map
215
216 #Open sam-file and count mappings for each allele
217 def readMapping(map,sam):
218 samhandle = pysam.Samfile(sam,"rb")
219 for ar in samhandle.fetch():
220 hla = samhandle.getrname(ar.tid)
221 readcount[map[hla]]+=1
222 samhandle.close()
223
224 #predict HLA class II genotype
225 def predictHLA2(sam,medians,output,medianflag,log):
226
227 readspergroupDQAlist=[]
228 readspergroupDQBlist=[]
229 readspergroupDRBlist=[]
230 readspergroupDQA={}
231 readspergroupDQB={}
232 readspergroupDRB={}
233 maxallelepergroup={}
234
235 #for each allele, to which at least 1 read map, find the allele which has the most reads within a group (2-digit-level, e.g. DQA*02") and save
236 #i) the key of this allele as ambassador for the group => maxallelepergroup holds for each group the top-scoring allele
237 #ii) the number of reads mapping to the top-scoring allele => readspergroup
238 for key in readcount:
239 if readcount[key] > 0:
240 group=key.split(":")[0]
241 if readspergroup[group] <=readcount[key]:
242 readspergroup[group]=readcount[key]
243 maxallelepergroup[group]=key
244
245 #consider all DQA-,DQB-, and DRB-groups seperately
246 #readspergroup<DQA|DQB|DRB>list = list of all reads mapping to the top-scoring groups minus the decision threshold (which is 0 in the first iteration)
247 #readspergroup<DQA|DQB|DRB> = contains the same entries as the list, but the entries are uniquely accessible via the group-key (e.g. DQA*01)
248 for key in readspergroup:
249 keyBegin = key[0:3]
250 if keyBegin=='DQA':
251 readspergroupDQAlist.append(readspergroup[key]-float(medians[0]))
252 readspergroupDQA[key]=readspergroup[key]-float(medians[0])
253 if keyBegin=='DQB':
254 readspergroupDQBlist.append(readspergroup[key]-float(medians[1]))
255 readspergroupDQB[key]=readspergroup[key]-float(medians[1])
256 if keyBegin=='DRB':
257 readspergroupDRBlist.append(readspergroup[key]-float(medians[2]))
258 readspergroupDRB[key]=readspergroup[key]-float(medians[2])
259
260 #compute the decision threshold (median) for homozygosity vs. heterozygosity for the second iteration
261 medianDQA=numpy.median(readspergroupDQAlist)
262 medianDQB=numpy.median(readspergroupDQBlist)
263 medianDRB=numpy.median(readspergroupDRBlist)
264 a = b = c = ""
265
266 #Determine top-scoring group of the whole locus (DQA,DQBB,DRB) and store it
267 #maxkey<DQA,DQB,DRB> = group (e.g. DQA*01) with the most reads
268 #It can be that, e.g. in cancer cells a whole locus is lost. For that reason it is checked if the
269 #number of mapping reads of the top-scoring group maxkey<DQA,DQB,DRB> is > 0, otherwise "no" ist reported for this locus
270 if len(readspergroupDQAlist)>0:
271 maxkeyDQA=max(readspergroupDQA,key=lambda a:readspergroupDQA.get(a))
272 if readspergroupDQA[maxkeyDQA] > 0:
273 maxDQA=maxallelepergroup[maxkeyDQA]
274 a = max(readspergroupDQA,key=lambda a:readspergroupDQA.get(a))
275 else:
276 a = "no"
277 else:
278 a = "no"
279
280 if len(readspergroupDQBlist)>0:
281 maxkeyDQB=max(readspergroupDQB,key=lambda a:readspergroupDQB.get(a))
282 if readspergroupDQB[maxkeyDQB] > 0:
283 maxDQB=maxallelepergroup[maxkeyDQB]
284 b = max(readspergroupDQB,key=lambda a:readspergroupDQB.get(a))
285 else:
286 b = "no"
287 else:
288 b = "no"
289
290 if len(readspergroupDRBlist)>0:
291 maxkeyDRB=max(readspergroupDRB,key=lambda a:readspergroupDRB.get(a))
292 if readspergroupDRB[maxkeyDRB] > 0:
293 maxDRB=maxallelepergroup[maxkeyDRB]
294 c = max(readspergroupDRB,key=lambda a:readspergroupDRB.get(a))
295 else:
296 c = "no"
297 else:
298 c = "no"
299
300 readspergroupDQA=sorted(readspergroupDQA.items(), key=itemgetter(1),reverse=True)
301 readspergroupDQB=sorted(readspergroupDQB.items(), key=itemgetter(1),reverse=True)
302 readspergroupDRB=sorted(readspergroupDRB.items(), key=itemgetter(1),reverse=True)
303
304 dqavec=dqbvec=drbvec=""
305 if medianflag:
306 #in the 2nd iteration:
307 #1.) DO NOT remove the top-scoring group from the set <dqa,dqb,drb>vec as this is more strict when calculating the probability of the top scoring group being an outlier
308 #The strings <dqa,dqb,drb>vec are used by the R-script to calculate the probability of the top scoring group being an outlier
309 for key in maxallelepergroup:
310 keyBegin = key[0:3]
311 if keyBegin=="DQA":
312 dqavec=dqavec+str(readcount[maxallelepergroup[key]])+","
313 #print key+": "+str(readcount[maxallelepergroup[key]])
314 if keyBegin=="DQB":
315 dqbvec=dqbvec+str(readcount[maxallelepergroup[key]])+","
316 #print key+": "+str(readcount[maxallelepergroup[key]])
317 if keyBegin=="DRB":
318 drbvec=drbvec+str(readcount[maxallelepergroup[key]])+","
319 #print key+": "+str(readcount[maxallelepergroup[key]])
320 #2.) Add the decision thresholds to the sets <dqa,dqb,drb>vec, as this enables measuring the distance of the top-scoring group to this distance
321 if a=="no":
322 dqavec+=str(medians[0])
323 if b=="no":
324 dqbvec+=str(medians[1])
325 if c=="no":
326 drbvec+=str(medians[2])
327 #3.) In case of, e.g. a loss of whole HLA-locus (DQA/B,DRB), <dqa,dqb,drb>vec only contain medians[<0|1|2>].
328 #To avoid errors in the R-Script, set to 0
329 if dqavec==str(medians[0]):
330 dqavec = dqacount="0"
331 else:
332 dqacount=str(readcount[maxallelepergroup[maxkeyDQA]])
333 if dqavec==str(readcount[maxallelepergroup[maxkeyDQA]])+",":
334 dqavec=str(readcount[maxallelepergroup[maxkeyDQA]])+","+str(readcount[maxallelepergroup[maxkeyDQA]])+","
335 if dqbvec==str(medians[1]):
336 dqbvec = dqbcount="0"
337 else:
338 dqbcount=str(readcount[maxallelepergroup[maxkeyDQB]])
339 if dqbvec==str(readcount[maxallelepergroup[maxkeyDQB]])+",":
340 dqbvec=str(readcount[maxallelepergroup[maxkeyDQB]])+","+str(readcount[maxallelepergroup[maxkeyDQB]])+","
341 if drbvec==str(medians[2]):
342 drbvec = drbcount="0"
343 else:
344 drbcount=str(readcount[maxallelepergroup[maxkeyDRB]])
345 if drbvec==str(readcount[maxallelepergroup[maxkeyDRB]])+",":
346 drbvec=str(readcount[maxallelepergroup[maxkeyDDRB]])+","+str(readcount[maxallelepergroup[maxkeyDRB]])+","
347 else:
348 #in the 1st iteration: remove the top-scoring group from the set <dqa,dqb,drb>vec as this increases the certainty when calculating the probability of the top scoring group being an outlier
349 for key in maxallelepergroup:
350 keyBegin = key[0:3]
351 if keyBegin=="DQA":
352 if key!=maxkeyDQA:
353 dqavec=dqavec+str(readcount[maxallelepergroup[key]])+","
354 #print key+": "+str(readcount[maxallelepergroup[key]])
355 if keyBegin=="DQB":
356 if key!=maxkeyDQB:
357 dqbvec=dqbvec+str(readcount[maxallelepergroup[key]])+","
358 #print key+": "+str(readcount[maxallelepergroup[key]])
359 if keyBegin=="DRB":
360 if key!=maxkeyDRB:
361 drbvec=drbvec+str(readcount[maxallelepergroup[key]])+","
362 #print key+": "+str(readcount[maxallelepergroup[key]])
363 #2.) DO NOT add the decision thresholds to the sets <dqa,dqb,drb>vec
364 if dqavec=="":
365 dqacount=dqavec="0"
366 else:
367 dqacount=str(readcount[maxallelepergroup[maxkeyDQA]])
368 if dqavec==str(readcount[maxallelepergroup[maxkeyDQA]])+",":
369 dqavec=str(readcount[maxallelepergroup[maxkeyDQA]])+","+str(readcount[maxallelepergroup[maxkeyDQA]])+","
370 if len(dqavec.split(","))==2:
371 dqavec=dqavec+"0,"
372
373 if dqbvec=="":
374 dqbcount=dqbvec="0"
375 else:
376 dqbcount=str(readcount[maxallelepergroup[maxkeyDQB]])
377 if dqbvec==str(readcount[maxallelepergroup[maxkeyDQB]])+",":
378 dqbvec=str(readcount[maxallelepergroup[maxkeyDQB]])+","+str(readcount[maxallelepergroup[maxkeyDQB]])+","
379 if len(dqbvec.split(","))==2:
380 dqbvec=dqbvec+"0,"
381
382 if drbvec=="":
383 drbcount=drbvec="0"
384 else:
385 drbcount=str(readcount[maxallelepergroup[maxkeyDRB]])
386 if drbvec==str(readcount[maxallelepergroup[maxkeyDRB]])+",":
387 drbvec=str(readcount[maxallelepergroup[maxkeyDDRB]])+","+str(readcount[maxallelepergroup[maxkeyDRB]])+","
388 if len(drbvec.split(","))==2:
389 drbvec=drbvec+"0,"
390
391 #call R-script "commmand.R" to calculate the confidence of the top-scoring allele
392 routput=os.popen("R --vanilla < "+sys.path[0]+"/command.R --args "+dqacount+" "+dqavec+" "+maxkeyDQA+" "+dqbcount+" "+dqbvec+" "+maxkeyDQB+" "+drbcount+" "+drbvec+" "+maxkeyDRB).read()
393 parseOutput=routput.split("\n")
394
395 entries = []
396 for entry in parseOutput:
397 if entry[0:3]=="[1]":
398 entries.append(str(entry[4:len(entry)]))
399
400 if a=="no" and entries[1]!="NA":
401 entries[1]=str(1-float(entries[1]))
402 if b=="no" and entries[3]!="NA":
403 entries[3]=str(1-float(entries[3]))
404 if c=="no" and entries[5]!="NA":
405 entries[5]=str(1-float(entries[5]))
406
407 pred2File(entries,medianDQA,medianDQB,medianDRB,output,a,b,c,log)
408
409 #predict HLA class I genotype
410 def predictHLA1(sam,medians,output,medianflag,log):
411
412 readspergroupAlist=[]
413 readspergroupBlist=[]
414 readspergroupClist=[]
415 readspergroupA={}
416 readspergroupB={}
417 readspergroupC={}
418 maxallelepergroup={}
419
420 #for each allele, to which at least 1 read map, find the allele which has the most reads within a group (2-digit-level, e.g. A*02") and save
421 #i) the key of this allele as ambassador for the group => maxallelepergroup holds for each group the top-scoring allele
422 #ii) the number of reads mapping to the top-scoring allele => readspergroup
423 for key in readcount:
424 if readcount[key] > 0:
425 group=key.split(":")[0]
426 if readspergroup[group] <=readcount[key]:
427 readspergroup[group]=readcount[key]
428 maxallelepergroup[group]=key
429
430
431 #consider all A-,B-, and C-groups seperately
432 #readspergroup<A|B|C>list = list of all reads mapping to the top-scoring groups minus the decision threshold (which is 0 in the first iteration)
433 #readspergroup<A|B|C> = contains the same entries as the list, but the entries are uniquely accessible via the group-key (e.g. B*27)
434 for key in readspergroup:
435 firstKey = key[0]
436 if key[0]=='A':
437 readspergroupAlist.append(readspergroup[key]-float(medians[0]))
438 readspergroupA[key]=readspergroup[key]-float(medians[0])
439 if key[0]=='B':
440 readspergroupBlist.append(readspergroup[key]-float(medians[1]))
441 readspergroupB[key]=readspergroup[key]-float(medians[1])
442 if key[0]=='C':
443 readspergroupClist.append(readspergroup[key]-float(medians[2]))
444 readspergroupC[key]=readspergroup[key]-float(medians[2])
445
446 #compute the decision threshold (median) for homozygosity vs. heterozygosity for the second iteration
447 medianA=numpy.median(readspergroupAlist)
448 medianB=numpy.median(readspergroupBlist)
449 medianC=numpy.median(readspergroupClist)
450 a = b = c = ""
451
452 #Determine top-scoring group of the whole locus (A,B,C) and store it
453 #maxkey<A,B,C> = group (e.g. A*02) with the most reads
454 #It can be that, e.g. in cancer cells A whole locus is lost. For that reason it is checked if the
455 #number of mapping reads of the top-scoring group maxkey<A,B,C> is > 0, otherwise "no" ist reported for this locus
456 if len(readspergroupAlist)>0:
457 maxkeyA=max(readspergroupA,key=lambda a:readspergroupA.get(a))
458 if readspergroupA[maxkeyA] > 0:
459 maxA=maxallelepergroup[maxkeyA]
460 a = max(readspergroupA,key=lambda a:readspergroupA.get(a))
461 else:
462 a = "no"
463 else:
464 a = "no"
465
466 if len(readspergroupBlist)>0:
467 maxkeyB=max(readspergroupB,key=lambda a:readspergroupB.get(a))
468 if readspergroupB[maxkeyB] > 0:
469 maxB=maxallelepergroup[maxkeyB]
470 b = max(readspergroupB,key=lambda a:readspergroupB.get(a))
471 else:
472 b = "no"
473 else:
474 b = "no"
475
476 if len(readspergroupClist)>0:
477 maxkeyC=max(readspergroupC,key=lambda a:readspergroupC.get(a))
478 if readspergroupC[maxkeyC] > 0:
479 maxC=maxallelepergroup[maxkeyC]
480 c = max(readspergroupC,key=lambda a:readspergroupC.get(a))
481 else:
482 c = "no"
483 else:
484 c = "no"
485
486 readspergroupA=sorted(readspergroupA.items(), key=itemgetter(1),reverse=True)
487 readspergroupB=sorted(readspergroupB.items(), key=itemgetter(1),reverse=True)
488 readspergroupC=sorted(readspergroupC.items(), key=itemgetter(1),reverse=True)
489
490 avec=bvec=cvec=""
491
492 if medianflag:
493 #in the 2nd iteration:
494 #1.) DO NOT remove the top-scoring group from the set <a,b,c>vec as this is more strict when calculating the probability of the top scoring group being an outlier
495 #The strings <a,b,c>vec are used by the R-script to calculate the probability of the top scoring group being an outlier
496 for key in maxallelepergroup:
497 if key[0]=="A":
498 avec=avec+str(readcount[maxallelepergroup[key]])+","
499 #print key+": "+str(readcount[maxallelepergroup[key]])
500 if key[0]=="B":
501 bvec=bvec+str(readcount[maxallelepergroup[key]])+","
502 #print key+": "+str(readcount[maxallelepergroup[key]])
503 if key[0]=="C":
504 cvec=cvec+str(readcount[maxallelepergroup[key]])+","
505 #print key+": "+str(readcount[maxallelepergroup[key]])
506 #2.) Add the decision thresholds to the sets <a,b,c>vec, as this enables measuring the distance of the top-scoring group to this distance
507 if a=="no":
508 avec+=str(medians[0])
509 if b=="no":
510 bvec+=str(medians[1])
511 if c=="no":
512 cvec+=str(medians[2])
513 #3.) In case of, e.g. a loss of whole HLA-locus (A,B,C), <a,b,c>vec only contain median[<0|1|2>].
514 #To avoid errors in the R-Script, set to 0
515 if avec=="" or avec==medians[0]:
516 avec = acount="0"
517 else:
518 acount=str(readcount[maxallelepergroup[maxkeyA]])
519
520 if bvec=="" or bvec==medians[1]:
521 bvec = bcount="0"
522 else:
523 bcount=str(readcount[maxallelepergroup[maxkeyB]])
524
525 if cvec=="" or cvec==medians[2]:
526 cvec = ccount="0"
527 else:
528 ccount=str(readcount[maxallelepergroup[maxkeyC]])
529 else:
530 #in the 1st iteration: remove the top-scoring group from the set <a,b,c>vec as this increases the certainty when calculating the probability of the top scoring group being an outlier
531 for key in maxallelepergroup:
532 if key[0]=="A" and key!=maxkeyA:
533 avec=avec+str(readcount[maxallelepergroup[key]])+","
534 #print key+": "+str(readcount[maxallelepergroup[key]])
535 if key[0]=="B" and key!=maxkeyB:
536 bvec=bvec+str(readcount[maxallelepergroup[key]])+","
537 #print key+": "+str(readcount[maxallelepergroup[key]])
538 if key[0]=="C" and key!=maxkeyC:
539 cvec=cvec+str(readcount[maxallelepergroup[key]])+","
540 #print key+": "+str(readcount[maxallelepergroup[key]])
541 #2.) DO NOT add the decision thresholds to the sets <a,b,c>vec
542 if avec=="":
543 acount=avec="0"
544 else:
545 acount=str(readcount[maxallelepergroup[maxkeyA]])
546 if bvec=="":
547 bcount=bvec="0"
548 else:
549 bcount=str(readcount[maxallelepergroup[maxkeyB]])
550 if cvec=="":
551 ccount=cvec="0"
552 else:
553 ccount=str(readcount[maxallelepergroup[maxkeyC]])
554
555 #call R-script "commmand.R" to calculate the confidence of the top-scoring allele
556 routput=os.popen("R --vanilla < "+sys.path[0]+"/command.R --args "+acount+" "+avec+" "+maxkeyA+" "+bcount+" "+bvec+" "+maxkeyB+" "+ccount+" "+cvec+" "+maxkeyC).read()
557 parseOutput=routput.split("\n")
558
559 entries = []
560 for entry in parseOutput:
561 if entry[0:3]=="[1]":
562 entries.append(str(entry[4:len(entry)]))
563
564 if a=="no" and entries[1]!="NA":
565 entries[1]=str(1-float(entries[1]))
566 if b=="no" and entries[3]!="NA":
567 entries[3]=str(1-float(entries[3]))
568 if c=="no" and entries[5]!="NA":
569 entries[5]=str(1-float(entries[5]))
570
571 pred2File(entries,medianA,medianB,medianC,output,a,b,c,log)
572
573 #write digital haplotype into file
574 def pred2File(entries,medianA,medianB,medianC,output,a,b,c,log):
575 out = open(output, 'w')
576 out.write("HLA\tHLA1\tmedian-Value\talternative\tp-Value\n")
577 out.write("A\t"+a+"\t")
578 # write the median-Value for A
579 out.write(str(medianA)+"\t")
580 out.write(entries[0]+"\t"+entries[1]+"\n")
581
582 out.write("B\t"+b+"\t")
583 # write the median-Value for B
584 out.write(str(medianB)+"\t")
585 out.write(entries[2]+"\t"+entries[3]+"\n")
586
587 out.write("C\t"+c+"\t")
588 # write the median-Value for C
589 out.write(str(medianC)+"\t")
590 out.write(entries[4]+"\t"+entries[5]+"\n")
591 out.close()
592 log.write("The digital haplotype is written into "+output+"\n")
593
594 #open mapping file and all read ids to the list "removeList", which map to one of the three groups in "alleles"
595 def createRemoveList(runName,map):
596 removeList={}
597 alleles = []
598 sam=runName+"-iteration1.bam"
599 alleles_in=runName+".digitalhaplotype1"
600 for i in range(2,5):
601 alleles.append(linecache.getline(alleles_in, i).split('\t', 2)[1])
602
603 samhandle=pysam.Samfile(sam,'rb')
604 for read in samhandle.fetch():
605 illuminaid = read.qname
606 hlapseudoname = samhandle.getrname(read.tid)
607 if map[hlapseudoname].split(':')[0] in alleles:
608 removeList[illuminaid]=1
609 samhandle.close()
610 return removeList
611
612 #Remove reads that mapped to the three top-scoring alleles and write the remaining reads into two new read files
613 def removeReads(runName,removeList):
614 aligned1=runName+"_1.aligned"
615 aligned2=runName+"_2.aligned"
616 newReadFile1=runName+"-2nditeration_1.fq"
617 newReadFile2=runName+"-2nditeration_2.fq"
618 #r1 and r2, which are the input of bowtie in the 2nd iteration
619 r1=open(newReadFile1,"w")
620 r2=open(newReadFile2,"w")
621 #open the 2 files, that contain the reads that mapped in the 1st iteration
622 aligned_handle1=open(aligned1,"r")
623 aligned_handle2=open(aligned2,"r")
624
625 #One read entry consists of 4 lines: header, seq, "+", qualities.
626 for record in SeqIO.parse(aligned_handle1, "fastq"):
627 illuminaid=record.id.split('/')[0].split(' ')[0]#find exact id, which also appears in the mapping file
628 if not illuminaid in removeList:
629 SeqIO.write(record, r1, "fastq")
630
631 for record in SeqIO.parse(aligned_handle2, "fastq"):
632 illuminaid=record.id.split('/')[0].split(' ')[0]#find exact id, which also appears in the mapping file
633 if not illuminaid in removeList:
634 SeqIO.write(record, r2, "fastq")
635
636 #write the final prediction (both digital haplotypes) to file and stdout
637 def reportHLAgenotype(output1,output2,finaloutput):
638 filehandle1=open(output1,'r').readlines()[1:4]
639 filehandle2=open(output2,'r').readlines()[1:4]
640 outfile=open(finaloutput, 'w')
641
642 outfile.write("HLA\tHLA1\tp-Value\tHLA2\tp-Value\n")
643 for i in range(len(filehandle1)):
644 filehandle1[i]=filehandle1[i][0:-1]
645 for i in range(len(filehandle2)):
646 filehandle2[i]=filehandle2[i][0:-1]
647
648 a1 = filehandle1[0].split('\t',2)[1]
649 a1score = filehandle1[0].split('\t')[4]
650 b1 = filehandle1[1].split('\t',2)[1]
651 b1score = filehandle1[1].split('\t')[4]
652 c1 = filehandle1[2].split('\t',2)[1]
653 c1score = filehandle1[2].split('\t')[4]
654
655 a2 = filehandle2[0].split('\t',2)[1]
656 if a2 == "no":
657 a2 = "hoz("+filehandle2[0].split('\t')[3]+")"
658 a2score = filehandle2[0].split('\t')[4]
659 b2 = filehandle2[1].split('\t',2)[1]
660 if b2 == "no":
661 b2 = "hoz("+filehandle2[1].split('\t')[3]+")"
662 b2score = filehandle2[1].split('\t')[4]
663 c2 = filehandle2[2].split('\t', 2)[1]
664 if c2 == "no":
665 c2 = "hoz("+filehandle2[2].split('\t')[3]+")"
666 c2score = filehandle2[2].split('\t')[4]
667 #write complete HLA genotype to file
668 outfile.write("A\t"+a1+"\t"+a1score+"\t"+a2+"\t"+a2score+"\n")
669 outfile.write("B\t"+b1+"\t"+b1score+"\t"+b2+"\t"+b2score+"\n")
670 outfile.write("C\t"+c1+"\t"+c1score+"\t"+c2+"\t"+c2score+"\n")
671 #.. and print it to STDOUT
672 #print "A\t"+a1+"\t"+a1score+"\t"+a2+"\t"+a2score
673 #print "B\t"+b1+"\t"+b1score+"\t"+b2+"\t"+b2score
674 #print "C\t"+c1+"\t"+c1score+"\t"+c2+"\t"+c2score
675
676 #calculate locus-specific expression
677 def expression(locus1,locus2,locus3,length1,length2,length3,map,runName,readFile1,finaloutput,gzipped):
678 outfile = open(finaloutput,'a')
679 aligned1=runName+"_1.aligned"
680 sam=runName+"-iteration1.bam"
681 alleles_in=runName+".digitalhaplotype1"
682 if gzipped:
683 proc=subprocess.Popen(["/bin/bash","-c","wc -l <(zcat " + readFile1 + ")"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
684 (stdoutdata,stderrdata) = proc.communicate()
685 numberlines = stdoutdata
686 else:
687 proc=subprocess.Popen(["/bin/bash","-c","wc -l " + readFile1],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
688 (stdoutdata,stderrdata) = proc.communicate()
689 numberlines = stdoutdata
690 totalreads=int(numberlines.split(" ")[0])/4
691 alleles=[]
692 for i in range(2,5):
693 alleles.append(linecache.getline(alleles_in, i).split('\t', 2)[1])
694 alleles.append(linecache.getline(alleles_in, i).split('\t')[3])
695
696 #create read dictionary
697 reads={}
698 aligned_handle1=open(aligned1,"r")
699 for record in SeqIO.parse(aligned_handle1, "fastq"):
700 illuminaid=record.id.split('/')[0].split(' ')[0]#find exact id, which also appears in the mapping file
701 reads[illuminaid]={}
702 reads[illuminaid][locus1]=0
703 reads[illuminaid][locus2]=0
704 reads[illuminaid][locus3]=0
705 samhandle = pysam.Samfile(sam,"rb")
706 for read in samhandle.fetch():
707 illuminaid = read.qname
708 hlapseudoname = samhandle.getrname(read.tid)
709 if map[hlapseudoname].split(':')[0] in alleles:
710 reads[illuminaid][map[hlapseudoname].split('*')[0]]+=1
711
712 count={}
713 count[locus1]=0
714 count[locus2]=0
715 count[locus3]=0
716 for key in reads:
717 n=0
718 for locus in reads[key]:
719 if reads[key][locus] > 0:
720 n+=1
721 for locus in reads[key]:
722 if reads[key][locus] > 0:
723 count[locus]+=float(1.0/float(n))
724
725 outfile.write("\n")
726 #Calculate RPKM and print expression values for each locus to stdout
727 for locus in count:
728 if locus==locus1:
729 outfile.write(locus+": "+str(round(float((1000.0/length1))*float((1000000.0/totalreads))*count[locus],2))+" RPKM\n")
730 if locus==locus2:
731 outfile.write(locus+": "+str(round(float((1000.0/length2))*float((1000000.0/totalreads))*count[locus],2))+" RPKM\n")
732 if locus==locus3:
733 outfile.write(locus+": "+str(round(float((1000.0/length3))*float((1000000.0/totalreads))*count[locus],2))+" RPKM\n")
734
735
736 if __name__ == '__main__':
737 parser = OptionParser(usage="usage: %prog -1 readFile1 -2 readFile2 -r runName -l readlength [-3 <int>]", version="%prog 1.0")
738 parser.add_option("-z","--gzipped",action="store",dest="gzipped",help="Select if input reads are in gzip format")
739 parser.add_option("-1",
740 action="store",
741 dest="readFile1",
742 help="File name of #1 mates ")
743 parser.add_option("-2",
744 action="store",
745 dest="readFile2",
746 help="File name of #2 mates")
747 parser.add_option("-r", "--runName",
748 action="store",
749 dest="runName",
750 help="Name of this HLA typing run. Wil be used throughout this process as part of the name of the newly created files.")
751 parser.add_option("-l", "--length",
752 action="store",
753 dest="length",
754 help="Readlength")
755 parser.add_option("-3", "--trim3",
756 action="store",
757 dest="trim3",
758 default="0",
759 help="Bowtie option: -3 <int> trims <int> bases from the low quality 3' end of each read. Default: 0")
760 parser.add_option("-o", "--output1",
761 action="store",
762 dest="output1",
763 help="Output file 1")
764 parser.add_option("-p", "--output2",
765 action="store",
766 dest="output2",
767 help="Output file 2")
768 parser.add_option("-g","--log",action="store",dest="logfile",help="Output Log File")
769
770
771 (options, args) = parser.parse_args()
772 if not options.readFile1:
773 parser.error('File name #1 pair not given.')
774 if not options.readFile2:
775 parser.error('File name #2 pair not given.')
776 if not options.runName:
777 parser.error('Run name not given.')
778 if not options.length:
779 parser.error('Readlength not given.')
780 gzipped = options.gzipped
781 readFile1=options.readFile1
782 readFile2=options.readFile2
783 runName=options.runName
784 output1 = options.output1
785 output2 = options.output2
786 logfile = options.logfile
787 bowtiebuildClassI=sys.path[0]+"/references/ClassIWithoutNQex2-3.plus75"
788 bowtiebuildClassII=sys.path[0]+"/references/HLA2.ex2.plus75"
789 fastaClassI=sys.path[0]+"/references/ClassIWithoutNQex2-3.plus75.fasta"
790 fastaClassII=sys.path[0]+"/references/HLA2.ex2.plus75.fasta"
791 #as shown in the publication HLA typing with RNA-Seq works best by allowing as less mismatches as necessary
792 if int(options.length)<=50:
793 mismatch=1
794 elif int(options.length)>50 and int(options.length)<=100:
795 mismatch=2
796 else:
797 mismatch=3
798 trim3=str(options.trim3)
799 main(runName,readFile1,readFile2,fastaClassI,fastaClassII,bowtiebuildClassI,bowtiebuildClassII,mismatch,trim3,output1,output2,logfile,gzipped)
800 sys.stderr = log_stderr
801 log_file.close()