0
|
1 ##########################################################################################################
|
|
2 #Title:
|
|
3 #seq2HLA - HLA genotyping from RNA-Seq sequence reads
|
|
4 #
|
|
5 #Release: 1.0
|
|
6 #
|
|
7 #Author:
|
|
8 #Sebastian Boegel, 2012 (c)
|
|
9 #TRON - Translational Oncology at the University Medical Center Mainz, 55131 Mainz, Germany
|
|
10 #University Medical Center of the Johannes Gutenberg-University Mainz, III. Medical Department, Mainz, Germany
|
|
11 #
|
|
12 #Contact:
|
|
13 #boegels@uni-mainz.de
|
|
14 #
|
|
15 #Synopsis:
|
|
16 #We developed an in-silico method "Seq2HLA", written in python and R, which takes standard RNA-Seq sequence reads in fastq format
|
|
17 #as input, uses a bowtie index comprising all HLA alleles and outputs the most likely HLA class I and class II genotypes,
|
|
18 #a p-value for each call, and the expression of each class
|
|
19 #
|
|
20 #Usage:
|
|
21 #python seq2HLA.py -1 <readfile1> -2 <readfile2> -r "<runname>" -l <readlength> [-3 <int>]*
|
|
22 #*optional (Default:0)
|
|
23 #
|
|
24 #Dependencies:
|
|
25 #0.) seq2HLA is a python script, developed with Python 2.6.8
|
|
26 #1.) bowtie must be reachable by the command "bowtie". seq2HLA was developed and tested with bowtie version 0.12.7 (64-bit). The call to bowtie is invoked with 6 CPUs. You can change that in the function "mapping".
|
|
27 #2.) R must be installed, seq2HLA.py was developed and tested with R version 2.12.2 (2011-02-25)
|
|
28 #3.) Input must be paired-end reads in fastq-format
|
|
29 #4.) Index files must be located in the folder "references".
|
|
30 #5.) Packages: biopython (developed with V1.58), numpy (1.3.0)
|
|
31 ###########################################################################################################
|
|
32
|
|
33 from operator import itemgetter
|
|
34 import sys
|
|
35 import linecache
|
|
36 import ast
|
|
37 import os
|
|
38 import shutil
|
|
39 import tempfile
|
|
40 import subprocess
|
|
41 from Bio import SeqIO
|
|
42 import numpy
|
|
43 import operator
|
|
44 from optparse import OptionParser
|
|
45
|
|
46 log_stderr = sys.stderr
|
|
47 log_file = open("err.log","w")
|
|
48 sys.stderr = log_file
|
|
49
|
|
50 import pysam
|
|
51
|
|
52 #These variables need to be global, as they are filled and used by different modules
|
|
53 readcount={}
|
|
54 readspergroup={}
|
|
55 allelesPerLocus={}
|
|
56
|
1
|
57 def main(runName,readFile1,readFile2,fastaClassI,fastaClassII,bowtiebuildClassI,bowtiebuildClassII,mismatch,trim3,output1,output2,logfile,gzipped,num_threads):
|
0
|
58 log = open(logfile,"w")
|
|
59 # mapopt="-a -v"+str(mismatch)
|
|
60 mapopt="-a -v"+str(mismatch)
|
|
61 #call HLA typing for Class I
|
1
|
62 mainClassI(runName+"-ClassI",readFile1,readFile2,bowtiebuildClassI,fastaClassI,mapopt,trim3,output1,log,gzipped,num_threads)
|
0
|
63 #call HLA typing for Class II
|
1
|
64 mainClassII(runName+"-ClassII",readFile1,readFile2,bowtiebuildClassII,fastaClassII,mapopt,trim3,output2,log,gzipped,num_threads)
|
0
|
65
|
|
66 #---------------Class I-------------------------------
|
1
|
67 def mainClassI(runName,readFile1,readFile2,bowtiebuild,hla1fasta,mapopt,trim3,finaloutput,log,gzipped,num_threads):
|
0
|
68 #-------1st iteration-----------------------------------
|
|
69 log.write("----------HLA class I------------\n")
|
|
70 sam1=runName+"-iteration1.bam"
|
|
71 iteration=1
|
|
72 log.write("First iteration starts....\nMapping ......\n")
|
1
|
73 mapping(sam1,runName,readFile1,readFile2,bowtiebuild,1,mapopt,trim3,log,gzipped,num_threads)
|
0
|
74 medians=[]
|
|
75 medians.extend([0,0,0])
|
|
76 medianflag=False
|
|
77 #Calculation of first digital haplototype .....
|
|
78 output1=runName+".digitalhaplotype1"
|
|
79 log.write("Calculation of first digital haplototype .....\n")
|
|
80 map=createRefDict(hla1fasta,"A","B","C")
|
|
81 readMapping(map,sam1)
|
|
82 predictHLA1(sam1,medians,output1,medianflag,log)
|
|
83 log.write("1st iteration done.\nNow removing reads that mapped to the three top-scoring groups .......\n")
|
|
84 removeReads(runName,createRemoveList(runName,map))
|
|
85
|
|
86 #------2nd iteration------------------------------------------
|
|
87 log.write("Second iterations starts .....\n Mapping ......\n")
|
|
88 medians=[]
|
|
89 iteration=2
|
|
90 sam2=runName+"-iteration2.bam"
|
|
91 newReadFile1=runName+"-2nditeration_1.fq"
|
|
92 newReadFile2=runName+"-2nditeration_2.fq"
|
1
|
93 mapping(sam2,runName,newReadFile1,newReadFile2,bowtiebuild,2,mapopt,trim3,log,gzipped,num_threads)
|
0
|
94 medianfile=runName+".digitalhaplotype1"
|
|
95 for i in range(2,5):
|
|
96 medians.append(linecache.getline(medianfile, i).split('\t', 3)[2])
|
|
97 medianflag=True
|
|
98 output2=runName+".digitalhaplotype2"
|
|
99 #Calculation of second digital haplototype
|
|
100 log.write("Calculation of second digital haplototype .....\n")
|
|
101 map=createRefDict(hla1fasta,"A","B","C")
|
|
102 readMapping(map,sam2)
|
|
103 predictHLA1(sam2,medians,output2,medianflag,log)
|
|
104 log.write("2nd iteration done.")
|
|
105 reportHLAgenotype(output1,output2,finaloutput)
|
|
106 log.write("Calculation of locus-specific expression ...\n")
|
|
107 expression("A","B","C",694,694,694,map,runName,readFile1,finaloutput,gzipped)
|
|
108
|
|
109 #--------------Class II----------------------------------------
|
1
|
110 def mainClassII(runName,readFile1,readFile2,bowtiebuild,hla2fasta,mapopt,trim3,finaloutput,log,gzipped,num_threads):
|
0
|
111 #-------1st iteration----------------------------------------
|
|
112 log.write("----------HLA class II------------\n")
|
|
113 sam1=runName+"-iteration1.bam"
|
|
114 iteration=1
|
|
115 log.write("ClassII: first iteration starts....\nMapping ......\n")
|
1
|
116 mapping(sam1,runName,readFile1,readFile2,bowtiebuild,1,mapopt,trim3,log,gzipped,num_threads)
|
0
|
117 medians=[]
|
|
118 medians.extend([0,0,0])
|
|
119 medianflag=False
|
|
120 output1=runName+".digitalhaplotype1"
|
|
121 log.write("ClassII: calculation of first digital haplototype .....\n")
|
|
122 map=createRefDict(hla2fasta,"DQA1","DQB1","DRB1")
|
|
123 readMapping(map,sam1)
|
|
124 predictHLA2(sam1,medians,output1,medianflag,log)
|
|
125 log.write("1st iteration done.\nNow removing reads that mapped to the three top-scoring groups .......\n")
|
|
126 removeReads(runName,createRemoveList(runName,map))
|
|
127
|
|
128 #------2nd iteration------------------------------------------
|
|
129 log.write("Second iterations starts .....\n Mapping ......\n")
|
|
130 medians=[]
|
|
131 iteration=2
|
|
132 sam2=runName+"-iteration2.bam"
|
|
133 newReadFile1=runName+"-2nditeration_1.fq"
|
|
134 newReadFile2=runName+"-2nditeration_2.fq"
|
1
|
135 mapping(sam2,runName,newReadFile1,newReadFile2,bowtiebuild,2,mapopt,trim3,log,gzipped,num_threads)
|
0
|
136 medianfile=runName+".digitalhaplotype1"
|
|
137 for i in range(2,5):
|
|
138 medians.append(float(linecache.getline(medianfile, i).split('\t', 3)[2])/2.0)
|
|
139 medianflag=True
|
|
140 output2=runName+".digitalhaplotype2"
|
|
141 #Calculation of second digital haplototype
|
|
142 log.write("ClassII: calculation of second digital haplototype .....\n")
|
|
143 map=createRefDict(hla2fasta,"DQA1","DQB1","DRB1")
|
|
144 readMapping(map,sam2)
|
|
145 predictHLA2(sam2,medians,output2,medianflag,log)
|
|
146 log.write("2nd iteration done.\n")
|
|
147 reportHLAgenotype(output1,output2,finaloutput)
|
|
148 log.write("Calculation of locus-specific expression ...\n")
|
|
149 expression("DQA1","DQB1","DRB1",400,421,421,map,runName,readFile1,finaloutput,gzipped)
|
|
150
|
|
151 #performs the bowtie mapping for the 2 iterations using the given parameters
|
1
|
152 def mapping(sam,runName,readFile1,readFile2,bowtiebuild,iteration,mapopt,trim3,log,gzipped,num_threads):
|
0
|
153 if iteration==1:
|
|
154 if gzipped == "True":
|
|
155 sam = sam.rsplit(".",1)[0]
|
1
|
156 cmd1 = "bowtie --threads "+num_threads+" -3 %s --quiet -S %s --al %s.aligned %s -1 <(zcat %s) -2 <(zcat %s) | awk -F '\t' '$3 != \"*\"{ print $0 }' | samtools view -bS - | samtools sort - %s" % (trim3,mapopt,runName,bowtiebuild,readFile1,readFile2,sam)
|
0
|
157 mappingOutput = execBowtie(cmd1)
|
|
158
|
|
159 cmd2 = "samtools index %s.bam" % sam
|
|
160 createBAMIndex(cmd2)
|
|
161 else:
|
|
162 sam = sam.rsplit(".",1)[0]
|
1
|
163 cmd1 = "bowtie --threads "+num_threads+" -3 %s --quiet -S %s --al %s.aligned %s -1 %s -2 %s | awk -F '\t' '$3 != \"*\"{ print $0 }' | samtools view -bS - | samtools sort - %s" % (trim3,mapopt,runName,bowtiebuild,readFile1,readFile2,sam)
|
0
|
164 mappingOutput = execBowtie(cmd1)
|
|
165
|
|
166 cmd2 = "samtools index %s.bam" % sam
|
|
167 createBAMIndex(cmd2)
|
|
168 # if iteration==2:
|
|
169 else:
|
|
170
|
|
171 sam = sam.rsplit(".",1)[0]
|
1
|
172 cmd2 = "bowtie --threads "+num_threads+" -3 %s --quiet -S %s %s -1 %s -2 %s | samtools view -bS - | samtools sort - %s" % (trim3,mapopt,bowtiebuild,readFile1,readFile2,sam)
|
0
|
173 mappingOutput = execBowtie(cmd2)
|
|
174
|
|
175 cmd3 = "samtools index %s.bam" % sam
|
|
176 createBAMIndex(cmd3)
|
|
177
|
|
178 log.write(mappingOutput+"\n")
|
|
179
|
|
180 def createBAMIndex(cmd):
|
|
181 proc = subprocess.Popen(["/bin/bash","-c",cmd],stderr=subprocess.PIPE)
|
|
182 (stdoutdata,stderrdata) = proc.communicate()
|
|
183 r = proc.returncode
|
|
184 if r != 0:
|
|
185 raise IOError(stderrdata)
|
|
186
|
|
187 def execBowtie(cmd):
|
|
188 proc = subprocess.Popen(["/bin/bash","-c",cmd],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
|
|
189 (stdoutdata,stderrdata) = proc.communicate()
|
|
190 mappingOutput = stdoutdata
|
|
191 r = proc.returncode
|
|
192 if r != 0:
|
|
193 raise IOError(stderrdata)
|
|
194 return mappingOutput
|
|
195
|
|
196 #create dictionary "map", that contains all IMGT/HLA-allele names as keys and the allele name (e.g. A*02:01:01) as value
|
|
197 #dictionary "readcount" is initialized with 0 for each allele
|
|
198 #dictionary "readspergroup" is initialized with 0 for each group (2digit, e.g. A*01)
|
|
199 #dictionary "allelesPerLocus" stores the number of alleles per locus.
|
|
200 def createRefDict(hlafasta,locus1,locus2,locus3):
|
|
201 map={}
|
|
202 for locus in (locus1,locus2,locus3):
|
|
203 allelesPerLocus[locus]=0
|
|
204 handle=open(hlafasta,'r')
|
|
205 for record in SeqIO.parse(handle, "fasta") :
|
|
206 l=record.description.split(' ')
|
|
207 hlapseudoname=l[0]
|
|
208 hlaallele=l[1]
|
|
209 map[hlapseudoname]=hlaallele
|
|
210 readcount[hlaallele]=0
|
|
211 readspergroup[hlaallele.split(":")[0]]=0
|
|
212 allelesPerLocus[hlaallele.split('*')[0]]+=1
|
|
213 handle.close()
|
|
214 return map
|
|
215
|
|
216 #Open sam-file and count mappings for each allele
|
|
217 def readMapping(map,sam):
|
|
218 samhandle = pysam.Samfile(sam,"rb")
|
|
219 for ar in samhandle.fetch():
|
|
220 hla = samhandle.getrname(ar.tid)
|
|
221 readcount[map[hla]]+=1
|
|
222 samhandle.close()
|
|
223
|
|
224 #predict HLA class II genotype
|
|
225 def predictHLA2(sam,medians,output,medianflag,log):
|
|
226
|
|
227 readspergroupDQAlist=[]
|
|
228 readspergroupDQBlist=[]
|
|
229 readspergroupDRBlist=[]
|
|
230 readspergroupDQA={}
|
|
231 readspergroupDQB={}
|
|
232 readspergroupDRB={}
|
|
233 maxallelepergroup={}
|
|
234
|
|
235 #for each allele, to which at least 1 read map, find the allele which has the most reads within a group (2-digit-level, e.g. DQA*02") and save
|
|
236 #i) the key of this allele as ambassador for the group => maxallelepergroup holds for each group the top-scoring allele
|
|
237 #ii) the number of reads mapping to the top-scoring allele => readspergroup
|
|
238 for key in readcount:
|
|
239 if readcount[key] > 0:
|
|
240 group=key.split(":")[0]
|
|
241 if readspergroup[group] <=readcount[key]:
|
|
242 readspergroup[group]=readcount[key]
|
|
243 maxallelepergroup[group]=key
|
|
244
|
|
245 #consider all DQA-,DQB-, and DRB-groups seperately
|
|
246 #readspergroup<DQA|DQB|DRB>list = list of all reads mapping to the top-scoring groups minus the decision threshold (which is 0 in the first iteration)
|
|
247 #readspergroup<DQA|DQB|DRB> = contains the same entries as the list, but the entries are uniquely accessible via the group-key (e.g. DQA*01)
|
|
248 for key in readspergroup:
|
|
249 keyBegin = key[0:3]
|
|
250 if keyBegin=='DQA':
|
|
251 readspergroupDQAlist.append(readspergroup[key]-float(medians[0]))
|
|
252 readspergroupDQA[key]=readspergroup[key]-float(medians[0])
|
|
253 if keyBegin=='DQB':
|
|
254 readspergroupDQBlist.append(readspergroup[key]-float(medians[1]))
|
|
255 readspergroupDQB[key]=readspergroup[key]-float(medians[1])
|
|
256 if keyBegin=='DRB':
|
|
257 readspergroupDRBlist.append(readspergroup[key]-float(medians[2]))
|
|
258 readspergroupDRB[key]=readspergroup[key]-float(medians[2])
|
|
259
|
|
260 #compute the decision threshold (median) for homozygosity vs. heterozygosity for the second iteration
|
|
261 medianDQA=numpy.median(readspergroupDQAlist)
|
|
262 medianDQB=numpy.median(readspergroupDQBlist)
|
|
263 medianDRB=numpy.median(readspergroupDRBlist)
|
|
264 a = b = c = ""
|
|
265
|
|
266 #Determine top-scoring group of the whole locus (DQA,DQBB,DRB) and store it
|
|
267 #maxkey<DQA,DQB,DRB> = group (e.g. DQA*01) with the most reads
|
|
268 #It can be that, e.g. in cancer cells a whole locus is lost. For that reason it is checked if the
|
|
269 #number of mapping reads of the top-scoring group maxkey<DQA,DQB,DRB> is > 0, otherwise "no" ist reported for this locus
|
|
270 if len(readspergroupDQAlist)>0:
|
|
271 maxkeyDQA=max(readspergroupDQA,key=lambda a:readspergroupDQA.get(a))
|
|
272 if readspergroupDQA[maxkeyDQA] > 0:
|
|
273 maxDQA=maxallelepergroup[maxkeyDQA]
|
|
274 a = max(readspergroupDQA,key=lambda a:readspergroupDQA.get(a))
|
|
275 else:
|
|
276 a = "no"
|
|
277 else:
|
|
278 a = "no"
|
|
279
|
|
280 if len(readspergroupDQBlist)>0:
|
|
281 maxkeyDQB=max(readspergroupDQB,key=lambda a:readspergroupDQB.get(a))
|
|
282 if readspergroupDQB[maxkeyDQB] > 0:
|
|
283 maxDQB=maxallelepergroup[maxkeyDQB]
|
|
284 b = max(readspergroupDQB,key=lambda a:readspergroupDQB.get(a))
|
|
285 else:
|
|
286 b = "no"
|
|
287 else:
|
|
288 b = "no"
|
|
289
|
|
290 if len(readspergroupDRBlist)>0:
|
|
291 maxkeyDRB=max(readspergroupDRB,key=lambda a:readspergroupDRB.get(a))
|
|
292 if readspergroupDRB[maxkeyDRB] > 0:
|
|
293 maxDRB=maxallelepergroup[maxkeyDRB]
|
|
294 c = max(readspergroupDRB,key=lambda a:readspergroupDRB.get(a))
|
|
295 else:
|
|
296 c = "no"
|
|
297 else:
|
|
298 c = "no"
|
|
299
|
|
300 readspergroupDQA=sorted(readspergroupDQA.items(), key=itemgetter(1),reverse=True)
|
|
301 readspergroupDQB=sorted(readspergroupDQB.items(), key=itemgetter(1),reverse=True)
|
|
302 readspergroupDRB=sorted(readspergroupDRB.items(), key=itemgetter(1),reverse=True)
|
|
303
|
|
304 dqavec=dqbvec=drbvec=""
|
|
305 if medianflag:
|
|
306 #in the 2nd iteration:
|
|
307 #1.) DO NOT remove the top-scoring group from the set <dqa,dqb,drb>vec as this is more strict when calculating the probability of the top scoring group being an outlier
|
|
308 #The strings <dqa,dqb,drb>vec are used by the R-script to calculate the probability of the top scoring group being an outlier
|
|
309 for key in maxallelepergroup:
|
|
310 keyBegin = key[0:3]
|
|
311 if keyBegin=="DQA":
|
|
312 dqavec=dqavec+str(readcount[maxallelepergroup[key]])+","
|
|
313 #print key+": "+str(readcount[maxallelepergroup[key]])
|
|
314 if keyBegin=="DQB":
|
|
315 dqbvec=dqbvec+str(readcount[maxallelepergroup[key]])+","
|
|
316 #print key+": "+str(readcount[maxallelepergroup[key]])
|
|
317 if keyBegin=="DRB":
|
|
318 drbvec=drbvec+str(readcount[maxallelepergroup[key]])+","
|
|
319 #print key+": "+str(readcount[maxallelepergroup[key]])
|
|
320 #2.) Add the decision thresholds to the sets <dqa,dqb,drb>vec, as this enables measuring the distance of the top-scoring group to this distance
|
|
321 if a=="no":
|
|
322 dqavec+=str(medians[0])
|
|
323 if b=="no":
|
|
324 dqbvec+=str(medians[1])
|
|
325 if c=="no":
|
|
326 drbvec+=str(medians[2])
|
|
327 #3.) In case of, e.g. a loss of whole HLA-locus (DQA/B,DRB), <dqa,dqb,drb>vec only contain medians[<0|1|2>].
|
|
328 #To avoid errors in the R-Script, set to 0
|
|
329 if dqavec==str(medians[0]):
|
|
330 dqavec = dqacount="0"
|
|
331 else:
|
|
332 dqacount=str(readcount[maxallelepergroup[maxkeyDQA]])
|
|
333 if dqavec==str(readcount[maxallelepergroup[maxkeyDQA]])+",":
|
|
334 dqavec=str(readcount[maxallelepergroup[maxkeyDQA]])+","+str(readcount[maxallelepergroup[maxkeyDQA]])+","
|
|
335 if dqbvec==str(medians[1]):
|
|
336 dqbvec = dqbcount="0"
|
|
337 else:
|
|
338 dqbcount=str(readcount[maxallelepergroup[maxkeyDQB]])
|
|
339 if dqbvec==str(readcount[maxallelepergroup[maxkeyDQB]])+",":
|
|
340 dqbvec=str(readcount[maxallelepergroup[maxkeyDQB]])+","+str(readcount[maxallelepergroup[maxkeyDQB]])+","
|
|
341 if drbvec==str(medians[2]):
|
|
342 drbvec = drbcount="0"
|
|
343 else:
|
|
344 drbcount=str(readcount[maxallelepergroup[maxkeyDRB]])
|
|
345 if drbvec==str(readcount[maxallelepergroup[maxkeyDRB]])+",":
|
|
346 drbvec=str(readcount[maxallelepergroup[maxkeyDDRB]])+","+str(readcount[maxallelepergroup[maxkeyDRB]])+","
|
|
347 else:
|
|
348 #in the 1st iteration: remove the top-scoring group from the set <dqa,dqb,drb>vec as this increases the certainty when calculating the probability of the top scoring group being an outlier
|
|
349 for key in maxallelepergroup:
|
|
350 keyBegin = key[0:3]
|
|
351 if keyBegin=="DQA":
|
|
352 if key!=maxkeyDQA:
|
|
353 dqavec=dqavec+str(readcount[maxallelepergroup[key]])+","
|
|
354 #print key+": "+str(readcount[maxallelepergroup[key]])
|
|
355 if keyBegin=="DQB":
|
|
356 if key!=maxkeyDQB:
|
|
357 dqbvec=dqbvec+str(readcount[maxallelepergroup[key]])+","
|
|
358 #print key+": "+str(readcount[maxallelepergroup[key]])
|
|
359 if keyBegin=="DRB":
|
|
360 if key!=maxkeyDRB:
|
|
361 drbvec=drbvec+str(readcount[maxallelepergroup[key]])+","
|
|
362 #print key+": "+str(readcount[maxallelepergroup[key]])
|
|
363 #2.) DO NOT add the decision thresholds to the sets <dqa,dqb,drb>vec
|
|
364 if dqavec=="":
|
|
365 dqacount=dqavec="0"
|
|
366 else:
|
|
367 dqacount=str(readcount[maxallelepergroup[maxkeyDQA]])
|
|
368 if dqavec==str(readcount[maxallelepergroup[maxkeyDQA]])+",":
|
|
369 dqavec=str(readcount[maxallelepergroup[maxkeyDQA]])+","+str(readcount[maxallelepergroup[maxkeyDQA]])+","
|
|
370 if len(dqavec.split(","))==2:
|
|
371 dqavec=dqavec+"0,"
|
|
372
|
|
373 if dqbvec=="":
|
|
374 dqbcount=dqbvec="0"
|
|
375 else:
|
|
376 dqbcount=str(readcount[maxallelepergroup[maxkeyDQB]])
|
|
377 if dqbvec==str(readcount[maxallelepergroup[maxkeyDQB]])+",":
|
|
378 dqbvec=str(readcount[maxallelepergroup[maxkeyDQB]])+","+str(readcount[maxallelepergroup[maxkeyDQB]])+","
|
|
379 if len(dqbvec.split(","))==2:
|
|
380 dqbvec=dqbvec+"0,"
|
|
381
|
|
382 if drbvec=="":
|
|
383 drbcount=drbvec="0"
|
|
384 else:
|
|
385 drbcount=str(readcount[maxallelepergroup[maxkeyDRB]])
|
|
386 if drbvec==str(readcount[maxallelepergroup[maxkeyDRB]])+",":
|
|
387 drbvec=str(readcount[maxallelepergroup[maxkeyDDRB]])+","+str(readcount[maxallelepergroup[maxkeyDRB]])+","
|
|
388 if len(drbvec.split(","))==2:
|
|
389 drbvec=drbvec+"0,"
|
|
390
|
|
391 #call R-script "commmand.R" to calculate the confidence of the top-scoring allele
|
|
392 routput=os.popen("R --vanilla < "+sys.path[0]+"/command.R --args "+dqacount+" "+dqavec+" "+maxkeyDQA+" "+dqbcount+" "+dqbvec+" "+maxkeyDQB+" "+drbcount+" "+drbvec+" "+maxkeyDRB).read()
|
|
393 parseOutput=routput.split("\n")
|
|
394
|
|
395 entries = []
|
|
396 for entry in parseOutput:
|
|
397 if entry[0:3]=="[1]":
|
|
398 entries.append(str(entry[4:len(entry)]))
|
|
399
|
|
400 if a=="no" and entries[1]!="NA":
|
|
401 entries[1]=str(1-float(entries[1]))
|
|
402 if b=="no" and entries[3]!="NA":
|
|
403 entries[3]=str(1-float(entries[3]))
|
|
404 if c=="no" and entries[5]!="NA":
|
|
405 entries[5]=str(1-float(entries[5]))
|
|
406
|
|
407 pred2File(entries,medianDQA,medianDQB,medianDRB,output,a,b,c,log)
|
|
408
|
|
409 #predict HLA class I genotype
|
|
410 def predictHLA1(sam,medians,output,medianflag,log):
|
|
411
|
|
412 readspergroupAlist=[]
|
|
413 readspergroupBlist=[]
|
|
414 readspergroupClist=[]
|
|
415 readspergroupA={}
|
|
416 readspergroupB={}
|
|
417 readspergroupC={}
|
|
418 maxallelepergroup={}
|
|
419
|
|
420 #for each allele, to which at least 1 read map, find the allele which has the most reads within a group (2-digit-level, e.g. A*02") and save
|
|
421 #i) the key of this allele as ambassador for the group => maxallelepergroup holds for each group the top-scoring allele
|
|
422 #ii) the number of reads mapping to the top-scoring allele => readspergroup
|
|
423 for key in readcount:
|
|
424 if readcount[key] > 0:
|
|
425 group=key.split(":")[0]
|
|
426 if readspergroup[group] <=readcount[key]:
|
|
427 readspergroup[group]=readcount[key]
|
|
428 maxallelepergroup[group]=key
|
|
429
|
|
430
|
|
431 #consider all A-,B-, and C-groups seperately
|
|
432 #readspergroup<A|B|C>list = list of all reads mapping to the top-scoring groups minus the decision threshold (which is 0 in the first iteration)
|
|
433 #readspergroup<A|B|C> = contains the same entries as the list, but the entries are uniquely accessible via the group-key (e.g. B*27)
|
|
434 for key in readspergroup:
|
|
435 firstKey = key[0]
|
|
436 if key[0]=='A':
|
|
437 readspergroupAlist.append(readspergroup[key]-float(medians[0]))
|
|
438 readspergroupA[key]=readspergroup[key]-float(medians[0])
|
|
439 if key[0]=='B':
|
|
440 readspergroupBlist.append(readspergroup[key]-float(medians[1]))
|
|
441 readspergroupB[key]=readspergroup[key]-float(medians[1])
|
|
442 if key[0]=='C':
|
|
443 readspergroupClist.append(readspergroup[key]-float(medians[2]))
|
|
444 readspergroupC[key]=readspergroup[key]-float(medians[2])
|
|
445
|
|
446 #compute the decision threshold (median) for homozygosity vs. heterozygosity for the second iteration
|
|
447 medianA=numpy.median(readspergroupAlist)
|
|
448 medianB=numpy.median(readspergroupBlist)
|
|
449 medianC=numpy.median(readspergroupClist)
|
|
450 a = b = c = ""
|
|
451
|
|
452 #Determine top-scoring group of the whole locus (A,B,C) and store it
|
|
453 #maxkey<A,B,C> = group (e.g. A*02) with the most reads
|
|
454 #It can be that, e.g. in cancer cells A whole locus is lost. For that reason it is checked if the
|
|
455 #number of mapping reads of the top-scoring group maxkey<A,B,C> is > 0, otherwise "no" ist reported for this locus
|
|
456 if len(readspergroupAlist)>0:
|
|
457 maxkeyA=max(readspergroupA,key=lambda a:readspergroupA.get(a))
|
|
458 if readspergroupA[maxkeyA] > 0:
|
|
459 maxA=maxallelepergroup[maxkeyA]
|
|
460 a = max(readspergroupA,key=lambda a:readspergroupA.get(a))
|
|
461 else:
|
|
462 a = "no"
|
|
463 else:
|
|
464 a = "no"
|
|
465
|
|
466 if len(readspergroupBlist)>0:
|
|
467 maxkeyB=max(readspergroupB,key=lambda a:readspergroupB.get(a))
|
|
468 if readspergroupB[maxkeyB] > 0:
|
|
469 maxB=maxallelepergroup[maxkeyB]
|
|
470 b = max(readspergroupB,key=lambda a:readspergroupB.get(a))
|
|
471 else:
|
|
472 b = "no"
|
|
473 else:
|
|
474 b = "no"
|
|
475
|
|
476 if len(readspergroupClist)>0:
|
|
477 maxkeyC=max(readspergroupC,key=lambda a:readspergroupC.get(a))
|
|
478 if readspergroupC[maxkeyC] > 0:
|
|
479 maxC=maxallelepergroup[maxkeyC]
|
|
480 c = max(readspergroupC,key=lambda a:readspergroupC.get(a))
|
|
481 else:
|
|
482 c = "no"
|
|
483 else:
|
|
484 c = "no"
|
|
485
|
|
486 readspergroupA=sorted(readspergroupA.items(), key=itemgetter(1),reverse=True)
|
|
487 readspergroupB=sorted(readspergroupB.items(), key=itemgetter(1),reverse=True)
|
|
488 readspergroupC=sorted(readspergroupC.items(), key=itemgetter(1),reverse=True)
|
|
489
|
|
490 avec=bvec=cvec=""
|
|
491
|
|
492 if medianflag:
|
|
493 #in the 2nd iteration:
|
|
494 #1.) DO NOT remove the top-scoring group from the set <a,b,c>vec as this is more strict when calculating the probability of the top scoring group being an outlier
|
|
495 #The strings <a,b,c>vec are used by the R-script to calculate the probability of the top scoring group being an outlier
|
|
496 for key in maxallelepergroup:
|
|
497 if key[0]=="A":
|
|
498 avec=avec+str(readcount[maxallelepergroup[key]])+","
|
|
499 #print key+": "+str(readcount[maxallelepergroup[key]])
|
|
500 if key[0]=="B":
|
|
501 bvec=bvec+str(readcount[maxallelepergroup[key]])+","
|
|
502 #print key+": "+str(readcount[maxallelepergroup[key]])
|
|
503 if key[0]=="C":
|
|
504 cvec=cvec+str(readcount[maxallelepergroup[key]])+","
|
|
505 #print key+": "+str(readcount[maxallelepergroup[key]])
|
|
506 #2.) Add the decision thresholds to the sets <a,b,c>vec, as this enables measuring the distance of the top-scoring group to this distance
|
|
507 if a=="no":
|
|
508 avec+=str(medians[0])
|
|
509 if b=="no":
|
|
510 bvec+=str(medians[1])
|
|
511 if c=="no":
|
|
512 cvec+=str(medians[2])
|
|
513 #3.) In case of, e.g. a loss of whole HLA-locus (A,B,C), <a,b,c>vec only contain median[<0|1|2>].
|
|
514 #To avoid errors in the R-Script, set to 0
|
|
515 if avec=="" or avec==medians[0]:
|
|
516 avec = acount="0"
|
|
517 else:
|
|
518 acount=str(readcount[maxallelepergroup[maxkeyA]])
|
|
519
|
|
520 if bvec=="" or bvec==medians[1]:
|
|
521 bvec = bcount="0"
|
|
522 else:
|
|
523 bcount=str(readcount[maxallelepergroup[maxkeyB]])
|
|
524
|
|
525 if cvec=="" or cvec==medians[2]:
|
|
526 cvec = ccount="0"
|
|
527 else:
|
|
528 ccount=str(readcount[maxallelepergroup[maxkeyC]])
|
|
529 else:
|
|
530 #in the 1st iteration: remove the top-scoring group from the set <a,b,c>vec as this increases the certainty when calculating the probability of the top scoring group being an outlier
|
|
531 for key in maxallelepergroup:
|
|
532 if key[0]=="A" and key!=maxkeyA:
|
|
533 avec=avec+str(readcount[maxallelepergroup[key]])+","
|
|
534 #print key+": "+str(readcount[maxallelepergroup[key]])
|
|
535 if key[0]=="B" and key!=maxkeyB:
|
|
536 bvec=bvec+str(readcount[maxallelepergroup[key]])+","
|
|
537 #print key+": "+str(readcount[maxallelepergroup[key]])
|
|
538 if key[0]=="C" and key!=maxkeyC:
|
|
539 cvec=cvec+str(readcount[maxallelepergroup[key]])+","
|
|
540 #print key+": "+str(readcount[maxallelepergroup[key]])
|
|
541 #2.) DO NOT add the decision thresholds to the sets <a,b,c>vec
|
|
542 if avec=="":
|
|
543 acount=avec="0"
|
|
544 else:
|
|
545 acount=str(readcount[maxallelepergroup[maxkeyA]])
|
|
546 if bvec=="":
|
|
547 bcount=bvec="0"
|
|
548 else:
|
|
549 bcount=str(readcount[maxallelepergroup[maxkeyB]])
|
|
550 if cvec=="":
|
|
551 ccount=cvec="0"
|
|
552 else:
|
|
553 ccount=str(readcount[maxallelepergroup[maxkeyC]])
|
|
554
|
|
555 #call R-script "commmand.R" to calculate the confidence of the top-scoring allele
|
|
556 routput=os.popen("R --vanilla < "+sys.path[0]+"/command.R --args "+acount+" "+avec+" "+maxkeyA+" "+bcount+" "+bvec+" "+maxkeyB+" "+ccount+" "+cvec+" "+maxkeyC).read()
|
|
557 parseOutput=routput.split("\n")
|
|
558
|
|
559 entries = []
|
|
560 for entry in parseOutput:
|
|
561 if entry[0:3]=="[1]":
|
|
562 entries.append(str(entry[4:len(entry)]))
|
|
563
|
|
564 if a=="no" and entries[1]!="NA":
|
|
565 entries[1]=str(1-float(entries[1]))
|
|
566 if b=="no" and entries[3]!="NA":
|
|
567 entries[3]=str(1-float(entries[3]))
|
|
568 if c=="no" and entries[5]!="NA":
|
|
569 entries[5]=str(1-float(entries[5]))
|
|
570
|
|
571 pred2File(entries,medianA,medianB,medianC,output,a,b,c,log)
|
|
572
|
|
573 #write digital haplotype into file
|
|
574 def pred2File(entries,medianA,medianB,medianC,output,a,b,c,log):
|
|
575 out = open(output, 'w')
|
|
576 out.write("HLA\tHLA1\tmedian-Value\talternative\tp-Value\n")
|
|
577 out.write("A\t"+a+"\t")
|
|
578 # write the median-Value for A
|
|
579 out.write(str(medianA)+"\t")
|
|
580 out.write(entries[0]+"\t"+entries[1]+"\n")
|
|
581
|
|
582 out.write("B\t"+b+"\t")
|
|
583 # write the median-Value for B
|
|
584 out.write(str(medianB)+"\t")
|
|
585 out.write(entries[2]+"\t"+entries[3]+"\n")
|
|
586
|
|
587 out.write("C\t"+c+"\t")
|
|
588 # write the median-Value for C
|
|
589 out.write(str(medianC)+"\t")
|
|
590 out.write(entries[4]+"\t"+entries[5]+"\n")
|
|
591 out.close()
|
|
592 log.write("The digital haplotype is written into "+output+"\n")
|
|
593
|
|
594 #open mapping file and all read ids to the list "removeList", which map to one of the three groups in "alleles"
|
|
595 def createRemoveList(runName,map):
|
|
596 removeList={}
|
|
597 alleles = []
|
|
598 sam=runName+"-iteration1.bam"
|
|
599 alleles_in=runName+".digitalhaplotype1"
|
|
600 for i in range(2,5):
|
|
601 alleles.append(linecache.getline(alleles_in, i).split('\t', 2)[1])
|
|
602
|
|
603 samhandle=pysam.Samfile(sam,'rb')
|
|
604 for read in samhandle.fetch():
|
|
605 illuminaid = read.qname
|
|
606 hlapseudoname = samhandle.getrname(read.tid)
|
|
607 if map[hlapseudoname].split(':')[0] in alleles:
|
|
608 removeList[illuminaid]=1
|
|
609 samhandle.close()
|
|
610 return removeList
|
|
611
|
|
612 #Remove reads that mapped to the three top-scoring alleles and write the remaining reads into two new read files
|
|
613 def removeReads(runName,removeList):
|
|
614 aligned1=runName+"_1.aligned"
|
|
615 aligned2=runName+"_2.aligned"
|
|
616 newReadFile1=runName+"-2nditeration_1.fq"
|
|
617 newReadFile2=runName+"-2nditeration_2.fq"
|
|
618 #r1 and r2, which are the input of bowtie in the 2nd iteration
|
|
619 r1=open(newReadFile1,"w")
|
|
620 r2=open(newReadFile2,"w")
|
|
621 #open the 2 files, that contain the reads that mapped in the 1st iteration
|
|
622 aligned_handle1=open(aligned1,"r")
|
|
623 aligned_handle2=open(aligned2,"r")
|
|
624
|
|
625 #One read entry consists of 4 lines: header, seq, "+", qualities.
|
|
626 for record in SeqIO.parse(aligned_handle1, "fastq"):
|
|
627 illuminaid=record.id.split('/')[0].split(' ')[0]#find exact id, which also appears in the mapping file
|
|
628 if not illuminaid in removeList:
|
|
629 SeqIO.write(record, r1, "fastq")
|
|
630
|
|
631 for record in SeqIO.parse(aligned_handle2, "fastq"):
|
|
632 illuminaid=record.id.split('/')[0].split(' ')[0]#find exact id, which also appears in the mapping file
|
|
633 if not illuminaid in removeList:
|
|
634 SeqIO.write(record, r2, "fastq")
|
|
635
|
|
636 #write the final prediction (both digital haplotypes) to file and stdout
|
|
637 def reportHLAgenotype(output1,output2,finaloutput):
|
|
638 filehandle1=open(output1,'r').readlines()[1:4]
|
|
639 filehandle2=open(output2,'r').readlines()[1:4]
|
|
640 outfile=open(finaloutput, 'w')
|
|
641
|
|
642 outfile.write("HLA\tHLA1\tp-Value\tHLA2\tp-Value\n")
|
|
643 for i in range(len(filehandle1)):
|
|
644 filehandle1[i]=filehandle1[i][0:-1]
|
|
645 for i in range(len(filehandle2)):
|
|
646 filehandle2[i]=filehandle2[i][0:-1]
|
|
647
|
|
648 a1 = filehandle1[0].split('\t',2)[1]
|
|
649 a1score = filehandle1[0].split('\t')[4]
|
|
650 b1 = filehandle1[1].split('\t',2)[1]
|
|
651 b1score = filehandle1[1].split('\t')[4]
|
|
652 c1 = filehandle1[2].split('\t',2)[1]
|
|
653 c1score = filehandle1[2].split('\t')[4]
|
|
654
|
|
655 a2 = filehandle2[0].split('\t',2)[1]
|
|
656 if a2 == "no":
|
|
657 a2 = "hoz("+filehandle2[0].split('\t')[3]+")"
|
|
658 a2score = filehandle2[0].split('\t')[4]
|
|
659 b2 = filehandle2[1].split('\t',2)[1]
|
|
660 if b2 == "no":
|
|
661 b2 = "hoz("+filehandle2[1].split('\t')[3]+")"
|
|
662 b2score = filehandle2[1].split('\t')[4]
|
|
663 c2 = filehandle2[2].split('\t', 2)[1]
|
|
664 if c2 == "no":
|
|
665 c2 = "hoz("+filehandle2[2].split('\t')[3]+")"
|
|
666 c2score = filehandle2[2].split('\t')[4]
|
|
667 #write complete HLA genotype to file
|
|
668 outfile.write("A\t"+a1+"\t"+a1score+"\t"+a2+"\t"+a2score+"\n")
|
|
669 outfile.write("B\t"+b1+"\t"+b1score+"\t"+b2+"\t"+b2score+"\n")
|
|
670 outfile.write("C\t"+c1+"\t"+c1score+"\t"+c2+"\t"+c2score+"\n")
|
|
671 #.. and print it to STDOUT
|
|
672 #print "A\t"+a1+"\t"+a1score+"\t"+a2+"\t"+a2score
|
|
673 #print "B\t"+b1+"\t"+b1score+"\t"+b2+"\t"+b2score
|
|
674 #print "C\t"+c1+"\t"+c1score+"\t"+c2+"\t"+c2score
|
|
675
|
|
676 #calculate locus-specific expression
|
|
677 def expression(locus1,locus2,locus3,length1,length2,length3,map,runName,readFile1,finaloutput,gzipped):
|
|
678 outfile = open(finaloutput,'a')
|
|
679 aligned1=runName+"_1.aligned"
|
|
680 sam=runName+"-iteration1.bam"
|
|
681 alleles_in=runName+".digitalhaplotype1"
|
|
682 if gzipped:
|
|
683 proc=subprocess.Popen(["/bin/bash","-c","wc -l <(zcat " + readFile1 + ")"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
|
|
684 (stdoutdata,stderrdata) = proc.communicate()
|
|
685 numberlines = stdoutdata
|
|
686 else:
|
|
687 proc=subprocess.Popen(["/bin/bash","-c","wc -l " + readFile1],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
|
|
688 (stdoutdata,stderrdata) = proc.communicate()
|
|
689 numberlines = stdoutdata
|
|
690 totalreads=int(numberlines.split(" ")[0])/4
|
|
691 alleles=[]
|
|
692 for i in range(2,5):
|
|
693 alleles.append(linecache.getline(alleles_in, i).split('\t', 2)[1])
|
|
694 alleles.append(linecache.getline(alleles_in, i).split('\t')[3])
|
|
695
|
|
696 #create read dictionary
|
|
697 reads={}
|
|
698 aligned_handle1=open(aligned1,"r")
|
|
699 for record in SeqIO.parse(aligned_handle1, "fastq"):
|
|
700 illuminaid=record.id.split('/')[0].split(' ')[0]#find exact id, which also appears in the mapping file
|
|
701 reads[illuminaid]={}
|
|
702 reads[illuminaid][locus1]=0
|
|
703 reads[illuminaid][locus2]=0
|
|
704 reads[illuminaid][locus3]=0
|
|
705 samhandle = pysam.Samfile(sam,"rb")
|
|
706 for read in samhandle.fetch():
|
|
707 illuminaid = read.qname
|
|
708 hlapseudoname = samhandle.getrname(read.tid)
|
|
709 if map[hlapseudoname].split(':')[0] in alleles:
|
|
710 reads[illuminaid][map[hlapseudoname].split('*')[0]]+=1
|
|
711
|
|
712 count={}
|
|
713 count[locus1]=0
|
|
714 count[locus2]=0
|
|
715 count[locus3]=0
|
|
716 for key in reads:
|
|
717 n=0
|
|
718 for locus in reads[key]:
|
|
719 if reads[key][locus] > 0:
|
|
720 n+=1
|
|
721 for locus in reads[key]:
|
|
722 if reads[key][locus] > 0:
|
|
723 count[locus]+=float(1.0/float(n))
|
|
724
|
|
725 outfile.write("\n")
|
|
726 #Calculate RPKM and print expression values for each locus to stdout
|
|
727 for locus in count:
|
|
728 if locus==locus1:
|
|
729 outfile.write(locus+": "+str(round(float((1000.0/length1))*float((1000000.0/totalreads))*count[locus],2))+" RPKM\n")
|
|
730 if locus==locus2:
|
|
731 outfile.write(locus+": "+str(round(float((1000.0/length2))*float((1000000.0/totalreads))*count[locus],2))+" RPKM\n")
|
|
732 if locus==locus3:
|
|
733 outfile.write(locus+": "+str(round(float((1000.0/length3))*float((1000000.0/totalreads))*count[locus],2))+" RPKM\n")
|
|
734
|
|
735
|
|
736 if __name__ == '__main__':
|
|
737 parser = OptionParser(usage="usage: %prog -1 readFile1 -2 readFile2 -r runName -l readlength [-3 <int>]", version="%prog 1.0")
|
|
738 parser.add_option("-z","--gzipped",action="store",dest="gzipped",help="Select if input reads are in gzip format")
|
|
739 parser.add_option("-1",
|
|
740 action="store",
|
|
741 dest="readFile1",
|
|
742 help="File name of #1 mates ")
|
|
743 parser.add_option("-2",
|
|
744 action="store",
|
|
745 dest="readFile2",
|
|
746 help="File name of #2 mates")
|
|
747 parser.add_option("-r", "--runName",
|
|
748 action="store",
|
|
749 dest="runName",
|
|
750 help="Name of this HLA typing run. Wil be used throughout this process as part of the name of the newly created files.")
|
|
751 parser.add_option("-l", "--length",
|
|
752 action="store",
|
|
753 dest="length",
|
|
754 help="Readlength")
|
|
755 parser.add_option("-3", "--trim3",
|
|
756 action="store",
|
|
757 dest="trim3",
|
|
758 default="0",
|
|
759 help="Bowtie option: -3 <int> trims <int> bases from the low quality 3' end of each read. Default: 0")
|
|
760 parser.add_option("-o", "--output1",
|
|
761 action="store",
|
|
762 dest="output1",
|
|
763 help="Output file 1")
|
|
764 parser.add_option("-p", "--output2",
|
|
765 action="store",
|
|
766 dest="output2",
|
|
767 help="Output file 2")
|
|
768 parser.add_option("-g","--log",action="store",dest="logfile",help="Output Log File")
|
1
|
769 parser.add_option("-t","--threads",type="int",dest="num_threads",help="Define amount of threads to use for Bowtie in this script")
|
0
|
770
|
|
771
|
|
772 (options, args) = parser.parse_args()
|
|
773 if not options.readFile1:
|
|
774 parser.error('File name #1 pair not given.')
|
|
775 if not options.readFile2:
|
|
776 parser.error('File name #2 pair not given.')
|
|
777 if not options.runName:
|
|
778 parser.error('Run name not given.')
|
|
779 if not options.length:
|
|
780 parser.error('Readlength not given.')
|
|
781 gzipped = options.gzipped
|
|
782 readFile1=options.readFile1
|
|
783 readFile2=options.readFile2
|
|
784 runName=options.runName
|
|
785 output1 = options.output1
|
|
786 output2 = options.output2
|
|
787 logfile = options.logfile
|
|
788 bowtiebuildClassI=sys.path[0]+"/references/ClassIWithoutNQex2-3.plus75"
|
|
789 bowtiebuildClassII=sys.path[0]+"/references/HLA2.ex2.plus75"
|
|
790 fastaClassI=sys.path[0]+"/references/ClassIWithoutNQex2-3.plus75.fasta"
|
|
791 fastaClassII=sys.path[0]+"/references/HLA2.ex2.plus75.fasta"
|
|
792 #as shown in the publication HLA typing with RNA-Seq works best by allowing as less mismatches as necessary
|
|
793 if int(options.length)<=50:
|
|
794 mismatch=1
|
|
795 elif int(options.length)>50 and int(options.length)<=100:
|
|
796 mismatch=2
|
|
797 else:
|
|
798 mismatch=3
|
|
799 trim3=str(options.trim3)
|
1
|
800 main(runName,readFile1,readFile2,fastaClassI,fastaClassII,bowtiebuildClassI,bowtiebuildClassII,mismatch,trim3,output1,output2,logfile,gzipped,options.num_threads)
|
0
|
801 sys.stderr = log_stderr
|
|
802 log_file.close()
|