Mercurial > repos > sebastian-boegel > seq2hla
diff seq2HLA.py @ 0:913ea6991ee4 draft
Uploaded
author | sebastian-boegel |
---|---|
date | Thu, 20 Dec 2012 10:37:01 -0500 |
parents | |
children | 155b796033b6 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/seq2HLA.py Thu Dec 20 10:37:01 2012 -0500 @@ -0,0 +1,801 @@ +########################################################################################################## +#Title: +#seq2HLA - HLA genotyping from RNA-Seq sequence reads +# +#Release: 1.0 +# +#Author: +#Sebastian Boegel, 2012 (c) +#TRON - Translational Oncology at the University Medical Center Mainz, 55131 Mainz, Germany +#University Medical Center of the Johannes Gutenberg-University Mainz, III. Medical Department, Mainz, Germany +# +#Contact: +#boegels@uni-mainz.de +# +#Synopsis: +#We developed an in-silico method "Seq2HLA", written in python and R, which takes standard RNA-Seq sequence reads in fastq format +#as input, uses a bowtie index comprising all HLA alleles and outputs the most likely HLA class I and class II genotypes, +#a p-value for each call, and the expression of each class +# +#Usage: +#python seq2HLA.py -1 <readfile1> -2 <readfile2> -r "<runname>" -l <readlength> [-3 <int>]* +#*optional (Default:0) +# +#Dependencies: +#0.) seq2HLA is a python script, developed with Python 2.6.8 +#1.) bowtie must be reachable by the command "bowtie". seq2HLA was developed and tested with bowtie version 0.12.7 (64-bit). The call to bowtie is invoked with 6 CPUs. You can change that in the function "mapping". +#2.) R must be installed, seq2HLA.py was developed and tested with R version 2.12.2 (2011-02-25) +#3.) Input must be paired-end reads in fastq-format +#4.) Index files must be located in the folder "references". +#5.) Packages: biopython (developed with V1.58), numpy (1.3.0) +########################################################################################################### + +from operator import itemgetter +import sys +import linecache +import ast +import os +import shutil +import tempfile +import subprocess +from Bio import SeqIO +import numpy +import operator +from optparse import OptionParser + +log_stderr = sys.stderr +log_file = open("err.log","w") +sys.stderr = log_file + +import pysam + +#These variables need to be global, as they are filled and used by different modules +readcount={} +readspergroup={} +allelesPerLocus={} + +def main(runName,readFile1,readFile2,fastaClassI,fastaClassII,bowtiebuildClassI,bowtiebuildClassII,mismatch,trim3,output1,output2,logfile,gzipped): + log = open(logfile,"w") +# mapopt="-a -v"+str(mismatch) + mapopt="-a -v"+str(mismatch) + #call HLA typing for Class I + mainClassI(runName+"-ClassI",readFile1,readFile2,bowtiebuildClassI,fastaClassI,mapopt,trim3,output1,log,gzipped) + #call HLA typing for Class II + mainClassII(runName+"-ClassII",readFile1,readFile2,bowtiebuildClassII,fastaClassII,mapopt,trim3,output2,log,gzipped) + +#---------------Class I------------------------------- +def mainClassI(runName,readFile1,readFile2,bowtiebuild,hla1fasta,mapopt,trim3,finaloutput,log,gzipped): + #-------1st iteration----------------------------------- + log.write("----------HLA class I------------\n") + sam1=runName+"-iteration1.bam" + iteration=1 + log.write("First iteration starts....\nMapping ......\n") + mapping(sam1,runName,readFile1,readFile2,bowtiebuild,1,mapopt,trim3,log,gzipped) + medians=[] + medians.extend([0,0,0]) + medianflag=False + #Calculation of first digital haplototype ..... + output1=runName+".digitalhaplotype1" + log.write("Calculation of first digital haplototype .....\n") + map=createRefDict(hla1fasta,"A","B","C") + readMapping(map,sam1) + predictHLA1(sam1,medians,output1,medianflag,log) + log.write("1st iteration done.\nNow removing reads that mapped to the three top-scoring groups .......\n") + removeReads(runName,createRemoveList(runName,map)) + + #------2nd iteration------------------------------------------ + log.write("Second iterations starts .....\n Mapping ......\n") + medians=[] + iteration=2 + sam2=runName+"-iteration2.bam" + newReadFile1=runName+"-2nditeration_1.fq" + newReadFile2=runName+"-2nditeration_2.fq" + mapping(sam2,runName,newReadFile1,newReadFile2,bowtiebuild,2,mapopt,trim3,log,gzipped) + medianfile=runName+".digitalhaplotype1" + for i in range(2,5): + medians.append(linecache.getline(medianfile, i).split('\t', 3)[2]) + medianflag=True + output2=runName+".digitalhaplotype2" + #Calculation of second digital haplototype + log.write("Calculation of second digital haplototype .....\n") + map=createRefDict(hla1fasta,"A","B","C") + readMapping(map,sam2) + predictHLA1(sam2,medians,output2,medianflag,log) + log.write("2nd iteration done.") + reportHLAgenotype(output1,output2,finaloutput) + log.write("Calculation of locus-specific expression ...\n") + expression("A","B","C",694,694,694,map,runName,readFile1,finaloutput,gzipped) + +#--------------Class II---------------------------------------- +def mainClassII(runName,readFile1,readFile2,bowtiebuild,hla2fasta,mapopt,trim3,finaloutput,log,gzipped): + #-------1st iteration---------------------------------------- + log.write("----------HLA class II------------\n") + sam1=runName+"-iteration1.bam" + iteration=1 + log.write("ClassII: first iteration starts....\nMapping ......\n") + mapping(sam1,runName,readFile1,readFile2,bowtiebuild,1,mapopt,trim3,log,gzipped) + medians=[] + medians.extend([0,0,0]) + medianflag=False + output1=runName+".digitalhaplotype1" + log.write("ClassII: calculation of first digital haplototype .....\n") + map=createRefDict(hla2fasta,"DQA1","DQB1","DRB1") + readMapping(map,sam1) + predictHLA2(sam1,medians,output1,medianflag,log) + log.write("1st iteration done.\nNow removing reads that mapped to the three top-scoring groups .......\n") + removeReads(runName,createRemoveList(runName,map)) + + #------2nd iteration------------------------------------------ + log.write("Second iterations starts .....\n Mapping ......\n") + medians=[] + iteration=2 + sam2=runName+"-iteration2.bam" + newReadFile1=runName+"-2nditeration_1.fq" + newReadFile2=runName+"-2nditeration_2.fq" + mapping(sam2,runName,newReadFile1,newReadFile2,bowtiebuild,2,mapopt,trim3,log,gzipped) + medianfile=runName+".digitalhaplotype1" + for i in range(2,5): + medians.append(float(linecache.getline(medianfile, i).split('\t', 3)[2])/2.0) + medianflag=True + output2=runName+".digitalhaplotype2" + #Calculation of second digital haplototype + log.write("ClassII: calculation of second digital haplototype .....\n") + map=createRefDict(hla2fasta,"DQA1","DQB1","DRB1") + readMapping(map,sam2) + predictHLA2(sam2,medians,output2,medianflag,log) + log.write("2nd iteration done.\n") + reportHLAgenotype(output1,output2,finaloutput) + log.write("Calculation of locus-specific expression ...\n") + expression("DQA1","DQB1","DRB1",400,421,421,map,runName,readFile1,finaloutput,gzipped) + +#performs the bowtie mapping for the 2 iterations using the given parameters +def mapping(sam,runName,readFile1,readFile2,bowtiebuild,iteration,mapopt,trim3,log,gzipped): + if iteration==1: + if gzipped == "True": + sam = sam.rsplit(".",1)[0] + cmd1 = "bowtie --threads 4 -3 %s --quiet -S %s --al %s.aligned %s -1 <(zcat %s) -2 <(zcat %s) | awk -F '\t' '$3 != \"*\"{ print $0 }' | samtools view -bS - | samtools sort - %s" % (trim3,mapopt,runName,bowtiebuild,readFile1,readFile2,sam) + mappingOutput = execBowtie(cmd1) + + cmd2 = "samtools index %s.bam" % sam + createBAMIndex(cmd2) + else: + sam = sam.rsplit(".",1)[0] + cmd1 = "bowtie --threads 4 -3 %s --quiet -S %s --al %s.aligned %s -1 %s -2 %s | awk -F '\t' '$3 != \"*\"{ print $0 }' | samtools view -bS - | samtools sort - %s" % (trim3,mapopt,runName,bowtiebuild,readFile1,readFile2,sam) + mappingOutput = execBowtie(cmd1) + + cmd2 = "samtools index %s.bam" % sam + createBAMIndex(cmd2) +# if iteration==2: + else: + + sam = sam.rsplit(".",1)[0] + cmd2 = "bowtie --threads 4 -3 %s --quiet -S %s %s -1 %s -2 %s | samtools view -bS - | samtools sort - %s" % (trim3,mapopt,bowtiebuild,readFile1,readFile2,sam) + mappingOutput = execBowtie(cmd2) + + cmd3 = "samtools index %s.bam" % sam + createBAMIndex(cmd3) + + log.write(mappingOutput+"\n") + +def createBAMIndex(cmd): + proc = subprocess.Popen(["/bin/bash","-c",cmd],stderr=subprocess.PIPE) + (stdoutdata,stderrdata) = proc.communicate() + r = proc.returncode + if r != 0: + raise IOError(stderrdata) + +def execBowtie(cmd): + proc = subprocess.Popen(["/bin/bash","-c",cmd],stdout=subprocess.PIPE,stderr=subprocess.PIPE) + (stdoutdata,stderrdata) = proc.communicate() + mappingOutput = stdoutdata + r = proc.returncode + if r != 0: + raise IOError(stderrdata) + return mappingOutput + +#create dictionary "map", that contains all IMGT/HLA-allele names as keys and the allele name (e.g. A*02:01:01) as value +#dictionary "readcount" is initialized with 0 for each allele +#dictionary "readspergroup" is initialized with 0 for each group (2digit, e.g. A*01) +#dictionary "allelesPerLocus" stores the number of alleles per locus. +def createRefDict(hlafasta,locus1,locus2,locus3): + map={} + for locus in (locus1,locus2,locus3): + allelesPerLocus[locus]=0 + handle=open(hlafasta,'r') + for record in SeqIO.parse(handle, "fasta") : + l=record.description.split(' ') + hlapseudoname=l[0] + hlaallele=l[1] + map[hlapseudoname]=hlaallele + readcount[hlaallele]=0 + readspergroup[hlaallele.split(":")[0]]=0 + allelesPerLocus[hlaallele.split('*')[0]]+=1 + handle.close() + return map + +#Open sam-file and count mappings for each allele +def readMapping(map,sam): + samhandle = pysam.Samfile(sam,"rb") + for ar in samhandle.fetch(): + hla = samhandle.getrname(ar.tid) + readcount[map[hla]]+=1 + samhandle.close() + +#predict HLA class II genotype +def predictHLA2(sam,medians,output,medianflag,log): + + readspergroupDQAlist=[] + readspergroupDQBlist=[] + readspergroupDRBlist=[] + readspergroupDQA={} + readspergroupDQB={} + readspergroupDRB={} + maxallelepergroup={} + + #for each allele, to which at least 1 read map, find the allele which has the most reads within a group (2-digit-level, e.g. DQA*02") and save + #i) the key of this allele as ambassador for the group => maxallelepergroup holds for each group the top-scoring allele + #ii) the number of reads mapping to the top-scoring allele => readspergroup + for key in readcount: + if readcount[key] > 0: + group=key.split(":")[0] + if readspergroup[group] <=readcount[key]: + readspergroup[group]=readcount[key] + maxallelepergroup[group]=key + + #consider all DQA-,DQB-, and DRB-groups seperately + #readspergroup<DQA|DQB|DRB>list = list of all reads mapping to the top-scoring groups minus the decision threshold (which is 0 in the first iteration) + #readspergroup<DQA|DQB|DRB> = contains the same entries as the list, but the entries are uniquely accessible via the group-key (e.g. DQA*01) + for key in readspergroup: + keyBegin = key[0:3] + if keyBegin=='DQA': + readspergroupDQAlist.append(readspergroup[key]-float(medians[0])) + readspergroupDQA[key]=readspergroup[key]-float(medians[0]) + if keyBegin=='DQB': + readspergroupDQBlist.append(readspergroup[key]-float(medians[1])) + readspergroupDQB[key]=readspergroup[key]-float(medians[1]) + if keyBegin=='DRB': + readspergroupDRBlist.append(readspergroup[key]-float(medians[2])) + readspergroupDRB[key]=readspergroup[key]-float(medians[2]) + + #compute the decision threshold (median) for homozygosity vs. heterozygosity for the second iteration + medianDQA=numpy.median(readspergroupDQAlist) + medianDQB=numpy.median(readspergroupDQBlist) + medianDRB=numpy.median(readspergroupDRBlist) + a = b = c = "" + + #Determine top-scoring group of the whole locus (DQA,DQBB,DRB) and store it + #maxkey<DQA,DQB,DRB> = group (e.g. DQA*01) with the most reads + #It can be that, e.g. in cancer cells a whole locus is lost. For that reason it is checked if the + #number of mapping reads of the top-scoring group maxkey<DQA,DQB,DRB> is > 0, otherwise "no" ist reported for this locus + if len(readspergroupDQAlist)>0: + maxkeyDQA=max(readspergroupDQA,key=lambda a:readspergroupDQA.get(a)) + if readspergroupDQA[maxkeyDQA] > 0: + maxDQA=maxallelepergroup[maxkeyDQA] + a = max(readspergroupDQA,key=lambda a:readspergroupDQA.get(a)) + else: + a = "no" + else: + a = "no" + + if len(readspergroupDQBlist)>0: + maxkeyDQB=max(readspergroupDQB,key=lambda a:readspergroupDQB.get(a)) + if readspergroupDQB[maxkeyDQB] > 0: + maxDQB=maxallelepergroup[maxkeyDQB] + b = max(readspergroupDQB,key=lambda a:readspergroupDQB.get(a)) + else: + b = "no" + else: + b = "no" + + if len(readspergroupDRBlist)>0: + maxkeyDRB=max(readspergroupDRB,key=lambda a:readspergroupDRB.get(a)) + if readspergroupDRB[maxkeyDRB] > 0: + maxDRB=maxallelepergroup[maxkeyDRB] + c = max(readspergroupDRB,key=lambda a:readspergroupDRB.get(a)) + else: + c = "no" + else: + c = "no" + + readspergroupDQA=sorted(readspergroupDQA.items(), key=itemgetter(1),reverse=True) + readspergroupDQB=sorted(readspergroupDQB.items(), key=itemgetter(1),reverse=True) + readspergroupDRB=sorted(readspergroupDRB.items(), key=itemgetter(1),reverse=True) + + dqavec=dqbvec=drbvec="" + if medianflag: + #in the 2nd iteration: + #1.) DO NOT remove the top-scoring group from the set <dqa,dqb,drb>vec as this is more strict when calculating the probability of the top scoring group being an outlier + #The strings <dqa,dqb,drb>vec are used by the R-script to calculate the probability of the top scoring group being an outlier + for key in maxallelepergroup: + keyBegin = key[0:3] + if keyBegin=="DQA": + dqavec=dqavec+str(readcount[maxallelepergroup[key]])+"," + #print key+": "+str(readcount[maxallelepergroup[key]]) + if keyBegin=="DQB": + dqbvec=dqbvec+str(readcount[maxallelepergroup[key]])+"," + #print key+": "+str(readcount[maxallelepergroup[key]]) + if keyBegin=="DRB": + drbvec=drbvec+str(readcount[maxallelepergroup[key]])+"," + #print key+": "+str(readcount[maxallelepergroup[key]]) + #2.) Add the decision thresholds to the sets <dqa,dqb,drb>vec, as this enables measuring the distance of the top-scoring group to this distance + if a=="no": + dqavec+=str(medians[0]) + if b=="no": + dqbvec+=str(medians[1]) + if c=="no": + drbvec+=str(medians[2]) + #3.) In case of, e.g. a loss of whole HLA-locus (DQA/B,DRB), <dqa,dqb,drb>vec only contain medians[<0|1|2>]. + #To avoid errors in the R-Script, set to 0 + if dqavec==str(medians[0]): + dqavec = dqacount="0" + else: + dqacount=str(readcount[maxallelepergroup[maxkeyDQA]]) + if dqavec==str(readcount[maxallelepergroup[maxkeyDQA]])+",": + dqavec=str(readcount[maxallelepergroup[maxkeyDQA]])+","+str(readcount[maxallelepergroup[maxkeyDQA]])+"," + if dqbvec==str(medians[1]): + dqbvec = dqbcount="0" + else: + dqbcount=str(readcount[maxallelepergroup[maxkeyDQB]]) + if dqbvec==str(readcount[maxallelepergroup[maxkeyDQB]])+",": + dqbvec=str(readcount[maxallelepergroup[maxkeyDQB]])+","+str(readcount[maxallelepergroup[maxkeyDQB]])+"," + if drbvec==str(medians[2]): + drbvec = drbcount="0" + else: + drbcount=str(readcount[maxallelepergroup[maxkeyDRB]]) + if drbvec==str(readcount[maxallelepergroup[maxkeyDRB]])+",": + drbvec=str(readcount[maxallelepergroup[maxkeyDDRB]])+","+str(readcount[maxallelepergroup[maxkeyDRB]])+"," + else: + #in the 1st iteration: remove the top-scoring group from the set <dqa,dqb,drb>vec as this increases the certainty when calculating the probability of the top scoring group being an outlier + for key in maxallelepergroup: + keyBegin = key[0:3] + if keyBegin=="DQA": + if key!=maxkeyDQA: + dqavec=dqavec+str(readcount[maxallelepergroup[key]])+"," + #print key+": "+str(readcount[maxallelepergroup[key]]) + if keyBegin=="DQB": + if key!=maxkeyDQB: + dqbvec=dqbvec+str(readcount[maxallelepergroup[key]])+"," + #print key+": "+str(readcount[maxallelepergroup[key]]) + if keyBegin=="DRB": + if key!=maxkeyDRB: + drbvec=drbvec+str(readcount[maxallelepergroup[key]])+"," + #print key+": "+str(readcount[maxallelepergroup[key]]) + #2.) DO NOT add the decision thresholds to the sets <dqa,dqb,drb>vec + if dqavec=="": + dqacount=dqavec="0" + else: + dqacount=str(readcount[maxallelepergroup[maxkeyDQA]]) + if dqavec==str(readcount[maxallelepergroup[maxkeyDQA]])+",": + dqavec=str(readcount[maxallelepergroup[maxkeyDQA]])+","+str(readcount[maxallelepergroup[maxkeyDQA]])+"," + if len(dqavec.split(","))==2: + dqavec=dqavec+"0," + + if dqbvec=="": + dqbcount=dqbvec="0" + else: + dqbcount=str(readcount[maxallelepergroup[maxkeyDQB]]) + if dqbvec==str(readcount[maxallelepergroup[maxkeyDQB]])+",": + dqbvec=str(readcount[maxallelepergroup[maxkeyDQB]])+","+str(readcount[maxallelepergroup[maxkeyDQB]])+"," + if len(dqbvec.split(","))==2: + dqbvec=dqbvec+"0," + + if drbvec=="": + drbcount=drbvec="0" + else: + drbcount=str(readcount[maxallelepergroup[maxkeyDRB]]) + if drbvec==str(readcount[maxallelepergroup[maxkeyDRB]])+",": + drbvec=str(readcount[maxallelepergroup[maxkeyDDRB]])+","+str(readcount[maxallelepergroup[maxkeyDRB]])+"," + if len(drbvec.split(","))==2: + drbvec=drbvec+"0," + + #call R-script "commmand.R" to calculate the confidence of the top-scoring allele + routput=os.popen("R --vanilla < "+sys.path[0]+"/command.R --args "+dqacount+" "+dqavec+" "+maxkeyDQA+" "+dqbcount+" "+dqbvec+" "+maxkeyDQB+" "+drbcount+" "+drbvec+" "+maxkeyDRB).read() + parseOutput=routput.split("\n") + + entries = [] + for entry in parseOutput: + if entry[0:3]=="[1]": + entries.append(str(entry[4:len(entry)])) + + if a=="no" and entries[1]!="NA": + entries[1]=str(1-float(entries[1])) + if b=="no" and entries[3]!="NA": + entries[3]=str(1-float(entries[3])) + if c=="no" and entries[5]!="NA": + entries[5]=str(1-float(entries[5])) + + pred2File(entries,medianDQA,medianDQB,medianDRB,output,a,b,c,log) + +#predict HLA class I genotype +def predictHLA1(sam,medians,output,medianflag,log): + + readspergroupAlist=[] + readspergroupBlist=[] + readspergroupClist=[] + readspergroupA={} + readspergroupB={} + readspergroupC={} + maxallelepergroup={} + + #for each allele, to which at least 1 read map, find the allele which has the most reads within a group (2-digit-level, e.g. A*02") and save + #i) the key of this allele as ambassador for the group => maxallelepergroup holds for each group the top-scoring allele + #ii) the number of reads mapping to the top-scoring allele => readspergroup + for key in readcount: + if readcount[key] > 0: + group=key.split(":")[0] + if readspergroup[group] <=readcount[key]: + readspergroup[group]=readcount[key] + maxallelepergroup[group]=key + + + #consider all A-,B-, and C-groups seperately + #readspergroup<A|B|C>list = list of all reads mapping to the top-scoring groups minus the decision threshold (which is 0 in the first iteration) + #readspergroup<A|B|C> = contains the same entries as the list, but the entries are uniquely accessible via the group-key (e.g. B*27) + for key in readspergroup: + firstKey = key[0] + if key[0]=='A': + readspergroupAlist.append(readspergroup[key]-float(medians[0])) + readspergroupA[key]=readspergroup[key]-float(medians[0]) + if key[0]=='B': + readspergroupBlist.append(readspergroup[key]-float(medians[1])) + readspergroupB[key]=readspergroup[key]-float(medians[1]) + if key[0]=='C': + readspergroupClist.append(readspergroup[key]-float(medians[2])) + readspergroupC[key]=readspergroup[key]-float(medians[2]) + + #compute the decision threshold (median) for homozygosity vs. heterozygosity for the second iteration + medianA=numpy.median(readspergroupAlist) + medianB=numpy.median(readspergroupBlist) + medianC=numpy.median(readspergroupClist) + a = b = c = "" + + #Determine top-scoring group of the whole locus (A,B,C) and store it + #maxkey<A,B,C> = group (e.g. A*02) with the most reads + #It can be that, e.g. in cancer cells A whole locus is lost. For that reason it is checked if the + #number of mapping reads of the top-scoring group maxkey<A,B,C> is > 0, otherwise "no" ist reported for this locus + if len(readspergroupAlist)>0: + maxkeyA=max(readspergroupA,key=lambda a:readspergroupA.get(a)) + if readspergroupA[maxkeyA] > 0: + maxA=maxallelepergroup[maxkeyA] + a = max(readspergroupA,key=lambda a:readspergroupA.get(a)) + else: + a = "no" + else: + a = "no" + + if len(readspergroupBlist)>0: + maxkeyB=max(readspergroupB,key=lambda a:readspergroupB.get(a)) + if readspergroupB[maxkeyB] > 0: + maxB=maxallelepergroup[maxkeyB] + b = max(readspergroupB,key=lambda a:readspergroupB.get(a)) + else: + b = "no" + else: + b = "no" + + if len(readspergroupClist)>0: + maxkeyC=max(readspergroupC,key=lambda a:readspergroupC.get(a)) + if readspergroupC[maxkeyC] > 0: + maxC=maxallelepergroup[maxkeyC] + c = max(readspergroupC,key=lambda a:readspergroupC.get(a)) + else: + c = "no" + else: + c = "no" + + readspergroupA=sorted(readspergroupA.items(), key=itemgetter(1),reverse=True) + readspergroupB=sorted(readspergroupB.items(), key=itemgetter(1),reverse=True) + readspergroupC=sorted(readspergroupC.items(), key=itemgetter(1),reverse=True) + + avec=bvec=cvec="" + + if medianflag: + #in the 2nd iteration: + #1.) DO NOT remove the top-scoring group from the set <a,b,c>vec as this is more strict when calculating the probability of the top scoring group being an outlier + #The strings <a,b,c>vec are used by the R-script to calculate the probability of the top scoring group being an outlier + for key in maxallelepergroup: + if key[0]=="A": + avec=avec+str(readcount[maxallelepergroup[key]])+"," + #print key+": "+str(readcount[maxallelepergroup[key]]) + if key[0]=="B": + bvec=bvec+str(readcount[maxallelepergroup[key]])+"," + #print key+": "+str(readcount[maxallelepergroup[key]]) + if key[0]=="C": + cvec=cvec+str(readcount[maxallelepergroup[key]])+"," + #print key+": "+str(readcount[maxallelepergroup[key]]) + #2.) Add the decision thresholds to the sets <a,b,c>vec, as this enables measuring the distance of the top-scoring group to this distance + if a=="no": + avec+=str(medians[0]) + if b=="no": + bvec+=str(medians[1]) + if c=="no": + cvec+=str(medians[2]) + #3.) In case of, e.g. a loss of whole HLA-locus (A,B,C), <a,b,c>vec only contain median[<0|1|2>]. + #To avoid errors in the R-Script, set to 0 + if avec=="" or avec==medians[0]: + avec = acount="0" + else: + acount=str(readcount[maxallelepergroup[maxkeyA]]) + + if bvec=="" or bvec==medians[1]: + bvec = bcount="0" + else: + bcount=str(readcount[maxallelepergroup[maxkeyB]]) + + if cvec=="" or cvec==medians[2]: + cvec = ccount="0" + else: + ccount=str(readcount[maxallelepergroup[maxkeyC]]) + else: + #in the 1st iteration: remove the top-scoring group from the set <a,b,c>vec as this increases the certainty when calculating the probability of the top scoring group being an outlier + for key in maxallelepergroup: + if key[0]=="A" and key!=maxkeyA: + avec=avec+str(readcount[maxallelepergroup[key]])+"," + #print key+": "+str(readcount[maxallelepergroup[key]]) + if key[0]=="B" and key!=maxkeyB: + bvec=bvec+str(readcount[maxallelepergroup[key]])+"," + #print key+": "+str(readcount[maxallelepergroup[key]]) + if key[0]=="C" and key!=maxkeyC: + cvec=cvec+str(readcount[maxallelepergroup[key]])+"," + #print key+": "+str(readcount[maxallelepergroup[key]]) + #2.) DO NOT add the decision thresholds to the sets <a,b,c>vec + if avec=="": + acount=avec="0" + else: + acount=str(readcount[maxallelepergroup[maxkeyA]]) + if bvec=="": + bcount=bvec="0" + else: + bcount=str(readcount[maxallelepergroup[maxkeyB]]) + if cvec=="": + ccount=cvec="0" + else: + ccount=str(readcount[maxallelepergroup[maxkeyC]]) + + #call R-script "commmand.R" to calculate the confidence of the top-scoring allele + routput=os.popen("R --vanilla < "+sys.path[0]+"/command.R --args "+acount+" "+avec+" "+maxkeyA+" "+bcount+" "+bvec+" "+maxkeyB+" "+ccount+" "+cvec+" "+maxkeyC).read() + parseOutput=routput.split("\n") + + entries = [] + for entry in parseOutput: + if entry[0:3]=="[1]": + entries.append(str(entry[4:len(entry)])) + + if a=="no" and entries[1]!="NA": + entries[1]=str(1-float(entries[1])) + if b=="no" and entries[3]!="NA": + entries[3]=str(1-float(entries[3])) + if c=="no" and entries[5]!="NA": + entries[5]=str(1-float(entries[5])) + + pred2File(entries,medianA,medianB,medianC,output,a,b,c,log) + +#write digital haplotype into file +def pred2File(entries,medianA,medianB,medianC,output,a,b,c,log): + out = open(output, 'w') + out.write("HLA\tHLA1\tmedian-Value\talternative\tp-Value\n") + out.write("A\t"+a+"\t") + # write the median-Value for A + out.write(str(medianA)+"\t") + out.write(entries[0]+"\t"+entries[1]+"\n") + + out.write("B\t"+b+"\t") + # write the median-Value for B + out.write(str(medianB)+"\t") + out.write(entries[2]+"\t"+entries[3]+"\n") + + out.write("C\t"+c+"\t") + # write the median-Value for C + out.write(str(medianC)+"\t") + out.write(entries[4]+"\t"+entries[5]+"\n") + out.close() + log.write("The digital haplotype is written into "+output+"\n") + +#open mapping file and all read ids to the list "removeList", which map to one of the three groups in "alleles" +def createRemoveList(runName,map): + removeList={} + alleles = [] + sam=runName+"-iteration1.bam" + alleles_in=runName+".digitalhaplotype1" + for i in range(2,5): + alleles.append(linecache.getline(alleles_in, i).split('\t', 2)[1]) + + samhandle=pysam.Samfile(sam,'rb') + for read in samhandle.fetch(): + illuminaid = read.qname + hlapseudoname = samhandle.getrname(read.tid) + if map[hlapseudoname].split(':')[0] in alleles: + removeList[illuminaid]=1 + samhandle.close() + return removeList + +#Remove reads that mapped to the three top-scoring alleles and write the remaining reads into two new read files +def removeReads(runName,removeList): + aligned1=runName+"_1.aligned" + aligned2=runName+"_2.aligned" + newReadFile1=runName+"-2nditeration_1.fq" + newReadFile2=runName+"-2nditeration_2.fq" + #r1 and r2, which are the input of bowtie in the 2nd iteration + r1=open(newReadFile1,"w") + r2=open(newReadFile2,"w") + #open the 2 files, that contain the reads that mapped in the 1st iteration + aligned_handle1=open(aligned1,"r") + aligned_handle2=open(aligned2,"r") + + #One read entry consists of 4 lines: header, seq, "+", qualities. + for record in SeqIO.parse(aligned_handle1, "fastq"): + illuminaid=record.id.split('/')[0].split(' ')[0]#find exact id, which also appears in the mapping file + if not illuminaid in removeList: + SeqIO.write(record, r1, "fastq") + + for record in SeqIO.parse(aligned_handle2, "fastq"): + illuminaid=record.id.split('/')[0].split(' ')[0]#find exact id, which also appears in the mapping file + if not illuminaid in removeList: + SeqIO.write(record, r2, "fastq") + +#write the final prediction (both digital haplotypes) to file and stdout +def reportHLAgenotype(output1,output2,finaloutput): + filehandle1=open(output1,'r').readlines()[1:4] + filehandle2=open(output2,'r').readlines()[1:4] + outfile=open(finaloutput, 'w') + + outfile.write("HLA\tHLA1\tp-Value\tHLA2\tp-Value\n") + for i in range(len(filehandle1)): + filehandle1[i]=filehandle1[i][0:-1] + for i in range(len(filehandle2)): + filehandle2[i]=filehandle2[i][0:-1] + + a1 = filehandle1[0].split('\t',2)[1] + a1score = filehandle1[0].split('\t')[4] + b1 = filehandle1[1].split('\t',2)[1] + b1score = filehandle1[1].split('\t')[4] + c1 = filehandle1[2].split('\t',2)[1] + c1score = filehandle1[2].split('\t')[4] + + a2 = filehandle2[0].split('\t',2)[1] + if a2 == "no": + a2 = "hoz("+filehandle2[0].split('\t')[3]+")" + a2score = filehandle2[0].split('\t')[4] + b2 = filehandle2[1].split('\t',2)[1] + if b2 == "no": + b2 = "hoz("+filehandle2[1].split('\t')[3]+")" + b2score = filehandle2[1].split('\t')[4] + c2 = filehandle2[2].split('\t', 2)[1] + if c2 == "no": + c2 = "hoz("+filehandle2[2].split('\t')[3]+")" + c2score = filehandle2[2].split('\t')[4] + #write complete HLA genotype to file + outfile.write("A\t"+a1+"\t"+a1score+"\t"+a2+"\t"+a2score+"\n") + outfile.write("B\t"+b1+"\t"+b1score+"\t"+b2+"\t"+b2score+"\n") + outfile.write("C\t"+c1+"\t"+c1score+"\t"+c2+"\t"+c2score+"\n") + #.. and print it to STDOUT + #print "A\t"+a1+"\t"+a1score+"\t"+a2+"\t"+a2score + #print "B\t"+b1+"\t"+b1score+"\t"+b2+"\t"+b2score + #print "C\t"+c1+"\t"+c1score+"\t"+c2+"\t"+c2score + +#calculate locus-specific expression +def expression(locus1,locus2,locus3,length1,length2,length3,map,runName,readFile1,finaloutput,gzipped): + outfile = open(finaloutput,'a') + aligned1=runName+"_1.aligned" + sam=runName+"-iteration1.bam" + alleles_in=runName+".digitalhaplotype1" + if gzipped: + proc=subprocess.Popen(["/bin/bash","-c","wc -l <(zcat " + readFile1 + ")"],stdout=subprocess.PIPE,stderr=subprocess.PIPE) + (stdoutdata,stderrdata) = proc.communicate() + numberlines = stdoutdata + else: + proc=subprocess.Popen(["/bin/bash","-c","wc -l " + readFile1],stdout=subprocess.PIPE,stderr=subprocess.PIPE) + (stdoutdata,stderrdata) = proc.communicate() + numberlines = stdoutdata + totalreads=int(numberlines.split(" ")[0])/4 + alleles=[] + for i in range(2,5): + alleles.append(linecache.getline(alleles_in, i).split('\t', 2)[1]) + alleles.append(linecache.getline(alleles_in, i).split('\t')[3]) + + #create read dictionary + reads={} + aligned_handle1=open(aligned1,"r") + for record in SeqIO.parse(aligned_handle1, "fastq"): + illuminaid=record.id.split('/')[0].split(' ')[0]#find exact id, which also appears in the mapping file + reads[illuminaid]={} + reads[illuminaid][locus1]=0 + reads[illuminaid][locus2]=0 + reads[illuminaid][locus3]=0 + samhandle = pysam.Samfile(sam,"rb") + for read in samhandle.fetch(): + illuminaid = read.qname + hlapseudoname = samhandle.getrname(read.tid) + if map[hlapseudoname].split(':')[0] in alleles: + reads[illuminaid][map[hlapseudoname].split('*')[0]]+=1 + + count={} + count[locus1]=0 + count[locus2]=0 + count[locus3]=0 + for key in reads: + n=0 + for locus in reads[key]: + if reads[key][locus] > 0: + n+=1 + for locus in reads[key]: + if reads[key][locus] > 0: + count[locus]+=float(1.0/float(n)) + + outfile.write("\n") + #Calculate RPKM and print expression values for each locus to stdout + for locus in count: + if locus==locus1: + outfile.write(locus+": "+str(round(float((1000.0/length1))*float((1000000.0/totalreads))*count[locus],2))+" RPKM\n") + if locus==locus2: + outfile.write(locus+": "+str(round(float((1000.0/length2))*float((1000000.0/totalreads))*count[locus],2))+" RPKM\n") + if locus==locus3: + outfile.write(locus+": "+str(round(float((1000.0/length3))*float((1000000.0/totalreads))*count[locus],2))+" RPKM\n") + + +if __name__ == '__main__': + parser = OptionParser(usage="usage: %prog -1 readFile1 -2 readFile2 -r runName -l readlength [-3 <int>]", version="%prog 1.0") + parser.add_option("-z","--gzipped",action="store",dest="gzipped",help="Select if input reads are in gzip format") + parser.add_option("-1", + action="store", + dest="readFile1", + help="File name of #1 mates ") + parser.add_option("-2", + action="store", + dest="readFile2", + help="File name of #2 mates") + parser.add_option("-r", "--runName", + action="store", + dest="runName", + help="Name of this HLA typing run. Wil be used throughout this process as part of the name of the newly created files.") + parser.add_option("-l", "--length", + action="store", + dest="length", + help="Readlength") + parser.add_option("-3", "--trim3", + action="store", + dest="trim3", + default="0", + help="Bowtie option: -3 <int> trims <int> bases from the low quality 3' end of each read. Default: 0") + parser.add_option("-o", "--output1", + action="store", + dest="output1", + help="Output file 1") + parser.add_option("-p", "--output2", + action="store", + dest="output2", + help="Output file 2") + parser.add_option("-g","--log",action="store",dest="logfile",help="Output Log File") + + + (options, args) = parser.parse_args() + if not options.readFile1: + parser.error('File name #1 pair not given.') + if not options.readFile2: + parser.error('File name #2 pair not given.') + if not options.runName: + parser.error('Run name not given.') + if not options.length: + parser.error('Readlength not given.') + gzipped = options.gzipped + readFile1=options.readFile1 + readFile2=options.readFile2 + runName=options.runName + output1 = options.output1 + output2 = options.output2 + logfile = options.logfile + bowtiebuildClassI=sys.path[0]+"/references/ClassIWithoutNQex2-3.plus75" + bowtiebuildClassII=sys.path[0]+"/references/HLA2.ex2.plus75" + fastaClassI=sys.path[0]+"/references/ClassIWithoutNQex2-3.plus75.fasta" + fastaClassII=sys.path[0]+"/references/HLA2.ex2.plus75.fasta" + #as shown in the publication HLA typing with RNA-Seq works best by allowing as less mismatches as necessary + if int(options.length)<=50: + mismatch=1 + elif int(options.length)>50 and int(options.length)<=100: + mismatch=2 + else: + mismatch=3 + trim3=str(options.trim3) + main(runName,readFile1,readFile2,fastaClassI,fastaClassII,bowtiebuildClassI,bowtiebuildClassII,mismatch,trim3,output1,output2,logfile,gzipped) + sys.stderr = log_stderr + log_file.close()