Mercurial > repos > sebastian-boegel > seq2hla
view seq2HLA.py @ 1:155b796033b6 draft default tip
Uploaded
author | sebastian-boegel |
---|---|
date | Fri, 21 Dec 2012 03:46:15 -0500 |
parents | 913ea6991ee4 |
children |
line wrap: on
line source
########################################################################################################## #Title: #seq2HLA - HLA genotyping from RNA-Seq sequence reads # #Release: 1.0 # #Author: #Sebastian Boegel, 2012 (c) #TRON - Translational Oncology at the University Medical Center Mainz, 55131 Mainz, Germany #University Medical Center of the Johannes Gutenberg-University Mainz, III. Medical Department, Mainz, Germany # #Contact: #boegels@uni-mainz.de # #Synopsis: #We developed an in-silico method "Seq2HLA", written in python and R, which takes standard RNA-Seq sequence reads in fastq format #as input, uses a bowtie index comprising all HLA alleles and outputs the most likely HLA class I and class II genotypes, #a p-value for each call, and the expression of each class # #Usage: #python seq2HLA.py -1 <readfile1> -2 <readfile2> -r "<runname>" -l <readlength> [-3 <int>]* #*optional (Default:0) # #Dependencies: #0.) seq2HLA is a python script, developed with Python 2.6.8 #1.) bowtie must be reachable by the command "bowtie". seq2HLA was developed and tested with bowtie version 0.12.7 (64-bit). The call to bowtie is invoked with 6 CPUs. You can change that in the function "mapping". #2.) R must be installed, seq2HLA.py was developed and tested with R version 2.12.2 (2011-02-25) #3.) Input must be paired-end reads in fastq-format #4.) Index files must be located in the folder "references". #5.) Packages: biopython (developed with V1.58), numpy (1.3.0) ########################################################################################################### from operator import itemgetter import sys import linecache import ast import os import shutil import tempfile import subprocess from Bio import SeqIO import numpy import operator from optparse import OptionParser log_stderr = sys.stderr log_file = open("err.log","w") sys.stderr = log_file import pysam #These variables need to be global, as they are filled and used by different modules readcount={} readspergroup={} allelesPerLocus={} def main(runName,readFile1,readFile2,fastaClassI,fastaClassII,bowtiebuildClassI,bowtiebuildClassII,mismatch,trim3,output1,output2,logfile,gzipped,num_threads): log = open(logfile,"w") # mapopt="-a -v"+str(mismatch) mapopt="-a -v"+str(mismatch) #call HLA typing for Class I mainClassI(runName+"-ClassI",readFile1,readFile2,bowtiebuildClassI,fastaClassI,mapopt,trim3,output1,log,gzipped,num_threads) #call HLA typing for Class II mainClassII(runName+"-ClassII",readFile1,readFile2,bowtiebuildClassII,fastaClassII,mapopt,trim3,output2,log,gzipped,num_threads) #---------------Class I------------------------------- def mainClassI(runName,readFile1,readFile2,bowtiebuild,hla1fasta,mapopt,trim3,finaloutput,log,gzipped,num_threads): #-------1st iteration----------------------------------- log.write("----------HLA class I------------\n") sam1=runName+"-iteration1.bam" iteration=1 log.write("First iteration starts....\nMapping ......\n") mapping(sam1,runName,readFile1,readFile2,bowtiebuild,1,mapopt,trim3,log,gzipped,num_threads) medians=[] medians.extend([0,0,0]) medianflag=False #Calculation of first digital haplototype ..... output1=runName+".digitalhaplotype1" log.write("Calculation of first digital haplototype .....\n") map=createRefDict(hla1fasta,"A","B","C") readMapping(map,sam1) predictHLA1(sam1,medians,output1,medianflag,log) log.write("1st iteration done.\nNow removing reads that mapped to the three top-scoring groups .......\n") removeReads(runName,createRemoveList(runName,map)) #------2nd iteration------------------------------------------ log.write("Second iterations starts .....\n Mapping ......\n") medians=[] iteration=2 sam2=runName+"-iteration2.bam" newReadFile1=runName+"-2nditeration_1.fq" newReadFile2=runName+"-2nditeration_2.fq" mapping(sam2,runName,newReadFile1,newReadFile2,bowtiebuild,2,mapopt,trim3,log,gzipped,num_threads) medianfile=runName+".digitalhaplotype1" for i in range(2,5): medians.append(linecache.getline(medianfile, i).split('\t', 3)[2]) medianflag=True output2=runName+".digitalhaplotype2" #Calculation of second digital haplototype log.write("Calculation of second digital haplototype .....\n") map=createRefDict(hla1fasta,"A","B","C") readMapping(map,sam2) predictHLA1(sam2,medians,output2,medianflag,log) log.write("2nd iteration done.") reportHLAgenotype(output1,output2,finaloutput) log.write("Calculation of locus-specific expression ...\n") expression("A","B","C",694,694,694,map,runName,readFile1,finaloutput,gzipped) #--------------Class II---------------------------------------- def mainClassII(runName,readFile1,readFile2,bowtiebuild,hla2fasta,mapopt,trim3,finaloutput,log,gzipped,num_threads): #-------1st iteration---------------------------------------- log.write("----------HLA class II------------\n") sam1=runName+"-iteration1.bam" iteration=1 log.write("ClassII: first iteration starts....\nMapping ......\n") mapping(sam1,runName,readFile1,readFile2,bowtiebuild,1,mapopt,trim3,log,gzipped,num_threads) medians=[] medians.extend([0,0,0]) medianflag=False output1=runName+".digitalhaplotype1" log.write("ClassII: calculation of first digital haplototype .....\n") map=createRefDict(hla2fasta,"DQA1","DQB1","DRB1") readMapping(map,sam1) predictHLA2(sam1,medians,output1,medianflag,log) log.write("1st iteration done.\nNow removing reads that mapped to the three top-scoring groups .......\n") removeReads(runName,createRemoveList(runName,map)) #------2nd iteration------------------------------------------ log.write("Second iterations starts .....\n Mapping ......\n") medians=[] iteration=2 sam2=runName+"-iteration2.bam" newReadFile1=runName+"-2nditeration_1.fq" newReadFile2=runName+"-2nditeration_2.fq" mapping(sam2,runName,newReadFile1,newReadFile2,bowtiebuild,2,mapopt,trim3,log,gzipped,num_threads) medianfile=runName+".digitalhaplotype1" for i in range(2,5): medians.append(float(linecache.getline(medianfile, i).split('\t', 3)[2])/2.0) medianflag=True output2=runName+".digitalhaplotype2" #Calculation of second digital haplototype log.write("ClassII: calculation of second digital haplototype .....\n") map=createRefDict(hla2fasta,"DQA1","DQB1","DRB1") readMapping(map,sam2) predictHLA2(sam2,medians,output2,medianflag,log) log.write("2nd iteration done.\n") reportHLAgenotype(output1,output2,finaloutput) log.write("Calculation of locus-specific expression ...\n") expression("DQA1","DQB1","DRB1",400,421,421,map,runName,readFile1,finaloutput,gzipped) #performs the bowtie mapping for the 2 iterations using the given parameters def mapping(sam,runName,readFile1,readFile2,bowtiebuild,iteration,mapopt,trim3,log,gzipped,num_threads): if iteration==1: if gzipped == "True": sam = sam.rsplit(".",1)[0] cmd1 = "bowtie --threads "+num_threads+" -3 %s --quiet -S %s --al %s.aligned %s -1 <(zcat %s) -2 <(zcat %s) | awk -F '\t' '$3 != \"*\"{ print $0 }' | samtools view -bS - | samtools sort - %s" % (trim3,mapopt,runName,bowtiebuild,readFile1,readFile2,sam) mappingOutput = execBowtie(cmd1) cmd2 = "samtools index %s.bam" % sam createBAMIndex(cmd2) else: sam = sam.rsplit(".",1)[0] cmd1 = "bowtie --threads "+num_threads+" -3 %s --quiet -S %s --al %s.aligned %s -1 %s -2 %s | awk -F '\t' '$3 != \"*\"{ print $0 }' | samtools view -bS - | samtools sort - %s" % (trim3,mapopt,runName,bowtiebuild,readFile1,readFile2,sam) mappingOutput = execBowtie(cmd1) cmd2 = "samtools index %s.bam" % sam createBAMIndex(cmd2) # if iteration==2: else: sam = sam.rsplit(".",1)[0] cmd2 = "bowtie --threads "+num_threads+" -3 %s --quiet -S %s %s -1 %s -2 %s | samtools view -bS - | samtools sort - %s" % (trim3,mapopt,bowtiebuild,readFile1,readFile2,sam) mappingOutput = execBowtie(cmd2) cmd3 = "samtools index %s.bam" % sam createBAMIndex(cmd3) log.write(mappingOutput+"\n") def createBAMIndex(cmd): proc = subprocess.Popen(["/bin/bash","-c",cmd],stderr=subprocess.PIPE) (stdoutdata,stderrdata) = proc.communicate() r = proc.returncode if r != 0: raise IOError(stderrdata) def execBowtie(cmd): proc = subprocess.Popen(["/bin/bash","-c",cmd],stdout=subprocess.PIPE,stderr=subprocess.PIPE) (stdoutdata,stderrdata) = proc.communicate() mappingOutput = stdoutdata r = proc.returncode if r != 0: raise IOError(stderrdata) return mappingOutput #create dictionary "map", that contains all IMGT/HLA-allele names as keys and the allele name (e.g. A*02:01:01) as value #dictionary "readcount" is initialized with 0 for each allele #dictionary "readspergroup" is initialized with 0 for each group (2digit, e.g. A*01) #dictionary "allelesPerLocus" stores the number of alleles per locus. def createRefDict(hlafasta,locus1,locus2,locus3): map={} for locus in (locus1,locus2,locus3): allelesPerLocus[locus]=0 handle=open(hlafasta,'r') for record in SeqIO.parse(handle, "fasta") : l=record.description.split(' ') hlapseudoname=l[0] hlaallele=l[1] map[hlapseudoname]=hlaallele readcount[hlaallele]=0 readspergroup[hlaallele.split(":")[0]]=0 allelesPerLocus[hlaallele.split('*')[0]]+=1 handle.close() return map #Open sam-file and count mappings for each allele def readMapping(map,sam): samhandle = pysam.Samfile(sam,"rb") for ar in samhandle.fetch(): hla = samhandle.getrname(ar.tid) readcount[map[hla]]+=1 samhandle.close() #predict HLA class II genotype def predictHLA2(sam,medians,output,medianflag,log): readspergroupDQAlist=[] readspergroupDQBlist=[] readspergroupDRBlist=[] readspergroupDQA={} readspergroupDQB={} readspergroupDRB={} maxallelepergroup={} #for each allele, to which at least 1 read map, find the allele which has the most reads within a group (2-digit-level, e.g. DQA*02") and save #i) the key of this allele as ambassador for the group => maxallelepergroup holds for each group the top-scoring allele #ii) the number of reads mapping to the top-scoring allele => readspergroup for key in readcount: if readcount[key] > 0: group=key.split(":")[0] if readspergroup[group] <=readcount[key]: readspergroup[group]=readcount[key] maxallelepergroup[group]=key #consider all DQA-,DQB-, and DRB-groups seperately #readspergroup<DQA|DQB|DRB>list = list of all reads mapping to the top-scoring groups minus the decision threshold (which is 0 in the first iteration) #readspergroup<DQA|DQB|DRB> = contains the same entries as the list, but the entries are uniquely accessible via the group-key (e.g. DQA*01) for key in readspergroup: keyBegin = key[0:3] if keyBegin=='DQA': readspergroupDQAlist.append(readspergroup[key]-float(medians[0])) readspergroupDQA[key]=readspergroup[key]-float(medians[0]) if keyBegin=='DQB': readspergroupDQBlist.append(readspergroup[key]-float(medians[1])) readspergroupDQB[key]=readspergroup[key]-float(medians[1]) if keyBegin=='DRB': readspergroupDRBlist.append(readspergroup[key]-float(medians[2])) readspergroupDRB[key]=readspergroup[key]-float(medians[2]) #compute the decision threshold (median) for homozygosity vs. heterozygosity for the second iteration medianDQA=numpy.median(readspergroupDQAlist) medianDQB=numpy.median(readspergroupDQBlist) medianDRB=numpy.median(readspergroupDRBlist) a = b = c = "" #Determine top-scoring group of the whole locus (DQA,DQBB,DRB) and store it #maxkey<DQA,DQB,DRB> = group (e.g. DQA*01) with the most reads #It can be that, e.g. in cancer cells a whole locus is lost. For that reason it is checked if the #number of mapping reads of the top-scoring group maxkey<DQA,DQB,DRB> is > 0, otherwise "no" ist reported for this locus if len(readspergroupDQAlist)>0: maxkeyDQA=max(readspergroupDQA,key=lambda a:readspergroupDQA.get(a)) if readspergroupDQA[maxkeyDQA] > 0: maxDQA=maxallelepergroup[maxkeyDQA] a = max(readspergroupDQA,key=lambda a:readspergroupDQA.get(a)) else: a = "no" else: a = "no" if len(readspergroupDQBlist)>0: maxkeyDQB=max(readspergroupDQB,key=lambda a:readspergroupDQB.get(a)) if readspergroupDQB[maxkeyDQB] > 0: maxDQB=maxallelepergroup[maxkeyDQB] b = max(readspergroupDQB,key=lambda a:readspergroupDQB.get(a)) else: b = "no" else: b = "no" if len(readspergroupDRBlist)>0: maxkeyDRB=max(readspergroupDRB,key=lambda a:readspergroupDRB.get(a)) if readspergroupDRB[maxkeyDRB] > 0: maxDRB=maxallelepergroup[maxkeyDRB] c = max(readspergroupDRB,key=lambda a:readspergroupDRB.get(a)) else: c = "no" else: c = "no" readspergroupDQA=sorted(readspergroupDQA.items(), key=itemgetter(1),reverse=True) readspergroupDQB=sorted(readspergroupDQB.items(), key=itemgetter(1),reverse=True) readspergroupDRB=sorted(readspergroupDRB.items(), key=itemgetter(1),reverse=True) dqavec=dqbvec=drbvec="" if medianflag: #in the 2nd iteration: #1.) DO NOT remove the top-scoring group from the set <dqa,dqb,drb>vec as this is more strict when calculating the probability of the top scoring group being an outlier #The strings <dqa,dqb,drb>vec are used by the R-script to calculate the probability of the top scoring group being an outlier for key in maxallelepergroup: keyBegin = key[0:3] if keyBegin=="DQA": dqavec=dqavec+str(readcount[maxallelepergroup[key]])+"," #print key+": "+str(readcount[maxallelepergroup[key]]) if keyBegin=="DQB": dqbvec=dqbvec+str(readcount[maxallelepergroup[key]])+"," #print key+": "+str(readcount[maxallelepergroup[key]]) if keyBegin=="DRB": drbvec=drbvec+str(readcount[maxallelepergroup[key]])+"," #print key+": "+str(readcount[maxallelepergroup[key]]) #2.) Add the decision thresholds to the sets <dqa,dqb,drb>vec, as this enables measuring the distance of the top-scoring group to this distance if a=="no": dqavec+=str(medians[0]) if b=="no": dqbvec+=str(medians[1]) if c=="no": drbvec+=str(medians[2]) #3.) In case of, e.g. a loss of whole HLA-locus (DQA/B,DRB), <dqa,dqb,drb>vec only contain medians[<0|1|2>]. #To avoid errors in the R-Script, set to 0 if dqavec==str(medians[0]): dqavec = dqacount="0" else: dqacount=str(readcount[maxallelepergroup[maxkeyDQA]]) if dqavec==str(readcount[maxallelepergroup[maxkeyDQA]])+",": dqavec=str(readcount[maxallelepergroup[maxkeyDQA]])+","+str(readcount[maxallelepergroup[maxkeyDQA]])+"," if dqbvec==str(medians[1]): dqbvec = dqbcount="0" else: dqbcount=str(readcount[maxallelepergroup[maxkeyDQB]]) if dqbvec==str(readcount[maxallelepergroup[maxkeyDQB]])+",": dqbvec=str(readcount[maxallelepergroup[maxkeyDQB]])+","+str(readcount[maxallelepergroup[maxkeyDQB]])+"," if drbvec==str(medians[2]): drbvec = drbcount="0" else: drbcount=str(readcount[maxallelepergroup[maxkeyDRB]]) if drbvec==str(readcount[maxallelepergroup[maxkeyDRB]])+",": drbvec=str(readcount[maxallelepergroup[maxkeyDDRB]])+","+str(readcount[maxallelepergroup[maxkeyDRB]])+"," else: #in the 1st iteration: remove the top-scoring group from the set <dqa,dqb,drb>vec as this increases the certainty when calculating the probability of the top scoring group being an outlier for key in maxallelepergroup: keyBegin = key[0:3] if keyBegin=="DQA": if key!=maxkeyDQA: dqavec=dqavec+str(readcount[maxallelepergroup[key]])+"," #print key+": "+str(readcount[maxallelepergroup[key]]) if keyBegin=="DQB": if key!=maxkeyDQB: dqbvec=dqbvec+str(readcount[maxallelepergroup[key]])+"," #print key+": "+str(readcount[maxallelepergroup[key]]) if keyBegin=="DRB": if key!=maxkeyDRB: drbvec=drbvec+str(readcount[maxallelepergroup[key]])+"," #print key+": "+str(readcount[maxallelepergroup[key]]) #2.) DO NOT add the decision thresholds to the sets <dqa,dqb,drb>vec if dqavec=="": dqacount=dqavec="0" else: dqacount=str(readcount[maxallelepergroup[maxkeyDQA]]) if dqavec==str(readcount[maxallelepergroup[maxkeyDQA]])+",": dqavec=str(readcount[maxallelepergroup[maxkeyDQA]])+","+str(readcount[maxallelepergroup[maxkeyDQA]])+"," if len(dqavec.split(","))==2: dqavec=dqavec+"0," if dqbvec=="": dqbcount=dqbvec="0" else: dqbcount=str(readcount[maxallelepergroup[maxkeyDQB]]) if dqbvec==str(readcount[maxallelepergroup[maxkeyDQB]])+",": dqbvec=str(readcount[maxallelepergroup[maxkeyDQB]])+","+str(readcount[maxallelepergroup[maxkeyDQB]])+"," if len(dqbvec.split(","))==2: dqbvec=dqbvec+"0," if drbvec=="": drbcount=drbvec="0" else: drbcount=str(readcount[maxallelepergroup[maxkeyDRB]]) if drbvec==str(readcount[maxallelepergroup[maxkeyDRB]])+",": drbvec=str(readcount[maxallelepergroup[maxkeyDDRB]])+","+str(readcount[maxallelepergroup[maxkeyDRB]])+"," if len(drbvec.split(","))==2: drbvec=drbvec+"0," #call R-script "commmand.R" to calculate the confidence of the top-scoring allele routput=os.popen("R --vanilla < "+sys.path[0]+"/command.R --args "+dqacount+" "+dqavec+" "+maxkeyDQA+" "+dqbcount+" "+dqbvec+" "+maxkeyDQB+" "+drbcount+" "+drbvec+" "+maxkeyDRB).read() parseOutput=routput.split("\n") entries = [] for entry in parseOutput: if entry[0:3]=="[1]": entries.append(str(entry[4:len(entry)])) if a=="no" and entries[1]!="NA": entries[1]=str(1-float(entries[1])) if b=="no" and entries[3]!="NA": entries[3]=str(1-float(entries[3])) if c=="no" and entries[5]!="NA": entries[5]=str(1-float(entries[5])) pred2File(entries,medianDQA,medianDQB,medianDRB,output,a,b,c,log) #predict HLA class I genotype def predictHLA1(sam,medians,output,medianflag,log): readspergroupAlist=[] readspergroupBlist=[] readspergroupClist=[] readspergroupA={} readspergroupB={} readspergroupC={} maxallelepergroup={} #for each allele, to which at least 1 read map, find the allele which has the most reads within a group (2-digit-level, e.g. A*02") and save #i) the key of this allele as ambassador for the group => maxallelepergroup holds for each group the top-scoring allele #ii) the number of reads mapping to the top-scoring allele => readspergroup for key in readcount: if readcount[key] > 0: group=key.split(":")[0] if readspergroup[group] <=readcount[key]: readspergroup[group]=readcount[key] maxallelepergroup[group]=key #consider all A-,B-, and C-groups seperately #readspergroup<A|B|C>list = list of all reads mapping to the top-scoring groups minus the decision threshold (which is 0 in the first iteration) #readspergroup<A|B|C> = contains the same entries as the list, but the entries are uniquely accessible via the group-key (e.g. B*27) for key in readspergroup: firstKey = key[0] if key[0]=='A': readspergroupAlist.append(readspergroup[key]-float(medians[0])) readspergroupA[key]=readspergroup[key]-float(medians[0]) if key[0]=='B': readspergroupBlist.append(readspergroup[key]-float(medians[1])) readspergroupB[key]=readspergroup[key]-float(medians[1]) if key[0]=='C': readspergroupClist.append(readspergroup[key]-float(medians[2])) readspergroupC[key]=readspergroup[key]-float(medians[2]) #compute the decision threshold (median) for homozygosity vs. heterozygosity for the second iteration medianA=numpy.median(readspergroupAlist) medianB=numpy.median(readspergroupBlist) medianC=numpy.median(readspergroupClist) a = b = c = "" #Determine top-scoring group of the whole locus (A,B,C) and store it #maxkey<A,B,C> = group (e.g. A*02) with the most reads #It can be that, e.g. in cancer cells A whole locus is lost. For that reason it is checked if the #number of mapping reads of the top-scoring group maxkey<A,B,C> is > 0, otherwise "no" ist reported for this locus if len(readspergroupAlist)>0: maxkeyA=max(readspergroupA,key=lambda a:readspergroupA.get(a)) if readspergroupA[maxkeyA] > 0: maxA=maxallelepergroup[maxkeyA] a = max(readspergroupA,key=lambda a:readspergroupA.get(a)) else: a = "no" else: a = "no" if len(readspergroupBlist)>0: maxkeyB=max(readspergroupB,key=lambda a:readspergroupB.get(a)) if readspergroupB[maxkeyB] > 0: maxB=maxallelepergroup[maxkeyB] b = max(readspergroupB,key=lambda a:readspergroupB.get(a)) else: b = "no" else: b = "no" if len(readspergroupClist)>0: maxkeyC=max(readspergroupC,key=lambda a:readspergroupC.get(a)) if readspergroupC[maxkeyC] > 0: maxC=maxallelepergroup[maxkeyC] c = max(readspergroupC,key=lambda a:readspergroupC.get(a)) else: c = "no" else: c = "no" readspergroupA=sorted(readspergroupA.items(), key=itemgetter(1),reverse=True) readspergroupB=sorted(readspergroupB.items(), key=itemgetter(1),reverse=True) readspergroupC=sorted(readspergroupC.items(), key=itemgetter(1),reverse=True) avec=bvec=cvec="" if medianflag: #in the 2nd iteration: #1.) DO NOT remove the top-scoring group from the set <a,b,c>vec as this is more strict when calculating the probability of the top scoring group being an outlier #The strings <a,b,c>vec are used by the R-script to calculate the probability of the top scoring group being an outlier for key in maxallelepergroup: if key[0]=="A": avec=avec+str(readcount[maxallelepergroup[key]])+"," #print key+": "+str(readcount[maxallelepergroup[key]]) if key[0]=="B": bvec=bvec+str(readcount[maxallelepergroup[key]])+"," #print key+": "+str(readcount[maxallelepergroup[key]]) if key[0]=="C": cvec=cvec+str(readcount[maxallelepergroup[key]])+"," #print key+": "+str(readcount[maxallelepergroup[key]]) #2.) Add the decision thresholds to the sets <a,b,c>vec, as this enables measuring the distance of the top-scoring group to this distance if a=="no": avec+=str(medians[0]) if b=="no": bvec+=str(medians[1]) if c=="no": cvec+=str(medians[2]) #3.) In case of, e.g. a loss of whole HLA-locus (A,B,C), <a,b,c>vec only contain median[<0|1|2>]. #To avoid errors in the R-Script, set to 0 if avec=="" or avec==medians[0]: avec = acount="0" else: acount=str(readcount[maxallelepergroup[maxkeyA]]) if bvec=="" or bvec==medians[1]: bvec = bcount="0" else: bcount=str(readcount[maxallelepergroup[maxkeyB]]) if cvec=="" or cvec==medians[2]: cvec = ccount="0" else: ccount=str(readcount[maxallelepergroup[maxkeyC]]) else: #in the 1st iteration: remove the top-scoring group from the set <a,b,c>vec as this increases the certainty when calculating the probability of the top scoring group being an outlier for key in maxallelepergroup: if key[0]=="A" and key!=maxkeyA: avec=avec+str(readcount[maxallelepergroup[key]])+"," #print key+": "+str(readcount[maxallelepergroup[key]]) if key[0]=="B" and key!=maxkeyB: bvec=bvec+str(readcount[maxallelepergroup[key]])+"," #print key+": "+str(readcount[maxallelepergroup[key]]) if key[0]=="C" and key!=maxkeyC: cvec=cvec+str(readcount[maxallelepergroup[key]])+"," #print key+": "+str(readcount[maxallelepergroup[key]]) #2.) DO NOT add the decision thresholds to the sets <a,b,c>vec if avec=="": acount=avec="0" else: acount=str(readcount[maxallelepergroup[maxkeyA]]) if bvec=="": bcount=bvec="0" else: bcount=str(readcount[maxallelepergroup[maxkeyB]]) if cvec=="": ccount=cvec="0" else: ccount=str(readcount[maxallelepergroup[maxkeyC]]) #call R-script "commmand.R" to calculate the confidence of the top-scoring allele routput=os.popen("R --vanilla < "+sys.path[0]+"/command.R --args "+acount+" "+avec+" "+maxkeyA+" "+bcount+" "+bvec+" "+maxkeyB+" "+ccount+" "+cvec+" "+maxkeyC).read() parseOutput=routput.split("\n") entries = [] for entry in parseOutput: if entry[0:3]=="[1]": entries.append(str(entry[4:len(entry)])) if a=="no" and entries[1]!="NA": entries[1]=str(1-float(entries[1])) if b=="no" and entries[3]!="NA": entries[3]=str(1-float(entries[3])) if c=="no" and entries[5]!="NA": entries[5]=str(1-float(entries[5])) pred2File(entries,medianA,medianB,medianC,output,a,b,c,log) #write digital haplotype into file def pred2File(entries,medianA,medianB,medianC,output,a,b,c,log): out = open(output, 'w') out.write("HLA\tHLA1\tmedian-Value\talternative\tp-Value\n") out.write("A\t"+a+"\t") # write the median-Value for A out.write(str(medianA)+"\t") out.write(entries[0]+"\t"+entries[1]+"\n") out.write("B\t"+b+"\t") # write the median-Value for B out.write(str(medianB)+"\t") out.write(entries[2]+"\t"+entries[3]+"\n") out.write("C\t"+c+"\t") # write the median-Value for C out.write(str(medianC)+"\t") out.write(entries[4]+"\t"+entries[5]+"\n") out.close() log.write("The digital haplotype is written into "+output+"\n") #open mapping file and all read ids to the list "removeList", which map to one of the three groups in "alleles" def createRemoveList(runName,map): removeList={} alleles = [] sam=runName+"-iteration1.bam" alleles_in=runName+".digitalhaplotype1" for i in range(2,5): alleles.append(linecache.getline(alleles_in, i).split('\t', 2)[1]) samhandle=pysam.Samfile(sam,'rb') for read in samhandle.fetch(): illuminaid = read.qname hlapseudoname = samhandle.getrname(read.tid) if map[hlapseudoname].split(':')[0] in alleles: removeList[illuminaid]=1 samhandle.close() return removeList #Remove reads that mapped to the three top-scoring alleles and write the remaining reads into two new read files def removeReads(runName,removeList): aligned1=runName+"_1.aligned" aligned2=runName+"_2.aligned" newReadFile1=runName+"-2nditeration_1.fq" newReadFile2=runName+"-2nditeration_2.fq" #r1 and r2, which are the input of bowtie in the 2nd iteration r1=open(newReadFile1,"w") r2=open(newReadFile2,"w") #open the 2 files, that contain the reads that mapped in the 1st iteration aligned_handle1=open(aligned1,"r") aligned_handle2=open(aligned2,"r") #One read entry consists of 4 lines: header, seq, "+", qualities. for record in SeqIO.parse(aligned_handle1, "fastq"): illuminaid=record.id.split('/')[0].split(' ')[0]#find exact id, which also appears in the mapping file if not illuminaid in removeList: SeqIO.write(record, r1, "fastq") for record in SeqIO.parse(aligned_handle2, "fastq"): illuminaid=record.id.split('/')[0].split(' ')[0]#find exact id, which also appears in the mapping file if not illuminaid in removeList: SeqIO.write(record, r2, "fastq") #write the final prediction (both digital haplotypes) to file and stdout def reportHLAgenotype(output1,output2,finaloutput): filehandle1=open(output1,'r').readlines()[1:4] filehandle2=open(output2,'r').readlines()[1:4] outfile=open(finaloutput, 'w') outfile.write("HLA\tHLA1\tp-Value\tHLA2\tp-Value\n") for i in range(len(filehandle1)): filehandle1[i]=filehandle1[i][0:-1] for i in range(len(filehandle2)): filehandle2[i]=filehandle2[i][0:-1] a1 = filehandle1[0].split('\t',2)[1] a1score = filehandle1[0].split('\t')[4] b1 = filehandle1[1].split('\t',2)[1] b1score = filehandle1[1].split('\t')[4] c1 = filehandle1[2].split('\t',2)[1] c1score = filehandle1[2].split('\t')[4] a2 = filehandle2[0].split('\t',2)[1] if a2 == "no": a2 = "hoz("+filehandle2[0].split('\t')[3]+")" a2score = filehandle2[0].split('\t')[4] b2 = filehandle2[1].split('\t',2)[1] if b2 == "no": b2 = "hoz("+filehandle2[1].split('\t')[3]+")" b2score = filehandle2[1].split('\t')[4] c2 = filehandle2[2].split('\t', 2)[1] if c2 == "no": c2 = "hoz("+filehandle2[2].split('\t')[3]+")" c2score = filehandle2[2].split('\t')[4] #write complete HLA genotype to file outfile.write("A\t"+a1+"\t"+a1score+"\t"+a2+"\t"+a2score+"\n") outfile.write("B\t"+b1+"\t"+b1score+"\t"+b2+"\t"+b2score+"\n") outfile.write("C\t"+c1+"\t"+c1score+"\t"+c2+"\t"+c2score+"\n") #.. and print it to STDOUT #print "A\t"+a1+"\t"+a1score+"\t"+a2+"\t"+a2score #print "B\t"+b1+"\t"+b1score+"\t"+b2+"\t"+b2score #print "C\t"+c1+"\t"+c1score+"\t"+c2+"\t"+c2score #calculate locus-specific expression def expression(locus1,locus2,locus3,length1,length2,length3,map,runName,readFile1,finaloutput,gzipped): outfile = open(finaloutput,'a') aligned1=runName+"_1.aligned" sam=runName+"-iteration1.bam" alleles_in=runName+".digitalhaplotype1" if gzipped: proc=subprocess.Popen(["/bin/bash","-c","wc -l <(zcat " + readFile1 + ")"],stdout=subprocess.PIPE,stderr=subprocess.PIPE) (stdoutdata,stderrdata) = proc.communicate() numberlines = stdoutdata else: proc=subprocess.Popen(["/bin/bash","-c","wc -l " + readFile1],stdout=subprocess.PIPE,stderr=subprocess.PIPE) (stdoutdata,stderrdata) = proc.communicate() numberlines = stdoutdata totalreads=int(numberlines.split(" ")[0])/4 alleles=[] for i in range(2,5): alleles.append(linecache.getline(alleles_in, i).split('\t', 2)[1]) alleles.append(linecache.getline(alleles_in, i).split('\t')[3]) #create read dictionary reads={} aligned_handle1=open(aligned1,"r") for record in SeqIO.parse(aligned_handle1, "fastq"): illuminaid=record.id.split('/')[0].split(' ')[0]#find exact id, which also appears in the mapping file reads[illuminaid]={} reads[illuminaid][locus1]=0 reads[illuminaid][locus2]=0 reads[illuminaid][locus3]=0 samhandle = pysam.Samfile(sam,"rb") for read in samhandle.fetch(): illuminaid = read.qname hlapseudoname = samhandle.getrname(read.tid) if map[hlapseudoname].split(':')[0] in alleles: reads[illuminaid][map[hlapseudoname].split('*')[0]]+=1 count={} count[locus1]=0 count[locus2]=0 count[locus3]=0 for key in reads: n=0 for locus in reads[key]: if reads[key][locus] > 0: n+=1 for locus in reads[key]: if reads[key][locus] > 0: count[locus]+=float(1.0/float(n)) outfile.write("\n") #Calculate RPKM and print expression values for each locus to stdout for locus in count: if locus==locus1: outfile.write(locus+": "+str(round(float((1000.0/length1))*float((1000000.0/totalreads))*count[locus],2))+" RPKM\n") if locus==locus2: outfile.write(locus+": "+str(round(float((1000.0/length2))*float((1000000.0/totalreads))*count[locus],2))+" RPKM\n") if locus==locus3: outfile.write(locus+": "+str(round(float((1000.0/length3))*float((1000000.0/totalreads))*count[locus],2))+" RPKM\n") if __name__ == '__main__': parser = OptionParser(usage="usage: %prog -1 readFile1 -2 readFile2 -r runName -l readlength [-3 <int>]", version="%prog 1.0") parser.add_option("-z","--gzipped",action="store",dest="gzipped",help="Select if input reads are in gzip format") parser.add_option("-1", action="store", dest="readFile1", help="File name of #1 mates ") parser.add_option("-2", action="store", dest="readFile2", help="File name of #2 mates") parser.add_option("-r", "--runName", action="store", dest="runName", help="Name of this HLA typing run. Wil be used throughout this process as part of the name of the newly created files.") parser.add_option("-l", "--length", action="store", dest="length", help="Readlength") parser.add_option("-3", "--trim3", action="store", dest="trim3", default="0", help="Bowtie option: -3 <int> trims <int> bases from the low quality 3' end of each read. Default: 0") parser.add_option("-o", "--output1", action="store", dest="output1", help="Output file 1") parser.add_option("-p", "--output2", action="store", dest="output2", help="Output file 2") parser.add_option("-g","--log",action="store",dest="logfile",help="Output Log File") parser.add_option("-t","--threads",type="int",dest="num_threads",help="Define amount of threads to use for Bowtie in this script") (options, args) = parser.parse_args() if not options.readFile1: parser.error('File name #1 pair not given.') if not options.readFile2: parser.error('File name #2 pair not given.') if not options.runName: parser.error('Run name not given.') if not options.length: parser.error('Readlength not given.') gzipped = options.gzipped readFile1=options.readFile1 readFile2=options.readFile2 runName=options.runName output1 = options.output1 output2 = options.output2 logfile = options.logfile bowtiebuildClassI=sys.path[0]+"/references/ClassIWithoutNQex2-3.plus75" bowtiebuildClassII=sys.path[0]+"/references/HLA2.ex2.plus75" fastaClassI=sys.path[0]+"/references/ClassIWithoutNQex2-3.plus75.fasta" fastaClassII=sys.path[0]+"/references/HLA2.ex2.plus75.fasta" #as shown in the publication HLA typing with RNA-Seq works best by allowing as less mismatches as necessary if int(options.length)<=50: mismatch=1 elif int(options.length)>50 and int(options.length)<=100: mismatch=2 else: mismatch=3 trim3=str(options.trim3) main(runName,readFile1,readFile2,fastaClassI,fastaClassII,bowtiebuildClassI,bowtiebuildClassII,mismatch,trim3,output1,output2,logfile,gzipped,options.num_threads) sys.stderr = log_stderr log_file.close()