Mercurial > repos > sebastian-boegel > seq2hla

diff seq2HLA.py @ 0:913ea6991ee4 draft
Uploaded
author: sebastian-boegel
date: Thu, 20 Dec 2012 10:37:01 -0500
children: 155b796033b6
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seq2HLA.py	Thu Dec 20 10:37:01 2012 -0500
@@ -0,0 +1,801 @@
+##########################################################################################################
+#Title:
+#seq2HLA - HLA genotyping from RNA-Seq sequence reads
+#
+#Release: 1.0
+#
+#Author:
+#Sebastian Boegel, 2012 (c)
+#TRON - Translational Oncology at the University Medical Center Mainz, 55131 Mainz, Germany
+#University Medical Center of the Johannes Gutenberg-University Mainz, III.  Medical Department, Mainz, Germany
+#
+#Contact:
+#boegels@uni-mainz.de
+#
+#Synopsis:
+#We developed an in-silico method "Seq2HLA", written in python and R, which takes standard RNA-Seq sequence reads in fastq format 
+#as input, uses a bowtie index comprising all HLA alleles and outputs the most likely HLA class I and class II genotypes, 
+#a p-value for each call, and the expression of each class 
+#
+#Usage: 
+#python seq2HLA.py -1 <readfile1> -2 <readfile2> -r "<runname>" -l <readlength> [-3 <int>]*
+#*optional (Default:0)
+#
+#Dependencies:
+#0.) seq2HLA is a python script, developed with Python 2.6.8
+#1.) bowtie must be reachable by the command "bowtie". seq2HLA was developed and tested with bowtie version 0.12.7 (64-bit). The call to bowtie is invoked with 6 CPUs. You can change that in the function "mapping".
+#2.) R must be installed, seq2HLA.py was developed and tested with R version 2.12.2 (2011-02-25)
+#3.) Input must be paired-end reads in fastq-format
+#4.) Index files must be located in the folder "references".
+#5.) Packages: biopython (developed with V1.58), numpy (1.3.0)
+###########################################################################################################
+
+from operator import itemgetter
+import sys
+import linecache
+import ast
+import os
+import shutil
+import tempfile
+import subprocess
+from Bio import SeqIO
+import numpy
+import operator
+from optparse import OptionParser
+
+log_stderr = sys.stderr
+log_file = open("err.log","w")
+sys.stderr = log_file
+
+import pysam
+
+#These variables need to be global, as they are filled and used by different modules
+readcount={}
+readspergroup={}
+allelesPerLocus={}
+
+def main(runName,readFile1,readFile2,fastaClassI,fastaClassII,bowtiebuildClassI,bowtiebuildClassII,mismatch,trim3,output1,output2,logfile,gzipped):
+	log = open(logfile,"w")
+#	mapopt="-a -v"+str(mismatch)
+	mapopt="-a -v"+str(mismatch)
+	#call HLA typing for Class I
+	mainClassI(runName+"-ClassI",readFile1,readFile2,bowtiebuildClassI,fastaClassI,mapopt,trim3,output1,log,gzipped)
+	#call HLA typing for Class II
+	mainClassII(runName+"-ClassII",readFile1,readFile2,bowtiebuildClassII,fastaClassII,mapopt,trim3,output2,log,gzipped)
+
+#---------------Class I-------------------------------
+def mainClassI(runName,readFile1,readFile2,bowtiebuild,hla1fasta,mapopt,trim3,finaloutput,log,gzipped):
+	#-------1st iteration-----------------------------------
+	log.write("----------HLA class I------------\n")
+	sam1=runName+"-iteration1.bam"
+	iteration=1
+	log.write("First iteration starts....\nMapping ......\n")
+	mapping(sam1,runName,readFile1,readFile2,bowtiebuild,1,mapopt,trim3,log,gzipped)
+	medians=[]
+	medians.extend([0,0,0])
+	medianflag=False
+	#Calculation of first digital haplototype .....
+	output1=runName+".digitalhaplotype1"	
+	log.write("Calculation of first digital haplototype .....\n")
+	map=createRefDict(hla1fasta,"A","B","C")
+	readMapping(map,sam1)
+	predictHLA1(sam1,medians,output1,medianflag,log)
+	log.write("1st iteration done.\nNow removing reads that mapped to the three top-scoring groups .......\n")
+	removeReads(runName,createRemoveList(runName,map))
+	
+	#------2nd iteration------------------------------------------
+	log.write("Second iterations starts .....\n Mapping ......\n")
+	medians=[]
+	iteration=2
+	sam2=runName+"-iteration2.bam"
+	newReadFile1=runName+"-2nditeration_1.fq"
+	newReadFile2=runName+"-2nditeration_2.fq"
+	mapping(sam2,runName,newReadFile1,newReadFile2,bowtiebuild,2,mapopt,trim3,log,gzipped)
+	medianfile=runName+".digitalhaplotype1"
+	for i in range(2,5):
+		medians.append(linecache.getline(medianfile, i).split('\t', 3)[2])
+	medianflag=True
+	output2=runName+".digitalhaplotype2"
+	#Calculation of second digital haplototype
+	log.write("Calculation of second digital haplototype .....\n")
+	map=createRefDict(hla1fasta,"A","B","C")
+	readMapping(map,sam2)
+	predictHLA1(sam2,medians,output2,medianflag,log)
+	log.write("2nd iteration done.")
+	reportHLAgenotype(output1,output2,finaloutput)
+	log.write("Calculation of locus-specific expression ...\n")
+	expression("A","B","C",694,694,694,map,runName,readFile1,finaloutput,gzipped)
+
+#--------------Class II----------------------------------------
+def mainClassII(runName,readFile1,readFile2,bowtiebuild,hla2fasta,mapopt,trim3,finaloutput,log,gzipped):
+	#-------1st iteration----------------------------------------
+	log.write("----------HLA class II------------\n")
+	sam1=runName+"-iteration1.bam"
+	iteration=1
+	log.write("ClassII: first iteration starts....\nMapping ......\n")
+	mapping(sam1,runName,readFile1,readFile2,bowtiebuild,1,mapopt,trim3,log,gzipped)
+	medians=[]
+	medians.extend([0,0,0])
+	medianflag=False
+	output1=runName+".digitalhaplotype1"	
+	log.write("ClassII: calculation of first digital haplototype .....\n")
+	map=createRefDict(hla2fasta,"DQA1","DQB1","DRB1")
+	readMapping(map,sam1)
+	predictHLA2(sam1,medians,output1,medianflag,log)
+	log.write("1st iteration done.\nNow removing reads that mapped to the three top-scoring groups .......\n")
+	removeReads(runName,createRemoveList(runName,map))
+	
+	#------2nd iteration------------------------------------------
+	log.write("Second iterations starts .....\n Mapping ......\n")
+	medians=[]
+	iteration=2
+	sam2=runName+"-iteration2.bam"
+	newReadFile1=runName+"-2nditeration_1.fq"
+	newReadFile2=runName+"-2nditeration_2.fq"
+	mapping(sam2,runName,newReadFile1,newReadFile2,bowtiebuild,2,mapopt,trim3,log,gzipped)
+	medianfile=runName+".digitalhaplotype1"
+	for i in range(2,5):
+		medians.append(float(linecache.getline(medianfile, i).split('\t', 3)[2])/2.0)
+	medianflag=True
+	output2=runName+".digitalhaplotype2"
+	#Calculation of second digital haplototype
+	log.write("ClassII: calculation of second digital haplototype .....\n")
+	map=createRefDict(hla2fasta,"DQA1","DQB1","DRB1")
+	readMapping(map,sam2)
+	predictHLA2(sam2,medians,output2,medianflag,log)
+	log.write("2nd iteration done.\n")
+	reportHLAgenotype(output1,output2,finaloutput)
+	log.write("Calculation of locus-specific expression ...\n")
+	expression("DQA1","DQB1","DRB1",400,421,421,map,runName,readFile1,finaloutput,gzipped)
+
+#performs the bowtie mapping for the 2 iterations using the given parameters
+def mapping(sam,runName,readFile1,readFile2,bowtiebuild,iteration,mapopt,trim3,log,gzipped):
+	if iteration==1:
+		if gzipped == "True":
+			sam = sam.rsplit(".",1)[0]
+			cmd1 = "bowtie --threads 4 -3 %s --quiet -S %s --al %s.aligned %s -1 <(zcat %s) -2 <(zcat %s) |  awk -F '\t' '$3 != \"*\"{ print $0 }' | samtools view -bS - | samtools sort - %s" % (trim3,mapopt,runName,bowtiebuild,readFile1,readFile2,sam)
+			mappingOutput = execBowtie(cmd1)
+			
+			cmd2 = "samtools index %s.bam" % sam
+			createBAMIndex(cmd2)
+		else:
+			sam = sam.rsplit(".",1)[0]
+			cmd1 = "bowtie --threads 4 -3 %s --quiet -S %s --al %s.aligned %s -1 %s -2 %s |  awk -F '\t' '$3 != \"*\"{ print $0 }' | samtools view -bS - | samtools sort - %s" % (trim3,mapopt,runName,bowtiebuild,readFile1,readFile2,sam)
+			mappingOutput = execBowtie(cmd1)
+
+			cmd2 = "samtools index %s.bam" % sam
+			createBAMIndex(cmd2)
+#	if iteration==2:
+	else:
+		
+		sam = sam.rsplit(".",1)[0]
+		cmd2 = "bowtie --threads 4 -3 %s --quiet -S %s %s -1 %s -2 %s | samtools view -bS - | samtools sort - %s" % (trim3,mapopt,bowtiebuild,readFile1,readFile2,sam)
+		mappingOutput = execBowtie(cmd2)
+
+		cmd3 = "samtools index %s.bam" % sam
+		createBAMIndex(cmd3)
+
+	log.write(mappingOutput+"\n")
+
+def createBAMIndex(cmd):
+	proc = subprocess.Popen(["/bin/bash","-c",cmd],stderr=subprocess.PIPE)
+	(stdoutdata,stderrdata) = proc.communicate()
+	r = proc.returncode
+	if r != 0:
+		raise IOError(stderrdata)
+
+def execBowtie(cmd):
+	proc = subprocess.Popen(["/bin/bash","-c",cmd],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+	(stdoutdata,stderrdata) = proc.communicate()
+	mappingOutput = stdoutdata
+	r = proc.returncode
+	if r != 0:
+		raise IOError(stderrdata)
+	return mappingOutput
+
+#create dictionary "map", that contains all IMGT/HLA-allele names as keys and the allele name (e.g. A*02:01:01) as value
+#dictionary "readcount" is initialized with 0 for each allele
+#dictionary "readspergroup" is initialized with 0 for each group (2digit, e.g. A*01)
+#dictionary "allelesPerLocus" stores the number of alleles per locus.
+def createRefDict(hlafasta,locus1,locus2,locus3):
+	map={}
+	for locus in (locus1,locus2,locus3):
+		allelesPerLocus[locus]=0
+	handle=open(hlafasta,'r')
+	for record in SeqIO.parse(handle, "fasta") :
+		l=record.description.split(' ')
+		hlapseudoname=l[0]
+		hlaallele=l[1]
+		map[hlapseudoname]=hlaallele
+		readcount[hlaallele]=0
+		readspergroup[hlaallele.split(":")[0]]=0
+		allelesPerLocus[hlaallele.split('*')[0]]+=1
+	handle.close()
+	return map
+
+#Open sam-file and count mappings for each allele 
+def readMapping(map,sam):
+	samhandle = pysam.Samfile(sam,"rb")
+	for ar in samhandle.fetch():
+		hla = samhandle.getrname(ar.tid)
+		readcount[map[hla]]+=1
+	samhandle.close()
+
+#predict HLA class II genotype
+def predictHLA2(sam,medians,output,medianflag,log):
+
+	readspergroupDQAlist=[]
+	readspergroupDQBlist=[]
+	readspergroupDRBlist=[]
+	readspergroupDQA={}
+	readspergroupDQB={}
+	readspergroupDRB={}
+	maxallelepergroup={}
+	
+	#for each allele, to which at least 1 read map, find the allele which has the most reads within a group (2-digit-level, e.g. DQA*02") and save
+	#i) the key of this allele as ambassador for the group => maxallelepergroup holds for each group the top-scoring allele
+	#ii) the number of reads mapping to the top-scoring allele => readspergroup
+	for key in readcount:
+		if readcount[key] > 0:
+			group=key.split(":")[0]
+			if readspergroup[group] <=readcount[key]:
+				readspergroup[group]=readcount[key]
+				maxallelepergroup[group]=key
+
+	#consider all DQA-,DQB-, and DRB-groups seperately
+	#readspergroup<DQA|DQB|DRB>list = list of all reads mapping to the top-scoring groups minus the decision threshold (which is 0 in the first iteration)
+	#readspergroup<DQA|DQB|DRB> = contains the same entries as the list, but the entries are uniquely accessible via the group-key (e.g. DQA*01)
+	for key in readspergroup:
+		keyBegin = key[0:3]
+		if keyBegin=='DQA':
+			readspergroupDQAlist.append(readspergroup[key]-float(medians[0]))
+			readspergroupDQA[key]=readspergroup[key]-float(medians[0])
+		if keyBegin=='DQB':
+			readspergroupDQBlist.append(readspergroup[key]-float(medians[1]))
+			readspergroupDQB[key]=readspergroup[key]-float(medians[1])
+		if keyBegin=='DRB':
+			readspergroupDRBlist.append(readspergroup[key]-float(medians[2]))
+			readspergroupDRB[key]=readspergroup[key]-float(medians[2])
+				
+	#compute the decision threshold (median) for homozygosity vs. heterozygosity for the second iteration 
+	medianDQA=numpy.median(readspergroupDQAlist)
+	medianDQB=numpy.median(readspergroupDQBlist)
+	medianDRB=numpy.median(readspergroupDRBlist)
+	a = b = c = ""
+	
+	#Determine top-scoring group of the whole locus (DQA,DQBB,DRB) and store it
+	#maxkey<DQA,DQB,DRB> = group (e.g. DQA*01) with the most reads
+	#It can be that, e.g. in cancer cells a whole locus is lost. For that reason it is checked if the 
+	#number of mapping reads of the top-scoring group maxkey<DQA,DQB,DRB> is > 0, otherwise "no" ist reported for this locus
+	if len(readspergroupDQAlist)>0:
+		maxkeyDQA=max(readspergroupDQA,key=lambda a:readspergroupDQA.get(a))
+		if readspergroupDQA[maxkeyDQA] > 0:
+			maxDQA=maxallelepergroup[maxkeyDQA]
+			a = max(readspergroupDQA,key=lambda a:readspergroupDQA.get(a))
+		else:
+			a = "no"
+	else:
+		a = "no"
+
+	if len(readspergroupDQBlist)>0:
+		maxkeyDQB=max(readspergroupDQB,key=lambda a:readspergroupDQB.get(a))
+		if readspergroupDQB[maxkeyDQB] > 0:
+			maxDQB=maxallelepergroup[maxkeyDQB]
+			b = max(readspergroupDQB,key=lambda a:readspergroupDQB.get(a))
+		else:
+			b = "no"
+	else:
+		b = "no"
+
+	if len(readspergroupDRBlist)>0:
+		maxkeyDRB=max(readspergroupDRB,key=lambda a:readspergroupDRB.get(a))
+		if readspergroupDRB[maxkeyDRB] > 0:
+			maxDRB=maxallelepergroup[maxkeyDRB]
+			c = max(readspergroupDRB,key=lambda a:readspergroupDRB.get(a))
+		else:
+			c = "no"
+	else:
+		c = "no"
+		
+	readspergroupDQA=sorted(readspergroupDQA.items(), key=itemgetter(1),reverse=True)
+	readspergroupDQB=sorted(readspergroupDQB.items(), key=itemgetter(1),reverse=True)
+	readspergroupDRB=sorted(readspergroupDRB.items(), key=itemgetter(1),reverse=True)
+
+	dqavec=dqbvec=drbvec=""
+	if medianflag:
+		#in the 2nd iteration: 
+		#1.) DO NOT remove the top-scoring group from the set <dqa,dqb,drb>vec as this is more strict when calculating the probability of the top scoring group being an outlier
+		#The strings <dqa,dqb,drb>vec are used by the R-script to calculate the probability of the top scoring group being an outlier
+		for key in maxallelepergroup:
+			keyBegin = key[0:3]
+			if keyBegin=="DQA":
+				dqavec=dqavec+str(readcount[maxallelepergroup[key]])+","
+				#print key+": "+str(readcount[maxallelepergroup[key]])
+			if keyBegin=="DQB":
+				dqbvec=dqbvec+str(readcount[maxallelepergroup[key]])+","
+				#print key+": "+str(readcount[maxallelepergroup[key]])
+			if keyBegin=="DRB":
+				drbvec=drbvec+str(readcount[maxallelepergroup[key]])+","
+				#print key+": "+str(readcount[maxallelepergroup[key]])
+		#2.) Add the decision thresholds to the sets <dqa,dqb,drb>vec, as this enables measuring the distance of the top-scoring group to this distance
+		if a=="no":
+			dqavec+=str(medians[0])
+		if b=="no":
+			dqbvec+=str(medians[1])
+		if c=="no":
+			drbvec+=str(medians[2])
+		#3.) In case of, e.g. a loss of whole HLA-locus (DQA/B,DRB), <dqa,dqb,drb>vec only contain medians[<0|1|2>].
+		#To avoid errors in the R-Script, set to 0
+		if dqavec==str(medians[0]):
+			dqavec = dqacount="0"
+		else:
+			dqacount=str(readcount[maxallelepergroup[maxkeyDQA]])
+			if dqavec==str(readcount[maxallelepergroup[maxkeyDQA]])+",":
+				dqavec=str(readcount[maxallelepergroup[maxkeyDQA]])+","+str(readcount[maxallelepergroup[maxkeyDQA]])+","
+		if dqbvec==str(medians[1]):
+			dqbvec = dqbcount="0"
+		else:
+			dqbcount=str(readcount[maxallelepergroup[maxkeyDQB]])
+			if dqbvec==str(readcount[maxallelepergroup[maxkeyDQB]])+",":
+				dqbvec=str(readcount[maxallelepergroup[maxkeyDQB]])+","+str(readcount[maxallelepergroup[maxkeyDQB]])+","
+		if drbvec==str(medians[2]):
+			drbvec = drbcount="0"
+		else:
+			drbcount=str(readcount[maxallelepergroup[maxkeyDRB]])
+			if drbvec==str(readcount[maxallelepergroup[maxkeyDRB]])+",":
+				drbvec=str(readcount[maxallelepergroup[maxkeyDDRB]])+","+str(readcount[maxallelepergroup[maxkeyDRB]])+","
+	else:
+		#in the 1st iteration: remove the top-scoring group from the set <dqa,dqb,drb>vec as this increases the certainty when calculating the probability of the top scoring group being an outlier
+		for key in maxallelepergroup:
+			keyBegin = key[0:3]
+			if keyBegin=="DQA":
+				if key!=maxkeyDQA:
+					dqavec=dqavec+str(readcount[maxallelepergroup[key]])+","
+				#print key+": "+str(readcount[maxallelepergroup[key]])
+			if keyBegin=="DQB":
+				if key!=maxkeyDQB:
+					dqbvec=dqbvec+str(readcount[maxallelepergroup[key]])+","
+				#print key+": "+str(readcount[maxallelepergroup[key]])	
+			if keyBegin=="DRB":
+				if key!=maxkeyDRB:
+					drbvec=drbvec+str(readcount[maxallelepergroup[key]])+","
+				#print key+": "+str(readcount[maxallelepergroup[key]])
+		#2.) DO NOT add the decision thresholds to the sets <dqa,dqb,drb>vec
+		if dqavec=="":
+			dqacount=dqavec="0"
+		else:
+			dqacount=str(readcount[maxallelepergroup[maxkeyDQA]])
+			if dqavec==str(readcount[maxallelepergroup[maxkeyDQA]])+",":
+				dqavec=str(readcount[maxallelepergroup[maxkeyDQA]])+","+str(readcount[maxallelepergroup[maxkeyDQA]])+","
+			if len(dqavec.split(","))==2:
+				dqavec=dqavec+"0,"
+
+		if dqbvec=="":
+			dqbcount=dqbvec="0"
+		else:
+			dqbcount=str(readcount[maxallelepergroup[maxkeyDQB]])
+			if dqbvec==str(readcount[maxallelepergroup[maxkeyDQB]])+",":
+				dqbvec=str(readcount[maxallelepergroup[maxkeyDQB]])+","+str(readcount[maxallelepergroup[maxkeyDQB]])+","
+			if len(dqbvec.split(","))==2:
+				dqbvec=dqbvec+"0,"
+
+		if drbvec=="":
+			drbcount=drbvec="0"
+		else:
+			drbcount=str(readcount[maxallelepergroup[maxkeyDRB]])
+			if drbvec==str(readcount[maxallelepergroup[maxkeyDRB]])+",":
+				drbvec=str(readcount[maxallelepergroup[maxkeyDDRB]])+","+str(readcount[maxallelepergroup[maxkeyDRB]])+","
+			if len(drbvec.split(","))==2:
+				drbvec=drbvec+"0,"
+
+	#call R-script "commmand.R" to calculate the confidence of the top-scoring allele
+	routput=os.popen("R --vanilla < "+sys.path[0]+"/command.R --args "+dqacount+" "+dqavec+" "+maxkeyDQA+" "+dqbcount+" "+dqbvec+" "+maxkeyDQB+" "+drbcount+" "+drbvec+" "+maxkeyDRB).read()
+	parseOutput=routput.split("\n")
+
+	entries = []
+	for entry in parseOutput:
+		if entry[0:3]=="[1]":
+			entries.append(str(entry[4:len(entry)]))
+
+	if a=="no" and entries[1]!="NA":
+		entries[1]=str(1-float(entries[1]))
+	if b=="no" and entries[3]!="NA":
+		entries[3]=str(1-float(entries[3]))
+	if c=="no" and entries[5]!="NA":
+		entries[5]=str(1-float(entries[5]))
+	
+	pred2File(entries,medianDQA,medianDQB,medianDRB,output,a,b,c,log)
+	
+#predict HLA class I genotype 
+def predictHLA1(sam,medians,output,medianflag,log): 
+	
+	readspergroupAlist=[]
+	readspergroupBlist=[]
+	readspergroupClist=[]
+	readspergroupA={}
+	readspergroupB={}
+	readspergroupC={}
+	maxallelepergroup={}
+	
+	#for each allele, to which at least 1 read map, find the allele which has the most reads within a group (2-digit-level, e.g. A*02") and save
+	#i) the key of this allele as ambassador for the group => maxallelepergroup holds for each group the top-scoring allele
+	#ii) the number of reads mapping to the top-scoring allele => readspergroup
+	for key in readcount:
+		if readcount[key] > 0:
+			group=key.split(":")[0]
+			if readspergroup[group] <=readcount[key]:
+				readspergroup[group]=readcount[key]
+				maxallelepergroup[group]=key
+
+	
+	#consider all A-,B-, and C-groups seperately
+	#readspergroup<A|B|C>list = list of all reads mapping to the top-scoring groups minus the decision threshold (which is 0 in the first iteration)
+	#readspergroup<A|B|C> = contains the same entries as the list, but the entries are uniquely accessible via the group-key (e.g. B*27)
+	for key in readspergroup:
+		firstKey = key[0]
+		if key[0]=='A':
+			readspergroupAlist.append(readspergroup[key]-float(medians[0]))
+			readspergroupA[key]=readspergroup[key]-float(medians[0])
+		if key[0]=='B':
+			readspergroupBlist.append(readspergroup[key]-float(medians[1]))
+			readspergroupB[key]=readspergroup[key]-float(medians[1])
+		if key[0]=='C':
+			readspergroupClist.append(readspergroup[key]-float(medians[2]))
+			readspergroupC[key]=readspergroup[key]-float(medians[2])
+
+	#compute the decision threshold (median) for homozygosity vs. heterozygosity for the second iteration 
+	medianA=numpy.median(readspergroupAlist)
+	medianB=numpy.median(readspergroupBlist)
+	medianC=numpy.median(readspergroupClist)
+	a = b = c = ""
+	
+	#Determine top-scoring group of the whole locus (A,B,C) and store it
+	#maxkey<A,B,C> = group (e.g. A*02) with the most reads
+	#It can be that, e.g. in cancer cells A whole locus is lost. For that reason it is checked if the 
+	#number of mapping reads of the top-scoring group maxkey<A,B,C> is > 0, otherwise "no" ist reported for this locus
+	if len(readspergroupAlist)>0:
+		maxkeyA=max(readspergroupA,key=lambda a:readspergroupA.get(a))
+		if readspergroupA[maxkeyA] > 0:
+			maxA=maxallelepergroup[maxkeyA]
+			a = max(readspergroupA,key=lambda a:readspergroupA.get(a))
+		else:
+			a = "no"
+	else:
+		a = "no"
+
+	if len(readspergroupBlist)>0:
+		maxkeyB=max(readspergroupB,key=lambda a:readspergroupB.get(a))
+		if readspergroupB[maxkeyB] > 0:
+			maxB=maxallelepergroup[maxkeyB]
+			b = max(readspergroupB,key=lambda a:readspergroupB.get(a))
+		else:
+			b = "no"
+	else:
+		b = "no"
+
+	if len(readspergroupClist)>0:
+		maxkeyC=max(readspergroupC,key=lambda a:readspergroupC.get(a))
+		if readspergroupC[maxkeyC] > 0:
+			maxC=maxallelepergroup[maxkeyC]
+			c = max(readspergroupC,key=lambda a:readspergroupC.get(a))
+		else:
+			c = "no"
+	else:
+		c = "no"
+
+	readspergroupA=sorted(readspergroupA.items(), key=itemgetter(1),reverse=True)
+	readspergroupB=sorted(readspergroupB.items(), key=itemgetter(1),reverse=True)
+	readspergroupC=sorted(readspergroupC.items(), key=itemgetter(1),reverse=True)
+
+	avec=bvec=cvec=""
+
+	if medianflag:
+		#in the 2nd iteration: 
+		#1.) DO NOT remove the top-scoring group from the set <a,b,c>vec as this is more strict when calculating the probability of the top scoring group being an outlier
+		#The strings <a,b,c>vec are used by the R-script to calculate the probability of the top scoring group being an outlier
+		for key in maxallelepergroup:
+			if key[0]=="A":
+				avec=avec+str(readcount[maxallelepergroup[key]])+","
+				#print key+": "+str(readcount[maxallelepergroup[key]])
+			if key[0]=="B":
+				bvec=bvec+str(readcount[maxallelepergroup[key]])+","
+				#print key+": "+str(readcount[maxallelepergroup[key]])
+			if key[0]=="C":
+				cvec=cvec+str(readcount[maxallelepergroup[key]])+","
+				#print key+": "+str(readcount[maxallelepergroup[key]])
+		#2.) Add the decision thresholds to the sets <a,b,c>vec, as this enables measuring the distance of the top-scoring group to this distance
+		if a=="no":
+			avec+=str(medians[0])
+		if b=="no":
+			bvec+=str(medians[1])
+		if c=="no":
+			cvec+=str(medians[2])
+		#3.) In case of, e.g. a loss of whole HLA-locus (A,B,C), <a,b,c>vec only contain median[<0|1|2>].
+		#To avoid errors in the R-Script, set to 0
+		if avec=="" or avec==medians[0]:
+			avec = acount="0"
+		else:
+			acount=str(readcount[maxallelepergroup[maxkeyA]])
+
+		if bvec=="" or bvec==medians[1]:
+			bvec = bcount="0"
+		else:
+			bcount=str(readcount[maxallelepergroup[maxkeyB]])
+
+		if cvec=="" or cvec==medians[2]:
+			cvec = ccount="0"
+		else:
+			ccount=str(readcount[maxallelepergroup[maxkeyC]])
+	else:
+		#in the 1st iteration: remove the top-scoring group from the set <a,b,c>vec as this increases the certainty when calculating the probability of the top scoring group being an outlier
+		for key in maxallelepergroup:
+			if key[0]=="A" and key!=maxkeyA:
+				avec=avec+str(readcount[maxallelepergroup[key]])+","
+				#print key+": "+str(readcount[maxallelepergroup[key]])
+			if key[0]=="B" and key!=maxkeyB:
+				bvec=bvec+str(readcount[maxallelepergroup[key]])+","
+				#print key+": "+str(readcount[maxallelepergroup[key]])	
+			if key[0]=="C" and key!=maxkeyC:
+				cvec=cvec+str(readcount[maxallelepergroup[key]])+","
+				#print key+": "+str(readcount[maxallelepergroup[key]])
+		#2.) DO NOT add the decision thresholds to the sets <a,b,c>vec
+		if avec=="":
+			acount=avec="0"
+		else:
+			acount=str(readcount[maxallelepergroup[maxkeyA]])
+		if bvec=="":
+			bcount=bvec="0"
+		else:
+			bcount=str(readcount[maxallelepergroup[maxkeyB]])
+		if cvec=="":
+			ccount=cvec="0"
+		else:
+			ccount=str(readcount[maxallelepergroup[maxkeyC]])
+		
+	#call R-script "commmand.R" to calculate the confidence of the top-scoring allele
+	routput=os.popen("R --vanilla < "+sys.path[0]+"/command.R --args "+acount+" "+avec+" "+maxkeyA+" "+bcount+" "+bvec+" "+maxkeyB+" "+ccount+" "+cvec+" "+maxkeyC).read()
+	parseOutput=routput.split("\n")
+
+	entries = []
+	for entry in parseOutput:
+		if entry[0:3]=="[1]":
+			entries.append(str(entry[4:len(entry)]))
+
+	if a=="no" and entries[1]!="NA":
+		entries[1]=str(1-float(entries[1]))
+	if b=="no" and entries[3]!="NA":
+		entries[3]=str(1-float(entries[3]))
+	if c=="no" and entries[5]!="NA":
+		entries[5]=str(1-float(entries[5]))
+	
+	pred2File(entries,medianA,medianB,medianC,output,a,b,c,log)
+	
+#write digital haplotype into file
+def pred2File(entries,medianA,medianB,medianC,output,a,b,c,log):
+	out = open(output, 'w')
+	out.write("HLA\tHLA1\tmedian-Value\talternative\tp-Value\n")
+	out.write("A\t"+a+"\t")
+	# write the median-Value for A
+	out.write(str(medianA)+"\t")
+	out.write(entries[0]+"\t"+entries[1]+"\n")
+
+	out.write("B\t"+b+"\t")
+	# write the median-Value for B
+	out.write(str(medianB)+"\t")
+	out.write(entries[2]+"\t"+entries[3]+"\n")
+
+	out.write("C\t"+c+"\t")
+	# write the median-Value for C
+	out.write(str(medianC)+"\t")
+	out.write(entries[4]+"\t"+entries[5]+"\n")
+	out.close()
+	log.write("The digital haplotype is written into "+output+"\n")
+
+#open mapping file and all read ids to the list "removeList", which map to one of the three groups in "alleles"
+def createRemoveList(runName,map):
+	removeList={}
+	alleles = []
+	sam=runName+"-iteration1.bam"
+	alleles_in=runName+".digitalhaplotype1"
+	for i in range(2,5):
+		alleles.append(linecache.getline(alleles_in, i).split('\t', 2)[1])
+	
+	samhandle=pysam.Samfile(sam,'rb')
+	for read in samhandle.fetch():
+		illuminaid = read.qname
+		hlapseudoname = samhandle.getrname(read.tid)
+		if map[hlapseudoname].split(':')[0] in alleles:
+			removeList[illuminaid]=1
+	samhandle.close()
+	return removeList
+
+#Remove reads that mapped to the three top-scoring alleles and write the remaining reads into two new read files
+def removeReads(runName,removeList):
+	aligned1=runName+"_1.aligned"
+	aligned2=runName+"_2.aligned"
+	newReadFile1=runName+"-2nditeration_1.fq"
+	newReadFile2=runName+"-2nditeration_2.fq"
+	#r1 and r2, which are the input of bowtie in the 2nd iteration
+	r1=open(newReadFile1,"w")
+	r2=open(newReadFile2,"w")
+	#open the 2 files, that contain the reads that mapped in the 1st iteration
+	aligned_handle1=open(aligned1,"r")
+	aligned_handle2=open(aligned2,"r")
+	
+	#One read entry consists of 4 lines: header, seq, "+", qualities.
+	for record in SeqIO.parse(aligned_handle1, "fastq"):
+		illuminaid=record.id.split('/')[0].split(' ')[0]#find exact id, which also appears in the mapping file
+		if not illuminaid in removeList:
+			SeqIO.write(record, r1, "fastq")
+	
+	for record in SeqIO.parse(aligned_handle2, "fastq"):
+		illuminaid=record.id.split('/')[0].split(' ')[0]#find exact id, which also appears in the mapping file
+		if not illuminaid in removeList:
+			SeqIO.write(record, r2, "fastq")
+
+#write the final prediction (both digital haplotypes) to file and stdout
+def reportHLAgenotype(output1,output2,finaloutput):
+	filehandle1=open(output1,'r').readlines()[1:4]
+	filehandle2=open(output2,'r').readlines()[1:4]
+	outfile=open(finaloutput, 'w')
+
+	outfile.write("HLA\tHLA1\tp-Value\tHLA2\tp-Value\n")
+	for i in range(len(filehandle1)):
+		filehandle1[i]=filehandle1[i][0:-1]
+	for i in range(len(filehandle2)):
+		filehandle2[i]=filehandle2[i][0:-1]
+
+	a1 = filehandle1[0].split('\t',2)[1]
+	a1score = filehandle1[0].split('\t')[4]
+	b1 = filehandle1[1].split('\t',2)[1]
+	b1score = filehandle1[1].split('\t')[4]
+	c1 = filehandle1[2].split('\t',2)[1]
+	c1score = filehandle1[2].split('\t')[4]
+
+	a2 = filehandle2[0].split('\t',2)[1]
+	if a2 == "no":
+		a2 = "hoz("+filehandle2[0].split('\t')[3]+")"
+	a2score = filehandle2[0].split('\t')[4]
+	b2 = filehandle2[1].split('\t',2)[1]
+	if b2 == "no":
+		b2 = "hoz("+filehandle2[1].split('\t')[3]+")"
+	b2score = filehandle2[1].split('\t')[4]
+	c2 = filehandle2[2].split('\t', 2)[1]
+	if c2 == "no":
+		c2 = "hoz("+filehandle2[2].split('\t')[3]+")"
+	c2score = filehandle2[2].split('\t')[4]
+	#write complete HLA genotype to file
+	outfile.write("A\t"+a1+"\t"+a1score+"\t"+a2+"\t"+a2score+"\n")
+	outfile.write("B\t"+b1+"\t"+b1score+"\t"+b2+"\t"+b2score+"\n")
+	outfile.write("C\t"+c1+"\t"+c1score+"\t"+c2+"\t"+c2score+"\n")
+	#.. and print it to STDOUT
+	#print "A\t"+a1+"\t"+a1score+"\t"+a2+"\t"+a2score
+	#print "B\t"+b1+"\t"+b1score+"\t"+b2+"\t"+b2score
+	#print "C\t"+c1+"\t"+c1score+"\t"+c2+"\t"+c2score
+
+#calculate locus-specific expression
+def expression(locus1,locus2,locus3,length1,length2,length3,map,runName,readFile1,finaloutput,gzipped):
+	outfile = open(finaloutput,'a')
+	aligned1=runName+"_1.aligned"
+	sam=runName+"-iteration1.bam"
+	alleles_in=runName+".digitalhaplotype1"
+	if gzipped:
+		proc=subprocess.Popen(["/bin/bash","-c","wc -l <(zcat " + readFile1 + ")"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+		(stdoutdata,stderrdata) = proc.communicate()
+		numberlines = stdoutdata
+	else:
+		proc=subprocess.Popen(["/bin/bash","-c","wc -l " + readFile1],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+		(stdoutdata,stderrdata) = proc.communicate()
+		numberlines = stdoutdata
+	totalreads=int(numberlines.split(" ")[0])/4
+	alleles=[]
+	for i in range(2,5):
+		alleles.append(linecache.getline(alleles_in, i).split('\t', 2)[1])
+		alleles.append(linecache.getline(alleles_in, i).split('\t')[3])
+	
+	#create read dictionary
+	reads={}
+	aligned_handle1=open(aligned1,"r")
+	for record in SeqIO.parse(aligned_handle1, "fastq"):
+		illuminaid=record.id.split('/')[0].split(' ')[0]#find exact id, which also appears in the mapping file
+		reads[illuminaid]={}
+		reads[illuminaid][locus1]=0
+		reads[illuminaid][locus2]=0
+		reads[illuminaid][locus3]=0
+	samhandle = pysam.Samfile(sam,"rb")
+	for read in samhandle.fetch():
+		illuminaid = read.qname
+		hlapseudoname = samhandle.getrname(read.tid)
+		if map[hlapseudoname].split(':')[0] in alleles:
+			reads[illuminaid][map[hlapseudoname].split('*')[0]]+=1
+
+	count={}
+	count[locus1]=0
+	count[locus2]=0
+	count[locus3]=0
+	for key in reads:
+		n=0
+		for locus in reads[key]:
+			if reads[key][locus] > 0:
+				n+=1
+		for locus in reads[key]:
+			if reads[key][locus] > 0:
+				count[locus]+=float(1.0/float(n))
+	
+	outfile.write("\n")
+	#Calculate RPKM and print expression values for each locus to stdout
+	for locus in count:
+		if locus==locus1:
+			outfile.write(locus+": "+str(round(float((1000.0/length1))*float((1000000.0/totalreads))*count[locus],2))+" RPKM\n")
+		if locus==locus2:
+			outfile.write(locus+": "+str(round(float((1000.0/length2))*float((1000000.0/totalreads))*count[locus],2))+" RPKM\n")
+		if locus==locus3:
+			outfile.write(locus+": "+str(round(float((1000.0/length3))*float((1000000.0/totalreads))*count[locus],2))+" RPKM\n")
+
+
+if __name__ == '__main__':
+	parser = OptionParser(usage="usage: %prog -1 readFile1 -2 readFile2 -r runName -l readlength [-3 <int>]", version="%prog 1.0")
+	parser.add_option("-z","--gzipped",action="store",dest="gzipped",help="Select if input reads are in gzip format")
+	parser.add_option("-1",
+			action="store", 
+			dest="readFile1",
+			help="File name of #1 mates ")
+	parser.add_option("-2",
+			action="store", 
+			dest="readFile2",
+			help="File name of #2 mates")
+	parser.add_option("-r", "--runName",
+			action="store", 
+			dest="runName",
+			help="Name of this HLA typing run. Wil be used throughout this process as part of the name of the newly created files.")
+	parser.add_option("-l", "--length",
+			action="store",
+			dest="length",
+			help="Readlength")
+	parser.add_option("-3", "--trim3",
+			action="store",
+			dest="trim3",
+			default="0",
+			help="Bowtie option: -3 <int> trims <int> bases from the low quality 3' end of each read. Default: 0")
+	parser.add_option("-o", "--output1",
+			action="store",
+			dest="output1",
+			help="Output file 1")
+	parser.add_option("-p", "--output2",
+			action="store",
+			dest="output2",
+			help="Output file 2")
+	parser.add_option("-g","--log",action="store",dest="logfile",help="Output Log File")
+
+
+	(options, args) = parser.parse_args()
+	if not options.readFile1:   
+		parser.error('File name #1 pair not given.')
+	if not options.readFile2:   
+		parser.error('File name #2 pair not given.')
+	if not options.runName:   
+		parser.error('Run name not given.')
+	if not options.length:   
+		parser.error('Readlength not given.')
+	gzipped = options.gzipped
+	readFile1=options.readFile1
+	readFile2=options.readFile2
+	runName=options.runName
+	output1 = options.output1
+	output2 = options.output2
+	logfile = options.logfile
+	bowtiebuildClassI=sys.path[0]+"/references/ClassIWithoutNQex2-3.plus75"
+	bowtiebuildClassII=sys.path[0]+"/references/HLA2.ex2.plus75"
+	fastaClassI=sys.path[0]+"/references/ClassIWithoutNQex2-3.plus75.fasta"
+	fastaClassII=sys.path[0]+"/references/HLA2.ex2.plus75.fasta"
+	#as shown in the publication HLA typing with RNA-Seq works best by allowing as less mismatches as necessary
+	if int(options.length)<=50:
+		mismatch=1
+	elif int(options.length)>50 and int(options.length)<=100:
+		mismatch=2
+	else:
+		mismatch=3
+	trim3=str(options.trim3)
+	main(runName,readFile1,readFile2,fastaClassI,fastaClassII,bowtiebuildClassI,bowtiebuildClassII,mismatch,trim3,output1,output2,logfile,gzipped)
+	sys.stderr = log_stderr
+	log_file.close()
author	sebastian-boegel
date	Thu, 20 Dec 2012 10:37:01 -0500
parents
children	155b796033b6