Mercurial > repos > sebastian-boegel > seq2hla

##########################################################################################################
#Title:
#seq2HLA - HLA genotyping from RNA-Seq sequence reads
#
#Release: 1.0
#
#Author:
#Sebastian Boegel, 2012 (c)
#TRON - Translational Oncology at the University Medical Center Mainz, 55131 Mainz, Germany
#University Medical Center of the Johannes Gutenberg-University Mainz, III.  Medical Department, Mainz, Germany
#
#Contact:
#boegels@uni-mainz.de
#
#Synopsis:
#We developed an in-silico method "Seq2HLA", written in python and R, which takes standard RNA-Seq sequence reads in fastq format
#as input, uses a bowtie index comprising all HLA alleles and outputs the most likely HLA class I and class II genotypes,
#a p-value for each call, and the expression of each class
#
#Usage:
#python seq2HLA.py -1 <readfile1> -2 <readfile2> -r "<runname>" -l <readlength> [-3 <int>]*
#*optional (Default:0)
#
#Dependencies:
#0.) seq2HLA is a python script, developed with Python 2.6.8
#1.) bowtie must be reachable by the command "bowtie". seq2HLA was developed and tested with bowtie version 0.12.7 (64-bit). The call to bowtie is invoked with 6 CPUs. You can change that in the function "mapping".
#2.) R must be installed, seq2HLA.py was developed and tested with R version 2.12.2 (2011-02-25)
#3.) Input must be paired-end reads in fastq-format
#4.) Index files must be located in the folder "references".
#5.) Packages: biopython (developed with V1.58), numpy (1.3.0)
###########################################################################################################

from operator import itemgetter
import sys
import linecache
import ast
import os
import shutil
import tempfile
import subprocess
from Bio import SeqIO
import numpy
import operator
from optparse import OptionParser

log_stderr = sys.stderr
log_file = open("err.log","w")
sys.stderr = log_file

import pysam

#These variables need to be global, as they are filled and used by different modules
readcount={}
readspergroup={}
allelesPerLocus={}

def main(runName,readFile1,readFile2,fastaClassI,fastaClassII,bowtiebuildClassI,bowtiebuildClassII,mismatch,trim3,output1,output2,logfile,gzipped,num_threads):
	log = open(logfile,"w")
#	mapopt="-a -v"+str(mismatch)
	mapopt="-a -v"+str(mismatch)
	#call HLA typing for Class I
	mainClassI(runName+"-ClassI",readFile1,readFile2,bowtiebuildClassI,fastaClassI,mapopt,trim3,output1,log,gzipped,num_threads)
	#call HLA typing for Class II
	mainClassII(runName+"-ClassII",readFile1,readFile2,bowtiebuildClassII,fastaClassII,mapopt,trim3,output2,log,gzipped,num_threads)

#---------------Class I-------------------------------
def mainClassI(runName,readFile1,readFile2,bowtiebuild,hla1fasta,mapopt,trim3,finaloutput,log,gzipped,num_threads):
	#-------1st iteration-----------------------------------
	log.write("----------HLA class I------------\n")
	sam1=runName+"-iteration1.bam"
	iteration=1
	log.write("First iteration starts....\nMapping ......\n")
	mapping(sam1,runName,readFile1,readFile2,bowtiebuild,1,mapopt,trim3,log,gzipped,num_threads)
	medians=[]
	medians.extend([0,0,0])
	medianflag=False
	#Calculation of first digital haplototype .....
	output1=runName+".digitalhaplotype1"
	log.write("Calculation of first digital haplototype .....\n")
	map=createRefDict(hla1fasta,"A","B","C")
	readMapping(map,sam1)
	predictHLA1(sam1,medians,output1,medianflag,log)
	log.write("1st iteration done.\nNow removing reads that mapped to the three top-scoring groups .......\n")
	removeReads(runName,createRemoveList(runName,map))

	#------2nd iteration------------------------------------------
	log.write("Second iterations starts .....\n Mapping ......\n")
	medians=[]
	iteration=2
	sam2=runName+"-iteration2.bam"
	newReadFile1=runName+"-2nditeration_1.fq"
	newReadFile2=runName+"-2nditeration_2.fq"
	mapping(sam2,runName,newReadFile1,newReadFile2,bowtiebuild,2,mapopt,trim3,log,gzipped,num_threads)
	medianfile=runName+".digitalhaplotype1"
	for i in range(2,5):
		medians.append(linecache.getline(medianfile, i).split('\t', 3)[2])
	medianflag=True
	output2=runName+".digitalhaplotype2"
	#Calculation of second digital haplototype
	log.write("Calculation of second digital haplototype .....\n")
	map=createRefDict(hla1fasta,"A","B","C")
	readMapping(map,sam2)
	predictHLA1(sam2,medians,output2,medianflag,log)
	log.write("2nd iteration done.")
	reportHLAgenotype(output1,output2,finaloutput)
	log.write("Calculation of locus-specific expression ...\n")
	expression("A","B","C",694,694,694,map,runName,readFile1,finaloutput,gzipped)

#--------------Class II----------------------------------------
def mainClassII(runName,readFile1,readFile2,bowtiebuild,hla2fasta,mapopt,trim3,finaloutput,log,gzipped,num_threads):
	#-------1st iteration----------------------------------------
	log.write("----------HLA class II------------\n")
	sam1=runName+"-iteration1.bam"
	iteration=1
	log.write("ClassII: first iteration starts....\nMapping ......\n")
	mapping(sam1,runName,readFile1,readFile2,bowtiebuild,1,mapopt,trim3,log,gzipped,num_threads)
	medians=[]
	medians.extend([0,0,0])
	medianflag=False
	output1=runName+".digitalhaplotype1"
	log.write("ClassII: calculation of first digital haplototype .....\n")
	map=createRefDict(hla2fasta,"DQA1","DQB1","DRB1")
	readMapping(map,sam1)
	predictHLA2(sam1,medians,output1,medianflag,log)
	log.write("1st iteration done.\nNow removing reads that mapped to the three top-scoring groups .......\n")
	removeReads(runName,createRemoveList(runName,map))

	#------2nd iteration------------------------------------------
	log.write("Second iterations starts .....\n Mapping ......\n")
	medians=[]
	iteration=2
	sam2=runName+"-iteration2.bam"
	newReadFile1=runName+"-2nditeration_1.fq"
	newReadFile2=runName+"-2nditeration_2.fq"
	mapping(sam2,runName,newReadFile1,newReadFile2,bowtiebuild,2,mapopt,trim3,log,gzipped,num_threads)
	medianfile=runName+".digitalhaplotype1"
	for i in range(2,5):
		medians.append(float(linecache.getline(medianfile, i).split('\t', 3)[2])/2.0)
	medianflag=True
	output2=runName+".digitalhaplotype2"
	#Calculation of second digital haplototype
	log.write("ClassII: calculation of second digital haplototype .....\n")
	map=createRefDict(hla2fasta,"DQA1","DQB1","DRB1")
	readMapping(map,sam2)
	predictHLA2(sam2,medians,output2,medianflag,log)
	log.write("2nd iteration done.\n")
	reportHLAgenotype(output1,output2,finaloutput)
	log.write("Calculation of locus-specific expression ...\n")
	expression("DQA1","DQB1","DRB1",400,421,421,map,runName,readFile1,finaloutput,gzipped)

#performs the bowtie mapping for the 2 iterations using the given parameters
def mapping(sam,runName,readFile1,readFile2,bowtiebuild,iteration,mapopt,trim3,log,gzipped,num_threads):
	if iteration==1:
		if gzipped == "True":
			sam = sam.rsplit(".",1)[0]
			cmd1 = "bowtie --threads "+num_threads+" -3 %s --quiet -S %s --al %s.aligned %s -1 <(zcat %s) -2 <(zcat %s) |  awk -F '\t' '$3 != \"*\"{ print $0 }' | samtools view -bS - | samtools sort - %s" % (trim3,mapopt,runName,bowtiebuild,readFile1,readFile2,sam)
			mappingOutput = execBowtie(cmd1)

			cmd2 = "samtools index %s.bam" % sam
			createBAMIndex(cmd2)
		else:
			sam = sam.rsplit(".",1)[0]
			cmd1 = "bowtie --threads "+num_threads+" -3 %s --quiet -S %s --al %s.aligned %s -1 %s -2 %s |  awk -F '\t' '$3 != \"*\"{ print $0 }' | samtools view -bS - | samtools sort - %s" % (trim3,mapopt,runName,bowtiebuild,readFile1,readFile2,sam)
			mappingOutput = execBowtie(cmd1)

			cmd2 = "samtools index %s.bam" % sam
			createBAMIndex(cmd2)
#	if iteration==2:
	else:

		sam = sam.rsplit(".",1)[0]
		cmd2 = "bowtie --threads "+num_threads+" -3 %s --quiet -S %s %s -1 %s -2 %s | samtools view -bS - | samtools sort - %s" % (trim3,mapopt,bowtiebuild,readFile1,readFile2,sam)
		mappingOutput = execBowtie(cmd2)

		cmd3 = "samtools index %s.bam" % sam
		createBAMIndex(cmd3)

	log.write(mappingOutput+"\n")

def createBAMIndex(cmd):
	proc = subprocess.Popen(["/bin/bash","-c",cmd],stderr=subprocess.PIPE)
	(stdoutdata,stderrdata) = proc.communicate()
	r = proc.returncode
	if r != 0:
		raise IOError(stderrdata)

def execBowtie(cmd):
	proc = subprocess.Popen(["/bin/bash","-c",cmd],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
	(stdoutdata,stderrdata) = proc.communicate()
	mappingOutput = stdoutdata
	r = proc.returncode
	if r != 0:
		raise IOError(stderrdata)
	return mappingOutput

#create dictionary "map", that contains all IMGT/HLA-allele names as keys and the allele name (e.g. A*02:01:01) as value
#dictionary "readcount" is initialized with 0 for each allele
#dictionary "readspergroup" is initialized with 0 for each group (2digit, e.g. A*01)
#dictionary "allelesPerLocus" stores the number of alleles per locus.
def createRefDict(hlafasta,locus1,locus2,locus3):
	map={}
	for locus in (locus1,locus2,locus3):
		allelesPerLocus[locus]=0
	handle=open(hlafasta,'r')
	for record in SeqIO.parse(handle, "fasta") :
		l=record.description.split(' ')
		hlapseudoname=l[0]
		hlaallele=l[1]
		map[hlapseudoname]=hlaallele
		readcount[hlaallele]=0
		readspergroup[hlaallele.split(":")[0]]=0
		allelesPerLocus[hlaallele.split('*')[0]]+=1
	handle.close()
	return map

#Open sam-file and count mappings for each allele
def readMapping(map,sam):
	samhandle = pysam.Samfile(sam,"rb")
	for ar in samhandle.fetch():
		hla = samhandle.getrname(ar.tid)
		readcount[map[hla]]+=1
	samhandle.close()

#predict HLA class II genotype
def predictHLA2(sam,medians,output,medianflag,log):

	readspergroupDQAlist=[]
	readspergroupDQBlist=[]
	readspergroupDRBlist=[]
	readspergroupDQA={}
	readspergroupDQB={}
	readspergroupDRB={}
	maxallelepergroup={}

	#for each allele, to which at least 1 read map, find the allele which has the most reads within a group (2-digit-level, e.g. DQA*02") and save
	#i) the key of this allele as ambassador for the group => maxallelepergroup holds for each group the top-scoring allele
	#ii) the number of reads mapping to the top-scoring allele => readspergroup
	for key in readcount:
		if readcount[key] > 0:
			group=key.split(":")[0]
			if readspergroup[group] <=readcount[key]:
				readspergroup[group]=readcount[key]
				maxallelepergroup[group]=key

	#consider all DQA-,DQB-, and DRB-groups seperately
	#readspergroup<DQA|DQB|DRB>list = list of all reads mapping to the top-scoring groups minus the decision threshold (which is 0 in the first iteration)
	#readspergroup<DQA|DQB|DRB> = contains the same entries as the list, but the entries are uniquely accessible via the group-key (e.g. DQA*01)
	for key in readspergroup:
		keyBegin = key[0:3]
		if keyBegin=='DQA':
			readspergroupDQAlist.append(readspergroup[key]-float(medians[0]))
			readspergroupDQA[key]=readspergroup[key]-float(medians[0])
		if keyBegin=='DQB':
			readspergroupDQBlist.append(readspergroup[key]-float(medians[1]))
			readspergroupDQB[key]=readspergroup[key]-float(medians[1])
		if keyBegin=='DRB':
			readspergroupDRBlist.append(readspergroup[key]-float(medians[2]))
			readspergroupDRB[key]=readspergroup[key]-float(medians[2])

	#compute the decision threshold (median) for homozygosity vs. heterozygosity for the second iteration
	medianDQA=numpy.median(readspergroupDQAlist)
	medianDQB=numpy.median(readspergroupDQBlist)
	medianDRB=numpy.median(readspergroupDRBlist)
	a = b = c = ""

	#Determine top-scoring group of the whole locus (DQA,DQBB,DRB) and store it
	#maxkey<DQA,DQB,DRB> = group (e.g. DQA*01) with the most reads
	#It can be that, e.g. in cancer cells a whole locus is lost. For that reason it is checked if the
	#number of mapping reads of the top-scoring group maxkey<DQA,DQB,DRB> is > 0, otherwise "no" ist reported for this locus
	if len(readspergroupDQAlist)>0:
		maxkeyDQA=max(readspergroupDQA,key=lambda a:readspergroupDQA.get(a))
		if readspergroupDQA[maxkeyDQA] > 0:
			maxDQA=maxallelepergroup[maxkeyDQA]
			a = max(readspergroupDQA,key=lambda a:readspergroupDQA.get(a))
		else:
			a = "no"
	else:
		a = "no"

	if len(readspergroupDQBlist)>0:
		maxkeyDQB=max(readspergroupDQB,key=lambda a:readspergroupDQB.get(a))
		if readspergroupDQB[maxkeyDQB] > 0:
			maxDQB=maxallelepergroup[maxkeyDQB]
			b = max(readspergroupDQB,key=lambda a:readspergroupDQB.get(a))
		else:
			b = "no"
	else:
		b = "no"

	if len(readspergroupDRBlist)>0:
		maxkeyDRB=max(readspergroupDRB,key=lambda a:readspergroupDRB.get(a))
		if readspergroupDRB[maxkeyDRB] > 0:
			maxDRB=maxallelepergroup[maxkeyDRB]
			c = max(readspergroupDRB,key=lambda a:readspergroupDRB.get(a))
		else:
			c = "no"
	else:
		c = "no"

	readspergroupDQA=sorted(readspergroupDQA.items(), key=itemgetter(1),reverse=True)
	readspergroupDQB=sorted(readspergroupDQB.items(), key=itemgetter(1),reverse=True)
	readspergroupDRB=sorted(readspergroupDRB.items(), key=itemgetter(1),reverse=True)

	dqavec=dqbvec=drbvec=""
	if medianflag:
		#in the 2nd iteration:
		#1.) DO NOT remove the top-scoring group from the set <dqa,dqb,drb>vec as this is more strict when calculating the probability of the top scoring group being an outlier
		#The strings <dqa,dqb,drb>vec are used by the R-script to calculate the probability of the top scoring group being an outlier
		for key in maxallelepergroup:
			keyBegin = key[0:3]
			if keyBegin=="DQA":
				dqavec=dqavec+str(readcount[maxallelepergroup[key]])+","
				#print key+": "+str(readcount[maxallelepergroup[key]])
			if keyBegin=="DQB":
				dqbvec=dqbvec+str(readcount[maxallelepergroup[key]])+","
				#print key+": "+str(readcount[maxallelepergroup[key]])
			if keyBegin=="DRB":
				drbvec=drbvec+str(readcount[maxallelepergroup[key]])+","
				#print key+": "+str(readcount[maxallelepergroup[key]])
		#2.) Add the decision thresholds to the sets <dqa,dqb,drb>vec, as this enables measuring the distance of the top-scoring group to this distance
		if a=="no":
			dqavec+=str(medians[0])
		if b=="no":
			dqbvec+=str(medians[1])
		if c=="no":
			drbvec+=str(medians[2])
		#3.) In case of, e.g. a loss of whole HLA-locus (DQA/B,DRB), <dqa,dqb,drb>vec only contain medians[<0|1|2>].
		#To avoid errors in the R-Script, set to 0
		if dqavec==str(medians[0]):
			dqavec = dqacount="0"
		else:
			dqacount=str(readcount[maxallelepergroup[maxkeyDQA]])
			if dqavec==str(readcount[maxallelepergroup[maxkeyDQA]])+",":
				dqavec=str(readcount[maxallelepergroup[maxkeyDQA]])+","+str(readcount[maxallelepergroup[maxkeyDQA]])+","
		if dqbvec==str(medians[1]):
			dqbvec = dqbcount="0"
		else:
			dqbcount=str(readcount[maxallelepergroup[maxkeyDQB]])
			if dqbvec==str(readcount[maxallelepergroup[maxkeyDQB]])+",":
				dqbvec=str(readcount[maxallelepergroup[maxkeyDQB]])+","+str(readcount[maxallelepergroup[maxkeyDQB]])+","
		if drbvec==str(medians[2]):
			drbvec = drbcount="0"
		else:
			drbcount=str(readcount[maxallelepergroup[maxkeyDRB]])
			if drbvec==str(readcount[maxallelepergroup[maxkeyDRB]])+",":
				drbvec=str(readcount[maxallelepergroup[maxkeyDDRB]])+","+str(readcount[maxallelepergroup[maxkeyDRB]])+","
	else:
		#in the 1st iteration: remove the top-scoring group from the set <dqa,dqb,drb>vec as this increases the certainty when calculating the probability of the top scoring group being an outlier
		for key in maxallelepergroup:
			keyBegin = key[0:3]
			if keyBegin=="DQA":
				if key!=maxkeyDQA:
					dqavec=dqavec+str(readcount[maxallelepergroup[key]])+","
				#print key+": "+str(readcount[maxallelepergroup[key]])
			if keyBegin=="DQB":
				if key!=maxkeyDQB:
					dqbvec=dqbvec+str(readcount[maxallelepergroup[key]])+","
				#print key+": "+str(readcount[maxallelepergroup[key]])
			if keyBegin=="DRB":
				if key!=maxkeyDRB:
					drbvec=drbvec+str(readcount[maxallelepergroup[key]])+","
				#print key+": "+str(readcount[maxallelepergroup[key]])
		#2.) DO NOT add the decision thresholds to the sets <dqa,dqb,drb>vec
		if dqavec=="":
			dqacount=dqavec="0"
		else:
			dqacount=str(readcount[maxallelepergroup[maxkeyDQA]])
			if dqavec==str(readcount[maxallelepergroup[maxkeyDQA]])+",":
				dqavec=str(readcount[maxallelepergroup[maxkeyDQA]])+","+str(readcount[maxallelepergroup[maxkeyDQA]])+","
			if len(dqavec.split(","))==2:
				dqavec=dqavec+"0,"

		if dqbvec=="":
			dqbcount=dqbvec="0"
		else:
			dqbcount=str(readcount[maxallelepergroup[maxkeyDQB]])
			if dqbvec==str(readcount[maxallelepergroup[maxkeyDQB]])+",":
				dqbvec=str(readcount[maxallelepergroup[maxkeyDQB]])+","+str(readcount[maxallelepergroup[maxkeyDQB]])+","
			if len(dqbvec.split(","))==2:
				dqbvec=dqbvec+"0,"

		if drbvec=="":
			drbcount=drbvec="0"
		else:
			drbcount=str(readcount[maxallelepergroup[maxkeyDRB]])
			if drbvec==str(readcount[maxallelepergroup[maxkeyDRB]])+",":
				drbvec=str(readcount[maxallelepergroup[maxkeyDDRB]])+","+str(readcount[maxallelepergroup[maxkeyDRB]])+","
			if len(drbvec.split(","))==2:
				drbvec=drbvec+"0,"

	#call R-script "commmand.R" to calculate the confidence of the top-scoring allele
	routput=os.popen("R --vanilla < "+sys.path[0]+"/command.R --args "+dqacount+" "+dqavec+" "+maxkeyDQA+" "+dqbcount+" "+dqbvec+" "+maxkeyDQB+" "+drbcount+" "+drbvec+" "+maxkeyDRB).read()
	parseOutput=routput.split("\n")

	entries = []
	for entry in parseOutput:
		if entry[0:3]=="[1]":
			entries.append(str(entry[4:len(entry)]))

	if a=="no" and entries[1]!="NA":
		entries[1]=str(1-float(entries[1]))
	if b=="no" and entries[3]!="NA":
		entries[3]=str(1-float(entries[3]))
	if c=="no" and entries[5]!="NA":
		entries[5]=str(1-float(entries[5]))

	pred2File(entries,medianDQA,medianDQB,medianDRB,output,a,b,c,log)

#predict HLA class I genotype
def predictHLA1(sam,medians,output,medianflag,log):

	readspergroupAlist=[]
	readspergroupBlist=[]
	readspergroupClist=[]
	readspergroupA={}
	readspergroupB={}
	readspergroupC={}
	maxallelepergroup={}

	#for each allele, to which at least 1 read map, find the allele which has the most reads within a group (2-digit-level, e.g. A*02") and save
	#i) the key of this allele as ambassador for the group => maxallelepergroup holds for each group the top-scoring allele
	#ii) the number of reads mapping to the top-scoring allele => readspergroup
	for key in readcount:
		if readcount[key] > 0:
			group=key.split(":")[0]
			if readspergroup[group] <=readcount[key]:
				readspergroup[group]=readcount[key]
				maxallelepergroup[group]=key


	#consider all A-,B-, and C-groups seperately
	#readspergroup<A|B|C>list = list of all reads mapping to the top-scoring groups minus the decision threshold (which is 0 in the first iteration)
	#readspergroup<A|B|C> = contains the same entries as the list, but the entries are uniquely accessible via the group-key (e.g. B*27)
	for key in readspergroup:
		firstKey = key[0]
		if key[0]=='A':
			readspergroupAlist.append(readspergroup[key]-float(medians[0]))
			readspergroupA[key]=readspergroup[key]-float(medians[0])
		if key[0]=='B':
			readspergroupBlist.append(readspergroup[key]-float(medians[1]))
			readspergroupB[key]=readspergroup[key]-float(medians[1])
		if key[0]=='C':
			readspergroupClist.append(readspergroup[key]-float(medians[2]))
			readspergroupC[key]=readspergroup[key]-float(medians[2])

	#compute the decision threshold (median) for homozygosity vs. heterozygosity for the second iteration
	medianA=numpy.median(readspergroupAlist)
	medianB=numpy.median(readspergroupBlist)
	medianC=numpy.median(readspergroupClist)
	a = b = c = ""

	#Determine top-scoring group of the whole locus (A,B,C) and store it
	#maxkey<A,B,C> = group (e.g. A*02) with the most reads
	#It can be that, e.g. in cancer cells A whole locus is lost. For that reason it is checked if the
	#number of mapping reads of the top-scoring group maxkey<A,B,C> is > 0, otherwise "no" ist reported for this locus
	if len(readspergroupAlist)>0:
		maxkeyA=max(readspergroupA,key=lambda a:readspergroupA.get(a))
		if readspergroupA[maxkeyA] > 0:
			maxA=maxallelepergroup[maxkeyA]
			a = max(readspergroupA,key=lambda a:readspergroupA.get(a))
		else:
			a = "no"
	else:
		a = "no"

	if len(readspergroupBlist)>0:
		maxkeyB=max(readspergroupB,key=lambda a:readspergroupB.get(a))
		if readspergroupB[maxkeyB] > 0:
			maxB=maxallelepergroup[maxkeyB]
			b = max(readspergroupB,key=lambda a:readspergroupB.get(a))
		else:
			b = "no"
	else:
		b = "no"

	if len(readspergroupClist)>0:
		maxkeyC=max(readspergroupC,key=lambda a:readspergroupC.get(a))
		if readspergroupC[maxkeyC] > 0:
			maxC=maxallelepergroup[maxkeyC]
			c = max(readspergroupC,key=lambda a:readspergroupC.get(a))
		else:
			c = "no"
	else:
		c = "no"

	readspergroupA=sorted(readspergroupA.items(), key=itemgetter(1),reverse=True)
	readspergroupB=sorted(readspergroupB.items(), key=itemgetter(1),reverse=True)
	readspergroupC=sorted(readspergroupC.items(), key=itemgetter(1),reverse=True)

	avec=bvec=cvec=""

	if medianflag:
		#in the 2nd iteration:
		#1.) DO NOT remove the top-scoring group from the set <a,b,c>vec as this is more strict when calculating the probability of the top scoring group being an outlier
		#The strings <a,b,c>vec are used by the R-script to calculate the probability of the top scoring group being an outlier
		for key in maxallelepergroup:
			if key[0]=="A":
				avec=avec+str(readcount[maxallelepergroup[key]])+","
				#print key+": "+str(readcount[maxallelepergroup[key]])
			if key[0]=="B":
				bvec=bvec+str(readcount[maxallelepergroup[key]])+","
				#print key+": "+str(readcount[maxallelepergroup[key]])
			if key[0]=="C":
				cvec=cvec+str(readcount[maxallelepergroup[key]])+","
				#print key+": "+str(readcount[maxallelepergroup[key]])
		#2.) Add the decision thresholds to the sets <a,b,c>vec, as this enables measuring the distance of the top-scoring group to this distance
		if a=="no":
			avec+=str(medians[0])
		if b=="no":
			bvec+=str(medians[1])
		if c=="no":
			cvec+=str(medians[2])
		#3.) In case of, e.g. a loss of whole HLA-locus (A,B,C), <a,b,c>vec only contain median[<0|1|2>].
		#To avoid errors in the R-Script, set to 0
		if avec=="" or avec==medians[0]:
			avec = acount="0"
		else:
			acount=str(readcount[maxallelepergroup[maxkeyA]])

		if bvec=="" or bvec==medians[1]:
			bvec = bcount="0"
		else:
			bcount=str(readcount[maxallelepergroup[maxkeyB]])

		if cvec=="" or cvec==medians[2]:
			cvec = ccount="0"
		else:
			ccount=str(readcount[maxallelepergroup[maxkeyC]])
	else:
		#in the 1st iteration: remove the top-scoring group from the set <a,b,c>vec as this increases the certainty when calculating the probability of the top scoring group being an outlier
		for key in maxallelepergroup:
			if key[0]=="A" and key!=maxkeyA:
				avec=avec+str(readcount[maxallelepergroup[key]])+","
				#print key+": "+str(readcount[maxallelepergroup[key]])
			if key[0]=="B" and key!=maxkeyB:
				bvec=bvec+str(readcount[maxallelepergroup[key]])+","
				#print key+": "+str(readcount[maxallelepergroup[key]])
			if key[0]=="C" and key!=maxkeyC:
				cvec=cvec+str(readcount[maxallelepergroup[key]])+","
				#print key+": "+str(readcount[maxallelepergroup[key]])
		#2.) DO NOT add the decision thresholds to the sets <a,b,c>vec
		if avec=="":
			acount=avec="0"
		else:
			acount=str(readcount[maxallelepergroup[maxkeyA]])
		if bvec=="":
			bcount=bvec="0"
		else:
			bcount=str(readcount[maxallelepergroup[maxkeyB]])
		if cvec=="":
			ccount=cvec="0"
		else:
			ccount=str(readcount[maxallelepergroup[maxkeyC]])

	#call R-script "commmand.R" to calculate the confidence of the top-scoring allele
	routput=os.popen("R --vanilla < "+sys.path[0]+"/command.R --args "+acount+" "+avec+" "+maxkeyA+" "+bcount+" "+bvec+" "+maxkeyB+" "+ccount+" "+cvec+" "+maxkeyC).read()
	parseOutput=routput.split("\n")

	entries = []
	for entry in parseOutput:
		if entry[0:3]=="[1]":
			entries.append(str(entry[4:len(entry)]))

	if a=="no" and entries[1]!="NA":
		entries[1]=str(1-float(entries[1]))
	if b=="no" and entries[3]!="NA":
		entries[3]=str(1-float(entries[3]))
	if c=="no" and entries[5]!="NA":
		entries[5]=str(1-float(entries[5]))

	pred2File(entries,medianA,medianB,medianC,output,a,b,c,log)

#write digital haplotype into file
def pred2File(entries,medianA,medianB,medianC,output,a,b,c,log):
	out = open(output, 'w')
	out.write("HLA\tHLA1\tmedian-Value\talternative\tp-Value\n")
	out.write("A\t"+a+"\t")
	# write the median-Value for A
	out.write(str(medianA)+"\t")
	out.write(entries[0]+"\t"+entries[1]+"\n")

	out.write("B\t"+b+"\t")
	# write the median-Value for B
	out.write(str(medianB)+"\t")
	out.write(entries[2]+"\t"+entries[3]+"\n")

	out.write("C\t"+c+"\t")
	# write the median-Value for C
	out.write(str(medianC)+"\t")
	out.write(entries[4]+"\t"+entries[5]+"\n")
	out.close()
	log.write("The digital haplotype is written into "+output+"\n")

#open mapping file and all read ids to the list "removeList", which map to one of the three groups in "alleles"
def createRemoveList(runName,map):
	removeList={}
	alleles = []
	sam=runName+"-iteration1.bam"
	alleles_in=runName+".digitalhaplotype1"
	for i in range(2,5):
		alleles.append(linecache.getline(alleles_in, i).split('\t', 2)[1])

	samhandle=pysam.Samfile(sam,'rb')
	for read in samhandle.fetch():
		illuminaid = read.qname
		hlapseudoname = samhandle.getrname(read.tid)
		if map[hlapseudoname].split(':')[0] in alleles:
			removeList[illuminaid]=1
	samhandle.close()
	return removeList

#Remove reads that mapped to the three top-scoring alleles and write the remaining reads into two new read files
def removeReads(runName,removeList):
	aligned1=runName+"_1.aligned"
	aligned2=runName+"_2.aligned"
	newReadFile1=runName+"-2nditeration_1.fq"
	newReadFile2=runName+"-2nditeration_2.fq"
	#r1 and r2, which are the input of bowtie in the 2nd iteration
	r1=open(newReadFile1,"w")
	r2=open(newReadFile2,"w")
	#open the 2 files, that contain the reads that mapped in the 1st iteration
	aligned_handle1=open(aligned1,"r")
	aligned_handle2=open(aligned2,"r")

	#One read entry consists of 4 lines: header, seq, "+", qualities.
	for record in SeqIO.parse(aligned_handle1, "fastq"):
		illuminaid=record.id.split('/')[0].split(' ')[0]#find exact id, which also appears in the mapping file
		if not illuminaid in removeList:
			SeqIO.write(record, r1, "fastq")

	for record in SeqIO.parse(aligned_handle2, "fastq"):
		illuminaid=record.id.split('/')[0].split(' ')[0]#find exact id, which also appears in the mapping file
		if not illuminaid in removeList:
			SeqIO.write(record, r2, "fastq")

#write the final prediction (both digital haplotypes) to file and stdout
def reportHLAgenotype(output1,output2,finaloutput):
	filehandle1=open(output1,'r').readlines()[1:4]
	filehandle2=open(output2,'r').readlines()[1:4]
	outfile=open(finaloutput, 'w')

	outfile.write("HLA\tHLA1\tp-Value\tHLA2\tp-Value\n")
	for i in range(len(filehandle1)):
		filehandle1[i]=filehandle1[i][0:-1]
	for i in range(len(filehandle2)):
		filehandle2[i]=filehandle2[i][0:-1]

	a1 = filehandle1[0].split('\t',2)[1]
	a1score = filehandle1[0].split('\t')[4]
	b1 = filehandle1[1].split('\t',2)[1]
	b1score = filehandle1[1].split('\t')[4]
	c1 = filehandle1[2].split('\t',2)[1]
	c1score = filehandle1[2].split('\t')[4]

	a2 = filehandle2[0].split('\t',2)[1]
	if a2 == "no":
		a2 = "hoz("+filehandle2[0].split('\t')[3]+")"
	a2score = filehandle2[0].split('\t')[4]
	b2 = filehandle2[1].split('\t',2)[1]
	if b2 == "no":
		b2 = "hoz("+filehandle2[1].split('\t')[3]+")"
	b2score = filehandle2[1].split('\t')[4]
	c2 = filehandle2[2].split('\t', 2)[1]
	if c2 == "no":
		c2 = "hoz("+filehandle2[2].split('\t')[3]+")"
	c2score = filehandle2[2].split('\t')[4]
	#write complete HLA genotype to file
	outfile.write("A\t"+a1+"\t"+a1score+"\t"+a2+"\t"+a2score+"\n")
	outfile.write("B\t"+b1+"\t"+b1score+"\t"+b2+"\t"+b2score+"\n")
	outfile.write("C\t"+c1+"\t"+c1score+"\t"+c2+"\t"+c2score+"\n")
	#.. and print it to STDOUT
	#print "A\t"+a1+"\t"+a1score+"\t"+a2+"\t"+a2score
	#print "B\t"+b1+"\t"+b1score+"\t"+b2+"\t"+b2score
	#print "C\t"+c1+"\t"+c1score+"\t"+c2+"\t"+c2score

#calculate locus-specific expression
def expression(locus1,locus2,locus3,length1,length2,length3,map,runName,readFile1,finaloutput,gzipped):
	outfile = open(finaloutput,'a')
	aligned1=runName+"_1.aligned"
	sam=runName+"-iteration1.bam"
	alleles_in=runName+".digitalhaplotype1"
	if gzipped:
		proc=subprocess.Popen(["/bin/bash","-c","wc -l <(zcat " + readFile1 + ")"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
		(stdoutdata,stderrdata) = proc.communicate()
		numberlines = stdoutdata
	else:
		proc=subprocess.Popen(["/bin/bash","-c","wc -l " + readFile1],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
		(stdoutdata,stderrdata) = proc.communicate()
		numberlines = stdoutdata
	totalreads=int(numberlines.split(" ")[0])/4
	alleles=[]
	for i in range(2,5):
		alleles.append(linecache.getline(alleles_in, i).split('\t', 2)[1])
		alleles.append(linecache.getline(alleles_in, i).split('\t')[3])

	#create read dictionary
	reads={}
	aligned_handle1=open(aligned1,"r")
	for record in SeqIO.parse(aligned_handle1, "fastq"):
		illuminaid=record.id.split('/')[0].split(' ')[0]#find exact id, which also appears in the mapping file
		reads[illuminaid]={}
		reads[illuminaid][locus1]=0
		reads[illuminaid][locus2]=0
		reads[illuminaid][locus3]=0
	samhandle = pysam.Samfile(sam,"rb")
	for read in samhandle.fetch():
		illuminaid = read.qname
		hlapseudoname = samhandle.getrname(read.tid)
		if map[hlapseudoname].split(':')[0] in alleles:
			reads[illuminaid][map[hlapseudoname].split('*')[0]]+=1

	count={}
	count[locus1]=0
	count[locus2]=0
	count[locus3]=0
	for key in reads:
		n=0
		for locus in reads[key]:
			if reads[key][locus] > 0:
				n+=1
		for locus in reads[key]:
			if reads[key][locus] > 0:
				count[locus]+=float(1.0/float(n))

	outfile.write("\n")
	#Calculate RPKM and print expression values for each locus to stdout
	for locus in count:
		if locus==locus1:
			outfile.write(locus+": "+str(round(float((1000.0/length1))*float((1000000.0/totalreads))*count[locus],2))+" RPKM\n")
		if locus==locus2:
			outfile.write(locus+": "+str(round(float((1000.0/length2))*float((1000000.0/totalreads))*count[locus],2))+" RPKM\n")
		if locus==locus3:
			outfile.write(locus+": "+str(round(float((1000.0/length3))*float((1000000.0/totalreads))*count[locus],2))+" RPKM\n")


if __name__ == '__main__':
	parser = OptionParser(usage="usage: %prog -1 readFile1 -2 readFile2 -r runName -l readlength [-3 <int>]", version="%prog 1.0")
	parser.add_option("-z","--gzipped",action="store",dest="gzipped",help="Select if input reads are in gzip format")
	parser.add_option("-1",
			action="store",
			dest="readFile1",
			help="File name of #1 mates ")
	parser.add_option("-2",
			action="store",
			dest="readFile2",
			help="File name of #2 mates")
	parser.add_option("-r", "--runName",
			action="store",
			dest="runName",
			help="Name of this HLA typing run. Wil be used throughout this process as part of the name of the newly created files.")
	parser.add_option("-l", "--length",
			action="store",
			dest="length",
			help="Readlength")
	parser.add_option("-3", "--trim3",
			action="store",
			dest="trim3",
			default="0",
			help="Bowtie option: -3 <int> trims <int> bases from the low quality 3' end of each read. Default: 0")
	parser.add_option("-o", "--output1",
			action="store",
			dest="output1",
			help="Output file 1")
	parser.add_option("-p", "--output2",
			action="store",
			dest="output2",
			help="Output file 2")
	parser.add_option("-g","--log",action="store",dest="logfile",help="Output Log File")
	parser.add_option("-t","--threads",type="int",dest="num_threads",help="Define amount of threads to use for Bowtie in this script")


	(options, args) = parser.parse_args()
	if not options.readFile1:
		parser.error('File name #1 pair not given.')
	if not options.readFile2:
		parser.error('File name #2 pair not given.')
	if not options.runName:
		parser.error('Run name not given.')
	if not options.length:
		parser.error('Readlength not given.')
	gzipped = options.gzipped
	readFile1=options.readFile1
	readFile2=options.readFile2
	runName=options.runName
	output1 = options.output1
	output2 = options.output2
	logfile = options.logfile
	bowtiebuildClassI=sys.path[0]+"/references/ClassIWithoutNQex2-3.plus75"
	bowtiebuildClassII=sys.path[0]+"/references/HLA2.ex2.plus75"
	fastaClassI=sys.path[0]+"/references/ClassIWithoutNQex2-3.plus75.fasta"
	fastaClassII=sys.path[0]+"/references/HLA2.ex2.plus75.fasta"
	#as shown in the publication HLA typing with RNA-Seq works best by allowing as less mismatches as necessary
	if int(options.length)<=50:
		mismatch=1
	elif int(options.length)>50 and int(options.length)<=100:
		mismatch=2
	else:
		mismatch=3
	trim3=str(options.trim3)
	main(runName,readFile1,readFile2,fastaClassI,fastaClassII,bowtiebuildClassI,bowtiebuildClassII,mismatch,trim3,output1,output2,logfile,gzipped,options.num_threads)
	sys.stderr = log_stderr
	log_file.close()
author	sebastian-boegel
date	Fri, 21 Dec 2012 03:46:15 -0500
parents	913ea6991ee4
children