Mercurial > repos > geert-vandeweyer > dc_genotyper

#!/usr/bin/perl
#############################
## DEEP COVERAGE GENOTYPER ##
#############################
# version : 0.0.1
# Principle:
#  1. get allele counts on all positions in specified targets (bed) using igvtools. Only SNPs !!
#  2. remove known dbsnp positions (bcf file)
#  3. Get distribution of background noise (pcr/sequencing errors), by modelling allele fractions as normal distributions.
#  4. Based on these distributions, check each position for significant change from the reference allele (based on allele fraction)
#  5. For abberant positions, check each alternate allele to see if it passes the background signal.
#  6. Generate VCF file.


##################
## LOAD MODULES ##
##################
use threads;
use threads::shared;
use Thread::Queue;
use Getopt::Std;

####################
## get paramaters ##
####################
# t: target file
# b: bam file
# R: reference genome files for twobit and IGV.
# p: number of threads.
# s: dbsnp file
# m: minimal coverage (defaults 400x)
# P: ploidy
# a: outfile for allele distributions
# v: vcf file output.
# d: distribution plots pdf file
getopts('t:b:R:p:s:m:P:v:a:d:', \%opts) ;

## variables
my $twobit :shared;
my $igvgenome :shared;
if (!defined($opts{'R'})) {
	die("Reference Genomes not specified\n");
}
my @refgenomes = split(",",$opts{'R'});
if (!-e $refgenomes[0]) {
	die("'$refgenomes[0]' is not a valid file path.");
}
else {
	$twobit = $refgenomes[0];
}
if (!-e $refgenomes[1]) {
	die("'$refgenomes[1]' is not a valid file path.");
}
else {
	$igvgenome = $refgenomes[1];
}


my $mincov :shared;
$mincov = 320;
if (defined($opts{'m'})) {
	$mincov = $opts{'m'};
}

my $ploidy :shared;
if (defined($opts{'P'}) && $opts{'P'} =~ m/^\d+$/) {
	$ploidy = $opts{'P'};
}
else {
	die("Ploidy (-P) was not specified or not an integer\n");
}


if (defined($opts{'v'})) {
	$outfile = $opts{'v'};
}
else {
	die("No output vcf-file specified.\n");
}
if (!defined($opts{'a'})) {
	die("No output file specified for distribution details\n");
}
## create working dir.
my $rand = int(rand(10000));
while (-d "/tmp/DC_Genotyper_$rand") {
	$rand = int(rand(10000));
}
my $wd :shared;
$wd = "/tmp/DC_Genotyper_$rand";
system("mkdir '$wd'");


my $snpfile :shared;
my $hassnp :shared;
$hassnp = 'NoDbSNP';
$snpfile = '';
if (defined($opts{'s'})) {
	$snpfile = $opts{'s'};
	if (!-e $snpfile) {
		die("'$snpfile' is not a valid file path.");
	}

	my $mime = `file $snpfile`;
	if ($mime !~ m/compressed/) {
		print "$snpfile is not in compressed format. compressing & indexing the file now.\n";
		#print "... this takes a while\n";
		system("bgzip -c $snpfile > $wd/dbSNP.vcf.bgz");
		system("cd $wd/ && tabix -p vcf dbSNP.vcf.bgz");
		$snpfile = "$wd/dbSNP.vcf.bgz";
	}
	elsif (!-e "$snpfile.tbi") {
		print "tabix index file is missing for '$snpfile'. creating now.\n";
		## check if I can write it out for future use
		$snpfile =~ m/(.*)([^\/]+)$/;
		my $d = $1;
		if (-w $d) {
			open OUT, ">$d/lock";
			flock(OUT,2);
			system("cd $d && tabix -p vcf $snpfile");
			close OUT;
			system("rm $d/lock");
		}
		else {
			system("cp $snpfile /$wd/dbSNP.vcf.bgz");
			system("cd $wd/ && tabix -p vcf dbSNP.vcf.bgz");
			$snpfile = "$wd/dbSNP.vcf.bgz";
		}
	}
	$hassnp = 'WithDbSNP';
}


## 1. Get FASTA and prepare output hashes:
my $targets_one = Thread::Queue->new();
my $targets_two = Thread::Queue->new();
my $targets_three = Thread::Queue->new();
open IN, $opts{'t'} or die("Could not open $opts{'t'} file for reading");
if (-d "$wd/Fasta/") {
	system("rm $wd/Fasta/*");
}
else {
	system("mkdir $wd/Fasta");
}
## create the threads.
for (my $i = 1; $i<= $opts{'p'}; $i++) {
	${'thr'.$i} = threads->create('FetchFasta');
}

## enqueue the targets.
my %thash;
while (<IN>) {
	chomp;
	my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$_);
	$targets_one->enqueue($_);
	$targets_two->enqueue($_);
	$targets_three->enqueue($_);
	$thash{$chr}{$start} = $stop;
}
close IN;

## end the threads.
for (my $i = 1; $i <= $opts{'p'}; $i++) {
	$targets_one->enqueue(undef);
}

for (my $i = 1; $i <= $opts{'p'}; $i++) {
	${'thr'.$i}->join();
}

## load dbSNP inside target regions into shared structure.
##########################################################
my %dbsnp :shared;
if ($snpfile ne '') {
	## BCFTOOLS query is very very fast, but not available so far in the default bcftools version included in samtools package.
	# as a work-around, use tabix, but this is slower.
	#my $bcf = `which bcftools`;
	#chomp($bcf);
	#if ($bcf ne '') {
	#	my $command = "bcftools query -f '\%CHROM\\t\%POS\\t\%REF\\t\%ALT\\t\%ID\\n' -R '".$opts{'t'}."' '$snpfile' > $wd/dbsnp.txt";
	#	system("$command");
	#	open IN, "$wd/dbsnp.txt";
	#	while (<IN>) {
	#		chomp;
	#		my @p = split(/\t/,$_);
	#		$dbsnp{$p[0].'-'.$p[1]} = $p[2].'-'.$p[3].'-'.$p[4];
	#	}
	#	close IN;
	#}
	#else {
	#	print "WARNING: BCFtools is not in the path. Skipping snp handling.\n";
	#	$snpfile = '';
	#	system("touch $wd/dbsnp.txt");
	#}
	system("tabix $snpfile -B $opts{'t'} | cut -f 1-5 > $wd/dbsnp.txt");
	my $lc = `cat $wd/dbsnp.txt | wc -l`;
	chomp($lc);
	if ($lc eq '0') {
		open SNP, ">$wd/dbsnp.txt";
		## dummy line on chr zero
		print SNP "chr0\t1\t.\tA\tT\n";
		close SNP;
	}
}
else {
	open SNP, ">$wd/dbsnp.txt";
	## dummy line on chr zero to prevent R issues on empty file.
	print SNP "chr0\t1\t.\tA\tT\n";
	close SNP;
}
open IN, "$wd/dbsnp.txt";
while (<IN>) {
	chomp;
	my @p = split(/\t/,$_);
	$dbsnp{$p[0].'-'.$p[1]} = $p[3].'-'.$p[4].'-'.$p[2];
}
close IN;

## now process the bam file.
mkdir "$wd/WIGS/";
my $bam :shared;
$bam = $opts{'b'};
# igvtools cannot handle the .dat extension, so make symlink
system("ln -s '$bam' '$wd/input.bam'");
system("cd $wd && samtools index input.bam");

for (my $i = 1; $i <= $opts{'p'}; $i++) {
	${'thr'.$i} = threads->create('CountAlleles');
}
## end the threads.
for (my $i = 1; $i <= $opts{'p'}; $i++) {
	$targets_two->enqueue(undef);
}

for (my $i = 1; $i <= $opts{'p'}; $i++) {
	${'thr'.$i}->join();
}

## generate the distributions.
##############################
my $alleles = Thread::Queue->new();
my %all = ('A' => 1,'C' => 2,'G' => 3, 'T' => 4);
foreach(keys(%all)) {
	$alleles->enqueue($_);
	my  $a = $_;
	foreach(keys(%all)) {
		if ($_ eq $a) {
			next;
		}
		$alleles->enqueue($a.'-'.$_);
	}
}
for (my $i = 1; $i <= $opts{'p'}; $i++) {
	${'thr'.$i} = threads->create('GetDistribution');
}
## end the threads.
for (my $i = 1; $i <= $opts{'p'}; $i++) {
	$alleles->enqueue(undef);
}

for (my $i = 1; $i <= $opts{'p'}; $i++) {
	${'thr'.$i}->join();
}

## group distributions into one file
####################################
my %map =('A' => 2,'C' => 3,'G' => 4, 'T' => 5);
open OUT, ">".$opts{'a'};
print OUT "allele\tavg\tsd\tN\n";
foreach(keys(%map)) {
	my $r = $_;
	my $f = "$wd/model.$r.$mincov"."x.$hassnp.txt";
	open IN, "$f";
	my $a = <IN>;
	chomp($a);
	my $s = <IN>;
	chomp($s);
	my $n = <IN>;
	chomp($n);
	close IN;
	print OUT "$r\t$a\t$s\t$n\n";
	foreach(keys(%map)) {
		if ($_ eq $r) {
			next;
		}
		my $f = "$wd/model.$r-$_.$mincov"."x.$hassnp.txt";
		open IN, "$f";
		my $a = <IN>;
		chomp($a);
		my $s = <IN>;
		chomp($s);
		my $n = <IN>;
		chomp($n);
		close IN;
		print OUT "$r-$_\t$a\t$s\t$n\n";
	}
}
close OUT;

## make pdf with distribution plots
###################################
open OUT, ">$wd/MakePlots.R";
print OUT "\n";
print OUT "dists <- read.table(file='".$opts{'a'}."', header=TRUE, as.is=TRUE)\n";
print OUT "pdf(file='".$opts{'d'}."',paper='a4',onefile=TRUE)\n";
print OUT "par(mfrow=c(2, 2))\n";
print OUT "for (i in 1:nrow(dists)) {\n";
print OUT "   if (dists[i,'avg'] > 0.5) {\n";
print OUT "     x <- seq(0.85,1,length=1000)\n";
print OUT "     y <- dnorm(x,mean=dists[i,'avg'],sd=dists[i,'sd'])\n";
print OUT "     plot(x,y,main=paste('Distribution for allele \"',dists[i,'allele'],'\"',sep=''),xlab='Allelic Ratio',type='l',lwd=1)\n";
print OUT "     abline(v=(dists[i,'avg']-3*dists[i,'sd']),col='red')\n";
print OUT "     text(0.855,max(y-0.5),paste(c('avg: ',round(dists[i,'avg'],digits=5),'\\nsd: ',round(dists[i,'sd'],digits=5),'\\nN: ',dists[i,'N']),sep=' ',collapse=''),adj=c(0,1))\n";
print OUT "  } else {\n";
print OUT "   x <- seq(0,0.15,length=1000)\n";
print OUT "   y <- dnorm(x,dists[i,'avg'],sd=dists[i,'sd'])\n";
print OUT "   plot(x,y,main=paste('Distribution for \"',dists[i,'allele'],'\" variation',sep=''),xlab='Allelic Ratio',type='l',lwd=1)\n";
print OUT "   abline(v=(dists[i,'avg']+3*dists[i,'sd']),col='red')\n";
print OUT "   text(0.1,max(y-0.5),paste(c('avg: ',round(dists[i,'avg'],digits=5),'\\nsd: ',round(dists[i,'sd'],digits=5),'\\nN: ',dists[i,'N']),sep=' ',collapse=''),adj=c(0,1))\n";
print OUT "  }\n";
print OUT "}\n";
print OUT "dev.off()\n";
close OUT;
system("cd $wd && Rscript MakePlots.R > /dev/null 2>&1");

## CALL SNPs
############
# create the R script.
open R, ">$wd/CallSNPs.R";
print R "\n";
print R "args <- commandArgs(trailingOnly = TRUE)\n";
print R "counts <- read.table(file = args[1],header = FALSE, as.is = TRUE)\n";
print R "ploidy <- as.integer(args[3])\n";
print R "chr <- args[2]\n";
print R "snps <- read.table(file=args[5],header=FALSE,as.is=TRUE)\n";
print R "colnames(snps) <- c('chr','pos','id','ref','alt')\n";
print R "colnames(counts) <- c('pos','ref','A','C','G','T','TotalDepth')\n";
print R "dists <- read.table(file='".$opts{'a'}."',header=TRUE,as.is=TRUE)\n";
print R 'rownames(dists) = dists$allele'."\n";
print R 'dists <- dists[,-1]'."\n";
print R "vcf <- c()\n";
print R "lower <- c()\n";
print R "higher <- c()\n";
print R "for (i in 1:(ploidy)) {\n";
print R "  lower[length(lower)+1] <- (2*i-1)/(2*ploidy)\n";
print R "  higher[length(higher)+1] <- (2*i+1)/(2*ploidy)\n";
print R "}\n";
print R "for (i in 1:nrow(counts)) {\n";
print R "  if (counts[i,'TotalDepth'] == 0) next\n";
print R "  # significantly different from reference?\n";
print R "  z <- ((counts[i,counts[i,'ref']]/counts[i,'TotalDepth']) - dists[counts[i,'ref'],'avg']) /  dists[counts[i,'ref'],'sd']\n";
print R "  if (abs(z) > 3) {\n";
print R "    # test all alterate alleles to see which one is significant.\n";
print R "    for (j in c('A','C','G','T')) {\n";
print R "      if (j == counts[i,'ref']) next\n";
print R "      z <- ((counts[i,j]/counts[i,'TotalDepth']) - dists[paste(counts[i,'ref'],'-',j,sep=''),'avg']) /  dists[paste(counts[i,'ref'],'-',j,sep=''),'sd']\n";
print R "      if (abs(z) > 3){\n";
print R "         filter <- 'PASS'\n";
print R "         phred <- round(-10*log(pnorm(-abs(z))),digits=0)\n";
print R "         if (phred > 9999) phred <- 9999\n";
print R "         frac <- counts[i,j]/counts[i,'TotalDepth']\n";
print R "         for (k in 1:ploidy) {\n";
print R "            if (frac >= lower[k] && frac < higher[k]) {\n";
print R "		sample <- paste(paste(paste(rep(0,(ploidy-k)),sep='',collapse='/'),paste(rep(1,k),sep='',collapse='/'),sep='/',collapse=''),':',counts[i,counts[i,'ref']],',',counts[i,j],sep='',collapse='')\n";
print R "                af <- k/ploidy\n";
print R "                break\n";
print R "            }\n";
print R "         }\n";
print R "         if (frac < lower[1]) {\n";
print R "            sample <- paste(paste(paste(rep(0,(ploidy-1)),sep='',collapse='/'),paste(rep(1,1),sep='',collapse='/'),sep='/',collapse=''),':',counts[i,counts[i,'ref']],',',counts[i,j],sep='',collapse='')\n";
print R "            af <- 1/ploidy\n";
print R "            filter <- 'LowFraction'\n";
print R "         }\n";
print R "         if (counts[i,'TotalDepth'] < $mincov) {\n";
print R "            filter <- 'LowCoverage'\n";
print R "         }\n";
print R "         info <- paste('DP=',counts[i,'TotalDepth'],';AF=',round(af,digits=5),';AR=',round(frac,digits=5),sep='')\n";
print R "	  snpids <- which(snps\$chr == chr & snps\$pos == counts[i,'pos'])\n";
print R "         id <- '.'\n";
print R "         if (length(snpids) > 0) id <- snps[snpids[1],'id']\n";
print R "	  vcf[length(vcf)+1] <- paste(chr,counts[i,'pos'],id,counts[i,'ref'],j,phred,filter,info,'GT:AD',sample,sep='\\t',collapse='')\n";
print R "      }\n";
print R "     }\n";
print R "  }\n";
print R "}\n";
print R "if (length(vcf) > 0) {\n";
print R "   write(file=args[4],paste(vcf,sep='\\n'))\n";
print R "}\n";
close R;
system("mkdir $wd/VCF/");
for (my $i = 1; $i <= $opts{'p'}; $i++) {
	${'thr'.$i} = threads->create('CallSNPs');
}
## end the threads.
for (my $i = 1; $i <= $opts{'p'}; $i++) {
	$targets_three->enqueue(undef);
}

for (my $i = 1; $i <= $opts{'p'}; $i++) {
	${'thr'.$i}->join();
}

## BUILD FINAL VCF
open OUT, ">$outfile";
print OUT "##fileformat=VCFv4.1\n";
print OUT "##source=High_Ploidy_Genotyper_v.0.1\n";
print OUT "##genome_reference=$twobit\n";
if ($snpfile ne '') {
	print OUT "##SNP_file=$snpfile\n";
}
foreach(keys(%thash)) {
	print OUT "##contig=<ID=$_,assembly=hg19,species=\"Homo Sapiens\">\n";
}
print OUT "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n";
print OUT "##INFO=<ID=AF,Number=1,Type=Float,Description=\"Allele Frequency\">\n";
print OUT "##INFO=<ID=AR,Number=1,Type=Float,Description=\"Allelic Ratio\">\n";
print OUT "##FILTER=<ID=LowFraction,Description=\"Allelic Fraction under 1/2*$ploidy\">\n";
print OUT "##FILTER=<ID=LowCoverage,Description=\"Total Depth is lower than threshold of $mincov\">\n";
print OUT "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n";
print OUT "##FORMAT=<ID=AD,Number=2,type=Integer,Description,\"Allelic Depth\">\n";
print OUT "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n";
close OUT;
@i = ( 1 .. 22,'X','Y','M' );
foreach(@i) {
	my $chr = "chr$_";
	foreach(sort {$a <=> $b} keys(%{$thash{$chr}})) {
		my $v = "$wd/VCF/$chr.$_-".$thash{$chr}{$_}.".vcf";
		if (-e $v) {
			system("cat '$v' >> '$outfile'");
		}
	}
}

## clean up
system("rm -Rf '$wd'");

sub FetchFasta  {
	while(defined(my $line = $targets_one->dequeue())) {
		my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$line);
		# 2bit is zero based, non-including => decrease start by one
		$startposition = $start - 1;
		my $command = "twoBitToFa -seq=$chr -start=$startposition -end=$stop -noMask $twobit $wd/Fasta/$chr-$start-$stop.fasta";
		system($command);
	}
}

sub CountAlleles {
	# local version of hashes
	my $snp = \%dbsnp;
	my %counts;
	$counts{'A'} = '';
	$counts{'C'} = '';
	$counts{'G'} = '';
	$counts{'T'} = '';
	my %map =('A' => 1,'C' => 2,'G' => 3, 'T' => 4);
	my %options;
	foreach(keys(%map)) {
		my $r = $_;
		foreach(keys(%map)) {
			if ($_ eq $r) {
				next;
			}
			$options{$r.'-'.$_} = '';
		}
	}
	while (defined(my $line = $targets_two->dequeue())) {
		$out = '';
		my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$line);
		## get reference alleles
		my %ref_alleles;
		open FASTA, "$wd/Fasta/$chr-$start-$stop.fasta";
		my $head = <FASTA>;
		my $seq = '';
		while (<FASTA>) {
			chomp;
			$seq .= $_;
		}
		close FASTA;
		# this generates a hash of the reference alleles once, instead of substr-calls in every bam, on every iteration.
		for (my $pos = 0; $pos < length($seq); $pos++) {
			$ref_alleles{($pos+$start)} = substr($seq,$pos,1);
		}
		## get counts.
		my $target = "$chr:$start-$stop";
		my $command = "igvtools count -w 1 --bases --query '$target' '$wd/input.bam' '$wd/WIGS/$chr-$start-$stop.wig' '$igvgenome' > /dev/null 2>&1";
		system($command);
		open WIG, "$wd/WIGS/$chr-$start-$stop.wig";
		my $h = <WIG>;
		$h = <WIG>;
		$h = <WIG>;
		my $target_counts = '';
		while (<WIG>) {
			chomp;
			#my ($pos, $a, $c, $g, $t , $n) = split(/\t/,$_);
			my @p = split(/\t/,$_);
			my $s = $p[1] + $p[2] + $p[3] + $p[4];
			$target_counts .= "$p[0]\t$ref_alleles{$p[0]}\t$p[1]\t$p[2]\t$p[3]\t$p[4]\t$s\n";
			## skip positions with coverage < minimal coverage, and positions in dbsnp if specified (if not specified, snp hash is empty).
			if ($s > $mincov && !defined($snp->{$chr.'-'.$p[0]})) {
				## for model of 'non-reference'
				my $frac = $p[$map{$ref_alleles{$p[0]}}] / $s;
				$counts{$ref_alleles{$p[0]}} .= $frac.',';
				$out .= "$target\t$p[0]\t$ref_alleles{$p[0]}\t$p[1]\t$p[2]\t$p[3]\t$p[4]\n";
				## for each of the options background models
				foreach(keys(%map)) {
					if ($_ eq $ref_alleles{$p[0]}) {
						next;
					}
					$options{$ref_alleles{$p[0]}.'-'.$_} .= ($p[$map{$_}] / $s) .',';
				}

			}
		}
		close WIG;
		open OUT, ">>$wd/allcounts.$mincov"."x.$hassnp.txt";
		flock(OUT, 2);
		print OUT $out;
		close OUT;
		open OUT, ">$wd/WIGS/$chr.$start-$stop.txt";
		print OUT $target_counts;
		close OUT;

	}
	foreach(keys(%counts)) {
		open OUT, ">>$wd/counts_$_.$mincov"."x.$hassnp.txt";
		flock(OUT,2);
		print OUT $counts{$_};
		close OUT;
	}
	foreach(keys(%options)) {
		open OUT, ">>$wd/counts_$_.$mincov"."x.$hassnp.txt";
		flock(OUT,2);
		print OUT $options{$_};
		close OUT;
	}
}

sub GetDistribution {
	while (defined(my $allele = $alleles->dequeue())) {
		system("sed -i 's/.\$//' '$wd/counts_$allele.$mincov"."x.$hassnp.txt'");
		open OUT, ">$wd/GetDistribution.$allele.R";
		print OUT "\n";
		print OUT "nt <- '$allele'\n";
		#print OUT "pdf(file='$wd/Distribution.$allele.$mincov"."x.$hassnp.pdf',paper='a4')\n";
		print OUT "data <- scan(file='$wd/counts_$allele.$mincov"."x.$hassnp.txt',sep=',')\n";
		print OUT "nr <- length(data)\n";
		print OUT "avg <- mean(data)\n";
		print OUT "sdd <- sd(data)\n";
		print OUT "write(c(avg,sdd,nr),file='$wd/model.$allele.$mincov"."x.$hassnp.txt',ncolumns=1)\n";
		close OUT;
		system("cd $wd && Rscript GetDistribution.$allele.R >/dev/null 2>&1");
	}
}


sub CallSNPs {
	while (defined(my $line = $targets_three->dequeue())) {
		# split.
		my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$line);
		my $file = "$wd/WIGS/$chr.$start-$stop.txt";
		my $ofile = "$wd/VCF/$chr.$start-$stop.vcf";
		system("cd $wd && Rscript CallSNPs.R '$file' '$chr' '$ploidy' '$ofile' '$wd/dbsnp.txt'");
	}

}
author	geert-vandeweyer
date	Mon, 29 Sep 2014 07:17:11 -0400
parents	8938f339ed37
children