Mercurial > repos > geert-vandeweyer > dc_genotyper
diff DC_Genotyper.pl @ 11:845a87ad254a draft
re-upload, accidentally removed some files
author | geert-vandeweyer |
---|---|
date | Sat, 27 Sep 2014 05:38:17 -0400 |
parents | |
children | 8cc26598eeac |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/DC_Genotyper.pl Sat Sep 27 05:38:17 2014 -0400 @@ -0,0 +1,530 @@ +#!/usr/bin/perl +############################# +## DEEP COVERAGE GENOTYPER ## +############################# +# version : 0.0.1 +# Principle: +# 1. get allele counts on all positions in specified targets (bed) using igvtools. Only SNPs !! +# 2. remove known dbsnp positions (bcf file) +# 3. Get distribution of background noise (pcr/sequencing errors), by modelling allele fractions as normal distributions. +# 4. Based on these distributions, check each position for significant change from the reference allele (based on allele fraction) +# 5. For abberant positions, check each alternate allele to see if it passes the background signal. +# 6. Generate VCF file. + + +################## +## LOAD MODULES ## +################## +use threads; +use threads::shared; +use Thread::Queue; +use Getopt::Std; + +#################### +## get paramaters ## +#################### +# t: target file +# b: bam file +# R: 2bit version of reference fasta. +# p: number of threads. +# s: dbsnp file +# m: minimal coverage (defaults 400x) +# P: ploidy +# a: outfile for allele distributions +# v: vcf file output. +getopts('t:b:R:p:s:m:P:v:a:', \%opts) ; + +## variables +my $twobit :shared; +my $igvgenome :shared; +if (!defined($opts{'R'})) { + die("Reference Genomes not specified\n"); +} +my @refgenomes = split(",",$opts{'r'}); +if (!-e $refgenomes[0]) { + die("'$refgenomes[0]' is not a valid file path."); +} +else { + $twobit = $refgenomes[0]; +} +if (!-e $refgenomes[1]) { + die("'$refgenomes[1]' is not a valid file path."); +} +else { + $igvgenome = $refgenomes[1]; +} + + +my $mincov :shared; +$mincov = 320; +if (defined($opts{'m'})) { + $mincov = $opts{'m'}; +} + +my $ploidy :shared; +if (defined($opts{'P'}) && $opts{'P'} =~ m/^\d+$/) { + $ploidy = $opts{'P'}; +} +else { + die("Ploidy (-P) was not specified or not an integer\n"); +} + + +if (defined($opts{'v'})) { + $outfile = $opts{'v'}; +} +else { + die("No output vcf-file specified.\n"); +} +if (!defined($opts{'a'})) { + die("No output file specified for distribution details\n"); +} +## create working dir. +my $rand = int(rand(10000)); +while (-d "/tmp/DC_Genotyper_$rand") { + $rand = int(rand(10000)); +} +my $wd :shared; +$wd = "/tmp/DC_Genotyper_$rand"; +system("mkdir '$wd'"); + + +my $snpfile :shared; +my $hassnp :shared; +$hassnp = 'NoDbSNP'; +$snpfile = ''; +if (defined($opts{'s'})) { + $snpfile = $opts{'s'}; + if (!-e $snpfile) { + die("'$snpfile' is not a valid file path."); + } + + my $mime = `file $snpfile`; + if ($mime !~ m/compressed/) { + print "$snpfile is not in compressed format. compressing & indexing the file now.\n"; + #print "... this takes a while\n"; + system("bgzip -c $snpfile > $wd/dbSNP.vcf.bgz"); + system("cd $wd/ && tabix -p vcf dbSNP.vcf.bgz"); + $snpfile = "$wd/dbSNP.vcf.bgz"; + } + elsif (!-e "$snpfile.tbi") { + print "tabix index file is missing for '$snpfile'. creating now.\n"; + ## check if I can write it out for future use + $snpfile =~ m/(.*)([^\/]+)$/; + my $d = $1; + if (-w $d) { + open OUT, ">$d/lock"; + flock(OUT,2); + system("cd $d && tabix -p vcf $snpfile"); + close OUT; + system("rm $d/lock"); + } + else { + system("cp $snpfile /$wd/dbSNP.vcf.bgz"); + system("cd $wd/ && tabix -p vcf dbSNP.vcf.bgz"); + $snpfile = "$wd/dbSNP.vcf.bgz"; + } + } + $hassnp = 'WithDbSNP'; +} + + +## 1. Get FASTA and prepare output hashes: +my $targets_one = Thread::Queue->new(); +my $targets_two = Thread::Queue->new(); +my $targets_three = Thread::Queue->new(); +open IN, $opts{'t'} or die("Could not open $opts{'t'} file for reading"); +if (-d "$wd/Fasta/") { + system("rm $wd/Fasta/*"); +} +else { + system("mkdir $wd/Fasta"); +} +## create the threads. +for (my $i = 1; $i<= $opts{'p'}; $i++) { + ${'thr'.$i} = threads->create('FetchFasta'); +} + +## enqueue the targets. +my %thash; +while (<IN>) { + chomp; + my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$_); + $targets_one->enqueue($_); + $targets_two->enqueue($_); + $targets_three->enqueue($_); + $thash{$chr}{$start} = $stop; +} +close IN; + +## end the threads. +for (my $i = 1; $i <= $opts{'p'}; $i++) { + $targets_one->enqueue(undef); +} + +for (my $i = 1; $i <= $opts{'p'}; $i++) { + ${'thr'.$i}->join(); +} + +## load dbSNP inside target regions into shared structure. +########################################################## +my %dbsnp :shared; +if ($snpfile ne '') { + my $bcf = `which bcftools`; + chomp($bcf); + if ($bcf ne '') { + my $command = "bcftools query -f '\%CHROM\\t\%POS\\t\%REF\\t\%ALT\\t\%ID\\n' -R '".$opts{'t'}."' '$snpfile' > $wd/dbsnp.txt"; + system("$command"); + open IN, "$wd/dbsnp.txt"; + while (<IN>) { + chomp; + my @p = split(/\t/,$_); + $dbsnp{$p[0].'-'.$p[1]} = $p[2].'-'.$p[3].'-'.$p[4]; + } + close IN; + } + else { + print "WARNING: BCFtools is not in the path. Skipping snp handling.\n"; + $snpfile = ''; + system("touch $wd/dbsnp.txt"); + } +} +else { + system("touch $wd/dbsnp.txt"); +} + +## now process the bam file. +mkdir "$wd/WIGS/"; +my $bam :shared; +$bam = $opts{'b'}; +my $bai = $bam.".bai"; +if (!-e $bai) { + print "BAI ($bai) missing for $bam : creating\n"; + system("samtools index $bam"); +} + +for (my $i = 1; $i <= $opts{'p'}; $i++) { + ${'thr'.$i} = threads->create('CountAlleles'); +} +## end the threads. +for (my $i = 1; $i <= $opts{'p'}; $i++) { + $targets_two->enqueue(undef); +} + +for (my $i = 1; $i <= $opts{'p'}; $i++) { + ${'thr'.$i}->join(); +} + +## generate the distributions. +############################## +my $alleles = Thread::Queue->new(); +my %all = ('A' => 1,'C' => 2,'G' => 3, 'T' => 4); +foreach(keys(%all)) { + $alleles->enqueue($_); + my $a = $_; + foreach(keys(%all)) { + if ($_ eq $a) { + next; + } + $alleles->enqueue($a.'-'.$_); + } +} +for (my $i = 1; $i <= $opts{'p'}; $i++) { + ${'thr'.$i} = threads->create('GetDistribution'); +} +## end the threads. +for (my $i = 1; $i <= $opts{'p'}; $i++) { + $alleles->enqueue(undef); +} + +for (my $i = 1; $i <= $opts{'p'}; $i++) { + ${'thr'.$i}->join(); +} + +## group distributions into one file +#################################### +my %map =('A' => 2,'C' => 3,'G' => 4, 'T' => 5); +open OUT, ">".$opts{'a'}; +print OUT "allele\tavg\tsd\n"; +foreach(keys(%map)) { + my $r = $_; + my $f = "$wd/model.$r.$mincov"."x.$hassnp.txt"; + open IN, "$f"; + my $a = <IN>; + chomp($a); + #$dists{$r}{'avg'} = $a; + my $s = <IN>; + chomp($s); + #$dists{$r}{'sd'} = $s; + close IN; + print OUT "$r\t$a\t$s\n"; + foreach(keys(%map)) { + if ($_ eq $r) { + next; + } + my $f = "$wd/model.$r-$_.$mincov"."x.$hassnp.txt"; + open IN, "$f"; + my $a = <IN>; + chomp($a); + my $s = <IN>; + chomp($s); + close IN; + print OUT "$r-$_\t$a\t$s\n"; + } +} +close OUT; + +## CALL SNPs +############ +# create the R script. +open R, ">$wd/CallSNPs.R"; +print R "args <- commandArgs(trailingOnly = TRUE)\n"; +print R "counts <- read.table(file=args[1],header=FALSE, as.is=TRUE)\n"; +print R "ploidy <- as.integer(args[3])\n"; +print R "chr <- args[2]\n"; +print R "snps <- read.table(file=args[5],header=FALSE,as.is=TRUE)\n"; +print R "colnames(snps) <- c('chr','pos','ref','alt','id')\n"; +print R "colnames(counts) <- c('pos','ref','A','C','G','T','TotalDepth')\n"; +print R "dists <- read.table(file='$wd/allelic_distributions.txt',header=TRUE,as.is=TRUE)\n"; +print R 'rownames(dists) = dists$allele'."\n"; +print R 'dists <- dists[,-1]'."\n"; +print R "vcf <- c()\n"; +print R "lower <- c()\n"; +print R "higher <- c()\n"; +print R "for (i in 1:(ploidy)) {\n"; +print R " lower[length(lower)+1] <- (2*i-1)/(2*ploidy)\n"; +print R " higher[length(higher)+1] <- (2*i+1)/(2*ploidy)\n"; +print R "}\n"; +print R "for (i in 1:nrow(counts)) {\n"; +print R " if (counts[i,'TotalDepth'] == 0) next\n"; +print R " # significantly different from reference?\n"; +print R " z <- ((counts[i,counts[i,'ref']]/counts[i,'TotalDepth']) - dists[counts[i,'ref'],'avg']) / dists[counts[i,'ref'],'sd']\n"; +print R " if (abs(z) > 3) {\n"; +print R " # test all alterate alleles to see which one is significant.\n"; +print R " for (j in c('A','C','G','T')) {\n"; +print R " if (j == counts[i,'ref']) next\n"; +print R " z <- ((counts[i,j]/counts[i,'TotalDepth']) - dists[paste(counts[i,'ref'],'-',j,sep=''),'avg']) / dists[paste(counts[i,'ref'],'-',j,sep=''),'sd']\n"; +print R " if (abs(z) > 3){\n"; +print R " filter <- 'PASS'\n"; +print R " phred <- round(-10*log(pnorm(-abs(z))),digits=0)\n"; +print R " if (phred > 9999) phred <- 9999\n"; +print R " frac <- counts[i,j]/counts[i,'TotalDepth']\n"; +print R " for (k in 1:ploidy) {\n"; +print R " if (frac >= lower[k] && frac < higher[k]) {\n"; +print R " sample <- paste(paste(paste(rep(0,(ploidy-k)),sep='',collapse='/'),paste(rep(1,k),sep='',collapse='/'),sep='/',collapse=''),':',counts[i,counts[i,'ref']],',',counts[i,j],sep='',collapse='')\n"; +print R " af <- k/ploidy\n"; +print R " break\n"; +print R " }\n"; +print R " }\n"; +print R " if (frac < lower[1]) {\n"; +print R " sample <- paste(paste(paste(rep(0,(ploidy-1)),sep='',collapse='/'),paste(rep(1,1),sep='',collapse='/'),sep='/',collapse=''),':',counts[i,counts[i,'ref']],',',counts[i,j],sep='',collapse='')\n"; +print R " af <- 1/ploidy\n"; +print R " filter <- 'LowFraction'\n"; +print R " }\n"; +print R " if (counts[i,'TotalDepth'] < $mincov) {\n"; +print R " filter <- 'LowCoverage'\n"; +print R " }\n"; +print R " info <- paste('DP=',counts[i,'TotalDepth'],';AF=',round(af,digits=5),';AR=',round(frac,digits=5),sep='')\n"; +print R " snpids <- which(snps\$chr == chr & snps\$pos == counts[i,'pos'])\n"; +print R " id <- '.'\n"; +print R " if (length(snpids) > 0) id <- snps[snpids[1],'id']\n"; +print R " vcf[length(vcf)+1] <- paste(chr,counts[i,'pos'],id,counts[i,'ref'],j,phred,filter,info,'GT:AD',sample,sep='\\t',collapse='')\n"; +print R " }\n"; +print R " }\n"; +print R " }\n"; +print R "}\n"; +print R "if (length(vcf) > 0) {\n"; +print R " write(file=args[4],paste(vcf,sep='\\n'))\n"; +print R "}\n"; +close R; +system("mkdir $wd/VCF/"); +for (my $i = 1; $i <= $opts{'p'}; $i++) { + ${'thr'.$i} = threads->create('CallSNPs'); +} +## end the threads. +for (my $i = 1; $i <= $opts{'p'}; $i++) { + $targets_three->enqueue(undef); +} + +for (my $i = 1; $i <= $opts{'p'}; $i++) { + ${'thr'.$i}->join(); +} + +## BUILD FINAL VCF +open OUT, ">$outfile"; +print OUT "##fileformat=VCFv4.1\n"; +print OUT "##source=High_Ploidy_Genotyper_v.0.1\n"; +print OUT "##genome_reference=$twobit\n"; +if ($snpfile ne '') { + print OUT "##SNP_file=$snpfile\n"; +} +foreach(keys(%thash)) { + print OUT "##contig=<ID=$_,assembly=hg19,species=\"Homo Sapiens\">\n"; +} +print OUT "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n"; +print OUT "##INFO=<ID=AF,Number=1,Type=Float,Description=\"Allele Frequency\">\n"; +print OUT "##INFO=<ID=AR,Number=1,Type=Float,Description=\"Allelic Ratio\">\n"; +print OUT "##FILTER=<ID=LowFraction,Description=\"Allelic Fraction under 1/2*$ploidy\">\n"; +print OUT "##FILTER=<ID=LowCoverage,Description=\"Total Depth is lower than threshold of $mincov\">\n"; +print OUT "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"; +print OUT "##FORMAT=<ID=AD,Number=2,type=Integer,Description,\"Allelic Depth\">\n"; +print OUT "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n"; +close OUT; +@i = ( 1 .. 22,'X','Y','M' ); +foreach(@i) { + my $chr = "chr$_"; + foreach(sort {$a <=> $b} keys(%{$thash{$chr}})) { + my $v = "$wd/VCF/$chr.$_-".$thash{$chr}{$_}.".vcf"; + if (-e $v) { + system("cat '$v' >> '$outfile'"); + } + } +} + +## clean up +system("rm -Rf '$wd'"); + +sub FetchFasta { + while(defined(my $line = $targets_one->dequeue())) { + my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$line); + # 2bit is zero based, non-including => decrease start by one + $startposition = $start - 1; + my $command = "twoBitToFa -seq=$chr -start=$startposition -end=$stop -noMask $twobit $wd/Fasta/$chr-$start-$stop.fasta"; + system($command); + } +} + +sub CountAlleles { + # local version of hashes + my $snp = \%dbsnp; + my %counts; + $counts{'A'} = ''; + $counts{'C'} = ''; + $counts{'G'} = ''; + $counts{'T'} = ''; + my %map =('A' => 1,'C' => 2,'G' => 3, 'T' => 4); + my %options; + foreach(keys(%map)) { + my $r = $_; + foreach(keys(%map)) { + if ($_ eq $r) { + next; + } + $options{$r.'-'.$_} = ''; + } + } + while (defined(my $line = $targets_two->dequeue())) { + $out = ''; + my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$line); + ## get reference alleles + my %ref_alleles; + open FASTA, "$wd/Fasta/$chr-$start-$stop.fasta"; + my $head = <FASTA>; + my $seq = ''; + while (<FASTA>) { + chomp; + $seq .= $_; + } + close FASTA; + # this generates a hash of the reference alleles once, instead of substr-calls in every bam, on every iteration. + for (my $pos = 0; $pos < length($seq); $pos++) { + $ref_alleles{($pos+$start)} = substr($seq,$pos,1); + } + ## get counts. + my $target = "$chr:$start-$stop"; + my $command = "igvtools count -w 1 --bases --query '$target' '$bam' '$wd/WIGS/$chr-$start-$stop.wig' '$igvgenome' > /dev/null 2>&1"; + system($command); + open WIG, "$wd/WIGS/$chr-$start-$stop.wig"; + my $h = <WIG>; + $h = <WIG>; + $h = <WIG>; + my $target_counts = ''; + while (<WIG>) { + chomp; + #my ($pos, $a, $c, $g, $t , $n) = split(/\t/,$_); + my @p = split(/\t/,$_); + my $s = $p[1] + $p[2] + $p[3] + $p[4]; + $target_counts .= "$p[0]\t$ref_alleles{$p[0]}\t$p[1]\t$p[2]\t$p[3]\t$p[4]\t$s\n"; + ## skip positions with coverage < minimal coverage, and positions in dbsnp if specified (if not specified, snp hash is empty). + if ($s > $mincov && !defined($snp->{$chr.'-'.$p[0]})) { + ## for model of 'non-reference' + my $frac = $p[$map{$ref_alleles{$p[0]}}] / $s; + $counts{$ref_alleles{$p[0]}} .= $frac.','; + $out .= "$target\t$p[0]\t$ref_alleles{$p[0]}\t$p[1]\t$p[2]\t$p[3]\t$p[4]\n"; + ## for each of the options background models + foreach(keys(%map)) { + if ($_ eq $ref_alleles{$p[0]}) { + next; + } + $options{$ref_alleles{$p[0]}.'-'.$_} .= ($p[$map{$_}] / $s) .','; + } + + } + } + close WIG; + open OUT, ">>$wd/allcounts.$mincov"."x.$hassnp.txt"; + flock(OUT, 2); + print OUT $out; + close OUT; + open OUT, ">$wd/WIGS/$chr.$start-$stop.txt"; + print OUT $target_counts; + close OUT; + + } + foreach(keys(%counts)) { + open OUT, ">>$wd/counts_$_.$mincov"."x.$hassnp.txt"; + flock(OUT,2); + print OUT $counts{$_}; + close OUT; + } + foreach(keys(%options)) { + open OUT, ">>$wd/counts_$_.$mincov"."x.$hassnp.txt"; + flock(OUT,2); + print OUT $options{$_}; + close OUT; + } +} + +sub GetDistribution { + while (defined(my $allele = $alleles->dequeue())) { + system("sed -i 's/.\$//' '$wd/counts_$allele.$mincov"."x.$hassnp.txt'"); + open OUT, ">$wd/GetDistribution.$allele.R"; + print OUT "sample <- '$bam'\n"; + print OUT "nt <- '$allele'\n"; + #print OUT "pdf(file='$wd/Distribution.$allele.$mincov"."x.$hassnp.pdf',paper='a4')\n"; + print OUT "data <- scan(file='$wd/counts_$allele.$mincov"."x.$hassnp.txt',sep=',')\n"; + print OUT "nr <- length(data)\n"; + print OUT "avg <- mean(data)\n"; + print OUT "sdd <- sd(data)\n"; + #print OUT "if (avg > 0.5) {\n"; + #print OUT " x <- seq(0.8,1,length=1000)\n"; + #print OUT " y <- dnorm(x,mean=avg,sd=sdd)\n"; + #print OUT " plot(x,y,main='Distribution in sample $bam for nt $allele',xlab='Allelic Ratio',type='l',lwd=1)\n"; + #print OUT " abline(v=(avg-3*sdd),col='red')\n"; + #print OUT " text(0.81,max(y-0.5),paste(c('avg: ',avg,'\\nsd: ',sdd,'\\nnrDataPoints:', nr,'\\n$hassnp\\nMin.Cov:',$mincov),sep=' ',collapse=''),adj=c(0,1))\n"; + #print OUT "} else {\n"; + #print OUT " x <- seq(0,0.3,length=1000)\n"; + #print OUT " y <- dnorm(x,mean=avg,sd=sdd)\n"; + #print OUT " plot(x,y,main='Distribution in sample $bam for nt $allele',xlab='Allelic Ratio',type='l',lwd=1)\n"; + #print OUT " abline(v=(avg+3*sdd),col='red')\n"; + #print OUT " text(0.2,max(y-0.5),paste(c('avg: ',avg,'\\nsd: ',sdd,'\\nnrDataPoints:', nr,'\\n$hassnp\\nMin.Cov:',$mincov),sep=' ',collapse=''),adj=c(0,1))\n"; + #print OUT "}\n"; + #print OUT "dev.off()\n"; + print OUT "write(c(avg,sdd),file='$wd/model.$allele.$mincov"."x.$hassnp.txt',ncolumns=1)\n"; + close OUT; + system("cd $wd && Rscript GetDistribution.$allele.R >/dev/null 2>&1"); + } +} + + +sub CallSNPs { + while (defined(my $line = $targets_three->dequeue())) { + # split. + my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$line); + my $file = "$wd/WIGS/$chr.$start-$stop.txt"; + my $ofile = "$wd/VCF/$chr.$start-$stop.vcf"; + system("cd $wd && Rscript CallSNPs.R '$file' '$chr' '$ploidy' '$ofile' '$wd/dbsnp.txt'"); + } + +} +