# HG changeset patch # User geert-vandeweyer # Date 1411977610 14400 # Node ID 3ba482a2dd0ec6c08e6286ded7a74bc9d29c301f # Parent 7e4a9ee69f0b9d8bafcf36a3481500fb27cae11e Uploaded diff -r 7e4a9ee69f0b -r 3ba482a2dd0e DC_Genotyper.pl --- a/DC_Genotyper.pl Mon Sep 29 02:03:50 2014 -0400 +++ b/DC_Genotyper.pl Mon Sep 29 04:00:10 2014 -0400 @@ -32,7 +32,8 @@ # P: ploidy # a: outfile for allele distributions # v: vcf file output. -getopts('t:b:R:p:s:m:P:v:a:', \%opts) ; +# d: distribution plots pdf file +getopts('t:b:R:p:s:m:P:v:a:d:', \%opts) ; ## variables my $twobit :shared; @@ -170,6 +171,8 @@ ########################################################## my %dbsnp :shared; if ($snpfile ne '') { + ## BCFTOOLS query is very very fast, but not available so far in the default bcftools version included in samtools package. + # as a work-around, use tabix, but this is slower. #my $bcf = `which bcftools`; #chomp($bcf); #if ($bcf ne '') { @@ -196,10 +199,11 @@ ## dummy line on chr zero print SNP "chr0\t1\t.\tA\tT\n"; close SNP; + } } else { open SNP, ">$wd/dbsnp.txt"; - ## dummy line on chr zero + ## dummy line on chr zero to prevent R issues on empty file. print SNP "chr0\t1\t.\tA\tT\n"; close SNP; } @@ -208,11 +212,9 @@ mkdir "$wd/WIGS/"; my $bam :shared; $bam = $opts{'b'}; -my $bai = $bam.".bai"; -if (!-e $bai) { - print "BAI ($bai) missing for $bam : creating\n"; - system("samtools index $bam"); -} +# igvtools cannot handle the .dat extension, so make symlink +system("ln -s '$bam' '$wd/input.bam'"); +system("cd $wd && samtools index input.bam"); for (my $i = 1; $i <= $opts{'p'}; $i++) { ${'thr'.$i} = threads->create('CountAlleles'); @@ -256,19 +258,19 @@ #################################### my %map =('A' => 2,'C' => 3,'G' => 4, 'T' => 5); open OUT, ">".$opts{'a'}; -print OUT "allele\tavg\tsd\n"; +print OUT "allele\tavg\tsd\tN\n"; foreach(keys(%map)) { my $r = $_; my $f = "$wd/model.$r.$mincov"."x.$hassnp.txt"; open IN, "$f"; my $a = ; chomp($a); - #$dists{$r}{'avg'} = $a; my $s = ; chomp($s); - #$dists{$r}{'sd'} = $s; + my $n = ; + chomp($n); close IN; - print OUT "$r\t$a\t$s\n"; + print OUT "$r\t$a\t$s\t$n\n"; foreach(keys(%map)) { if ($_ eq $r) { next; @@ -279,12 +281,40 @@ chomp($a); my $s = ; chomp($s); + my $n = ; + chomp($n); close IN; - print OUT "$r-$_\t$a\t$s\n"; + print OUT "$r-$_\t$a\t$s\t$n\n"; } } close OUT; +## make pdf with distribution plots +################################### +open OUT, ">$wd/MakePlots.R"; +print OUT "\n"; +print OUT "dists <- read.table(file='".$opts{'a'}."', header=TRUE, as.is=TRUE)\n"; +print OUT "pdf(file='".$opts{'d'}."',paper='a4',onefile=TRUE)\n"; +print OUT "par(mfrow=c(2, 2))\n"; +print OUT "for (i in 1:nrow(dists)) {\n"; +print OUT " if (dists[i,'avg'] > 0.5) {\n"; +print OUT " x <- seq(0.85,1,length=1000))\n"; +print OUT " y <- dnorm(x,mean=dists[i,'avg'],sd=dists[i,'sd']))\n"; +print OUT " plot(x,y,main=paste('Distribution for allele \"',dists[i,'allele'],'\"',sep=''),xlab='Allelic Ratio',type='l',lwd=1))\n"; +print OUT " abline(v=(dists[i,'avg']-3*dists[i,'sd']),col='red'))\n"; +print OUT " text(0.855,max(y-0.5),paste(c('avg: ',round(dists[i,'avg'],digits=5),'\\nsd: ',round(dists[i,'sd'],digits=5),'\\nN: ',dists[i,'N']),sep=' ',collapse=''),adj=c(0,1)))\n"; +print OUT " } else {)\n"; +print OUT " x <- seq(0,0.15,length=1000))\n"; +print OUT " y <- dnorm(x,dists[i,'avg'],sd=dists[i,'sd']))\n"; +print OUT " plot(x,y,main=paste('Distribution for \"',dists[i,'allele'],'\" variation',sep=''),xlab='Allelic Ratio',type='l',lwd=1))\n"; +print OUT " abline(v=(dists[i,'avg']+3*dists[i,'sd']),col='red'))\n"; +print OUT " text(0.1,max(y-0.5),paste(c('avg: ',round(dists[i,'avg'],digits=5),'\\nsd: ',round(dists[i,'sd'],digits=5),'\\nN: ',dists[i,'N']),sep=' ',collapse=''),adj=c(0,1)))\n"; +print OUT " })\n"; +print OUT "})\n"; +print OUT "dev.off())\n"; +close OUT; +system("cd $wd && Rscript MakePlots.R > /dev/null 2>&1"); + ## CALL SNPs ############ # create the R script. @@ -508,21 +538,7 @@ print OUT "nr <- length(data)\n"; print OUT "avg <- mean(data)\n"; print OUT "sdd <- sd(data)\n"; - #print OUT "if (avg > 0.5) {\n"; - #print OUT " x <- seq(0.8,1,length=1000)\n"; - #print OUT " y <- dnorm(x,mean=avg,sd=sdd)\n"; - #print OUT " plot(x,y,main='Distribution in sample $bam for nt $allele',xlab='Allelic Ratio',type='l',lwd=1)\n"; - #print OUT " abline(v=(avg-3*sdd),col='red')\n"; - #print OUT " text(0.81,max(y-0.5),paste(c('avg: ',avg,'\\nsd: ',sdd,'\\nnrDataPoints:', nr,'\\n$hassnp\\nMin.Cov:',$mincov),sep=' ',collapse=''),adj=c(0,1))\n"; - #print OUT "} else {\n"; - #print OUT " x <- seq(0,0.3,length=1000)\n"; - #print OUT " y <- dnorm(x,mean=avg,sd=sdd)\n"; - #print OUT " plot(x,y,main='Distribution in sample $bam for nt $allele',xlab='Allelic Ratio',type='l',lwd=1)\n"; - #print OUT " abline(v=(avg+3*sdd),col='red')\n"; - #print OUT " text(0.2,max(y-0.5),paste(c('avg: ',avg,'\\nsd: ',sdd,'\\nnrDataPoints:', nr,'\\n$hassnp\\nMin.Cov:',$mincov),sep=' ',collapse=''),adj=c(0,1))\n"; - #print OUT "}\n"; - #print OUT "dev.off()\n"; - print OUT "write(c(avg,sdd),file='$wd/model.$allele.$mincov"."x.$hassnp.txt',ncolumns=1)\n"; + print OUT "write(c(avg,sdd,nr),file='$wd/model.$allele.$mincov"."x.$hassnp.txt',ncolumns=1)\n"; close OUT; system("cd $wd && Rscript GetDistribution.$allele.R >/dev/null 2>&1"); }