Mercurial > repos > geert-vandeweyer > dc_genotyper
changeset 10:a7e7f9b8f538 draft
Uploaded
author | geert-vandeweyer |
---|---|
date | Sat, 27 Sep 2014 05:36:34 -0400 |
parents | 7cbbf8aaa46c |
children | 845a87ad254a |
files | DC_Genotyper.pl DC_Genotyper.xml DC_Genotyper_indexes.loc.sample dbsnp_indexes.loc.sample readme.rst tool_data_table_conf.xml.sample tool_dependencies.xml |
diffstat | 7 files changed, 8 insertions(+), 667 deletions(-) [+] |
line wrap: on
line diff
--- a/DC_Genotyper.pl Sat Sep 27 02:47:31 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,530 +0,0 @@ -#!/usr/bin/perl -############################# -## DEEP COVERAGE GENOTYPER ## -############################# -# version : 0.0.1 -# Principle: -# 1. get allele counts on all positions in specified targets (bed) using igvtools. Only SNPs !! -# 2. remove known dbsnp positions (bcf file) -# 3. Get distribution of background noise (pcr/sequencing errors), by modelling allele fractions as normal distributions. -# 4. Based on these distributions, check each position for significant change from the reference allele (based on allele fraction) -# 5. For abberant positions, check each alternate allele to see if it passes the background signal. -# 6. Generate VCF file. - - -################## -## LOAD MODULES ## -################## -use threads; -use threads::shared; -use Thread::Queue; -use Getopt::Std; - -#################### -## get paramaters ## -#################### -# t: target file -# b: bam file -# R: 2bit version of reference fasta. -# p: number of threads. -# s: dbsnp file -# m: minimal coverage (defaults 400x) -# P: ploidy -# a: outfile for allele distributions -# v: vcf file output. -getopts('t:b:R:p:s:m:P:v:a:', \%opts) ; - -## variables -my $twobit :shared; -my $igvgenome :shared; -if (!defined($opts{'R'})) { - die("Reference Genomes not specified\n"); -} -my @refgenomes = split(",",$opts{'r'}); -if (!-e $refgenomes[0]) { - die("'$refgenomes[0]' is not a valid file path."); -} -else { - $twobit = $refgenomes[0]; -} -if (!-e $refgenomes[1]) { - die("'$refgenomes[1]' is not a valid file path."); -} -else { - $igvgenome = $refgenomes[1]; -} - - -my $mincov :shared; -$mincov = 320; -if (defined($opts{'m'})) { - $mincov = $opts{'m'}; -} - -my $ploidy :shared; -if (defined($opts{'P'}) && $opts{'P'} =~ m/^\d+$/) { - $ploidy = $opts{'P'}; -} -else { - die("Ploidy (-P) was not specified or not an integer\n"); -} - - -if (defined($opts{'v'})) { - $outfile = $opts{'v'}; -} -else { - die("No output vcf-file specified.\n"); -} -if (!defined($opts{'a'})) { - die("No output file specified for distribution details\n"); -} -## create working dir. -my $rand = int(rand(10000)); -while (-d "/tmp/DC_Genotyper_$rand") { - $rand = int(rand(10000)); -} -my $wd :shared; -$wd = "/tmp/DC_Genotyper_$rand"; -system("mkdir '$wd'"); - - -my $snpfile :shared; -my $hassnp :shared; -$hassnp = 'NoDbSNP'; -$snpfile = ''; -if (defined($opts{'s'})) { - $snpfile = $opts{'s'}; - if (!-e $snpfile) { - die("'$snpfile' is not a valid file path."); - } - - my $mime = `file $snpfile`; - if ($mime !~ m/compressed/) { - print "$snpfile is not in compressed format. compressing & indexing the file now.\n"; - #print "... this takes a while\n"; - system("bgzip -c $snpfile > $wd/dbSNP.vcf.bgz"); - system("cd $wd/ && tabix -p vcf dbSNP.vcf.bgz"); - $snpfile = "$wd/dbSNP.vcf.bgz"; - } - elsif (!-e "$snpfile.tbi") { - print "tabix index file is missing for '$snpfile'. creating now.\n"; - ## check if I can write it out for future use - $snpfile =~ m/(.*)([^\/]+)$/; - my $d = $1; - if (-w $d) { - open OUT, ">$d/lock"; - flock(OUT,2); - system("cd $d && tabix -p vcf $snpfile"); - close OUT; - system("rm $d/lock"); - } - else { - system("cp $snpfile /$wd/dbSNP.vcf.bgz"); - system("cd $wd/ && tabix -p vcf dbSNP.vcf.bgz"); - $snpfile = "$wd/dbSNP.vcf.bgz"; - } - } - $hassnp = 'WithDbSNP'; -} - - -## 1. Get FASTA and prepare output hashes: -my $targets_one = Thread::Queue->new(); -my $targets_two = Thread::Queue->new(); -my $targets_three = Thread::Queue->new(); -open IN, $opts{'t'} or die("Could not open $opts{'t'} file for reading"); -if (-d "$wd/Fasta/") { - system("rm $wd/Fasta/*"); -} -else { - system("mkdir $wd/Fasta"); -} -## create the threads. -for (my $i = 1; $i<= $opts{'p'}; $i++) { - ${'thr'.$i} = threads->create('FetchFasta'); -} - -## enqueue the targets. -my %thash; -while (<IN>) { - chomp; - my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$_); - $targets_one->enqueue($_); - $targets_two->enqueue($_); - $targets_three->enqueue($_); - $thash{$chr}{$start} = $stop; -} -close IN; - -## end the threads. -for (my $i = 1; $i <= $opts{'p'}; $i++) { - $targets_one->enqueue(undef); -} - -for (my $i = 1; $i <= $opts{'p'}; $i++) { - ${'thr'.$i}->join(); -} - -## load dbSNP inside target regions into shared structure. -########################################################## -my %dbsnp :shared; -if ($snpfile ne '') { - my $bcf = `which bcftools`; - chomp($bcf); - if ($bcf ne '') { - my $command = "bcftools query -f '\%CHROM\\t\%POS\\t\%REF\\t\%ALT\\t\%ID\\n' -R '".$opts{'t'}."' '$snpfile' > $wd/dbsnp.txt"; - system("$command"); - open IN, "$wd/dbsnp.txt"; - while (<IN>) { - chomp; - my @p = split(/\t/,$_); - $dbsnp{$p[0].'-'.$p[1]} = $p[2].'-'.$p[3].'-'.$p[4]; - } - close IN; - } - else { - print "WARNING: BCFtools is not in the path. Skipping snp handling.\n"; - $snpfile = ''; - system("touch $wd/dbsnp.txt"); - } -} -else { - system("touch $wd/dbsnp.txt"); -} - -## now process the bam file. -mkdir "$wd/WIGS/"; -my $bam :shared; -$bam = $opts{'b'}; -my $bai = $bam.".bai"; -if (!-e $bai) { - print "BAI ($bai) missing for $bam : creating\n"; - system("samtools index $bam"); -} - -for (my $i = 1; $i <= $opts{'p'}; $i++) { - ${'thr'.$i} = threads->create('CountAlleles'); -} -## end the threads. -for (my $i = 1; $i <= $opts{'p'}; $i++) { - $targets_two->enqueue(undef); -} - -for (my $i = 1; $i <= $opts{'p'}; $i++) { - ${'thr'.$i}->join(); -} - -## generate the distributions. -############################## -my $alleles = Thread::Queue->new(); -my %all = ('A' => 1,'C' => 2,'G' => 3, 'T' => 4); -foreach(keys(%all)) { - $alleles->enqueue($_); - my $a = $_; - foreach(keys(%all)) { - if ($_ eq $a) { - next; - } - $alleles->enqueue($a.'-'.$_); - } -} -for (my $i = 1; $i <= $opts{'p'}; $i++) { - ${'thr'.$i} = threads->create('GetDistribution'); -} -## end the threads. -for (my $i = 1; $i <= $opts{'p'}; $i++) { - $alleles->enqueue(undef); -} - -for (my $i = 1; $i <= $opts{'p'}; $i++) { - ${'thr'.$i}->join(); -} - -## group distributions into one file -#################################### -my %map =('A' => 2,'C' => 3,'G' => 4, 'T' => 5); -open OUT, ">".$opts{'a'}; -print OUT "allele\tavg\tsd\n"; -foreach(keys(%map)) { - my $r = $_; - my $f = "$wd/model.$r.$mincov"."x.$hassnp.txt"; - open IN, "$f"; - my $a = <IN>; - chomp($a); - #$dists{$r}{'avg'} = $a; - my $s = <IN>; - chomp($s); - #$dists{$r}{'sd'} = $s; - close IN; - print OUT "$r\t$a\t$s\n"; - foreach(keys(%map)) { - if ($_ eq $r) { - next; - } - my $f = "$wd/model.$r-$_.$mincov"."x.$hassnp.txt"; - open IN, "$f"; - my $a = <IN>; - chomp($a); - my $s = <IN>; - chomp($s); - close IN; - print OUT "$r-$_\t$a\t$s\n"; - } -} -close OUT; - -## CALL SNPs -############ -# create the R script. -open R, ">$wd/CallSNPs.R"; -print R "args <- commandArgs(trailingOnly = TRUE)\n"; -print R "counts <- read.table(file=args[1],header=FALSE, as.is=TRUE)\n"; -print R "ploidy <- as.integer(args[3])\n"; -print R "chr <- args[2]\n"; -print R "snps <- read.table(file=args[5],header=FALSE,as.is=TRUE)\n"; -print R "colnames(snps) <- c('chr','pos','ref','alt','id')\n"; -print R "colnames(counts) <- c('pos','ref','A','C','G','T','TotalDepth')\n"; -print R "dists <- read.table(file='$wd/allelic_distributions.txt',header=TRUE,as.is=TRUE)\n"; -print R 'rownames(dists) = dists$allele'."\n"; -print R 'dists <- dists[,-1]'."\n"; -print R "vcf <- c()\n"; -print R "lower <- c()\n"; -print R "higher <- c()\n"; -print R "for (i in 1:(ploidy)) {\n"; -print R " lower[length(lower)+1] <- (2*i-1)/(2*ploidy)\n"; -print R " higher[length(higher)+1] <- (2*i+1)/(2*ploidy)\n"; -print R "}\n"; -print R "for (i in 1:nrow(counts)) {\n"; -print R " if (counts[i,'TotalDepth'] == 0) next\n"; -print R " # significantly different from reference?\n"; -print R " z <- ((counts[i,counts[i,'ref']]/counts[i,'TotalDepth']) - dists[counts[i,'ref'],'avg']) / dists[counts[i,'ref'],'sd']\n"; -print R " if (abs(z) > 3) {\n"; -print R " # test all alterate alleles to see which one is significant.\n"; -print R " for (j in c('A','C','G','T')) {\n"; -print R " if (j == counts[i,'ref']) next\n"; -print R " z <- ((counts[i,j]/counts[i,'TotalDepth']) - dists[paste(counts[i,'ref'],'-',j,sep=''),'avg']) / dists[paste(counts[i,'ref'],'-',j,sep=''),'sd']\n"; -print R " if (abs(z) > 3){\n"; -print R " filter <- 'PASS'\n"; -print R " phred <- round(-10*log(pnorm(-abs(z))),digits=0)\n"; -print R " if (phred > 9999) phred <- 9999\n"; -print R " frac <- counts[i,j]/counts[i,'TotalDepth']\n"; -print R " for (k in 1:ploidy) {\n"; -print R " if (frac >= lower[k] && frac < higher[k]) {\n"; -print R " sample <- paste(paste(paste(rep(0,(ploidy-k)),sep='',collapse='/'),paste(rep(1,k),sep='',collapse='/'),sep='/',collapse=''),':',counts[i,counts[i,'ref']],',',counts[i,j],sep='',collapse='')\n"; -print R " af <- k/ploidy\n"; -print R " break\n"; -print R " }\n"; -print R " }\n"; -print R " if (frac < lower[1]) {\n"; -print R " sample <- paste(paste(paste(rep(0,(ploidy-1)),sep='',collapse='/'),paste(rep(1,1),sep='',collapse='/'),sep='/',collapse=''),':',counts[i,counts[i,'ref']],',',counts[i,j],sep='',collapse='')\n"; -print R " af <- 1/ploidy\n"; -print R " filter <- 'LowFraction'\n"; -print R " }\n"; -print R " if (counts[i,'TotalDepth'] < $mincov) {\n"; -print R " filter <- 'LowCoverage'\n"; -print R " }\n"; -print R " info <- paste('DP=',counts[i,'TotalDepth'],';AF=',round(af,digits=5),';AR=',round(frac,digits=5),sep='')\n"; -print R " snpids <- which(snps\$chr == chr & snps\$pos == counts[i,'pos'])\n"; -print R " id <- '.'\n"; -print R " if (length(snpids) > 0) id <- snps[snpids[1],'id']\n"; -print R " vcf[length(vcf)+1] <- paste(chr,counts[i,'pos'],id,counts[i,'ref'],j,phred,filter,info,'GT:AD',sample,sep='\\t',collapse='')\n"; -print R " }\n"; -print R " }\n"; -print R " }\n"; -print R "}\n"; -print R "if (length(vcf) > 0) {\n"; -print R " write(file=args[4],paste(vcf,sep='\\n'))\n"; -print R "}\n"; -close R; -system("mkdir $wd/VCF/"); -for (my $i = 1; $i <= $opts{'p'}; $i++) { - ${'thr'.$i} = threads->create('CallSNPs'); -} -## end the threads. -for (my $i = 1; $i <= $opts{'p'}; $i++) { - $targets_three->enqueue(undef); -} - -for (my $i = 1; $i <= $opts{'p'}; $i++) { - ${'thr'.$i}->join(); -} - -## BUILD FINAL VCF -open OUT, ">$outfile"; -print OUT "##fileformat=VCFv4.1\n"; -print OUT "##source=High_Ploidy_Genotyper_v.0.1\n"; -print OUT "##genome_reference=$twobit\n"; -if ($snpfile ne '') { - print OUT "##SNP_file=$snpfile\n"; -} -foreach(keys(%thash)) { - print OUT "##contig=<ID=$_,assembly=hg19,species=\"Homo Sapiens\">\n"; -} -print OUT "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n"; -print OUT "##INFO=<ID=AF,Number=1,Type=Float,Description=\"Allele Frequency\">\n"; -print OUT "##INFO=<ID=AR,Number=1,Type=Float,Description=\"Allelic Ratio\">\n"; -print OUT "##FILTER=<ID=LowFraction,Description=\"Allelic Fraction under 1/2*$ploidy\">\n"; -print OUT "##FILTER=<ID=LowCoverage,Description=\"Total Depth is lower than threshold of $mincov\">\n"; -print OUT "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"; -print OUT "##FORMAT=<ID=AD,Number=2,type=Integer,Description,\"Allelic Depth\">\n"; -print OUT "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n"; -close OUT; -@i = ( 1 .. 22,'X','Y','M' ); -foreach(@i) { - my $chr = "chr$_"; - foreach(sort {$a <=> $b} keys(%{$thash{$chr}})) { - my $v = "$wd/VCF/$chr.$_-".$thash{$chr}{$_}.".vcf"; - if (-e $v) { - system("cat '$v' >> '$outfile'"); - } - } -} - -## clean up -system("rm -Rf '$wd'"); - -sub FetchFasta { - while(defined(my $line = $targets_one->dequeue())) { - my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$line); - # 2bit is zero based, non-including => decrease start by one - $startposition = $start - 1; - my $command = "twoBitToFa -seq=$chr -start=$startposition -end=$stop -noMask $twobit $wd/Fasta/$chr-$start-$stop.fasta"; - system($command); - } -} - -sub CountAlleles { - # local version of hashes - my $snp = \%dbsnp; - my %counts; - $counts{'A'} = ''; - $counts{'C'} = ''; - $counts{'G'} = ''; - $counts{'T'} = ''; - my %map =('A' => 1,'C' => 2,'G' => 3, 'T' => 4); - my %options; - foreach(keys(%map)) { - my $r = $_; - foreach(keys(%map)) { - if ($_ eq $r) { - next; - } - $options{$r.'-'.$_} = ''; - } - } - while (defined(my $line = $targets_two->dequeue())) { - $out = ''; - my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$line); - ## get reference alleles - my %ref_alleles; - open FASTA, "$wd/Fasta/$chr-$start-$stop.fasta"; - my $head = <FASTA>; - my $seq = ''; - while (<FASTA>) { - chomp; - $seq .= $_; - } - close FASTA; - # this generates a hash of the reference alleles once, instead of substr-calls in every bam, on every iteration. - for (my $pos = 0; $pos < length($seq); $pos++) { - $ref_alleles{($pos+$start)} = substr($seq,$pos,1); - } - ## get counts. - my $target = "$chr:$start-$stop"; - my $command = "igvtools count -w 1 --bases --query '$target' '$bam' '$wd/WIGS/$chr-$start-$stop.wig' '$igvgenome' > /dev/null 2>&1"; - system($command); - open WIG, "$wd/WIGS/$chr-$start-$stop.wig"; - my $h = <WIG>; - $h = <WIG>; - $h = <WIG>; - my $target_counts = ''; - while (<WIG>) { - chomp; - #my ($pos, $a, $c, $g, $t , $n) = split(/\t/,$_); - my @p = split(/\t/,$_); - my $s = $p[1] + $p[2] + $p[3] + $p[4]; - $target_counts .= "$p[0]\t$ref_alleles{$p[0]}\t$p[1]\t$p[2]\t$p[3]\t$p[4]\t$s\n"; - ## skip positions with coverage < minimal coverage, and positions in dbsnp if specified (if not specified, snp hash is empty). - if ($s > $mincov && !defined($snp->{$chr.'-'.$p[0]})) { - ## for model of 'non-reference' - my $frac = $p[$map{$ref_alleles{$p[0]}}] / $s; - $counts{$ref_alleles{$p[0]}} .= $frac.','; - $out .= "$target\t$p[0]\t$ref_alleles{$p[0]}\t$p[1]\t$p[2]\t$p[3]\t$p[4]\n"; - ## for each of the options background models - foreach(keys(%map)) { - if ($_ eq $ref_alleles{$p[0]}) { - next; - } - $options{$ref_alleles{$p[0]}.'-'.$_} .= ($p[$map{$_}] / $s) .','; - } - - } - } - close WIG; - open OUT, ">>$wd/allcounts.$mincov"."x.$hassnp.txt"; - flock(OUT, 2); - print OUT $out; - close OUT; - open OUT, ">$wd/WIGS/$chr.$start-$stop.txt"; - print OUT $target_counts; - close OUT; - - } - foreach(keys(%counts)) { - open OUT, ">>$wd/counts_$_.$mincov"."x.$hassnp.txt"; - flock(OUT,2); - print OUT $counts{$_}; - close OUT; - } - foreach(keys(%options)) { - open OUT, ">>$wd/counts_$_.$mincov"."x.$hassnp.txt"; - flock(OUT,2); - print OUT $options{$_}; - close OUT; - } -} - -sub GetDistribution { - while (defined(my $allele = $alleles->dequeue())) { - system("sed -i 's/.\$//' '$wd/counts_$allele.$mincov"."x.$hassnp.txt'"); - open OUT, ">$wd/GetDistribution.$allele.R"; - print OUT "sample <- '$bam'\n"; - print OUT "nt <- '$allele'\n"; - #print OUT "pdf(file='$wd/Distribution.$allele.$mincov"."x.$hassnp.pdf',paper='a4')\n"; - print OUT "data <- scan(file='$wd/counts_$allele.$mincov"."x.$hassnp.txt',sep=',')\n"; - print OUT "nr <- length(data)\n"; - print OUT "avg <- mean(data)\n"; - print OUT "sdd <- sd(data)\n"; - #print OUT "if (avg > 0.5) {\n"; - #print OUT " x <- seq(0.8,1,length=1000)\n"; - #print OUT " y <- dnorm(x,mean=avg,sd=sdd)\n"; - #print OUT " plot(x,y,main='Distribution in sample $bam for nt $allele',xlab='Allelic Ratio',type='l',lwd=1)\n"; - #print OUT " abline(v=(avg-3*sdd),col='red')\n"; - #print OUT " text(0.81,max(y-0.5),paste(c('avg: ',avg,'\\nsd: ',sdd,'\\nnrDataPoints:', nr,'\\n$hassnp\\nMin.Cov:',$mincov),sep=' ',collapse=''),adj=c(0,1))\n"; - #print OUT "} else {\n"; - #print OUT " x <- seq(0,0.3,length=1000)\n"; - #print OUT " y <- dnorm(x,mean=avg,sd=sdd)\n"; - #print OUT " plot(x,y,main='Distribution in sample $bam for nt $allele',xlab='Allelic Ratio',type='l',lwd=1)\n"; - #print OUT " abline(v=(avg+3*sdd),col='red')\n"; - #print OUT " text(0.2,max(y-0.5),paste(c('avg: ',avg,'\\nsd: ',sdd,'\\nnrDataPoints:', nr,'\\n$hassnp\\nMin.Cov:',$mincov),sep=' ',collapse=''),adj=c(0,1))\n"; - #print OUT "}\n"; - #print OUT "dev.off()\n"; - print OUT "write(c(avg,sdd),file='$wd/model.$allele.$mincov"."x.$hassnp.txt',ncolumns=1)\n"; - close OUT; - system("cd $wd && Rscript GetDistribution.$allele.R >/dev/null 2>&1"); - } -} - - -sub CallSNPs { - while (defined(my $line = $targets_three->dequeue())) { - # split. - my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$line); - my $file = "$wd/WIGS/$chr.$start-$stop.txt"; - my $ofile = "$wd/VCF/$chr.$start-$stop.vcf"; - system("cd $wd && Rscript CallSNPs.R '$file' '$chr' '$ploidy' '$ofile' '$wd/dbsnp.txt'"); - } - -} -
--- a/DC_Genotyper.xml Sat Sep 27 02:47:31 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,83 +0,0 @@ -<tool id="DC_Genotyper" name="DC Genotyper" version='0.0.1'> - <description></description> - <requirements> - <requirement type='package' version='3.0.2'>R_3_0_2</requirement> - <requirement type='package' version='0.1.18'>samtools</requirement> - <requirement type='package' version='0.2.6'>tabix</requirement> - <requirement type='package' version='latest'>blat_server</requirement> - <requirement type='package' version='1.92'>perl_module_threads</requirement> - <requirement type='package' version='1.46'>perl_module_threads_shared</requirement> - <requirement type='package' version='3.02'>perl_module_Thread_Queue</requirement> - <requirement type='package' version='2.3.32'>igvtools</requirement> - </requirements> - <command interpreter="perl">DC_Genotyper.pl - -t "$targets" - -b "$bamfile" - -R "${ref.fields.path}" - -p "\${GALAXY_SLOTS:-4}" - #if $dbsnp.source == "history": - -s "${dbsnp.ownFile}" - #else - -s "${dbsnp.indices.fields.path}" - #end if - -m $mincov - -P $ploidy - - -a $output1 - -v $output2 - </command> - - <inputs> - <param name="bamfile" type="data" format="bam" label="Sample BAM file" /> - <param name="targets" type="data" format="bed" label="Enrichment BED file" /> - <param name="ref" type="select" label="Select a reference genome"> - <options from_data_table="DC_Genotyper_indexes"> - <filter type="sort_by" column="2" /> - <validator type="no_options" message="No indexes are available" /> - </options> - </param> - <conditional name="dbsnp"> - <param name="source" type="select" label="Will you select a dbSNP file from your history, or use a built in version (which is faster)"> - <option value="indexed">Use a built-in version</option> - <option value="history">Use one from the history</option> - </param> - <when value="indexed"> - <param name="indices" type="select" label="Select a dbSNP version"> - <options from_data_table="dbsnp_indexes"> - <filter type="sort_by" column="2" /> - <validator type="no_options" message="No indexes are available" /> - </options> - </param> - </when> - <when value="history"> - <param name="ownFile" type="data" format="vcf,bcf" label="Select a dbSNP file from history"/> - </when> - </conditional> - <param name="mincov" value="400" type="integer" label="Minimal Coverage Depth" /> - <param name="ploidy" type="integer" value='10' label="Expected Sample Ploidy" /> - </inputs> - - <outputs> - <data format='txt' name="output1" label="${tool.name} on ${on_string}: Allele Fraction Distributions"/> - <data format='vcf' name='output2' label="${tool.name} on ${on_string}: VCF file" /> - </outputs> -<help> - -**What it does** - - 1. get allele counts on all positions in specified targets (bed) using igvtools. Only SNPs !! - 2. remove known dbsnp positions (bcf file) - 3. Get distribution of background noise (pcr/sequencing errors), by modelling allele fractions as normal distributions. - 4. Based on these distributions, check each position for significant change from the reference allele (based on allele fraction) - 5. For abberant positions, check each alternate allele to see if it passes the background signal. - 6. Generate VCF file. - - -**Information** - -This tools is created by Geert Vandeweyer. It is a very early version with several limitations. Current limitations are : no support for indels, no plotting of the noise-models, incorrect syntax in for multi-allelic sites in the VCF file. - -Any feedback is welcome. - -</help> -</tool>
--- a/DC_Genotyper_indexes.loc.sample Sat Sep 27 02:47:31 2014 -0400 +++ b/DC_Genotyper_indexes.loc.sample Sat Sep 27 05:36:34 2014 -0400 @@ -5,11 +5,11 @@ #the directories in which those files are stored. The DC_Genotyper.loc #file has this format (white space characters are TAB characters): # -#<unique_build_id> <display_name> <2bit_path,IGVtools_genome.path> +#<unique_build_id> <dbkey> <display_name> <2bit_path,IGVtools_genome.path> # #for example: # -#hg19 Human (Homo sapiens): hg19 /depot/data2/galaxy/twobit/hg19.2bit,/depot/data2/galaxy/igvtools/hg19.chrom.sizes +#hg19 hg19 Human (Homo sapiens): hg19 /depot/data2/galaxy/twobit/hg19.2bit,/depot/data2/galaxy/igvtools/hg19.chrom.sizes # #then your /depot/data2/galaxy/twobit/ directory #would need to contain the following 2bit files:
--- a/dbsnp_indexes.loc.sample Sat Sep 27 02:47:31 2014 -0400 +++ b/dbsnp_indexes.loc.sample Sat Sep 27 05:36:34 2014 -0400 @@ -5,13 +5,13 @@ #the directories in which those files are stored. The dbsnp_indexes.loc #file has this format (white space characters are TAB characters): # -#<unique_build_id> <display_name> </path/to/dbSNP.vcf.bgz> +#<unique_build_id> <dbkey> <display_name> </path/to/dbSNP.vcf.bgz> # #for example: # -#hg19 Human (Homo sapiens): hg19 /depot/data2/galaxy/dbsnp/dbsnp_137.hg19.vcf.bgz -#hg19 Human (Homo sapiens): hg19 /depot/data2/galaxy/dbsnp/dbsnp_138.hg19.vcf.bgz -#mm9 Mouse (Mus musculus): mm9 /depot/data2/galaxy/dbsnp/dbsnp_128.mm9.vcf.bgz +#hg19 hg19 Human (Homo sapiens): hg19 /depot/data2/galaxy/dbsnp/dbsnp_137.hg19.vcf.bgz +#hg19 hg19 Human (Homo sapiens): hg19 /depot/data2/galaxy/dbsnp/dbsnp_138.hg19.vcf.bgz +#mm9 mm9 Mouse (Mus musculus): mm9 /depot/data2/galaxy/dbsnp/dbsnp_128.mm9.vcf.bgz # #then your /depot/data2/galaxy/dbsnp/ directory #would need to contain the following files:
--- a/readme.rst Sat Sep 27 02:47:31 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,18 +0,0 @@ -BACKGROUND: - -DC_Genotyper stands for Deep-Coverage Genotyper, and is aimed at detecting low fraction SNPs (no indels) in high-ploidy (or pooled) samples with very high coverage. It is being developed at the University of Antwerp by Geert Vandeweyer. - -METHOD: - -DC_Genotyper generates a background noise distributions on a per-sample basis, and uses these distrubutions to detect non-reference sites. For non-reference sites, Allele-specific distributions are used to estimate if an allele surpasses the background signal (e.g. A_to_G has different distribution as A_to_C, also reflected in Tr/Tv ratios). - -LIMITATION: - -This is a very early version with several limitations. Current limitations are : no support for indels, no plotting of the noise-models, incorrect syntax in for multi-allelic sites in the VCF file. - -Any feedback is welcome - - -INSTALLATION: - -After installation, complete the dbsnp.loc and dc_genotyper_indexes.loc files. The DC_genotyper_indexes.loc file has a specific format, specifying two index files per genome in a single line. Make sure you use comma to seperate these. DCG supports multithreading, but keep tests have shown that using more than 6-8 threads will lead to I/O bottlenecks.
--- a/tool_data_table_conf.xml.sample Sat Sep 27 02:47:31 2014 -0400 +++ b/tool_data_table_conf.xml.sample Sat Sep 27 05:36:34 2014 -0400 @@ -2,11 +2,11 @@ <tables> <!-- Locations of all index files under genome directory --> <table name="DC_Genotyper_indexes" comment_char="#"> - <columns>name,value, path</columns> + <columns>value, dbkey, name, path</columns> <file path="DC_Genotyper_indexes.loc" /> </table> <table name="dbsnp_indexes" comment_char="#"> - <columns>name,value, path</columns> + <columns>value, dbkey, name, path</columns> <file path="dbsnp_indexes.loc" /> </table>
--- a/tool_dependencies.xml Sat Sep 27 02:47:31 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,28 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="R_3_0_2" version="3.0.2"> - <repository changeset_revision="50f7e1e71271" name="package_r_3_0_2" owner="iuc" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> - <package name="samtools" version="0.1.18"> - <repository changeset_revision="171cd8bc208d" name="package_samtools_0_1_18" owner="devteam" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> - <package name="perl_module_threads" version="1.92"> - <repository changeset_revision="01ed0b079724" name="package_perl_threading" owner="geert-vandeweyer" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> - <package name="perl_module_threads_shared" version="1.46"> - <repository changeset_revision="01ed0b079724" name="package_perl_threading" owner="geert-vandeweyer" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> - <package name="perl_module_Thread_Queue" version="3.02"> - <repository changeset_revision="01ed0b079724" name="package_perl_threading" owner="geert-vandeweyer" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> - <package name="tabix" version="0.2.6"> - <repository changeset_revision="3d6beba7393e" name="package_tabix_0_2_6" owner="iuc" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> - <package name="blat_server" version="latest"> - <repository changeset_revision="5920144e9e9b" name="package_blat_server" owner="jjohnson" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> - <package name="igvtools" version="2.3.32"> - <repository name="package_igvtools_2_3_32" changeset_revision="79f8f883a7a0" owner='geert-vandeweyer' toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> -</tool_dependency> -