Previous changeset 10:a7e7f9b8f538 (2014-09-27) Next changeset 12:8cc26598eeac (2014-09-27) |
Commit message:
re-upload, accidentally removed some files |
added:
DC_Genotyper.pl DC_Genotyper.xml readme.rst tool_dependencies.xml |
b |
diff -r a7e7f9b8f538 -r 845a87ad254a DC_Genotyper.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/DC_Genotyper.pl Sat Sep 27 05:38:17 2014 -0400 |
[ |
b'@@ -0,0 +1,530 @@\n+#!/usr/bin/perl\n+#############################\n+## DEEP COVERAGE GENOTYPER ##\n+#############################\n+# version : 0.0.1\n+# Principle: \n+# 1. get allele counts on all positions in specified targets (bed) using igvtools. Only SNPs !!\n+# 2. remove known dbsnp positions (bcf file)\n+# 3. Get distribution of background noise (pcr/sequencing errors), by modelling allele fractions as normal distributions.\n+# 4. Based on these distributions, check each position for significant change from the reference allele (based on allele fraction)\n+# 5. For abberant positions, check each alternate allele to see if it passes the background signal. \n+# 6. Generate VCF file. \n+\n+\n+##################\n+## LOAD MODULES ##\n+##################\n+use threads;\n+use threads::shared;\n+use Thread::Queue;\n+use Getopt::Std;\n+\n+####################\n+## get paramaters ##\n+####################\n+# t: target file\n+# b: bam file\n+# R: 2bit version of reference fasta.\n+# p: number of threads.\n+# s: dbsnp file\n+# m: minimal coverage (defaults 400x)\n+# P: ploidy\n+# a: outfile for allele distributions\n+# v: vcf file output.\n+getopts(\'t:b:R:p:s:m:P:v:a:\', \\%opts) ;\n+\n+## variables\n+my $twobit :shared;\n+my $igvgenome :shared;\n+if (!defined($opts{\'R\'})) {\n+\tdie("Reference Genomes not specified\\n");\n+}\n+my @refgenomes = split(",",$opts{\'r\'});\n+if (!-e $refgenomes[0]) {\n+\tdie("\'$refgenomes[0]\' is not a valid file path.");\n+}\n+else {\n+\t$twobit = $refgenomes[0];\n+}\n+if (!-e $refgenomes[1]) {\n+\tdie("\'$refgenomes[1]\' is not a valid file path.");\n+}\n+else {\n+\t$igvgenome = $refgenomes[1];\n+}\n+\n+\n+my $mincov :shared;\n+$mincov = 320;\n+if (defined($opts{\'m\'})) {\n+\t$mincov = $opts{\'m\'};\n+}\n+\n+my $ploidy :shared;\n+if (defined($opts{\'P\'}) && $opts{\'P\'} =~ m/^\\d+$/) {\n+\t$ploidy = $opts{\'P\'};\n+}\n+else {\n+\tdie("Ploidy (-P) was not specified or not an integer\\n");\n+}\n+\n+\n+if (defined($opts{\'v\'})) {\n+\t$outfile = $opts{\'v\'};\n+}\n+else {\n+\tdie("No output vcf-file specified.\\n");\n+}\n+if (!defined($opts{\'a\'})) {\n+\tdie("No output file specified for distribution details\\n");\n+}\n+## create working dir.\n+my $rand = int(rand(10000));\n+while (-d "/tmp/DC_Genotyper_$rand") {\n+\t$rand = int(rand(10000));\n+}\n+my $wd :shared;\n+$wd = "/tmp/DC_Genotyper_$rand";\n+system("mkdir \'$wd\'");\n+\n+\n+my $snpfile :shared;\n+my $hassnp :shared;\n+$hassnp = \'NoDbSNP\';\n+$snpfile = \'\';\n+if (defined($opts{\'s\'})) {\n+\t$snpfile = $opts{\'s\'};\n+\tif (!-e $snpfile) {\n+\t\tdie("\'$snpfile\' is not a valid file path.");\n+\t}\n+\n+\tmy $mime = `file $snpfile`;\n+\tif ($mime !~ m/compressed/) {\n+\t\tprint "$snpfile is not in compressed format. compressing & indexing the file now.\\n";\n+\t\t#print "... this takes a while\\n";\n+\t\tsystem("bgzip -c $snpfile > $wd/dbSNP.vcf.bgz");\n+\t\tsystem("cd $wd/ && tabix -p vcf dbSNP.vcf.bgz");\n+\t\t$snpfile = "$wd/dbSNP.vcf.bgz";\n+\t}\n+\telsif (!-e "$snpfile.tbi") {\n+\t\tprint "tabix index file is missing for \'$snpfile\'. creating now.\\n";\n+\t\t## check if I can write it out for future use\n+\t\t$snpfile =~ m/(.*)([^\\/]+)$/;\n+\t\tmy $d = $1;\n+\t\tif (-w $d) {\n+\t\t\topen OUT, ">$d/lock";\n+\t\t\tflock(OUT,2);\n+\t\t\tsystem("cd $d && tabix -p vcf $snpfile");\n+\t\t\tclose OUT;\n+\t\t\tsystem("rm $d/lock");\n+\t\t}\n+\t\telse {\n+\t\t\tsystem("cp $snpfile /$wd/dbSNP.vcf.bgz");\n+\t\t\tsystem("cd $wd/ && tabix -p vcf dbSNP.vcf.bgz");\n+\t\t\t$snpfile = "$wd/dbSNP.vcf.bgz";\n+\t\t}\n+\t}\n+\t$hassnp = \'WithDbSNP\';\n+}\n+\n+\t\n+## 1. Get FASTA and prepare output hashes: \n+my $targets_one = Thread::Queue->new();\n+my $targets_two = Thread::Queue->new();\n+my $targets_three = Thread::Queue->new();\n+open IN, $opts{\'t\'} or die("Could not open $opts{\'t\'} file for reading");\n+if (-d "$wd/Fasta/") {\n+\tsystem("rm $wd/Fasta/*");\n+}\n+else {\n+\tsystem("mkdir $wd/Fasta");\n+}\n+## create the threads.\n+for (my $i = 1; $i<= $opts{\'p\'}; $i++) {\n+\t${\'thr\'.$i} = threads->create(\'FetchFasta\');\n+}\n+\n+## enqueue the targets.\n+my %thash;\n+while (<IN>) {\n+\tchomp;\n+\tmy ($chr,$start,$stop,$name,$score,$strand) = split(/\\t/,$_);\n+\t$targets_one->enqueue($_);\n+\t$targets_two->enqueue($_);\n'..b'instead of substr-calls in every bam, on every iteration.\n+\t\tfor (my $pos = 0; $pos < length($seq); $pos++) {\n+\t\t\t$ref_alleles{($pos+$start)} = substr($seq,$pos,1);\n+\t\t}\n+\t\t## get counts.\n+\t\tmy $target = "$chr:$start-$stop";\n+\t\tmy $command = "igvtools count -w 1 --bases --query \'$target\' \'$bam\' \'$wd/WIGS/$chr-$start-$stop.wig\' \'$igvgenome\' > /dev/null 2>&1";\n+\t\tsystem($command);\n+\t\topen WIG, "$wd/WIGS/$chr-$start-$stop.wig";\n+\t\tmy $h = <WIG>;\n+\t\t$h = <WIG>;\n+\t\t$h = <WIG>;\n+\t\tmy $target_counts = \'\';\n+\t\twhile (<WIG>) {\t\n+\t\t\tchomp;\n+\t\t\t#my ($pos, $a, $c, $g, $t , $n) = split(/\\t/,$_);\n+\t\t\tmy @p = split(/\\t/,$_);\n+\t\t\tmy $s = $p[1] + $p[2] + $p[3] + $p[4];\n+\t\t\t$target_counts .= "$p[0]\\t$ref_alleles{$p[0]}\\t$p[1]\\t$p[2]\\t$p[3]\\t$p[4]\\t$s\\n";\n+\t\t\t## skip positions with coverage < minimal coverage, and positions in dbsnp if specified (if not specified, snp hash is empty).\n+\t\t\tif ($s > $mincov && !defined($snp->{$chr.\'-\'.$p[0]})) {\n+\t\t\t\t## for model of \'non-reference\' \n+\t\t\t\tmy $frac = $p[$map{$ref_alleles{$p[0]}}] / $s;\n+\t\t\t\t$counts{$ref_alleles{$p[0]}} .= $frac.\',\';\n+\t\t\t\t$out .= "$target\\t$p[0]\\t$ref_alleles{$p[0]}\\t$p[1]\\t$p[2]\\t$p[3]\\t$p[4]\\n";\n+\t\t\t\t## for each of the options background models\n+\t\t\t\tforeach(keys(%map)) {\n+\t\t\t\t\tif ($_ eq $ref_alleles{$p[0]}) {\n+\t\t\t\t\t\tnext;\n+\t\t\t\t\t}\n+\t\t\t\t\t$options{$ref_alleles{$p[0]}.\'-\'.$_} .= ($p[$map{$_}] / $s) .\',\';\n+\t\t\t\t}\n+\t\t\t\t\t \n+\t\t\t}\n+\t\t}\n+\t\tclose WIG;\n+\t\topen OUT, ">>$wd/allcounts.$mincov"."x.$hassnp.txt";\n+\t\tflock(OUT, 2);\n+\t\tprint OUT $out;\n+\t\tclose OUT;\n+\t\topen OUT, ">$wd/WIGS/$chr.$start-$stop.txt";\n+\t\tprint OUT $target_counts;\n+\t\tclose OUT;\n+\t\t\t\n+\t}\n+\tforeach(keys(%counts)) {\n+\t\topen OUT, ">>$wd/counts_$_.$mincov"."x.$hassnp.txt";\n+\t\tflock(OUT,2);\n+\t\tprint OUT $counts{$_};\n+\t\tclose OUT;\n+\t}\n+\tforeach(keys(%options)) {\n+\t\topen OUT, ">>$wd/counts_$_.$mincov"."x.$hassnp.txt";\n+\t\tflock(OUT,2);\n+\t\tprint OUT $options{$_};\n+\t\tclose OUT;\n+\t}\n+}\n+\n+sub GetDistribution {\n+\twhile (defined(my $allele = $alleles->dequeue())) {\n+\t\tsystem("sed -i \'s/.\\$//\' \'$wd/counts_$allele.$mincov"."x.$hassnp.txt\'");\n+\t\topen OUT, ">$wd/GetDistribution.$allele.R";\n+\t\tprint OUT "sample <- \'$bam\'\\n";\n+\t\tprint OUT "nt <- \'$allele\'\\n";\n+\t\t#print OUT "pdf(file=\'$wd/Distribution.$allele.$mincov"."x.$hassnp.pdf\',paper=\'a4\')\\n";\n+\t\tprint OUT "data <- scan(file=\'$wd/counts_$allele.$mincov"."x.$hassnp.txt\',sep=\',\')\\n";\n+\t\tprint OUT "nr <- length(data)\\n";\n+\t\tprint OUT "avg <- mean(data)\\n";\n+\t\tprint OUT "sdd <- sd(data)\\n";\n+\t\t#print OUT "if (avg > 0.5) {\\n";\n+\t\t#print OUT " x <- seq(0.8,1,length=1000)\\n";\n+\t\t#print OUT " y <- dnorm(x,mean=avg,sd=sdd)\\n";\n+\t\t#print OUT " plot(x,y,main=\'Distribution in sample $bam for nt $allele\',xlab=\'Allelic Ratio\',type=\'l\',lwd=1)\\n";\n+\t\t#print OUT " abline(v=(avg-3*sdd),col=\'red\')\\n";\n+\t\t#print OUT " text(0.81,max(y-0.5),paste(c(\'avg: \',avg,\'\\\\nsd: \',sdd,\'\\\\nnrDataPoints:\', nr,\'\\\\n$hassnp\\\\nMin.Cov:\',$mincov),sep=\' \',collapse=\'\'),adj=c(0,1))\\n";\n+\t\t#print OUT "} else {\\n";\n+\t\t#print OUT " x <- seq(0,0.3,length=1000)\\n";\n+\t\t#print OUT " y <- dnorm(x,mean=avg,sd=sdd)\\n";\n+\t\t#print OUT " plot(x,y,main=\'Distribution in sample $bam for nt $allele\',xlab=\'Allelic Ratio\',type=\'l\',lwd=1)\\n";\n+\t\t#print OUT " abline(v=(avg+3*sdd),col=\'red\')\\n";\n+\t\t#print OUT " text(0.2,max(y-0.5),paste(c(\'avg: \',avg,\'\\\\nsd: \',sdd,\'\\\\nnrDataPoints:\', nr,\'\\\\n$hassnp\\\\nMin.Cov:\',$mincov),sep=\' \',collapse=\'\'),adj=c(0,1))\\n";\n+\t\t#print OUT "}\\n";\n+\t\t#print OUT "dev.off()\\n";\n+\t\tprint OUT "write(c(avg,sdd),file=\'$wd/model.$allele.$mincov"."x.$hassnp.txt\',ncolumns=1)\\n";\n+\t\tclose OUT; \n+\t\tsystem("cd $wd && Rscript GetDistribution.$allele.R >/dev/null 2>&1");\n+\t}\n+}\n+\n+\n+sub CallSNPs {\n+\twhile (defined(my $line = $targets_three->dequeue())) {\n+\t\t# split.\n+\t\tmy ($chr,$start,$stop,$name,$score,$strand) = split(/\\t/,$line);\t\n+\t\tmy $file = "$wd/WIGS/$chr.$start-$stop.txt";\n+\t\tmy $ofile = "$wd/VCF/$chr.$start-$stop.vcf";\n+\t\tsystem("cd $wd && Rscript CallSNPs.R \'$file\' \'$chr\' \'$ploidy\' \'$ofile\' \'$wd/dbsnp.txt\'");\n+\t}\n+\n+}\n+\n' |
b |
diff -r a7e7f9b8f538 -r 845a87ad254a DC_Genotyper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/DC_Genotyper.xml Sat Sep 27 05:38:17 2014 -0400 |
b |
@@ -0,0 +1,83 @@ +<tool id="DC_Genotyper" name="DC Genotyper" version='0.0.1'> + <description></description> + <requirements> + <requirement type='package' version='3.0.2'>R_3_0_2</requirement> + <requirement type='package' version='0.1.18'>samtools</requirement> + <requirement type='package' version='0.2.6'>tabix</requirement> + <requirement type='package' version='latest'>blat_server</requirement> + <requirement type='package' version='1.92'>perl_module_threads</requirement> + <requirement type='package' version='1.46'>perl_module_threads_shared</requirement> + <requirement type='package' version='3.02'>perl_module_Thread_Queue</requirement> + <requirement type='package' version='2.3.32'>igvtools</requirement> + </requirements> + <command interpreter="perl">DC_Genotyper.pl + -t "$targets" + -b "$bamfile" + -R "${ref.fields.path}" + -p "\${GALAXY_SLOTS:-4}" + #if $dbsnp.source == "history": + -s "${dbsnp.ownFile}" + #else + -s "${dbsnp.indices.fields.path}" + #end if + -m $mincov + -P $ploidy + + -a $output1 + -v $output2 + </command> + + <inputs> + <param name="bamfile" type="data" format="bam" label="Sample BAM file" /> + <param name="targets" type="data" format="bed" label="Enrichment BED file" /> + <param name="ref" type="select" label="Select a reference genome"> + <options from_data_table="DC_Genotyper_indexes"> + <filter type="sort_by" column="2" /> + <validator type="no_options" message="No indexes are available" /> + </options> + </param> + <conditional name="dbsnp"> + <param name="source" type="select" label="Will you select a dbSNP file from your history, or use a built in version (which is faster)"> + <option value="indexed">Use a built-in version</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="indices" type="select" label="Select a dbSNP version"> + <options from_data_table="dbsnp_indexes"> + <filter type="sort_by" column="2" /> + <validator type="no_options" message="No indexes are available" /> + </options> + </param> + </when> + <when value="history"> + <param name="ownFile" type="data" format="vcf,bcf" label="Select a dbSNP file from history"/> + </when> + </conditional> + <param name="mincov" value="400" type="integer" label="Minimal Coverage Depth" /> + <param name="ploidy" type="integer" value='10' label="Expected Sample Ploidy" /> + </inputs> + + <outputs> + <data format='txt' name="output1" label="${tool.name} on ${on_string}: Allele Fraction Distributions"/> + <data format='vcf' name='output2' label="${tool.name} on ${on_string}: VCF file" /> + </outputs> +<help> + +**What it does** + + 1. get allele counts on all positions in specified targets (bed) using igvtools. Only SNPs !! + 2. remove known dbsnp positions (bcf file) + 3. Get distribution of background noise (pcr/sequencing errors), by modelling allele fractions as normal distributions. + 4. Based on these distributions, check each position for significant change from the reference allele (based on allele fraction) + 5. For abberant positions, check each alternate allele to see if it passes the background signal. + 6. Generate VCF file. + + +**Information** + +This tools is created by Geert Vandeweyer. It is a very early version with several limitations. Current limitations are : no support for indels, no plotting of the noise-models, incorrect syntax in for multi-allelic sites in the VCF file. + +Any feedback is welcome. + +</help> +</tool> |
b |
diff -r a7e7f9b8f538 -r 845a87ad254a readme.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/readme.rst Sat Sep 27 05:38:17 2014 -0400 |
b |
@@ -0,0 +1,18 @@ +BACKGROUND: + +DC_Genotyper stands for Deep-Coverage Genotyper, and is aimed at detecting low fraction SNPs (no indels) in high-ploidy (or pooled) samples with very high coverage. It is being developed at the University of Antwerp by Geert Vandeweyer. + +METHOD: + +DC_Genotyper generates a background noise distributions on a per-sample basis, and uses these distrubutions to detect non-reference sites. For non-reference sites, Allele-specific distributions are used to estimate if an allele surpasses the background signal (e.g. A_to_G has different distribution as A_to_C, also reflected in Tr/Tv ratios). + +LIMITATION: + +This is a very early version with several limitations. Current limitations are : no support for indels, no plotting of the noise-models, incorrect syntax in for multi-allelic sites in the VCF file. + +Any feedback is welcome + + +INSTALLATION: + +After installation, complete the dbsnp.loc and dc_genotyper_indexes.loc files. The DC_genotyper_indexes.loc file has a specific format, specifying two index files per genome in a single line. Make sure you use comma to seperate these. DCG supports multithreading, but keep tests have shown that using more than 6-8 threads will lead to I/O bottlenecks. |
b |
diff -r a7e7f9b8f538 -r 845a87ad254a tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Sat Sep 27 05:38:17 2014 -0400 |
b |
@@ -0,0 +1,28 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="R_3_0_2" version="3.0.2"> + <repository changeset_revision="50f7e1e71271" name="package_r_3_0_2" owner="iuc" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> + <package name="samtools" version="0.1.18"> + <repository changeset_revision="171cd8bc208d" name="package_samtools_0_1_18" owner="devteam" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> + <package name="perl_module_threads" version="1.92"> + <repository changeset_revision="01ed0b079724" name="package_perl_threading" owner="geert-vandeweyer" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> + <package name="perl_module_threads_shared" version="1.46"> + <repository changeset_revision="01ed0b079724" name="package_perl_threading" owner="geert-vandeweyer" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> + <package name="perl_module_Thread_Queue" version="3.02"> + <repository changeset_revision="01ed0b079724" name="package_perl_threading" owner="geert-vandeweyer" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> + <package name="tabix" version="0.2.6"> + <repository changeset_revision="3d6beba7393e" name="package_tabix_0_2_6" owner="iuc" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> + <package name="blat_server" version="latest"> + <repository changeset_revision="5920144e9e9b" name="package_blat_server" owner="jjohnson" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> + <package name="igvtools" version="2.3.32"> + <repository name="package_igvtools_2_3_32" changeset_revision="79f8f883a7a0" owner='geert-vandeweyer' toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency> + |