Previous changeset 9:7cbbf8aaa46c (2014-09-27) Next changeset 11:845a87ad254a (2014-09-27) |
Commit message:
Uploaded |
modified:
DC_Genotyper_indexes.loc.sample dbsnp_indexes.loc.sample tool_data_table_conf.xml.sample |
removed:
DC_Genotyper.pl DC_Genotyper.xml readme.rst tool_dependencies.xml |
b |
diff -r 7cbbf8aaa46c -r a7e7f9b8f538 DC_Genotyper.pl --- a/DC_Genotyper.pl Sat Sep 27 02:47:31 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,530 +0,0 @@\n-#!/usr/bin/perl\n-#############################\n-## DEEP COVERAGE GENOTYPER ##\n-#############################\n-# version : 0.0.1\n-# Principle: \n-# 1. get allele counts on all positions in specified targets (bed) using igvtools. Only SNPs !!\n-# 2. remove known dbsnp positions (bcf file)\n-# 3. Get distribution of background noise (pcr/sequencing errors), by modelling allele fractions as normal distributions.\n-# 4. Based on these distributions, check each position for significant change from the reference allele (based on allele fraction)\n-# 5. For abberant positions, check each alternate allele to see if it passes the background signal. \n-# 6. Generate VCF file. \n-\n-\n-##################\n-## LOAD MODULES ##\n-##################\n-use threads;\n-use threads::shared;\n-use Thread::Queue;\n-use Getopt::Std;\n-\n-####################\n-## get paramaters ##\n-####################\n-# t: target file\n-# b: bam file\n-# R: 2bit version of reference fasta.\n-# p: number of threads.\n-# s: dbsnp file\n-# m: minimal coverage (defaults 400x)\n-# P: ploidy\n-# a: outfile for allele distributions\n-# v: vcf file output.\n-getopts(\'t:b:R:p:s:m:P:v:a:\', \\%opts) ;\n-\n-## variables\n-my $twobit :shared;\n-my $igvgenome :shared;\n-if (!defined($opts{\'R\'})) {\n-\tdie("Reference Genomes not specified\\n");\n-}\n-my @refgenomes = split(",",$opts{\'r\'});\n-if (!-e $refgenomes[0]) {\n-\tdie("\'$refgenomes[0]\' is not a valid file path.");\n-}\n-else {\n-\t$twobit = $refgenomes[0];\n-}\n-if (!-e $refgenomes[1]) {\n-\tdie("\'$refgenomes[1]\' is not a valid file path.");\n-}\n-else {\n-\t$igvgenome = $refgenomes[1];\n-}\n-\n-\n-my $mincov :shared;\n-$mincov = 320;\n-if (defined($opts{\'m\'})) {\n-\t$mincov = $opts{\'m\'};\n-}\n-\n-my $ploidy :shared;\n-if (defined($opts{\'P\'}) && $opts{\'P\'} =~ m/^\\d+$/) {\n-\t$ploidy = $opts{\'P\'};\n-}\n-else {\n-\tdie("Ploidy (-P) was not specified or not an integer\\n");\n-}\n-\n-\n-if (defined($opts{\'v\'})) {\n-\t$outfile = $opts{\'v\'};\n-}\n-else {\n-\tdie("No output vcf-file specified.\\n");\n-}\n-if (!defined($opts{\'a\'})) {\n-\tdie("No output file specified for distribution details\\n");\n-}\n-## create working dir.\n-my $rand = int(rand(10000));\n-while (-d "/tmp/DC_Genotyper_$rand") {\n-\t$rand = int(rand(10000));\n-}\n-my $wd :shared;\n-$wd = "/tmp/DC_Genotyper_$rand";\n-system("mkdir \'$wd\'");\n-\n-\n-my $snpfile :shared;\n-my $hassnp :shared;\n-$hassnp = \'NoDbSNP\';\n-$snpfile = \'\';\n-if (defined($opts{\'s\'})) {\n-\t$snpfile = $opts{\'s\'};\n-\tif (!-e $snpfile) {\n-\t\tdie("\'$snpfile\' is not a valid file path.");\n-\t}\n-\n-\tmy $mime = `file $snpfile`;\n-\tif ($mime !~ m/compressed/) {\n-\t\tprint "$snpfile is not in compressed format. compressing & indexing the file now.\\n";\n-\t\t#print "... this takes a while\\n";\n-\t\tsystem("bgzip -c $snpfile > $wd/dbSNP.vcf.bgz");\n-\t\tsystem("cd $wd/ && tabix -p vcf dbSNP.vcf.bgz");\n-\t\t$snpfile = "$wd/dbSNP.vcf.bgz";\n-\t}\n-\telsif (!-e "$snpfile.tbi") {\n-\t\tprint "tabix index file is missing for \'$snpfile\'. creating now.\\n";\n-\t\t## check if I can write it out for future use\n-\t\t$snpfile =~ m/(.*)([^\\/]+)$/;\n-\t\tmy $d = $1;\n-\t\tif (-w $d) {\n-\t\t\topen OUT, ">$d/lock";\n-\t\t\tflock(OUT,2);\n-\t\t\tsystem("cd $d && tabix -p vcf $snpfile");\n-\t\t\tclose OUT;\n-\t\t\tsystem("rm $d/lock");\n-\t\t}\n-\t\telse {\n-\t\t\tsystem("cp $snpfile /$wd/dbSNP.vcf.bgz");\n-\t\t\tsystem("cd $wd/ && tabix -p vcf dbSNP.vcf.bgz");\n-\t\t\t$snpfile = "$wd/dbSNP.vcf.bgz";\n-\t\t}\n-\t}\n-\t$hassnp = \'WithDbSNP\';\n-}\n-\n-\t\n-## 1. Get FASTA and prepare output hashes: \n-my $targets_one = Thread::Queue->new();\n-my $targets_two = Thread::Queue->new();\n-my $targets_three = Thread::Queue->new();\n-open IN, $opts{\'t\'} or die("Could not open $opts{\'t\'} file for reading");\n-if (-d "$wd/Fasta/") {\n-\tsystem("rm $wd/Fasta/*");\n-}\n-else {\n-\tsystem("mkdir $wd/Fasta");\n-}\n-## create the threads.\n-for (my $i = 1; $i<= $opts{\'p\'}; $i++) {\n-\t${\'thr\'.$i} = threads->create(\'FetchFasta\');\n-}\n-\n-## enqueue the targets.\n-my %thash;\n-while (<IN>) {\n-\tchomp;\n-\tmy ($chr,$start,$stop,$name,$score,$strand) = split(/\\t/,$_);\n-\t$targets_one->enqueue($_);\n-\t$targets_two->enqueue($_);\n'..b'instead of substr-calls in every bam, on every iteration.\n-\t\tfor (my $pos = 0; $pos < length($seq); $pos++) {\n-\t\t\t$ref_alleles{($pos+$start)} = substr($seq,$pos,1);\n-\t\t}\n-\t\t## get counts.\n-\t\tmy $target = "$chr:$start-$stop";\n-\t\tmy $command = "igvtools count -w 1 --bases --query \'$target\' \'$bam\' \'$wd/WIGS/$chr-$start-$stop.wig\' \'$igvgenome\' > /dev/null 2>&1";\n-\t\tsystem($command);\n-\t\topen WIG, "$wd/WIGS/$chr-$start-$stop.wig";\n-\t\tmy $h = <WIG>;\n-\t\t$h = <WIG>;\n-\t\t$h = <WIG>;\n-\t\tmy $target_counts = \'\';\n-\t\twhile (<WIG>) {\t\n-\t\t\tchomp;\n-\t\t\t#my ($pos, $a, $c, $g, $t , $n) = split(/\\t/,$_);\n-\t\t\tmy @p = split(/\\t/,$_);\n-\t\t\tmy $s = $p[1] + $p[2] + $p[3] + $p[4];\n-\t\t\t$target_counts .= "$p[0]\\t$ref_alleles{$p[0]}\\t$p[1]\\t$p[2]\\t$p[3]\\t$p[4]\\t$s\\n";\n-\t\t\t## skip positions with coverage < minimal coverage, and positions in dbsnp if specified (if not specified, snp hash is empty).\n-\t\t\tif ($s > $mincov && !defined($snp->{$chr.\'-\'.$p[0]})) {\n-\t\t\t\t## for model of \'non-reference\' \n-\t\t\t\tmy $frac = $p[$map{$ref_alleles{$p[0]}}] / $s;\n-\t\t\t\t$counts{$ref_alleles{$p[0]}} .= $frac.\',\';\n-\t\t\t\t$out .= "$target\\t$p[0]\\t$ref_alleles{$p[0]}\\t$p[1]\\t$p[2]\\t$p[3]\\t$p[4]\\n";\n-\t\t\t\t## for each of the options background models\n-\t\t\t\tforeach(keys(%map)) {\n-\t\t\t\t\tif ($_ eq $ref_alleles{$p[0]}) {\n-\t\t\t\t\t\tnext;\n-\t\t\t\t\t}\n-\t\t\t\t\t$options{$ref_alleles{$p[0]}.\'-\'.$_} .= ($p[$map{$_}] / $s) .\',\';\n-\t\t\t\t}\n-\t\t\t\t\t \n-\t\t\t}\n-\t\t}\n-\t\tclose WIG;\n-\t\topen OUT, ">>$wd/allcounts.$mincov"."x.$hassnp.txt";\n-\t\tflock(OUT, 2);\n-\t\tprint OUT $out;\n-\t\tclose OUT;\n-\t\topen OUT, ">$wd/WIGS/$chr.$start-$stop.txt";\n-\t\tprint OUT $target_counts;\n-\t\tclose OUT;\n-\t\t\t\n-\t}\n-\tforeach(keys(%counts)) {\n-\t\topen OUT, ">>$wd/counts_$_.$mincov"."x.$hassnp.txt";\n-\t\tflock(OUT,2);\n-\t\tprint OUT $counts{$_};\n-\t\tclose OUT;\n-\t}\n-\tforeach(keys(%options)) {\n-\t\topen OUT, ">>$wd/counts_$_.$mincov"."x.$hassnp.txt";\n-\t\tflock(OUT,2);\n-\t\tprint OUT $options{$_};\n-\t\tclose OUT;\n-\t}\n-}\n-\n-sub GetDistribution {\n-\twhile (defined(my $allele = $alleles->dequeue())) {\n-\t\tsystem("sed -i \'s/.\\$//\' \'$wd/counts_$allele.$mincov"."x.$hassnp.txt\'");\n-\t\topen OUT, ">$wd/GetDistribution.$allele.R";\n-\t\tprint OUT "sample <- \'$bam\'\\n";\n-\t\tprint OUT "nt <- \'$allele\'\\n";\n-\t\t#print OUT "pdf(file=\'$wd/Distribution.$allele.$mincov"."x.$hassnp.pdf\',paper=\'a4\')\\n";\n-\t\tprint OUT "data <- scan(file=\'$wd/counts_$allele.$mincov"."x.$hassnp.txt\',sep=\',\')\\n";\n-\t\tprint OUT "nr <- length(data)\\n";\n-\t\tprint OUT "avg <- mean(data)\\n";\n-\t\tprint OUT "sdd <- sd(data)\\n";\n-\t\t#print OUT "if (avg > 0.5) {\\n";\n-\t\t#print OUT " x <- seq(0.8,1,length=1000)\\n";\n-\t\t#print OUT " y <- dnorm(x,mean=avg,sd=sdd)\\n";\n-\t\t#print OUT " plot(x,y,main=\'Distribution in sample $bam for nt $allele\',xlab=\'Allelic Ratio\',type=\'l\',lwd=1)\\n";\n-\t\t#print OUT " abline(v=(avg-3*sdd),col=\'red\')\\n";\n-\t\t#print OUT " text(0.81,max(y-0.5),paste(c(\'avg: \',avg,\'\\\\nsd: \',sdd,\'\\\\nnrDataPoints:\', nr,\'\\\\n$hassnp\\\\nMin.Cov:\',$mincov),sep=\' \',collapse=\'\'),adj=c(0,1))\\n";\n-\t\t#print OUT "} else {\\n";\n-\t\t#print OUT " x <- seq(0,0.3,length=1000)\\n";\n-\t\t#print OUT " y <- dnorm(x,mean=avg,sd=sdd)\\n";\n-\t\t#print OUT " plot(x,y,main=\'Distribution in sample $bam for nt $allele\',xlab=\'Allelic Ratio\',type=\'l\',lwd=1)\\n";\n-\t\t#print OUT " abline(v=(avg+3*sdd),col=\'red\')\\n";\n-\t\t#print OUT " text(0.2,max(y-0.5),paste(c(\'avg: \',avg,\'\\\\nsd: \',sdd,\'\\\\nnrDataPoints:\', nr,\'\\\\n$hassnp\\\\nMin.Cov:\',$mincov),sep=\' \',collapse=\'\'),adj=c(0,1))\\n";\n-\t\t#print OUT "}\\n";\n-\t\t#print OUT "dev.off()\\n";\n-\t\tprint OUT "write(c(avg,sdd),file=\'$wd/model.$allele.$mincov"."x.$hassnp.txt\',ncolumns=1)\\n";\n-\t\tclose OUT; \n-\t\tsystem("cd $wd && Rscript GetDistribution.$allele.R >/dev/null 2>&1");\n-\t}\n-}\n-\n-\n-sub CallSNPs {\n-\twhile (defined(my $line = $targets_three->dequeue())) {\n-\t\t# split.\n-\t\tmy ($chr,$start,$stop,$name,$score,$strand) = split(/\\t/,$line);\t\n-\t\tmy $file = "$wd/WIGS/$chr.$start-$stop.txt";\n-\t\tmy $ofile = "$wd/VCF/$chr.$start-$stop.vcf";\n-\t\tsystem("cd $wd && Rscript CallSNPs.R \'$file\' \'$chr\' \'$ploidy\' \'$ofile\' \'$wd/dbsnp.txt\'");\n-\t}\n-\n-}\n-\n' |
b |
diff -r 7cbbf8aaa46c -r a7e7f9b8f538 DC_Genotyper.xml --- a/DC_Genotyper.xml Sat Sep 27 02:47:31 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,83 +0,0 @@ -<tool id="DC_Genotyper" name="DC Genotyper" version='0.0.1'> - <description></description> - <requirements> - <requirement type='package' version='3.0.2'>R_3_0_2</requirement> - <requirement type='package' version='0.1.18'>samtools</requirement> - <requirement type='package' version='0.2.6'>tabix</requirement> - <requirement type='package' version='latest'>blat_server</requirement> - <requirement type='package' version='1.92'>perl_module_threads</requirement> - <requirement type='package' version='1.46'>perl_module_threads_shared</requirement> - <requirement type='package' version='3.02'>perl_module_Thread_Queue</requirement> - <requirement type='package' version='2.3.32'>igvtools</requirement> - </requirements> - <command interpreter="perl">DC_Genotyper.pl - -t "$targets" - -b "$bamfile" - -R "${ref.fields.path}" - -p "\${GALAXY_SLOTS:-4}" - #if $dbsnp.source == "history": - -s "${dbsnp.ownFile}" - #else - -s "${dbsnp.indices.fields.path}" - #end if - -m $mincov - -P $ploidy - - -a $output1 - -v $output2 - </command> - - <inputs> - <param name="bamfile" type="data" format="bam" label="Sample BAM file" /> - <param name="targets" type="data" format="bed" label="Enrichment BED file" /> - <param name="ref" type="select" label="Select a reference genome"> - <options from_data_table="DC_Genotyper_indexes"> - <filter type="sort_by" column="2" /> - <validator type="no_options" message="No indexes are available" /> - </options> - </param> - <conditional name="dbsnp"> - <param name="source" type="select" label="Will you select a dbSNP file from your history, or use a built in version (which is faster)"> - <option value="indexed">Use a built-in version</option> - <option value="history">Use one from the history</option> - </param> - <when value="indexed"> - <param name="indices" type="select" label="Select a dbSNP version"> - <options from_data_table="dbsnp_indexes"> - <filter type="sort_by" column="2" /> - <validator type="no_options" message="No indexes are available" /> - </options> - </param> - </when> - <when value="history"> - <param name="ownFile" type="data" format="vcf,bcf" label="Select a dbSNP file from history"/> - </when> - </conditional> - <param name="mincov" value="400" type="integer" label="Minimal Coverage Depth" /> - <param name="ploidy" type="integer" value='10' label="Expected Sample Ploidy" /> - </inputs> - - <outputs> - <data format='txt' name="output1" label="${tool.name} on ${on_string}: Allele Fraction Distributions"/> - <data format='vcf' name='output2' label="${tool.name} on ${on_string}: VCF file" /> - </outputs> -<help> - -**What it does** - - 1. get allele counts on all positions in specified targets (bed) using igvtools. Only SNPs !! - 2. remove known dbsnp positions (bcf file) - 3. Get distribution of background noise (pcr/sequencing errors), by modelling allele fractions as normal distributions. - 4. Based on these distributions, check each position for significant change from the reference allele (based on allele fraction) - 5. For abberant positions, check each alternate allele to see if it passes the background signal. - 6. Generate VCF file. - - -**Information** - -This tools is created by Geert Vandeweyer. It is a very early version with several limitations. Current limitations are : no support for indels, no plotting of the noise-models, incorrect syntax in for multi-allelic sites in the VCF file. - -Any feedback is welcome. - -</help> -</tool> |
b |
diff -r 7cbbf8aaa46c -r a7e7f9b8f538 DC_Genotyper_indexes.loc.sample --- a/DC_Genotyper_indexes.loc.sample Sat Sep 27 02:47:31 2014 -0400 +++ b/DC_Genotyper_indexes.loc.sample Sat Sep 27 05:36:34 2014 -0400 |
b |
@@ -5,11 +5,11 @@ #the directories in which those files are stored. The DC_Genotyper.loc #file has this format (white space characters are TAB characters): # -#<unique_build_id> <display_name> <2bit_path,IGVtools_genome.path> +#<unique_build_id> <dbkey> <display_name> <2bit_path,IGVtools_genome.path> # #for example: # -#hg19 Human (Homo sapiens): hg19 /depot/data2/galaxy/twobit/hg19.2bit,/depot/data2/galaxy/igvtools/hg19.chrom.sizes +#hg19 hg19 Human (Homo sapiens): hg19 /depot/data2/galaxy/twobit/hg19.2bit,/depot/data2/galaxy/igvtools/hg19.chrom.sizes # #then your /depot/data2/galaxy/twobit/ directory #would need to contain the following 2bit files: |
b |
diff -r 7cbbf8aaa46c -r a7e7f9b8f538 dbsnp_indexes.loc.sample --- a/dbsnp_indexes.loc.sample Sat Sep 27 02:47:31 2014 -0400 +++ b/dbsnp_indexes.loc.sample Sat Sep 27 05:36:34 2014 -0400 |
b |
@@ -5,13 +5,13 @@ #the directories in which those files are stored. The dbsnp_indexes.loc #file has this format (white space characters are TAB characters): # -#<unique_build_id> <display_name> </path/to/dbSNP.vcf.bgz> +#<unique_build_id> <dbkey> <display_name> </path/to/dbSNP.vcf.bgz> # #for example: # -#hg19 Human (Homo sapiens): hg19 /depot/data2/galaxy/dbsnp/dbsnp_137.hg19.vcf.bgz -#hg19 Human (Homo sapiens): hg19 /depot/data2/galaxy/dbsnp/dbsnp_138.hg19.vcf.bgz -#mm9 Mouse (Mus musculus): mm9 /depot/data2/galaxy/dbsnp/dbsnp_128.mm9.vcf.bgz +#hg19 hg19 Human (Homo sapiens): hg19 /depot/data2/galaxy/dbsnp/dbsnp_137.hg19.vcf.bgz +#hg19 hg19 Human (Homo sapiens): hg19 /depot/data2/galaxy/dbsnp/dbsnp_138.hg19.vcf.bgz +#mm9 mm9 Mouse (Mus musculus): mm9 /depot/data2/galaxy/dbsnp/dbsnp_128.mm9.vcf.bgz # #then your /depot/data2/galaxy/dbsnp/ directory #would need to contain the following files: |
b |
diff -r 7cbbf8aaa46c -r a7e7f9b8f538 readme.rst --- a/readme.rst Sat Sep 27 02:47:31 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,18 +0,0 @@ -BACKGROUND: - -DC_Genotyper stands for Deep-Coverage Genotyper, and is aimed at detecting low fraction SNPs (no indels) in high-ploidy (or pooled) samples with very high coverage. It is being developed at the University of Antwerp by Geert Vandeweyer. - -METHOD: - -DC_Genotyper generates a background noise distributions on a per-sample basis, and uses these distrubutions to detect non-reference sites. For non-reference sites, Allele-specific distributions are used to estimate if an allele surpasses the background signal (e.g. A_to_G has different distribution as A_to_C, also reflected in Tr/Tv ratios). - -LIMITATION: - -This is a very early version with several limitations. Current limitations are : no support for indels, no plotting of the noise-models, incorrect syntax in for multi-allelic sites in the VCF file. - -Any feedback is welcome - - -INSTALLATION: - -After installation, complete the dbsnp.loc and dc_genotyper_indexes.loc files. The DC_genotyper_indexes.loc file has a specific format, specifying two index files per genome in a single line. Make sure you use comma to seperate these. DCG supports multithreading, but keep tests have shown that using more than 6-8 threads will lead to I/O bottlenecks. |
b |
diff -r 7cbbf8aaa46c -r a7e7f9b8f538 tool_data_table_conf.xml.sample --- a/tool_data_table_conf.xml.sample Sat Sep 27 02:47:31 2014 -0400 +++ b/tool_data_table_conf.xml.sample Sat Sep 27 05:36:34 2014 -0400 |
b |
@@ -2,11 +2,11 @@ <tables> <!-- Locations of all index files under genome directory --> <table name="DC_Genotyper_indexes" comment_char="#"> - <columns>name,value, path</columns> + <columns>value, dbkey, name, path</columns> <file path="DC_Genotyper_indexes.loc" /> </table> <table name="dbsnp_indexes" comment_char="#"> - <columns>name,value, path</columns> + <columns>value, dbkey, name, path</columns> <file path="dbsnp_indexes.loc" /> </table> |
b |
diff -r 7cbbf8aaa46c -r a7e7f9b8f538 tool_dependencies.xml --- a/tool_dependencies.xml Sat Sep 27 02:47:31 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,28 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="R_3_0_2" version="3.0.2"> - <repository changeset_revision="50f7e1e71271" name="package_r_3_0_2" owner="iuc" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> - <package name="samtools" version="0.1.18"> - <repository changeset_revision="171cd8bc208d" name="package_samtools_0_1_18" owner="devteam" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> - <package name="perl_module_threads" version="1.92"> - <repository changeset_revision="01ed0b079724" name="package_perl_threading" owner="geert-vandeweyer" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> - <package name="perl_module_threads_shared" version="1.46"> - <repository changeset_revision="01ed0b079724" name="package_perl_threading" owner="geert-vandeweyer" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> - <package name="perl_module_Thread_Queue" version="3.02"> - <repository changeset_revision="01ed0b079724" name="package_perl_threading" owner="geert-vandeweyer" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> - <package name="tabix" version="0.2.6"> - <repository changeset_revision="3d6beba7393e" name="package_tabix_0_2_6" owner="iuc" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> - <package name="blat_server" version="latest"> - <repository changeset_revision="5920144e9e9b" name="package_blat_server" owner="jjohnson" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> - <package name="igvtools" version="2.3.32"> - <repository name="package_igvtools_2_3_32" changeset_revision="79f8f883a7a0" owner='geert-vandeweyer' toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> -</tool_dependency> - |