Galaxy |

Changeset 0:e8a32d824f39 (2014-09-25)

Next changeset 1:15be98f10c13 (2014-09-25)

Commit message:
Uploaded

added:
DC_Genotyper.pl
DC_Genotyper.xml
DC_Genotyper_indexes.log.sample
dbsnp_indexes.loc.sample
readme.rst
tool_dependencies.xml

diff -r 000000000000 -r e8a32d824f39 DC_Genotyper.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/DC_Genotyper.pl Thu Sep 25 05:24:35 2014 -0400

[

b'@@ -0,0 +1,530 @@\n+#!/usr/bin/perl\n+#############################\n+## DEEP COVERAGE GENOTYPER ##\n+#############################\n+# version : 0.0.1\n+# Principle: \n+# 1. get allele counts on all positions in specified targets (bed) using igvtools. Only SNPs !!\n+# 2. remove known dbsnp positions (bcf file)\n+# 3. Get distribution of background noise (pcr/sequencing errors), by modelling allele fractions as normal distributions.\n+# 4. Based on these distributions, check each position for significant change from the reference allele (based on allele fraction)\n+# 5. For abberant positions, check each alternate allele to see if it passes the background signal. \n+# 6. Generate VCF file. \n+\n+\n+##################\n+## LOAD MODULES ##\n+##################\n+use threads;\n+use threads::shared;\n+use Thread::Queue;\n+use Getopt::Std;\n+\n+####################\n+## get paramaters ##\n+####################\n+# t: target file\n+# b: bam file\n+# R: 2bit version of reference fasta.\n+# p: number of threads.\n+# s: dbsnp file\n+# m: minimal coverage (defaults 400x)\n+# P: ploidy\n+# a: outfile for allele distributions\n+# v: vcf file output.\n+getopts(\'t:b:R:p:s:m:P:v:a:\', \\%opts) ;\n+\n+## variables\n+my $twobit :shared;\n+my $igvgenome :shared;\n+if (!defined($opts{\'R\'})) {\n+\tdie("Reference Genomes not specified\\n");\n+}\n+my @refgenomes = split(",",$opts{\'r\'});\n+if (!-e $refgenomes[0]) {\n+\tdie("\'$refgenomes[0]\' is not a valid file path.");\n+}\n+else {\n+\t$twobit = $refgenomes[0];\n+}\n+if (!-e $refgenomes[1]) {\n+\tdie("\'$refgenomes[1]\' is not a valid file path.");\n+}\n+else {\n+\t$igvgenome = $refgenomes[1];\n+}\n+\n+\n+my $mincov :shared;\n+$mincov = 320;\n+if (defined($opts{\'m\'})) {\n+\t$mincov = $opts{\'m\'};\n+}\n+\n+my $ploidy :shared;\n+if (defined($opts{\'P\'}) && $opts{\'P\'} =~ m/^\\d+$/) {\n+\t$ploidy = $opts{\'P\'};\n+}\n+else {\n+\tdie("Ploidy (-P) was not specified or not an integer\\n");\n+}\n+\n+\n+if (defined($opts{\'v\'})) {\n+\t$outfile = $opts{\'v\'};\n+}\n+else {\n+\tdie("No output vcf-file specified.\\n");\n+}\n+if (!defined($opts{\'a\'})) {\n+\tdie("No output file specified for distribution details\\n");\n+}\n+## create working dir.\n+my $rand = int(rand(10000));\n+while (-d "/tmp/DC_Genotyper_$rand") {\n+\t$rand = int(rand(10000));\n+}\n+my $wd :shared;\n+$wd = "/tmp/DC_Genotyper_$rand";\n+system("mkdir \'$wd\'");\n+\n+\n+my $snpfile :shared;\n+my $hassnp :shared;\n+$hassnp = \'NoDbSNP\';\n+$snpfile = \'\';\n+if (defined($opts{\'s\'})) {\n+\t$snpfile = $opts{\'s\'};\n+\tif (!-e $snpfile) {\n+\t\tdie("\'$snpfile\' is not a valid file path.");\n+\t}\n+\n+\tmy $mime = `file $snpfile`;\n+\tif ($mime !~ m/compressed/) {\n+\t\tprint "$snpfile is not in compressed format. compressing & indexing the file now.\\n";\n+\t\t#print "... this takes a while\\n";\n+\t\tsystem("bgzip -c $snpfile > $wd/dbSNP.vcf.bgz");\n+\t\tsystem("cd $wd/ && tabix -p vcf dbSNP.vcf.bgz");\n+\t\t$snpfile = "$wd/dbSNP.vcf.bgz";\n+\t}\n+\telsif (!-e "$snpfile.tbi") {\n+\t\tprint "tabix index file is missing for \'$snpfile\'. creating now.\\n";\n+\t\t## check if I can write it out for future use\n+\t\t$snpfile =~ m/(.*)/([^\\/]+)$/;\n+\t\tmy $d = $1;\n+\t\tif (-w $d) {\n+\t\t\topen OUT, ">$d/lock";\n+\t\t\tflock(OUT,2);\n+\t\t\tsystem("cd $d && tabix -p vcf $snpfile");\n+\t\t\tclose OUT;\n+\t\t\tsystem("rm $d/lock");\n+\t\t}\n+\t\telse {\n+\t\t\tsystem("cp $snpfile /$wd/dbSNP.vcf.bgz");\n+\t\t\tsystem("cd $wd/ && tabix -p vcf dbSNP.vcf.bgz");\n+\t\t\t$snpfile = "$wd/dbSNP.vcf.bgz";\n+\t\t}\n+\t}\n+\t$hassnp = \'WithDbSNP\';\n+}\n+\n+\t\n+## 1. Get FASTA and prepare output hashes: \n+my $targets_one = Thread::Queue->new();\n+my $targets_two = Thread::Queue->new();\n+my $targets_three = Thread::Queue->new();\n+open IN, $opts{\'t\'} or die("Could not open $opts{\'t\'} file for reading");\n+if (-d "$wd/Fasta/") {\n+\tsystem("rm $wd/Fasta/*");\n+}\n+else {\n+\tsystem("mkdir $wd/Fasta");\n+}\n+## create the threads.\n+for (my $i = 1; $i<= $opts{\'p\'}; $i++) {\n+\t${\'thr\'.$i} = threads->create(\'FetchFasta\');\n+}\n+\n+## enqueue the targets.\n+my %thash;\n+while (<IN>) {\n+\tchomp;\n+\tmy ($chr,$start,$stop,$name,$score,$strand) = split(/\\t/,$_);\n+\t$targets_one->enqueue($_);\n+\t$targets_two->enqueue($_);'..b'instead of substr-calls in every bam, on every iteration.\n+\t\tfor (my $pos = 0; $pos < length($seq); $pos++) {\n+\t\t\t$ref_alleles{($pos+$start)} = substr($seq,$pos,1);\n+\t\t}\n+\t\t## get counts.\n+\t\tmy $target = "$chr:$start-$stop";\n+\t\tmy $command = "igvtools count -w 1 --bases --query \'$target\' \'$bam\' \'$wd/WIGS/$chr-$start-$stop.wig\' \'$igvgenome\' > /dev/null 2>&1";\n+\t\tsystem($command);\n+\t\topen WIG, "$wd/WIGS/$chr-$start-$stop.wig";\n+\t\tmy $h = <WIG>;\n+\t\t$h = <WIG>;\n+\t\t$h = <WIG>;\n+\t\tmy $target_counts = \'\';\n+\t\twhile (<WIG>) {\t\n+\t\t\tchomp;\n+\t\t\t#my ($pos, $a, $c, $g, $t , $n) = split(/\\t/,$_);\n+\t\t\tmy @p = split(/\\t/,$_);\n+\t\t\tmy $s = $p[1] + $p[2] + $p[3] + $p[4];\n+\t\t\t$target_counts .= "$p[0]\\t$ref_alleles{$p[0]}\\t$p[1]\\t$p[2]\\t$p[3]\\t$p[4]\\t$s\\n";\n+\t\t\t## skip positions with coverage < minimal coverage, and positions in dbsnp if specified (if not specified, snp hash is empty).\n+\t\t\tif ($s > $mincov && !defined($snp->{$chr.\'-\'.$p[0]})) {\n+\t\t\t\t## for model of \'non-reference\' \n+\t\t\t\tmy $frac = $p[$map{$ref_alleles{$p[0]}}] / $s;\n+\t\t\t\t$counts{$ref_alleles{$p[0]}} .= $frac.\',\';\n+\t\t\t\t$out .= "$target\\t$p[0]\\t$ref_alleles{$p[0]}\\t$p[1]\\t$p[2]\\t$p[3]\\t$p[4]\\n";\n+\t\t\t\t## for each of the options background models\n+\t\t\t\tforeach(keys(%map)) {\n+\t\t\t\t\tif ($_ eq $ref_alleles{$p[0]}) {\n+\t\t\t\t\t\tnext;\n+\t\t\t\t\t}\n+\t\t\t\t\t$options{$ref_alleles{$p[0]}.\'-\'.$_} .= ($p[$map{$_}] / $s) .\',\';\n+\t\t\t\t}\n+\t\t\t\t\t \n+\t\t\t}\n+\t\t}\n+\t\tclose WIG;\n+\t\topen OUT, ">>$wd/allcounts.$mincov"."x.$hassnp.txt";\n+\t\tflock(OUT, 2);\n+\t\tprint OUT $out;\n+\t\tclose OUT;\n+\t\topen OUT, ">$wd/WIGS/$chr.$start-$stop.txt";\n+\t\tprint OUT $target_counts;\n+\t\tclose OUT;\n+\t\t\t\n+\t}\n+\tforeach(keys(%counts)) {\n+\t\topen OUT, ">>$wd/counts_$_.$mincov"."x.$hassnp.txt";\n+\t\tflock(OUT,2);\n+\t\tprint OUT $counts{$_};\n+\t\tclose OUT;\n+\t}\n+\tforeach(keys(%options)) {\n+\t\topen OUT, ">>$wd/counts_$_.$mincov"."x.$hassnp.txt";\n+\t\tflock(OUT,2);\n+\t\tprint OUT $options{$_};\n+\t\tclose OUT;\n+\t}\n+}\n+\n+sub GetDistribution {\n+\twhile (defined(my $allele = $alleles->dequeue())) {\n+\t\tsystem("sed -i \'s/.\\$//\' \'$wd/counts_$allele.$mincov"."x.$hassnp.txt\'");\n+\t\topen OUT, ">$wd/GetDistribution.$allele.R";\n+\t\tprint OUT "sample <- \'$bam\'\\n";\n+\t\tprint OUT "nt <- \'$allele\'\\n";\n+\t\t#print OUT "pdf(file=\'$wd/Distribution.$allele.$mincov"."x.$hassnp.pdf\',paper=\'a4\')\\n";\n+\t\tprint OUT "data <- scan(file=\'$wd/counts_$allele.$mincov"."x.$hassnp.txt\',sep=\',\')\\n";\n+\t\tprint OUT "nr <- length(data)\\n";\n+\t\tprint OUT "avg <- mean(data)\\n";\n+\t\tprint OUT "sdd <- sd(data)\\n";\n+\t\t#print OUT "if (avg > 0.5) {\\n";\n+\t\t#print OUT " x <- seq(0.8,1,length=1000)\\n";\n+\t\t#print OUT " y <- dnorm(x,mean=avg,sd=sdd)\\n";\n+\t\t#print OUT " plot(x,y,main=\'Distribution in sample $bam for nt $allele\',xlab=\'Allelic Ratio\',type=\'l\',lwd=1)\\n";\n+\t\t#print OUT " abline(v=(avg-3*sdd),col=\'red\')\\n";\n+\t\t#print OUT " text(0.81,max(y-0.5),paste(c(\'avg: \',avg,\'\\\\nsd: \',sdd,\'\\\\nnrDataPoints:\', nr,\'\\\\n$hassnp\\\\nMin.Cov:\',$mincov),sep=\' \',collapse=\'\'),adj=c(0,1))\\n";\n+\t\t#print OUT "} else {\\n";\n+\t\t#print OUT " x <- seq(0,0.3,length=1000)\\n";\n+\t\t#print OUT " y <- dnorm(x,mean=avg,sd=sdd)\\n";\n+\t\t#print OUT " plot(x,y,main=\'Distribution in sample $bam for nt $allele\',xlab=\'Allelic Ratio\',type=\'l\',lwd=1)\\n";\n+\t\t#print OUT " abline(v=(avg+3*sdd),col=\'red\')\\n";\n+\t\t#print OUT " text(0.2,max(y-0.5),paste(c(\'avg: \',avg,\'\\\\nsd: \',sdd,\'\\\\nnrDataPoints:\', nr,\'\\\\n$hassnp\\\\nMin.Cov:\',$mincov),sep=\' \',collapse=\'\'),adj=c(0,1))\\n";\n+\t\t#print OUT "}\\n";\n+\t\t#print OUT "dev.off()\\n";\n+\t\tprint OUT "write(c(avg,sdd),file=\'$wd/model.$allele.$mincov"."x.$hassnp.txt\',ncolumns=1)\\n";\n+\t\tclose OUT; \n+\t\tsystem("cd $wd && Rscript GetDistribution.$allele.R >/dev/null 2>&1");\n+\t}\n+}\n+\n+\n+sub CallSNPs {\n+\twhile (defined(my $line = $targets_three->dequeue())) {\n+\t\t# split.\n+\t\tmy ($chr,$start,$stop,$name,$score,$strand) = split(/\\t/,$line);\t\n+\t\tmy $file = "$wd/WIGS/$chr.$start-$stop.txt";\n+\t\tmy $ofile = "$wd/VCF/$chr.$start-$stop.vcf";\n+\t\tsystem("cd $wd && Rscript CallSNPs.R \'$file\' \'$chr\' \'$ploidy\' \'$ofile\' \'$wd/dbsnp.txt\'");\n+\t}\n+\n+}\n+\n'

diff -r 000000000000 -r e8a32d824f39 DC_Genotyper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/DC_Genotyper.xml Thu Sep 25 05:24:35 2014 -0400

@@ -0,0 +1,84 @@
+<tool id="Paired_fastQ_trimmer" name="Paired FastQ QC-Trimmer" >
+ <description></description>
+ <requirements>
+ <requirement type='package' version='3.0.2'>R</requirement>
+ <requirement type='package' version='0.1.18'>samtools</requirement>
+ <requirement type='package' version='0.2.6'>tabix</requirement>
+ <requirement type='package' version='latest'>twoBitToFa</requirement>
+ <requirement type='package' version='1.92'>perl_module_threads</requirement>
+ <requirement type='package' version='1.46'>perl_module_threads_shared</requirement>
+ <requirement type='package' version='3.02'>perl_module_Thread_Queue</requirement>
+ <requirement type='package' version='2.3.32'>igvtools</requirement>
+        </requirements>
+ <command interpreter="perl">DC_Genotyper.pl
+ -t "$targets"
+ -b "$bamfile"
+ -R "${ref.fields.path}"
+ -p  "\${GALAXY_SLOTS:-4}"
+ #if $dbsnp.source == "history":
+ -s "${dbsnp.ownFile}"
+ #else
+ -s "${dbsnp.indices.fields.path}"
+ #end if
+ -m $mincov
+ -P $ploidy
+
+ -a $output1
+ -v $output2
+ </command>
+
+ <inputs>
+ <param  name="bamfile" type="data" format="bam" label="Sample BAM file" />
+ <param  name="targets" type="data" format="bed" label="Enrichment BED file" />
+ <param name="ref" type="select" label="Select a reference genome">
+           <options from_data_table="DC_Genotyper_indexes">
+               <filter type="sort_by" column="2" />
+                 <validator type="no_options" message="No indexes are available" />
+           </options>
+         </param>
+ <conditional name="dbsnp">
+   <param name="source" type="select" label="Will you select a dbSNP file from your history, or use a built in version (which is faster)">
+ <option value="indexed">Use a built-in version</option>
+ <option value="history">Use one from the history</option>
+   </param>
+   <when value="indexed">
+ <param name="indices" type="select" label="Select a reference genome">
+           <options from_data_table="dbsnp_indexes">
+                          <filter type="sort_by" column="2" />
+             <validator type="no_options" message="No indexes are available" />
+           </options>
+         </param>
+   </when>
+                  <when value="history">
+ <param name="ownFile" type="data" format="vcf,bcf" label="Select a dbSNP file from history"/>
+   </when>
+ </param>
+           </conditional>
+ <param name="mincov" value="400" type="integer" label="Minimal Coverage Depth" />
+ <param name="ploidy" type="integer" value='10' label="Expected Sample Ploidy" />
+ </inputs>
+
+ <outputs>
+ <data format='txt' name="output1" label="${tool.name} on ${on_string}: Allele Fraction Distributions"/>
+ <data format='vcf' name='output2' label="${tool.name} on ${on_string}: VCF file">
+ </outputs>
+<help>
+
+**What it does**
+
+ 1. get allele counts on all positions in specified targets (bed) using igvtools. Only SNPs !!
+ 2. remove known dbsnp positions (bcf file)
+ 3. Get distribution of background noise (pcr/sequencing errors), by modelling allele fractions as normal distributions.
+ 4. Based on these distributions, check each position for significant change from the reference allele (based on allele fraction)
+ 5. For abberant positions, check each alternate allele to see if it passes the background signal.
+ 6. Generate VCF file.
+
+
+**Information**
+
+This tools is created by Geert Vandeweyer. It is a very early version with several limitations. Current limitations are : no support for indels, no plotting of the noise-models, incorrect syntax in for multi-allelic sites in the VCF file.
+
+Any feedback is welcome.
+
+</help>
+</tool>

diff -r 000000000000 -r e8a32d824f39 DC_Genotyper_indexes.log.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/DC_Genotyper_indexes.log.sample Thu Sep 25 05:24:35 2014 -0400

@@ -0,0 +1,26 @@
+#This is a sample file distributed with Galaxy that enables the DC_Genotyper
+#tool to locate genome files for use with twoBitToFa, and IGVtools. You will
+#need to supply these files and then create a DC_Genotyper.loc file
+#similar to this one (store it in this directory) that points to
+#the directories in which those files are stored. The DC_Genotyper.loc
+#file has this format (white space characters are TAB characters):
+#
+#<unique_build_id> <display_name> <2bit_path;IGVtools_genome.path>
+#
+#for example:
+#
+#hg19 Human (Homo sapiens): hg19 /depot/data2/galaxy/twobit/hg19.2bit;/depot/data2/galaxy/igvtools/hg19.chrom.sizes
+#
+#then your /depot/data2/galaxy/twobit/ directory
+#would need to contain the following 2bit files:
+#
+#-rw-r--r--  1 james    universe 527388 2005-09-13 10:12 hg19.2bit
+#-rw-r--r--  1 james    universe 269808 2005-09-13 10:12 mm9.2bit
+#
+# your igvtools folder needs the following files:
+#
+#-rw-r--r--  1 james    universe 527388 2005-09-13 10:12 hg19.chrom.sizes
+#
+# igv chrom.sizes files are tab delimited files. First column is chromosome name, second column is length. Additional columns are ignored.
+# example genome files are included in the installation location of the package_igvtools repository.
+

diff -r 000000000000 -r e8a32d824f39 dbsnp_indexes.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/dbsnp_indexes.loc.sample Thu Sep 25 05:24:35 2014 -0400

@@ -0,0 +1,24 @@
+#This is a sample file distributed with Galaxy that enables the DC_Genotyper
+#tool to locate bgzip-compressed and indexed versions of dbSNP. You will
+#need to supply these files and then create a dbsnp_indexes.loc file
+#similar to this one (store it in this directory) that points to
+#the directories in which those files are stored. The dbsnp_indexes.loc
+#file has this format (white space characters are TAB characters):
+#
+#<unique_build_id> <display_name> </path/to/dbSNP.vcf.bgz>
+#
+#for example:
+#
+#hg19 Human (Homo sapiens): hg19 /depot/data2/galaxy/dbsnp/dbsnp_137.hg19.vcf.bgz
+#hg19 Human (Homo sapiens): hg19 /depot/data2/galaxy/dbsnp/dbsnp_138.hg19.vcf.bgz
+#mm9 Mouse (Mus musculus): mm9 /depot/data2/galaxy/dbsnp/dbsnp_128.mm9.vcf.bgz
+#
+#then your /depot/data2/galaxy/dbsnp/ directory
+#would need to contain the following files:
+#
+#-rw-r--r--  1 james    universe 527388 2005-09-13 10:12 dbsnp_137.hg19.vcf.bgz
+#-rw-r--r--  1 james    universe 527388 2005-09-13 10:12 dbsnp_137.hg19.vcf.bgz.tbi
+#-rw-r--r--  1 james    universe 527388 2005-09-13 10:12 dbsnp_138.hg19.vcf.bgz
+#-rw-r--r--  1 james    universe 527388 2005-09-13 10:12 dbsnp_138.hg19.vcf.bgz.tbi
+#-rw-r--r--  1 james    universe 527388 2005-09-13 10:12 dbsnp_128.mm9.vcf.bgz
+#-rw-r--r--  1 james    universe 527388 2005-09-13 10:12 dbsnp_128.mm9.vcf.bgz.tbi

diff -r 000000000000 -r e8a32d824f39 readme.rst
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/readme.rst Thu Sep 25 05:24:35 2014 -0400

@@ -0,0 +1,13 @@
+BACKGROUND:
+
+DC_Genotyper stands for Deep-Coverage Genotyper, and is aimed at detecting low fraction SNPs (no indels) in high-ploidy (or pooled) samples with very high coverage. It is being developed at the University of Antwerp by Geert Vandeweyer.
+
+METHOD:
+
+DC_Genotyper generates a background noise distributions on a per-sample basis, and uses these distrubutions to detect non-reference sites. For non-reference sites, Allele-specific distributions are used to estimate if an allele surpasses the background signal (e.g. A_to_G has different distribution as A_to_C, also reflected in Tr/Tv ratios).
+
+LIMITATION:
+
+This is a very early version with several limitations. Current limitations are : no support for indels, no plotting of the noise-models, incorrect syntax in for multi-allelic sites in the VCF file.
+
+Any feedback is welcome

diff -r 000000000000 -r e8a32d824f39 tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Thu Sep 25 05:24:35 2014 -0400

@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="R" version="3.0.2">
+        <repository changeset_revision="50f7e1e71271" name="package_r_3_0_2" owner="iuc" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+   <package name="samtools" version="0.1.18">
+ <repository changeset_revision="171cd8bc208d" name="package_samtools_0_1_18" owner="devteam" toolshed="http://toolshed.g2.bx.psu.edu" />
+   </package>
+   <package name="perl_module_threads" version="1.92">
+        <repository changeset_revision="01ed0b079724" name="package_perl_threading" owner="geert-vandeweyer" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="perl_module_threads_shared" version="1.46">
+        <repository changeset_revision="01ed0b079724" name="package_perl_threading" owner="geert-vandeweyer" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="perl_module_Thread_Queue" version="3.02">
+        <repository changeset_revision="01ed0b079724" name="package_perl_threading" owner="geert-vandeweyer" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="tabix" version="0.2.6">
+ <repository changeset_revision="3d6beba7393e" name="package_tabix_0_2_6" owner="iuc" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="twoBitToFa" version="latest">
+ <repository changeset_revision="5920144e9e9b" name="package_blat_server" owner="jjohnson" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="igvtools" version="2.3.32">
+ <repository name="package_igvtools_2_3_32" changeset_revision="79f8f883a7a0" owner='geert-vandeweyer' toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>
+