changeset 0:72ea0d13dd66 draft

Imported from capsule None
author devteam
date Mon, 28 Jul 2014 11:56:46 -0400
parents
children 5c3ac057608b
files snpFreq.xml snpFreq2.pl test-data/snpFreqInput.txt test-data/snpFreqTestOut.txt tool_dependencies.xml
diffstat 5 files changed, 350 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snpFreq.xml	Mon Jul 28 11:56:46 2014 -0400
@@ -0,0 +1,125 @@
+<tool id="hgv_snpFreq" name="snpFreq" version="1.0.1">
+  <description>significant SNPs in case-control data</description>
+
+  <requirements>
+    <requirement type="package" version="2.11.0">R</requirement>
+    <requirement type="package" version="1.34.0">bioc_qvalue</requirement>
+  </requirements>
+
+  <command interpreter="perl">
+    snpFreq2.pl $inTypeCond.inType 0.05 $input $output
+    #if $inTypeCond.inType == "tab" 
+       $inTypeCond.group1_1 $inTypeCond.group1_2 $inTypeCond.group1_3 
+       $inTypeCond.group2_1 $inTypeCond.group2_2 $inTypeCond.group2_3 0.05
+    #else if $inTypeCond.inType == "snp" 
+       $group1 $group2
+    #end if
+  </command>
+
+  <inputs>
+    <conditional name="inTypeCond">
+       <param name="inType" type="select" label="Format of input" >
+          <option value="tab">Alleles pre-counted</option>
+          <option value="snp">SNP table</option>
+       </param>
+       <when value="tab">
+       <param format="tabular" name="input" type="data" label="Dataset" />
+       <param name="group1_1" label="Column with genotype 1 count for group 1" type="data_column" data_ref="input" />
+       <param name="group1_2" label="Column with genotype 2 count for group 1" type="data_column" data_ref="input" />
+       <param name="group1_3" label="Column with genotype 3 count for group 1" type="data_column" data_ref="input" />
+       <param name="group2_1" label="Column with genotype 1 count for group 2" type="data_column" data_ref="input" />
+       <param name="group2_2" label="Column with genotype 2 count for group 2" type="data_column" data_ref="input" />
+       <param name="group2_3" label="Column with genotype 3 count for group 2" type="data_column" data_ref="input" />
+       </when>
+       <when value="snp">
+       <param format="snp" name="input" type="data" label="SNP Dataset" />
+       <param format="ind" name="group1" type="data" label="Group 1" />
+       <param format="ind" name="group2" type="data" label="Group 2" />
+       </when>
+    </conditional>
+  </inputs>
+
+  <outputs>
+    <data format="tabular" name="output" />
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="inType" value="tab" />
+      <param name="input" ftype="tabular" value="snpFreqInput.txt" dbkey="hg18" />
+      <param name="group1_1" value="4" />
+      <param name="group1_2" value="5" />
+      <param name="group1_3" value="6" />
+      <param name="group2_1" value="7" />
+      <param name="group2_2" value="8" />
+      <param name="group2_3" value="9" />
+      <output name="output" file="snpFreqTestOut.txt" />
+    </test>
+  </tests>
+
+  <help>
+
+**Dataset formats**
+
+The input is tabular_, with six columns of allele counts.  The output is also tabular,
+and includes all of the input data plus the additional columns described below.
+(`Dataset missing?`_)
+
+.. _tabular: ${static_path}/formatHelp.html#tab
+.. _Dataset missing?: ${static_path}/formatHelp.html
+
+-----
+
+**What it does**
+
+This tool performs a basic analysis of bi-allelic SNPs in case-control
+data, using the R statistical environment and Fisher's exact test to
+identify SNPs with a significant difference in the allele frequencies
+between the two groups.  R's "qvalue" package is used to correct for
+multiple testing.
+
+The input file includes counts for each allele combination (AA aa Aa)
+for each group at each SNP position.  The assignment of codes (1 2 3)
+to these genotypes is arbitrary, as long as it is consistent for both
+groups.  Any other input columns are ignored in the computation, but
+are copied to the output.  The output appends eight additional columns,
+namely the minimum expected counts of the three genotypes for each
+group, the p-value, and the q-value.
+
+-----
+
+**Example**
+
+- input file::
+
+    chr1  210  211  38  4  15  56  0   1   x
+    chr1  228  229  55  0  2   56  0   1   x
+    chr1  230  231  46  0  11  55  0   2   x
+    chr1  234  235  43  0  14  55  0   2   x
+    chr1  236  237  55  0  2   13  10  34  x
+    chr1  437  438  55  0  2   46  0   11  x
+    chr1  439  440  56  0  1   55  0   2   x
+    chr1  449  450  56  0  1   13  20  24  x
+    chr1  518  519  56  0  1   38  4   15  x
+
+Here the group 1 genotype counts are in columns 4 - 6, while those
+for group 2 are in columns 7 - 9.
+
+Note that the "x" column has no meaning.  It was added to this example
+to show that extra columns can be included, and to make it easier
+to see where the new columns are appended in the output.
+
+- output file::
+
+    chr1  210  211  38  4  15  56  0   1   x  47    2   8     47    2   8     1.50219088598917e-05  6.32501425679652e-06
+    chr1  228  229  55  0  2   56  0   1   x  55.5  0   1.5   55.5  0   1.5   1                     0.210526315789474
+    chr1  230  231  46  0  11  55  0   2   x  50.5  0   6.5   50.5  0   6.5   0.0155644201009862    0.00409590002657532
+    chr1  234  235  43  0  14  55  0   2   x  49    0   8     49    0   8     0.00210854461554067   0.000739840215979182
+    chr1  236  237  55  0  2   13  10  34  x  34    5   18    34    5   18    6.14613878554783e-17  4.31307984950725e-17
+    chr1  437  438  55  0  2   46  0   11  x  50.5  0   6.5   50.5  0   6.5   0.0155644201009862    0.00409590002657532
+    chr1  439  440  56  0  1   55  0   2   x  55.5  0   1.5   55.5  0   1.5   1                     0.210526315789474
+    chr1  449  450  56  0  1   13  20  24  x  34.5  10  12.5  34.5  10  12.5  2.25757007974134e-18  2.37638955762246e-18
+    chr1  518  519  56  0  1   38  4   15  x  47    2   8     47    2   8     1.50219088598917e-05  6.32501425679652e-06
+
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snpFreq2.pl	Mon Jul 28 11:56:46 2014 -0400
@@ -0,0 +1,196 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+#using large SNP tables (~1G) may require large memory ~15G in R
+
+#expected input: path to file, cols of counts (2 sets of 3), threshold
+if (!@ARGV or scalar @ARGV != 11) {
+   if (!@ARGV or scalar @ARGV != 6) { #snpTable usage
+      print "usage for tab separated allele counts\n",
+            "snpFreq.pl inputType #threshold /path/to/snps.txt outfile <6 column numbers(1 based) with counts for alleles, first one group then another>\n";
+      print "OR for SNP tables\n";
+      print "usage snpFreq.pl inputType #threshold /path/to/snpTable.txt outfile group1File group2File\n";
+      exit 1;
+   }
+}
+
+#get and verify inputs
+my ($file, $a1, $a2, $a3, $b1, $b2, $b3, $thresh, $outfile);
+if ($ARGV[0] eq 'tab') {
+   shift @ARGV;
+   $thresh = shift @ARGV;
+   if ($thresh !~ /^\d*\.?\d+$/) {
+      print "Error the threshold must be a number. Got $thresh\n";
+      exit 1;
+   }elsif ($thresh > .3) {
+      print "Error the threshold can not be greater than 0.3 got $thresh\n";
+      exit 1;
+   }
+   $file = shift @ARGV;
+   $outfile = shift @ARGV;
+   $a1 = shift @ARGV;
+   if ($a1 =~ /\D/ or $a1 < 1) {
+      print "Error the column number, must be an integer greater than or equal to 1. Got $a1\n";
+      exit 1;
+   }
+   $a2 = shift @ARGV;
+   if ($a2 =~ /\D/ or $a2 < 1) {
+      print "Error the column number, must be an integer greater than or equal to 1. Got $a2\n";
+      exit 1;
+   }
+   $a3 = shift @ARGV;
+   if ($a3 =~ /\D/ or $a3 < 1) {
+      print "Error the column number, must be an integer greater than or equal to 1. Got $a3\n";
+      exit 1;
+   }
+   $b1 = shift @ARGV;
+   if ($b1 =~ /\D/ or $b1 < 1) {
+      print "Error the column number, must be an integer greater than or equal to 1. Got $b1\n";
+      exit 1;
+   }
+   $b2 = shift @ARGV;
+   if ($b2 =~ /\D/ or $b2 < 1) {
+      print "Error the column number, must be an integer greater than or equal to 1. Got $b2\n";
+      exit 1;
+   }
+   $b3 = shift @ARGV;
+   if ($b3 =~ /\D/ or $b3 < 1) {
+      print "Error the column number, must be an integer greater than or equal to 1. Got $b3\n";
+      exit 1;
+   }
+}else { #snp table convert and assign variables
+   #snpTable.txt #threshold outfile workingdir
+   shift @ARGV;
+   $thresh = shift @ARGV;
+   if ($thresh !~ /^\d*\.?\d+$/) {
+      print "Error the threshold must be a number. Got $thresh\n";
+      exit 1;
+   }elsif ($thresh > .3) {
+      print "Error the threshold can not be greater than 0.3 got $thresh\n";
+      exit 1;
+   }
+   $file = shift @ARGV;
+   $outfile  = shift @ARGV;
+   my $grpFile = shift @ARGV;
+   my @g1;
+   open(FH, $grpFile) or die "Couldn't open $grpFile, $!\n";
+   while (<FH>) {
+      chomp; 
+      if (/^(\d+)\s/) { push(@g1, $1); }
+   }
+   close FH or die "Couldn't close $grpFile, $!\n";
+   $grpFile = shift @ARGV;
+   my @g2;
+   open(FH, $grpFile) or die "Couldn't open $grpFile, $!\n";
+   while (<FH>) {
+      chomp;
+      if (/^(\d+)\s/) { push(@g2, $1); }
+   }
+   close FH or die "Couldn't close $grpFile, $!\n";
+   if ($file =~ /.gz$/) { 
+      open(FH, "zcat $file |") or die "Couldn't read $file, $!\n";
+   }else {
+      open(FH, $file) or die "Couldn't read $file, $!\n";
+   }
+   open(OUT, ">", "snpTable.txt") or die "Couldn't open snpTable.txt, $!\n";
+   my $size;
+   while (<FH>) {
+      chomp;
+      if (/^#/) { next; } #header
+      my @f = split(/\t/);
+      $size = scalar @f;
+      my @gc1 = (0, 0, 0);
+      my @gc2 = (0, 0, 0);
+      foreach my $g (@g1) { 
+         my $i = $g+1; #g is 1 based first col want 0 based snp call column
+         if ($i > $#f) { die "ERROR looking for index $i which is greater than the list $#f\n"; }
+         if ($f[$i] == -1 or $f[$i] == 2) { #treat unknown as ref 
+            $gc1[0]++;
+         }elsif ($f[$i] == 1) { 
+            $gc1[2]++;
+         }elsif ($f[$i] == 0) {
+            $gc1[1]++;
+         }else { die "Unexpected value for genotype $f[$i] in ", join(" ", @f), "\n"; }
+      }
+      foreach my $g (@g2) {
+         my $i = $g+1; #g is 1 based first col want 0 based snp call column
+         if ($f[$i] == -1 or $f[$i] == 2) { #treat unknown as ref 
+            $gc2[0]++;
+         }elsif ($f[$i] == 1) {
+            $gc2[2]++;
+         }elsif ($f[$i] == 0) {
+            $gc2[1]++;
+         }else { die "Unexpected value for genotype $f[$i] in ", join(" ", @f), "\n"; }
+      }
+      print OUT join("\t", @f), "\t", join("\t", @gc1), "\t", join("\t", @gc2), 
+         "\n";
+   }
+   close FH or die "Couldn't close $file, $!\n";
+   close OUT or die "Couldn't close snpTable.txt, $!\n";
+   my $i = $size + 1; #next 1 based column after input data
+   $a1 = $i++;
+   $a2 = $i++;
+   $a3 = $i++;
+   $b1 = $i++;
+   $b2 = $i++;
+   $b3 = $i++;
+   $file = "snpTable.txt";
+}
+
+#run a fishers exact test (using R) on whole table
+my $cmd = qq|options(warn=-1)
+           tab <- read.table('$file', sep="\t")
+           size <- length(tab[,1])
+           width <- length(tab[1,])
+           x <- 1:size
+           y <- matrix(data=0, nr=size, nc=6)
+           for(i in 1:size) {
+              m <- matrix(c(tab[i,$a1], tab[i,$b1], tab[i,$a2], tab[i,$b2], tab[i,$a3], tab[i,$b3]), nrow=2)
+              t <- fisher.test(m)
+              x[i] <- t\$p.value
+              if (x[i] >= 1) {
+                  x[i] <- .999
+              }
+              n <- (tab[i,$a1] + tab[i,$a2] + tab[i,$a3] + tab[i,$b1] + tab[i,$b2] + tab[i,$b3])
+              n_a <- (tab[i,$a1] + tab[i,$a2] + tab[i,$a3])
+              y[i,1] <- ((tab[i,$a1] + tab[i,$b1])*(n_a))/n
+              y[i,1] <- round(y[i,1],3)
+              y[i,2] <- ((tab[i,$a2] + tab[i,$b2])*(n_a))/n
+              y[i,2] <- round(y[i,2],3)
+              y[i,3] <- ((tab[i,$a3] + tab[i,$b3])*(n_a))/n
+              y[i,3] <- round(y[i,3],3)
+              n_b <- (tab[i,$b1] + tab[i,$b2] + tab[i,$b3])
+              y[i,4] <- ((tab[i,$a1] + tab[i,$b1])*(n_b))/n
+              y[i,4] <- round(y[i,4],3)
+              y[i,5] <- ((tab[i,$a2] + tab[i,$b2])*(n_b))/n
+              y[i,5] <- round(y[i,5],3)
+              y[i,6] <- ((tab[i,$a3] + tab[i,$b3])*(n_b))/n
+              y[i,6] <- round(y[i,6],3)
+           }|;
+           #results <- data.frame(tab[1:size,1:width], x[1:size])
+           #write.table(results, file="$outfile", row.names = FALSE ,col.names = FALSE,quote = FALSE, sep="\t")
+           #q()|;
+
+#my $cmd2 = qq|suppressPackageStartupMessages(library(lib.loc='/afs/bx.psu.edu/home/giardine/lib/R', qvalue))
+my $cmd2 = qq|suppressPackageStartupMessages(library(qvalue))
+              qobj <- qvalue(x[1:size], lambda=seq(0,0.90,$thresh), pi0.method="bootstrap", fdr.level=0.1, robust=FALSE, smooth.log.pi0 = FALSE)
+              q <- qobj\$qvalues
+              results <- data.frame(tab[1:size,1:width], y[1:size,1:6], x[1:size], q[1:size])
+              write.table(results, file="$outfile", row.names = FALSE ,col.names = FALSE,quote = FALSE, sep="\t")
+              q()|;
+
+#for TESTING
+my $pr = qq|results <- data.frame(tab[1:size,1:width], y[1:size,1:6], x[1:size])
+            write.table(results, file="$outfile", row.names = FALSE ,col.names = FALSE,quote = FALSE, sep="\t")
+              q()|;
+
+open(FT, "| R --slave --vanilla") 
+   or die "Couldn't call fisher.text, $!\n";
+print FT $cmd, "\n"; #fisher test
+print FT $cmd2, "\n"; #qvalues and results
+#print FT $pr, "\n";
+close FT or die "Couldn't finish fisher.test, $!\n";
+
+exit;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/snpFreqInput.txt	Mon Jul 28 11:56:46 2014 -0400
@@ -0,0 +1,10 @@
+chr1	210	211	38	4	15	56	0	1	x
+chr1	215	216	53	0	4	5	0	51	x
+chr1	228	229	55	0	2	56	0	1	x
+chr1	230	231	46	0	11	55	0	2	x
+chr1	234	235	43	0	14	55	0	2	x
+chr1	236	237	55	0	2	13	10	34	x
+chr1	437	438	55	0	2	46	0	11	x
+chr1	439	440	56	0	1	55	0	2	x
+chr1	449	450	56	0	1	13	20	24	x
+chr1	518	519	56	0	1	38	4	15	x
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/snpFreqTestOut.txt	Mon Jul 28 11:56:46 2014 -0400
@@ -0,0 +1,10 @@
+chr1	210	211	38	4	15	56	0	1	x	47	2	8	47	2	8	1.50219088598917e-05	6.32501425679652e-06
+chr1	215	216	53	0	4	5	0	51	x	29.257	0	27.743	28.743	0	27.257	2.18870370759758e-21	4.60779727915280e-21
+chr1	228	229	55	0	2	56	0	1	x	55.5	0	1.5	55.5	0	1.5	0.999	0.210315789473684
+chr1	230	231	46	0	11	55	0	2	x	50.5	0	6.5	50.5	0	6.5	0.0155644201009862	0.00409590002657532
+chr1	234	235	43	0	14	55	0	2	x	49	0	8	49	0	8	0.00210854461554067	0.000739840215979182
+chr1	236	237	55	0	2	13	10	34	x	34	5	18	34	5	18	6.14613878554783e-17	4.31307984950725e-17
+chr1	437	438	55	0	2	46	0	11	x	50.5	0	6.5	50.5	0	6.5	0.0155644201009862	0.00409590002657532
+chr1	439	440	56	0	1	55	0	2	x	55.5	0	1.5	55.5	0	1.5	0.999	0.210315789473684
+chr1	449	450	56	0	1	13	20	24	x	34.5	10	12.5	34.5	10	12.5	2.25757007974134e-18	2.37638955762246e-18
+chr1	518	519	56	0	1	38	4	15	x	47	2	8	47	2	8	1.50219088598917e-05	6.32501425679652e-06
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Mon Jul 28 11:56:46 2014 -0400
@@ -0,0 +1,9 @@
+<?xml version="1.0"?>
+<tool_dependency>
+  <package name="bioc_qvalue" version="1.34.0">
+      <repository changeset_revision="11735242a19e" name="package_bioc_qvalue_1_34_0" owner="devteam" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="R" version="2.11.0">
+      <repository changeset_revision="5824d2b3bc8b" name="package_r_2_11_0" owner="devteam" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>