Mercurial > repos > devteam > divide_pg_snp

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dividePgSnpAlleles.pl	Wed Sep 25 10:26:17 2013 -0400
@@ -0,0 +1,41 @@
+#!/usr/bin/perl -w
+use strict;
+
+#divide the alleles and their information into separate columns for pgSnp-like
+#files. Keep any additional columns beyond the pgSnp ones.
+#reads from stdin, writes to stdout
+my $ref;
+my $in;
+if (@ARGV && $ARGV[0] =~ /-ref=(\d+)/) {
+   $ref = $1 -1;
+   if ($ref == -1) { undef $ref; }
+   shift @ARGV;
+}
+if (@ARGV) {
+   $in = shift @ARGV;
+}
+
+open(FH, $in) or die "Couldn't open $in, $!\n";
+while (<FH>) {
+   chomp;
+   my @f = split(/\t/);
+   my @a = split(/\//, $f[3]);
+   my @fr = split(/,/, $f[5]);
+   my @sc = split(/,/, $f[6]);
+   if ($f[4] == 1) { #homozygous add N, 0, 0
+      if ($ref) { push(@a, $f[$ref]); }
+      else { push(@a, "N"); }
+      push(@fr, 0);
+      push(@sc, 0);
+   }
+   if ($f[4] > 2) { next; } #skip those with more than 2 alleles
+   print "$f[0]\t$f[1]\t$f[2]\t$a[0]\t$fr[0]\t$sc[0]\t$a[1]\t$fr[1]\t$sc[1]";
+   if (scalar @f > 7) {
+      splice(@f, 0, 7); #remove first 7
+      print "\t", join("\t", @f), "\n";
+   }else { print "\n"; }
+}
+close FH;
+
+exit;
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dividePgSnpAlleles.xml	Wed Sep 25 10:26:17 2013 -0400
@@ -0,0 +1,76 @@
+<tool id="dividePgSnp" version="1.0.0" name="Separate pgSnp alleles" hidden="false">
+  <description>into columns</description>
+  <command interpreter="perl">
+    #if $refcol.ref == "yes" #dividePgSnpAlleles.pl -ref=$refcol.ref_column $input1 > $out_file1
+    #else #dividePgSnpAlleles.pl $input1 > $out_file1
+    #end if
+  </command>
+  <inputs>
+    <param format="interval" version="1.0.0" name="input1" type="data" label="pgSnp dataset" />
+    <conditional version="1.0.0" name="refcol">
+      <param version="1.0.0" name="ref" type="select" label="Dataset has a column with the reference allele:">
+        <option value="yes">yes</option>
+        <option value="no" selected="true">no</option>
+      </param>
+      <when value="yes">
+      <param version="1.0.0" name="ref_column" type="data_column" data_ref="input1" label="Column with reference allele" />
+      </when>
+      <when value="no"> <!-- do nothing -->
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+  <data format="interval" version="1.0.0" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name='input1' value='dividePgSnp_input.pgSnp' ftype='interval' />
+      <param name='ref' value='no' />
+      <output version="1.0.0" name="output" file="dividePgSnp_output.txt" />
+    </test>
+  </tests>
+
+  <help>
+**Dataset formats**
+
+The input dataset is of Galaxy datatype interval_ with the columns specified
+for pgSnp_.
+Any additional columns beyond the pgSnp defined columns will be appended to
+the output.
+The output dataset is in interval_ format.  (`Dataset missing?`_)
+
+.. _interval: ./static/formatHelp.html#interval
+.. _Dataset missing?: ./static/formatHelp.html
+.. _pgSnp: ./static/formatHelp.html#pgSnp
+
+**What it does**
+
+This separates the alleles from a pgSnp dataset into separate columns,
+as well as the frequencies and scores that go with the alleles.  It will skip
+any positions with more than 2 alleles.  If only a single allele is given then "N"
+will be used for the second, with a frequency and score of zero.  Or, if a
+column with reference alleles is provided,
+the value in that column will be used in place of the "N" for single alleles.
+
+-----
+
+**Examples**
+
+- input pgSnp file::
+
+   chr1    256     257     A/C     2       3,4     10,20
+   chr1    56100   56101   A       1       5       30
+   chr1    77052   77053   A/G     2       6,7     40,50
+   chr1    110904  110905  A       1       8       60
+   etc.
+
+- output::
+
+   chr1    256     257     A       3       10       C       4       20
+   chr1    56100   56101   A       5       30       N       0       0
+   chr1    77052   77053   A       6       40       G       7       50
+   chr1    110904  110905  A       8       60       N       0       0
+   etc.
+
+</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/dividePgSnp_input.pgSnp	Wed Sep 25 10:26:17 2013 -0400
@@ -0,0 +1,10 @@
+chr1	256	257	A/C	2	4,5	0,0
+chr1	56100	56101	A	1	8	0
+chr1	77052	77053	A/G	2	3,5	0,0
+chr1	110904	110905	A	1	5	0
+chr1	160592	160593	G	1	3	0
+chr1	640353	640354	G	1	1	0
+chr1	695314	695315	A	1	7	0
+chr1	713681	713682	A	1	8	0
+chr1	713965	713966	A/G	2	3,2	0,0
+chr1	714056	714057	A/G	2	1,5	0,0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/dividePgSnp_output.txt	Wed Sep 25 10:26:17 2013 -0400
@@ -0,0 +1,10 @@
+chr1	256	257	A	4	0	C	5	0
+chr1	56100	56101	A	8	0	N	0	0
+chr1	77052	77053	A	3	0	G	5	0
+chr1	110904	110905	A	5	0	N	0	0
+chr1	160592	160593	G	3	0	N	0	0
+chr1	640353	640354	G	1	0	N	0	0
+chr1	695314	695315	A	7	0	N	0	0
+chr1	713681	713682	A	8	0	N	0	0
+chr1	713965	713966	A	3	0	G	2	0
+chr1	714056	714057	A	1	0	G	5	0