diff dpmix.xml @ 27:8997f2ca8c7a

Update to Miller Lab devshed revision bae0d3306d3b
author Richard Burhans <burhans@bx.psu.edu>
date Mon, 15 Jul 2013 10:47:35 -0400
parents 91e835060ad2
children 4188853b940b
line wrap: on
line diff
--- a/dpmix.xml	Mon Jun 03 12:29:29 2013 -0400
+++ b/dpmix.xml	Mon Jul 15 10:47:35 2013 -0400
@@ -1,18 +1,37 @@
 <tool id="gd_dpmix" name="Admixture" version="1.1.0">
-  <description>: Map genomic intervals resembling specified ancestral populations</description>
+  <description>: Map genomic intervals resembling specified source populations</description>
 
   <command interpreter="python">
-    dpmix.py "$input"
+    #import json
+    #import base64
+    #import zlib
+    #set $ind_names = $input.dataset.metadata.individual_names
+    #set $ind_colms = $input.dataset.metadata.individual_columns
+    #set $ind_dict = dict(zip($ind_names, $ind_colms))
+    #set $ind_json = json.dumps($ind_dict, separators=(',',':'))
+    #set $ind_comp = zlib.compress($ind_json, 9)
+    #set $ind_arg = base64.b64encode($ind_comp)
+    dpmix.py '$input'
     #if $input_type.choice == '0'
-      "gd_snp" "$input_type.data_source"
+      'gd_snp' '$input_type.data_source'
     #else if $input_type.choice == '1'
-      "gd_genotype" "1"
+      'gd_genotype' '1'
     #end if
-    "$switch_penalty" "$ap1_input" "$ap2_input" "$p_input" "$output" "$output2" "$output2.files_path" "$input.dataset.metadata.dbkey" "$input.dataset.metadata.ref" "$GALAXY_DATA_INDEX_DIR" "gd.heterochromatic.loc"
-    #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns)
-      #set $arg = '%s:%s' % ($individual_col, $individual)
-      "$arg"
-    #end for
+    #if $third_pop.choice == '0'
+      #set $ap3_arg = '/dev/null'
+      #set $ap3_name_arg = ''
+    #else if $third_pop.choice == '1'
+      #set $ap3_arg = $third_pop.ap3_input
+      #set $ap3_name_arg = $third_pop.ap3_input.name
+    #end if
+    #if $user_het.choice == '0'
+      #set $het_arg = 'use_installed'
+    #else if $user_het.choice == '1'
+      #set $het_arg = $user_het.het_file
+    #else if $user_het.choice == '2'
+      #set $het_arg = 'use_none'
+    #end if
+    '$switch_penalty' '$ap1_input' '$ap1_input.name' '$ap2_input' '$ap2_input.name' '$ap3_arg' '$ap3_name_arg' '$p_input' '$output' '$output2' '$output2.files_path' '$input.dataset.metadata.dbkey' '$input.dataset.metadata.ref' '$GALAXY_DATA_INDEX_DIR' 'gd.heterochromatic.loc' '$ind_arg' '$het_arg' '1'
   </command>
 
   <inputs>
@@ -38,11 +57,43 @@
       </when>
     </conditional>
 
-    <param name="ap1_input" type="data" format="gd_indivs" label="Ancestral population 1 individuals" />
-    <param name="ap2_input" type="data" format="gd_indivs" label="Ancestral population 2 individuals" />
+    <param name="ap1_input" type="data" format="gd_indivs" label="Source population 1 individuals" />
+    <param name="ap2_input" type="data" format="gd_indivs" label="Source population 2 individuals" />
+
+    <conditional name="third_pop">
+      <param name="choice" type="select" format="integer" label="Include third source population">
+        <option value="0" selected="true">no</option>
+        <option value="1">yes</option>
+      </param>
+      <when value="0" />
+      <when value="1">
+        <param name="ap3_input" type="data" format="gd_indivs" label="Source population 3 individuals" />
+      </when>
+    </conditional>
+
     <param name="p_input" type="data" format="gd_indivs" label="Potentially admixed individuals" />
 
     <param name="switch_penalty" type="float" min="0" value="10" label="Genotype switch penalty" help="Note: Depends on the density of SNPs.  For instance, with 50,000 SNPs in a vertebrate genome, 1.0 might be appropriate, with millions of SNPs, a value between 10 and 100 might be reasonable."/>
+
+    <conditional name="user_het">
+      <param name="choice" type="select" format="integer" label="Heterochromatin info">
+        <option value="0" selected="true">use installed</option>
+        <option value="1">use your own</option>
+        <option value="2">use none</option>
+      </param>
+      <when value="0" />
+      <when value="1">
+        <param name="het_file" type="data" format="txt" label="Heterochromatin dataset" />
+      </when>
+    </conditional>
+
+    <!--
+    <param name="add_logs" type="select" format="integer" label="Probabilities">
+      <option value="1" selected="true">add logs of probabilities</option>
+      <option value="0">add probabilities</option>
+    </param>
+    -->
+
   </inputs>
 
   <outputs>
@@ -88,27 +139,37 @@
 
 **What it does**
 
-The user specifies two "ancestral" populations (i.e., sources for
-chromosomes) and a set of potentially admixed individuals, and chooses
-between the sequence coverage or the estimated genotypes to measure
-the similarity of genomic intervals in admixed individuals to the two
-classes of ancestral chromosomes.  The user also picks a "genotype switch penalty",
-typically between 10 and 100.  For each potentially admixed individual,
-the program divides the genome into three "genotypes": (0) homozygous
-for the first ancestral population (i.e., both chromosomes from that
-population), (1) heterozygous, or (2) homozygous for the second ancestral
-population.  Parts of a chromosome that are labeled as "heterochromatic"
-are given the non-genotype "3".  Smaller values of the switch penalty
-(corresponding to more ancient admixture events) generally lead to the
-reconstruction of more frequent changes between genotypes.
+The user specifies two or three source populations (i.e., sources
+for chromosomes) and a set of potentially admixed individuals, and
+chooses between the sequence coverage or the estimated genotypes to
+measure the similarity of genomic intervals in admixed individuals to
+the three classes of source chromosomes.  The user also specifies a
+"switch penalty", controlling the strength of evidence needed to switch
+between source populations as the the program scans along a chromosome.
+Choice of picksan appropriate value depends on the number of SNPs and, to
+a lesser extent, on the time since the admixture events.  With several
+million SNPs genome-wide, reasonable values might fall between 10
+and 100.  If there are 3 source populatons, then for each potentially
+admixed individual the program divides the genome into six "genotypes":
+
+1. homozygous for the first source population (i.e., both chromosomes from that population),
+2. homozygous for the second source population,
+3. homozygous for the third source population,
+4. heterozygous for the first and second populations (i.e., one chromosome from each),
+5. heterozygous for the first and third populations, or
+6. heterozygous for the second and third populations.
+
+Parts of a reference chromosome that are labeled as "heterochromatic"
+are given the "non-genotype" 0.  With two source populations, only
+"genotypes" 1, 2 and 3 are possible, where 3 now means heterozygous in
+the two source populations.
 
 There are two output datasets generated.  A tabular dataset with chromosome,
 start, stop, and pairs of columns containing the "genotypes" from above
 and label from the admixed individual.  The second dataset is a composite
 dataset with general information from the run and a link to a pdf which
-graphically shows the ancestral population along each of the chromosomes.
+graphically shows the source population along each of the chromosomes.
 The second link is to a text file with summary information of the 
 "genotypes" over the whole genome.
-
   </help>
 </tool>