Mercurial > repos > miller-lab > genome_diversity

diff rank_pathways.xml @ 27:8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
author: Richard Burhans <burhans@bx.psu.edu>
date: Mon, 15 Jul 2013 10:47:35 -0400
parents: 95a05c1ef5d5
children: 184d14e4270d
--- a/rank_pathways.xml	Mon Jun 03 12:29:29 2013 -0400
+++ b/rank_pathways.xml	Mon Jul 15 10:47:35 2013 -0400
@@ -1,28 +1,52 @@
-<tool id="gd_calc_freq" name="Rank Pathways" version="1.1.0">
+<tool id="gd_calc_freq" name="Rank Pathways" version="1.2.0">
   <description>: Assess the impact of a gene set on KEGG pathways</description>
 
   <command interpreter="python">
-    #if str($output_format) == 'a'
+    #if $rank_by.choice == 'pct'
       rank_pathways_pct.py
-    #else if str($output_format) == 'b'
+      --input '$rank_by.input1'
+      --columnENSEMBLT '$rank_by.t_col1'
+      --inBckgrndfile '$rank_by.input2'
+      --columnENSEMBLTBckgrnd '$rank_by.t_col2'
+      --columnKEGGBckgrnd '$rank_by.k_col2'
+      --statsTest '$rank_by.stat'
+      --output '$output'
+    #else if $rank_by.choice == 'paths'
       calclenchange.py
+      '--loc_file=${GALAXY_DATA_INDEX_DIR}/gd.rank.loc'
+      '--species=${rank_by.input.metadata.dbkey}'
+      '--input=${rank_by.input}'
+      '--output=${output}'
+      '--posKEGGclmn=${rank_by.kpath}'
+      '--KEGGgeneposcolmn=${rank_by.kgene}'
     #end if
-    "--loc_file=${GALAXY_DATA_INDEX_DIR}/gd.rank.loc"
-    "--species=${input.metadata.dbkey}"
-    "--input=${input}"
-    "--output=${output}"
-    "--posKEGGclmn=${kpath}"
-    "--KEGGgeneposcolmn=${kgene}"
   </command>
 
   <inputs>
-    <param name="input" type="data" format="tab" label="Dataset" />
-    <param name="kgene" type="data_column" data_ref="input" label="Column with KEGG gene ID"  />
-    <param name="kpath" type="data_column" data_ref="input" numerical="false" label="Column with KEGG pathways" />
-    <param name="output_format" type="select" label="Output">
-      <option value="a" selected="true">ranked by percentage of genes affected</option>
-      <option value="b">ranked by change in length and number of paths</option>
-    </param>
+    <conditional name="rank_by">
+      <param name="choice" type="select" label="Rank by">
+        <option value="pct" selected="true">percentage of genes affected</option>
+        <option value="paths">change in length and number of paths</option>
+      </param>
+      <when value="pct">
+        <!-- using fields similar to the Rank Terms tool -->
+        <param name="input1" type="data" format="tabular" label="Query dataset" />
+        <param name="t_col1" type="data_column" data_ref="input1" label="Column with ENSEMBL transcript codes" />
+        <param name="input2" type="data" format="tabular" label="Background dataset" />
+        <param name="t_col2" type="data_column" data_ref="input2" label="Column with ENSEMBL transcript codes" />
+        <param name="k_col2" type="data_column" data_ref="input2" label="Column with KEGG pathways" />
+        <param name="stat" type="select" label="Statistic for determining enrichment/depletion">
+          <option value="fisher" selected="true">two-tailed Fisher's exact test</option>
+          <option value="hypergeometric">hypergeometric test</option>
+          <option value="binomial">binomial probability</option>
+        </param>
+      </when>
+      <when value="paths">
+        <param name="input" type="data" format="tabular" label="Dataset" />
+        <param name="kgene" type="data_column" data_ref="input" label="Column with KEGG gene ID" />
+        <param name="kpath" type="data_column" data_ref="input" numerical="false" label="Column with KEGG pathways" />
+      </when>
+    </conditional>
   </inputs>
 
   <outputs>
@@ -31,11 +55,6 @@
 
   <tests>
     <test>
-      <param name="input" value="test_in/sample.gd_sap" ftype="gd_sap" />
-      <param name="kgene" value="10" />
-      <param name="kpath" value="12" />
-      <param name="output_format" value="a" />
-      <output name="output" file="test_out/rank_pathways/rank_pathways.tabular" />
     </test>
   </tests>
 
@@ -43,9 +62,10 @@
 
 **Dataset formats**
 
-The input and output datasets are in tabular_ format.
+All of the input and output datasets are in tabular_ format.
 The input dataset must have columns with KEGG gene ID and pathways.
-The output dataset is described below.
+[Need to update this, since input columns now depend on the "Rank by" choice.]
+The output datasets are described below.
 (`Dataset missing?`_)
 
 .. _tabular: ./static/formatHelp.html#tab
@@ -56,7 +76,8 @@
 **What it does**
 
 This tool produces a table ranking the pathways based on the percentage
-of genes in an input dataset, out of the total in each pathway.
+of genes in an input dataset, out of the total in each pathway
+[please clarify w.r.t. query and background datasets].
 Alternatively, the tool ranks the pathways based on the change in
 length and number of paths connecting sources and sinks.  This change is
 calculated between graphs representing pathways with and without excluding
@@ -65,14 +86,15 @@
 Sinks are all the nodes representing the final reactants/products in
 the pathway.
 
-If pathways are ranked by percentage of genes affected, the output is
-a tabular dataset with the following columns:
+If pathways are ranked by percentage of genes affected, the output contains
+a row for each KEGG pathway, with the following columns:
 
-1. number of genes in the pathway present in the input dataset
-2. percentage of the total genes in the pathway included in the input dataset
-3. rank of the frequency (from high freq to low freq)
-4. Fisher probability of enrichment/depletion of pathway genes in the input dataset
-5. name of the pathway
+1. count: the number of genes in the query set that are in this pathway
+2. representation: the percentage of this pathway's genes (from the background dataset) that appear in the query set
+3. ranking of this pathway, based on its representation ("1" is highest)
+4. probability of depletion of this pathway in the query dataset
+5. probability of enrichment of this pathway in the query dataset
+6. KEGG pathway
 
 If pathways are ranked by change in length and number of paths, the
 output is a tabular dataset with the following columns:
@@ -97,20 +119,20 @@
    Contig62_chr1_19011969_19012646 265  chr1  19012240 ENSCAFT00000000144   ENSCAFP00000000125   *    161   R    483960  probably damaging    N
    etc.
  
-- output ranked by percentage of genes affected::
+- output ranked by percentage of genes affected [need new sample output with more columns]::
 
-   3       0.25    1       cfa03450=Non-homologous end-joining
-   1       0.25    1       cfa00750=Vitamin B6 metabolism
-   2       0.2     3       cfa00290=Valine, leucine and isoleucine biosynthesis
-   3       0.18    4       cfa00770=Pantothenate and CoA biosynthesis
+   3   0.25   1   cfa03450=Non-homologous end-joining
+   1   0.25   1   cfa00750=Vitamin B6 metabolism
+   2   0.2    3   cfa00290=Valine, leucine and isoleucine biosynthesis
+   3   0.18   4   cfa00770=Pantothenate and CoA biosynthesis
    etc.
 
 - output ranked by change in length and number of paths::
 
-   3.64	  8.44	4.8	2	4	9	5	1	cfa00260=Glycine, serine and threonine metabolism
-   7.6	  9.6	2	1	3	5	2	2	cfa00240=Pyrimidine metabolism
-   0.05	  2.67	2.62	6	1	30	29	3	cfa00982=Drug metabolism - cytochrome P450
-   -0.08  8.33	8.41	84	1	30	29	3	cfa00564=Glycerophospholipid metabolism
+    3.64   8.44   4.8     2   4    9    5   1   cfa00260=Glycine, serine and threonine metabolism
+    7.6    9.6    2       1   3    5    2   2   cfa00240=Pyrimidine metabolism
+    0.05   2.67   2.62    6   1   30   29   3   cfa00982=Drug metabolism - cytochrome P450
+   -0.08   8.33   8.41   84   1   30   29   3   cfa00564=Glycerophospholipid metabolism
    etc.
 
   </help>
author	Richard Burhans <burhans@bx.psu.edu>
date	Mon, 15 Jul 2013 10:47:35 -0400
parents	95a05c1ef5d5
children	184d14e4270d