diff rank_pathways.xml @ 21:d6b961721037

Miller Lab Devshed version 4c04e35b18f6
author Richard Burhans <burhans@bx.psu.edu>
date Mon, 05 Nov 2012 12:44:17 -0500
parents 8ae67e9fb6ff
children 95a05c1ef5d5
line wrap: on
line diff
--- a/rank_pathways.xml	Tue Oct 23 14:38:04 2012 -0400
+++ b/rank_pathways.xml	Mon Nov 05 12:44:17 2012 -0500
@@ -7,19 +7,19 @@
     #else if str($output_format) == 'b'
       calclenchange.py
     #end if
-        "--loc_file=${GALAXY_DATA_INDEX_DIR}/gd.rank.loc"
-        "--species=${input.metadata.dbkey}"
-        "--input=${input}"
-        "--output=${output}"
-        "--posKEGGclmn=${input.metadata.kegg_path}"
-        "--KEGGgeneposcolmn=${input.metadata.kegg_gene}"
+    "--loc_file=${GALAXY_DATA_INDEX_DIR}/gd.rank.loc"
+    "--species=${input.metadata.dbkey}"
+    "--input=${input}"
+    "--output=${output}"
+    "--posKEGGclmn=${kpath}"
+    "--KEGGgeneposcolmn=${kgene}"
   </command>
 
   <inputs>
-    <param name="input" type="data" format="gd_sap" label="Table">
-        <validator type="metadata" check="kegg_gene,kegg_path" message="Missing KEGG gene code column and/or KEGG pathway code/name column metadata.  Click the pencil icon in the history item to edit/save the metadata attributes" />
-    </param>
-    <param name="output_format" type="select" label="Output format">
+    <param name="input" type="data" format="tab" label="Dataset" />
+    <param name="kgene" type="data_column" data_ref="input" label="Column with KEGG gene ID"  />
+    <param name="kpath" type="data_column" data_ref="input" numerical="false" label="Column with KEGG pathways" />
+    <param name="output_format" type="select" label="Output">
       <option value="a" selected="true">ranked by percentage of genes affected</option>
       <option value="b">ranked by change in length and number of paths</option>
     </param>
@@ -32,6 +32,8 @@
   <tests>
     <test>
       <param name="input" value="test_in/sample.gd_sap" ftype="gd_sap" />
+      <param name="kgene" value="10" />
+      <param name="kpath" value="12" />
       <param name="output_format" value="a" />
       <output name="output" file="test_out/rank_pathways/rank_pathways.tabular" />
     </test>
@@ -39,6 +41,18 @@
 
   <help>
 
+**Dataset formats**
+
+The input and output datasets are in tabular_ format.
+The input dataset must have columns with KEGG gene ID and pathways.
+The output dataset is described below.
+(`Dataset missing?`_)
+
+.. _tabular: ./static/formatHelp.html#tab
+.. _Dataset missing?: ./static/formatHelp.html
+
+-----
+
 **What it does**
 
 This tool produces a table ranking the pathways based on the percentage
@@ -54,23 +68,49 @@
 If pathways are ranked by percentage of genes affected, the output is
 a tabular dataset with the following columns:
 
-   1. number of genes in the pathway present in the input dataset
-   2. percentage of the total genes in the pathway included in the input dataset
-   3. rank of the frequency (from high freq to low freq)
-   4. name of the pathway
+1. number of genes in the pathway present in the input dataset
+2. percentage of the total genes in the pathway included in the input dataset
+3. rank of the frequency (from high freq to low freq)
+4. name of the pathway
 
 If pathways are ranked by change in length and number of paths, the
 output is a tabular dataset with the following columns:
 
-   1. change in the mean length of paths between sources and sinks
-   2. mean length of paths between sources and sinks in the pathway including the genes in the input dataset.  If the pathway do not have sources/sinks, the length is assumed to be infinite (I)
-   3. mean length of paths between sources and sinks in the pathway excluding the genes in the input dataset.  If the pathway do not have sources/sinks, the length is assumed to be infinite (I)
-   4. rank of the change in the mean length of paths between sources and sinks (from high change to low change)
-   5. change in the number of paths between sources and sinks
-   6. number of paths between sources and sinks in the pathway including the genes in the input dataset.  If the pathway do not have sources/sinks, it is assumed to be a circuit (C)
-   7. number of paths between sources and sinks in the pathway excluding the genes in the input dataset.  If the pathway do not have sources/sinks, it is assumed to be a circuit (C)
-   8. rank of the change in the number of paths between sources and sinks (from high change to low change)
-   9. name of the pathway
+1. change in the mean length of paths between sources and sinks
+2. mean length of paths between sources and sinks in the pathway including the genes in the input dataset.  If the pathway do not have sources/sinks, the length is assumed to be infinite (I)
+3. mean length of paths between sources and sinks in the pathway excluding the genes in the input dataset.  If the pathway do not have sources/sinks, the length is assumed to be infinite (I)
+4. rank of the change in the mean length of paths between sources and sinks (from high change to low change)
+5. change in the number of paths between sources and sinks
+6. number of paths between sources and sinks in the pathway including the genes in the input dataset.  If the pathway do not have sources/sinks, it is assumed to be a circuit (C)
+7. number of paths between sources and sinks in the pathway excluding the genes in the input dataset.  If the pathway do not have sources/sinks, it is assumed to be a circuit (C)
+8. rank of the change in the number of paths between sources and sinks (from high change to low change)
+9. name of the pathway
+
+-----
+
+**Examples**
+
+- input (column 10 for KEGG gene ID, column 12 for KEGG pathways)::
+ 
+   Contig39_chr1_3261104_3261850   414  chr1  3261546  ENSCAFT00000000001   ENSCAFP00000000001   S    667   F    476153  probably damaging    cfa00230=Purine metabolism.cfa00500=Starch and sucrose metabolism.cfa00740=Riboflavin metabolism.cfa00760=Nicotinate and nicotinamide metabolism.cfa00770=Pantothenate and CoA biosynthesis.cfa01100=Metabolic pathways
+   Contig62_chr1_19011969_19012646 265  chr1  19012240 ENSCAFT00000000144   ENSCAFP00000000125   *    161   R    483960  probably damaging    N
+   etc.
+ 
+- output ranked by percentage of genes affected::
+
+   3       0.25    1       cfa03450=Non-homologous end-joining
+   1       0.25    1       cfa00750=Vitamin B6 metabolism
+   2       0.2     3       cfa00290=Valine, leucine and isoleucine biosynthesis
+   3       0.18    4       cfa00770=Pantothenate and CoA biosynthesis
+   etc.
+
+- output ranked by change in length and number of paths::
+
+   3.64	  8.44	4.8	2	4	9	5	1	cfa00260=Glycine, serine and threonine metabolism
+   7.6	  9.6	2	1	3	5	2	2	cfa00240=Pyrimidine metabolism
+   0.05	  2.67	2.62	6	1	30	29	3	cfa00982=Drug metabolism - cytochrome P450
+   -0.08  8.33	8.41	84	1	30	29	3	cfa00564=Glycerophospholipid metabolism
+   etc.
 
   </help>
 </tool>