diff mageck_pathway.xml @ 0:c871e57e2abb draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/mageck commit 71cef018eec5ee7ff7f3853599c027e80e2637fe
author iuc
date Wed, 14 Feb 2018 06:41:39 -0500
parents
children 37127c23a210
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mageck_pathway.xml	Wed Feb 14 06:41:39 2018 -0500
@@ -0,0 +1,156 @@
+<?xml version="1.0"?>
+<tool id="mageck_pathway" name="MAGeCK pathway" version="@VERSION@" >
+    <description>- given a ranked gene list, test whether one pathway is enriched</description>
+    <macros>
+        <import>mageck_macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <expand macro="version" />
+    <command detect_errors="exit_code"><![CDATA[
+
+mageck pathway
+
+--gene-ranking '$gene_ranking'
+--gmt-file '$gmt_file'
+-n sample1
+
+#if $adv.single_ranking:
+    --single-ranking
+#end if
+--method $adv.method
+--sort-criteria  $adv.sort_criteria
+--ranking-column $adv.ranking_column
+--ranking-column-2 $adv.ranking_column2
+--pathway-alpha $adv.pathway_alpha
+--permutation $adv.permutation
+
+    ]]></command>
+    <inputs>
+        <param name="gene_ranking" argument="--gene-ranking" type="data" format="tabular" label="Gene Ranking file" help="The gene ranking file generated by the gene test step. Only one enrichment comparison will be performed." />
+        <param name="gmt_file" argument="--gmt-file" type="data" format="tabular" label="Pathway GMT file" help="The pathway file in GMT format. See Help below for more information" />
+
+        <section name="adv" title="Advanced Options">
+            <param name="single_ranking" argument="--single-ranking" type="boolean" truevalue="--single-ranking" falsevalue="" checked="false" optional="true"
+                label="Single ranking file"
+                help="The provided file is a (single) gene ranking file, either positive or negative selection. Only one enrichment comparison will be performed. Default: No" />
+            <param name="method" argument="--method" type="select" label="Method for testing pathway enrichment" >
+                <option value="gsea" selected="True">GSEA</option>
+                <option value="rra">RRA</option>
+            </param>
+            <expand macro="sort_criteria" />
+            <param name="ranking_column" argument="--ranking-column" type="data_column" data_ref="gene_ranking" value="2" optional="true"
+                label="Gene Summary file column" help="Column number or label in gene summary file for gene ranking; can be either an integer of column number, or a string of column label. Default: 2 (the 3rd column)" />
+            <param name="ranking_column2" argument="--ranking-column-2" type="data_column" data_ref="gene_ranking" value="8" optional="true"
+                label="Gene Summary file column" help="Column number or label in gene summary file for gene ranking; can be either an integer of column number, or a string of column label. This option is used to determine the column for positive selections and is disabled if --single-ranking is specified. Default: 8 (the 9th column)" />
+            <param name="pathway_alpha" argument="--pathway-alpha" type="float" min="0" value="0.25" optional="true"
+                label="Alpha value for RRA pathway enrichment" help="The default alpha value for RRA pathway enrichment. Default: 0.25" />
+            <param argument="--permutation" type="integer" min="0" value="1000" optional="true" label="Permutation number for GSEA" help="Default: 1000" />
+            <param name="out_log" type="boolean" truevalue="True" falsevalue="" checked="false"
+                label="Output logfile" help="This file includes the logging information during the execution. Default: No" />
+        </section>
+
+    </inputs>
+
+    <outputs>
+        <data name="pathway_summary" format="tabular" from_work_dir="*.pathway_summary.txt" label="${tool.name} on ${on_string}: Pathway Summary" />
+        <data name="log" format="tabular" from_work_dir="*.log" label="${tool.name} on ${on_string}: Log" >
+            <filter>adv['out_log'] is True</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test><!-- Ensure MAGeCK's demo1 test works -->
+            <param name="gene_ranking" ftype="tabular" value="out.test.gene_summary.txt"  />
+            <param name="gmt_file" ftype="tabular" value="in.mageckQC.gmt" />
+            <param name="ranking_column" value="2" />
+            <param name="out_log" value="True"/>
+            <output name="pathway_summary" value="out.pathway.pathway_summary.txt" />
+            <output name="log" value="out.pathway.log.txt" compare="sim_size" />
+        </test>
+    </tests>
+
+    <help><![CDATA[
+.. class:: infomark
+
+**What it does**
+
+MAGeCK pathway can also invoke robust ranking aggregation (RRA) to test if a pathway is enriched in one particular gene ranking, see **More Information** below.
+
+-----
+
+**Inputs**
+
+**Gene Ranking files**
+
+A gene ranking file is required as input and can be produced using **mageck test**. An example of the gene ranking file (gene summary file) is as follows:
+
+======= ======= ============= =============== =========== ============ ================= =========== ============= =============== =========== ============ ================= ===========
+**id**  **num** **neg|score** **neg|p-value** **neg|fdr** **neg|rank** **neg|goodsgrna** **neg|lfc** **pos|score** **pos|p-value** **pos|fdr** **pos|rank** **pos|goodsgrna** **pos|lfc**
+------- ------- ------------- --------------- ----------- ------------ ----------------- ----------- ------------- --------------- ----------- ------------ ----------------- -----------
+ESPL1   12      6.4327e-10    7.558e-06       7.9e-05      1           -2.35             11          0.99725       0.99981         0.999992    615          0                 -0.07
+RPL18   12      6.4671e-10    7.558e-06       7.9e-05      2           -2.12             11          0.99799       0.99989         0.999992    620          0                 -0.32
+CDK1    12      2.6439e-09    7.558e-06       7.9e-05      3           -1.93             12          1.0           0.99999         0.999992    655          0                 -0.12
+======= ======= ============= =============== =========== ============ ================= =========== ============= =============== =========== ============ ================= ===========
+
+
+**Pathway file**
+
+MAGeCK pathway also requires a pathway file in GMT format. The GMT (Gene Matrix Transposed) file format is a tab delimited file format that describes gene sets and is consistent with the `GMT file in Gene Set Enrichment Analysis (GSEA)`_. In the GMT format, each row represents a gene set, with the first column containing the gene set name, and the second column containing a description for the gene set, followed by the names or ids of the genes in the gene set. You can download different GMT pathway files directly from the `GSEA MSigDB database`_.  An example of the GMT format is as follows:
+
+=============  =============================================================  =======================
+Gene Set Name  Description                                                    Genes
+-------------  -------------------------------------------------------------  -----------------------
+KEGG_RIBOSOME  http://www.broadinstitute.org/gsea/msigdb/cards/KEGG_RIBOSOME  RPL35   RPL23   RPL3...
+=============  =============================================================  =======================
+
+-----
+
+**Outputs**
+
+**Pathway summary file**
+
+An example of the pathway summary output file is as follows:
+
+============= ======= ============= =========== =============== =========== ============ ================ ============= ============= =========== =============== =========== ============ ================ ===========
+**id**        **num** **neg|score** **neg|rra** **neg|p-value** **neg|fdr** **neg|rank** **neg|goodgene** **neg|lfc**   **pos|score** **pos|rra** **pos|p-value** **pos|fdr** **pos|rank** **pos|goodgene** **pos|lfc**
+------------- ------- ------------- ----------- --------------- ----------- ------------ ---------------- ------------- ------------- ----------- --------------- ----------- ------------ ---------------- -----------
+KEGG_RIBOSOME 88      1             0           0               0           1            0                0             1             0           0               0               1   00
+============= ======= ============= =========== =============== =========== ============ ================ ============= ============= =========== =============== =========== ============ ================ ===========
+
+The contents of each column is as follows:
+
+* **id**  Gene ID
+* **num** The number of targeting sgRNAs for each gene
+* **neg|score** The RRA lo value of this gene in negative selection
+* **neg|p-value** The raw p-value (using permutation) of this gene in negative selection
+* **neg|fdr** The false discovery rate of this gene in negative selection
+* **neg|rank**  The ranking of this gene in negative selection
+* **neg|goodsgrna** The number of "good" sgRNAs, i.e., sgRNAs whose ranking is below the alpha cutoff (determined by the --gene-test-fdr-threshold option), in negative selection.
+* **neg|lfc** The log fold change of this gene in negative selection
+* **pos|score** The number of targeting sgRNAs for each gene in positive selection (usually the same as num.neg)
+* **pos|score** The RRA lo value of this gene in negative selection
+* **pos|p-value** The raw p-value of this gene in positive selection
+* **pos|fdr** The false discovery rate of this gene in positive selection
+* **pos|rank**  The ranking of this gene in positive selection
+* **pos|goodsgrna** The number of "good" sgRNAs, i.e., sgRNAs whose ranking is below the alpha cutoff (determined by the --gene-test-fdr-threshold option), in positive selection.
+* **pos|lfc** The log fold change of this gene in positive selection
+
+Genes are ranked by the p.neg field (by default). If you need a ranking by the p.pos, you can use the --sort-criteria option.
+
+-----
+
+**More Information**
+
+**Overview of the MAGeCK algorithm**
+
+Briefly, read counts from different samples are first median-normalized to adjust for the effect of library sizes and read count distributions. Then the variance of read counts is estimated by sharing information across features, and a negative binomial (NB) model is used to test whether sgRNA abundance differs significantly between treatments and controls. This approach is similar to those used for differential RNA-Seq analysis. We rank sgRNAs based on P-values calculated from the NB model, and use a modified robust ranking aggregation (RRA) algorithm named α-RRA to identify positively or negatively selected genes. More specifically, α-RRA assumes that if a gene has no effect on selection, then sgRNAs targeting this gene should be uniformly distributed across the ranked list of all the sgRNAs. α-RRA ranks genes by comparing the skew in rankings to the uniform null model, and prioritizes genes whose sgRNA rankings are consistently higher than expected. α-RRA calculates the statistical significance of the skew by permutation, and a detailed description of the algorithm is presented in the Materials and methods section of the `MAGeCK paper`_. Finally, MAGeCK reports positively and negatively selected pathways by applying α-RRA to the rankings of genes in a pathway.
+
+For more information on using MAGeCK, see the `MAGeCK website here`_.
+
+.. _`GMT file in Gene Set Enrichment Analysis (GSEA)`: http://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#GMT:_Gene_Matrix_Transposed_file_format_.28.2A.gmt.29
+.. _`GSEA MSigDB database`: http://software.broadinstitute.org/gsea/login.jsp
+.. _`MAGeCK paper`: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-014-0554-4
+.. _`MAGeCK website here`: https://sourceforge.net/p/mageck/wiki/QA/#using-mageck
+
+    ]]></help>
+      <expand macro="citations" />
+</tool>