Mercurial > repos > iuc > goseq
diff goseq.xml @ 0:ade933eff007 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/goseq commit b7dcd020c6a15fa55f392cc09cbc37580d6e75c4
author | iuc |
---|---|
date | Thu, 17 Nov 2016 16:40:19 -0500 |
parents | |
children | 9d1256d9ef0b |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/goseq.xml Thu Nov 17 16:40:19 2016 -0500 @@ -0,0 +1,140 @@ +<tool id="goseq" name="goseq" version="0.2.2"> + <description>tests for overrepresented gene categories</description> + <requirements> + <requirement type="package" version="1.3.2">r-optparse</requirement> + <requirement type="package" version="1.22.0">bioconductor-goseq</requirement> + </requirements> + <stdio> + <regex match="Execution halted" + source="both" + level="fatal" + description="Execution halted." /> + <regex match="Error in" + source="both" + level="fatal" + description="An undefined error occured, please check your input carefully and contact your administrator." /> + <regex match="Fatal error" + source="both" + level="fatal" + description="An undefined error occured, please check your input carefully and contact your administrator." /> + </stdio> + <command><![CDATA[ + Rscript '$__tool_directory__'/goseq.r --dge_file '$dge_file' + --length_file '$length_file' + --category_file '$category_file' + #if $methods['wallenius']: + --wallenius_tab '$wallenius_tab' + #end if + #if $methods['hypergeometric']: + --nobias_tab 'nobias_tab' + #end if + --repcnt '$methods.repcnt' + --sampling_tab '$sampling_tab' + --p_adj_method '$p_adj_method' + --use_genes_without_cat '$use_genes_without_cat' + --make_plots '$make_plots' + --length_bias_plot '$length_bias_plot' + --sample_vs_wallenius_plot '$sample_vs_wallenius_plot' + ]]></command> + <inputs> + <param name="dge_file" help="A tabular file with gene names in the first column, and TRUE or FALSE in the last column. TRUE means a gene is differentially expressed. See help section for details." label="Differentially expressed gene file" type="data" format="tabular" /> + <param name="length_file" label="Gene length file for length bias correction" help="You can calculate the gene length using the get length and gc content tool" type="data" format="tabular" /> + <param name="category_file" label="Gene category file" help="You can obtain a mapping of gene id to gene ontology using the getgo tool" type="data" format="tabular" /> + <param name="use_genes_without_cat" help="For example, a large number of gene may have no GO term annotated. If this option is set to FALSE, those genes will be ignored in the calculation of p-values. If this option is set to TRUE, then these genes will count towards the total number of genes outside the category being tested" + label="Count genes without any category?" type="boolean"/> + <section name="methods" title="Method options" expanded="True"> + <param name="wallenius" type="boolean" checked="true" label="Use wallenius method" help="See help for details" /> + <param name="hypergeometric" type="boolean" checked="false" label="Use hypergeometric method" help="Does not use gene length information. See help for details" /> + <param name="repcnt" help="Draw this many random control gene sets. Set to 0 to not do sampling. Larger values take a long time" label="sampling depth" size="3" type="integer" min="0" max="10000" value="0" /> + </section> + <param name="p_adj_method" type="select" label="Select a method for multiple hypothesis testing correction"> + <option value="BH" selected="true">Benjamini-Hochberg [FDR] (1995)</option> + <option value="holm">Holm (1979)</option> + <option value="hommel">Hommel (1988)</option> + <option value="hochberg">Hochberg (1988)</option> + <option value="bonferroni">Bonferroni</option> + <option value="BY">Benjamini - Yekutieli (2001)</option> + </param> + <param help="These plots may help you compare the different p-value estimation methods that goseq can use." label="Produce diagnostic plots?" name="make_plots" type="boolean"></param> + </inputs> + <outputs> + <data name="length_bias_plot" format="pdf" label="length bias plot"> + <filter>make_plots</filter> + <filter>methods['hypergeometric']</filter> + </data> + <data name="sample_vs_wallenius_plot" format="pdf" label="Plot P-value from sampling against wallenius distribution"> + <filter>methods['repcnt'] != 0</filter> + <filter>methods['wallenius']</filter> + <filter>make_plots</filter> + </data> + <data name="nobias_tab" format="tabular" label="Ranked category list - no length bias correction"> + <filter>methods['hypergeometric']</filter> + </data> + <data name="sampling_tab" format="tabular" label="Ranked category list - sampling"> + <filter>methods['repcnt'] != 0</filter> + </data> + <data name="wallenius_tab" format="tabular" label="Ranked category list - wallenius method"> + <filter>methods['wallenius']</filter> + </data> + </outputs> + <tests> + <test> + <param name="dge_file" value="dge_list.tab" ftype="tabular"/> + <param name="length_file" value="gene_length.tab" ftype="tabular"/> + <param name="category_file" value="category.tab" ftype="tabular"/> + <param name="use_genes_without_cat" value="true" /> + <output name="wallenius_tab" file="wal.tab" compare="re_match"/> + </test> + </tests> + <help> + + **What it does** + + Detects Gene Ontology and/or other user defined categories which are over/under-represented in RNA-seq data. + + Options map closely to the excellent manual_ + + + **Input files** + + *DGE list:* + goseq needs a tabular file with genes in the first column, and TRUE or FALSE in the last column. + TRUE means the gene should count as differentially expressed, FALSE means it is not differentially expressed. + You can use the "Compute an expression on every row" tool to create a TRUE / FALSE column for your dataset. + + *Gene length file:* + goseq needs information about the length of a gene to correct for potential length bias in differentially expressed genes + using a prodbability weight function (PWF). + The format of this file is tabular, with gene_id in the first column and length in the second column. + The "get length and gc content" tool can produce such a file. + + *Gene category file:* + You will also need a file describing the membership of genes in categories. The format of this file is gene_id in the first column, + category name in the second column. If you are interested in gene ontology categories you can use the getgo file to retrive + gene ontologies for model organisms, or you can construct your own file. + + **Method options** + + 3 methods, "Wallenius", "Sampling" and "Hypergeometric", can be used to calculate the p-values as follows. + + *"Wallenius"* approximates the true distribution of numbers of members of a category amongst DE genes by the Wallenius non-central hypergeometric distribution. + This distribution assumes that within a category all genes have the same probability of being chosen. + Therefore, this approximation works best when the range in probabilities obtained by the probability weighting function is small. + + *"Sampling"* uses random sampling to approximate the true distribution and uses it to calculate the p-values for over (and under) representation of categories. + Although this is the most accurate method given a high enough value of sampling depth, its use quickly becomes computationally prohibitive. + + *"Hypergeometric"* assumes there is no bias in power to detect differential expression at all and calculates the p-values using a standard hypergeometric distribution. + Useful if you wish to test the effect of selection bias on your results. + + CAUTION: "Hypergeometric" should NEVER be used for producing results for biological interpretation. + If there is genuinely no bias in power to detect DE in your experiment, the PWF will reflect this and the other methods will produce accuracte results. + + .. _manual: https://bioconductor.org/packages/release/bioc/vignettes/goseq/inst/doc/goseq.pdf + + + </help> + <citations> + <citation type="doi">10.1186/gb-2010-11-2-r14</citation> + </citations> +</tool>