comparison RBGOA.xml @ 2:5acf9dfdfa27 draft default tip

planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
author cristian
date Wed, 09 Nov 2022 08:57:54 +0000
parents f7287f82602f
children
comparison
equal deleted inserted replaced
1:f7287f82602f 2:5acf9dfdfa27
1 <tool id="RBGOA" name="RBGOA" version="0.2.0" python_template_version="3.5"> 1 <tool id="RBGOA" name="RBGOA" version="0.3.0" python_template_version="3.5">
2 <description>"Rank Based Gene Ontology Analysis"</description> 2 <description>"GO_MWU: a Rank Based Gene Ontology Analysis"</description>
3 <requirements> 3 <requirements>
4 <requirement type="package" version="5.6">r-ape</requirement> 4 <requirement type="package" version="5.6">r-ape</requirement>
5 <requirement type="package" version="1.20.3">r-getopt</requirement> 5 <requirement type="package" version="1.20.3">r-getopt</requirement>
6 </requirements> 6 </requirements>
7 <version_command>Rscript GO_MWU.R -v</version_command> 7 <version_command>Rscript GO_MWU.R -v</version_command>
8 <command detect_errors="exit_code"><![CDATA[ 8 <command detect_errors="exit_code"><![CDATA[
9 ln -s '${input1}' samples.tsv && 9 ln -s '${input1}' samples.tsv &&
10 ln -s '${input2}' annotation.tsv && 10 ln -s '${input2}' annotation.tsv &&
11 Rscript $__tool_directory__/GO_MWU.R -s $__tool_directory__ -i samples.tsv -a annotation.tsv -g $__tool_directory__/go.obo -d '$input3' -c '$input_filter.cluster' -o '$input_filter.over' -m '$input_filter.min' -p '$grouping.pcut' -t '$grouping.hcut' && 11 Rscript $__tool_directory__/GO_MWU.R -i samples.tsv -a annotation.tsv -g $__tool_directory__/go.obo -d '$input3' -c '$input_filter.cluster' -o '$input_filter.over' -m '$input_filter.min' -k '$plot_output.absval' -p '$grouping.pcut' -t '$grouping.hcut' -e '$plot_output.textsize' --l1 '$plot_output.lev1' --l2 '$plot_output.lev2' --l3 '$plot_output.lev3' &&
12 mv samples_${input3}.tsv div_input.tsv && 12 mv samples_${input3}.tsv div_input.tsv &&
13 mv dissim_${input3}_samples_annotation.tsv dissim.tsv && 13 mv dissim_${input3}_samples_annotation.tsv dissim.tsv &&
14 mv MWU_${input3}_samples.tsv mwu_file.tsv 14 mv MWU_${input3}_samples.tsv mwu_file.tsv
15 ]]></command> 15 ]]></command>
16 <inputs> 16 <inputs>
17 <param type="data" name="input1" format="tabular" label="Genes of interest with associated value" /> 17 <param type="data" name="input1" format="tabular" label="Genes of interest with associated value" />
18 <param type="data" name="input2" format="tabular" label="Gene GO annotation file" /> 18 <param type="data" name="input2" format="tabular" label="Gene-GO annotation file" />
19 <param type="select" name="input3" label="GO division" > 19 <param type="select" name="input3" label="GO division" >
20 <option value="BP" selected="true">BP</option> 20 <option value="BP" selected="true">BP</option>
21 <option value="MF">MF</option> 21 <option value="MF">MF</option>
22 <option value="CC">CC</option> 22 <option value="CC">CC</option>
23 </param> 23 </param>
24 <section name="input_filter" title="Input Filtering" expanded="true"> 24 <section name="input_filter" title="Input Filtering" expanded="true">
25 <param type="float" name="over" value="0.1" label="Filter out GO categories that have more than this fraction of total number of genes" /> 25 <param type="float" name="over" value="0.1" label="Filter out GO categories that include more than this fraction of the total number of genes" />
26 <param type="integer" name="min" value="5" label="Consider GO categories that have at least this many genes" /> 26 <param type="integer" name="min" value="5" label="Consider GO categories that have at least this many genes" />
27 <param type="float" name="cluster" value="0.25" label="Threshold for merging similar (gene-sharing) terms" /> 27 <param type="float" name="cluster" value="0.25" label="Threshold for merging similar (gene-sharing) terms" />
28 </section> 28 </section>
29 <section name="grouping" title="Significance and Grouping"> 29 <section name="plot_output" title="Plot tweaking" expanded="true">
30 <param type="float" name="absval" value="1.0" label="absValue" help="Threshold for 'good genes'. Default: 1, to use with log2(foldchange). Read help below!" />
31 <param type="float" name="lev1" value="0.1" label="Level 1" help="Significance level for smallest text" />
32 <param type="float" name="lev2" value="0.05" label="Level 2" help="Significance level for intermediate text" />
33 <param type="float" name="lev3" value="0.01" label="Level 3" help="Significance level for largest text" />
34 <param type="float" name="textsize" value="1.2" label="TextSize for plot labels" />
35 </section>
36 <section name="grouping" title="Significance and Grouping" expanded="true">
30 <param type="float" name="pcut" value="1e-2" label="Adjusted p-value cutoff for representative GO" /> 37 <param type="float" name="pcut" value="1e-2" label="Adjusted p-value cutoff for representative GO" />
31 <param type="float" name="hcut" value="0.9" label="Height at which to cut the GO terms tree to get 'independent groups'" /> 38 <param type="float" name="hcut" value="0.9" label="Height at which to cut the GO terms tree to get 'independent groups'" />
32 </section> 39 </section>
33 </inputs> 40 </inputs>
34 <outputs> 41 <outputs>
35 <data name="graph" format="pdf" from_work_dir="Rplots.pdf" label="Tree output" /> 42 <data name="graph" format="pdf" from_work_dir="Rplots.pdf" label="Plot of GO terms for (${input3})" />
36 <data name="div_input" format="tabular" from_work_dir="div_input.tsv" label="GO Division ${input3}" /> 43 <data name="div_input" format="tabular" from_work_dir="div_input.tsv" label="Augmented ${input3} GO terms for genes" />
37 <data name="dissim" format="tabular" from_work_dir="dissim.tsv" label="Dissimilarity table" /> 44 <data name="dissim" format="tabular" from_work_dir="dissim.tsv" label="Dissimilarity matrix of GO terms" />
38 <data name="mwu" format="tabular" from_work_dir="mwu_file.tsv" label="Delta rank for GO (${input3})" /> 45 <data name="mwu" format="tabular" from_work_dir="mwu_file.tsv" label="MWU test result for (${input3})" />
39 <data name="results" format="tabular" from_work_dir="results.tsv" label="Raw data for graph" /> 46 <data name="results" format="tabular" from_work_dir="results.tsv" label="Raw data for plot" />
40 <data name="best_go" format="tabular" from_work_dir="best_go.tsv" label="Best GO terms" /> 47 <data name="best_go" format="tabular" from_work_dir="best_go.tsv" label="Best GO terms" />
41 </outputs> 48 </outputs>
42 <tests> 49 <tests>
43 <test> 50 <test>
44 <param name="input1" value="heats.csv"/> 51 <param name="input1" value="heats.csv"/>
79 <help><![CDATA[ 86 <help><![CDATA[
80 ========================================================== 87 ==========================================================
81 Rank-based Gene Ontology Analysis with Adaptive Clustering 88 Rank-based Gene Ontology Analysis with Adaptive Clustering
82 ========================================================== 89 ==========================================================
83 90
84 Usage: GO_MWU.R [ 91 What it does
92 ------------
85 93
86 -[-help|h]] 94 In contrast to most other "GO enrichment analysis" methods (e.g., GeneMerge or DAVID), this one does not look for GO categories enriched among "significant" genes.
87 95
88 --scriptdir | -s <character> 96 Instead, it measures whether each GO category is significantly enriched by either up or down-regulated genes.
97 Basically, the method tests whether the genes belonging to a certain GO category are significantly bunched up near the top or the bottom of the global ranked list of genes, instead of being spread evenly all over it.
98 The test used is called the Mann-Whitney U (MWU) test.
89 99
90 --input | -i <character> 100 The major advantage of this approach is that the experimenter does not have to impose an arbitrary threshold for initial selection of "significant genes", and thus the whole dataset can be used to gain information.
91
92 --goAnnotations | -a <character>
93 101
94 --goDatabase | -g <character> 102 In fact, no preliminary statistical test is required prior to the analysis; the method is best suited to analyze the distribution of raw measures, such as dN/dS values, log-fold-changes of gene expression, or kME (correlation) values from WGCNA.
95 103
96 --goDivision | -d <character> 104 The method can also be run in a traditional mode, looking for GO categories significantly over-represented among "significant genes" (based on Fisher's exact test). To make the method work in this mode, the measure of significance should be binary (1 or 0, i.e., significant or not).
97 105
98 --threads | -t <integer> 106 **"absValue"**: Genes with the measure value exceeding this value will be counted as "good genes".
107 When using signed log(p-values) use the value 1.30103 which corresponds to -log(0.05, 10). Specify the value 0.001 if you are doing
108 Fisher's exact test for standard GO enrichment or analyzing a WGCNA module (all non-zero genes = "good genes").
109 Use a value of 1 if you're using log2(fold-change).
110 This parameter does not affect statistics and serves just the illustrative purpose.
99 111
100 --pcut | -p <double> 112 The method automatically retrieves all the missing parental terms for the lower-level GO categories.
113 Then, fully redundant GO categories (i.e., containing exactly the same genes) are collapsed under name of the lower-level (more specific) term.
114 Then, highly similar categories are merged according to complete linkage clustering based on the fraction of shared genes.
115 The distance measure for clustering, introduced in Kosiol et al 2008, is the number of genes shared among the two GO categories within the analyzed dataset divided by the size of the smaller of the two categories.
101 116
102 --hcut | -c <double> 117 The resulting hierarchical tree is then “cut” at the adjustable “height” ('Threshold for merging similar (gene-sharing) terms' parameter) to merge clustered categories.
118 The default for this parameter is 0.25, implying that a group of categories will be merged if the most dissimilar two of them share >75% of genes included in the smaller of the two.
119 The merged categories inherit the name of the largest one.
120 This simplifies the GO hierarchy, generates biologically meaningful groups of categories tailored for the particular dataset, and improves the multiple testing situation.
121
122 In the final plot, the method shows hierarchical clustering of GO categories based on the number of genes shared between them, to indicate which categories might be significant because of the same genes.
123
124 ------
125
126 Output Files
127 ------------
128
129 The plot
130 ^^^^^^^^
131
132 The plot consists of three parts:
133
134 | - Hierarchical clustering tree of significant GO categories based on shared genes in the current dataset.
135 Categories with no branch length between them are subsets of each other and their significance is most likely driven by the same genes.
136
137 | - Category names, plotted in different colors and fonts.
138 Fonts indicate the level of statistical significance, colors indicate enrichment of GO categories with either up- (red) or down- (blue) regulated genes.
139 The category names are preceded by the fraction indicating the number of "good candidates" relative to the total number of genes belonging to this category.
140 The "good candidates" are the genes exceeding an arbitrary **'absValue'** cutoff in their significance measure.
141 Adjust 'absValue' parameter according to what your measure is.
142 By default it is set to -log(0.05,10), assuming that the measure is a signed log p-value (so, the "good candidates" would be the ones with raw p-value < 0.05).
143 Ideally we would like to see more than one such gene per displayed GO category.
144 With 'level 1'=1 the script will display all the categories containing "good candidates", which is a good way to summarize the whole GO content of the experiment.
145 Note that 'absValue' parameter does not affect statistics and serves just the illustrative purpose.
146 In the Fisher-test mode (binary significance measure) and signed WGCNA module analysis the colors are not used; in that case specify absValue=0.001 to make the script display the fraction of genes with non-zero measure within a GO category.
147
148 | - The legend giving the correspondence of the fonts to significance thresholds.
149 The method corrects the p-values using Benjamini-Hochberg false discovery rate procedure except when analyzing WGCNA modules; in that case the false discovery rate is determined from ten permutations where significance measures are randomly shuffled among genes.
150 To set different thresholds for plotting, change parameters 'Level 1', 'Level 2' and 'Level 3' in the 'Plot tweaking' section.
151
152 In addition, the script prints out the number of GO categories displayed and the fraction of "good candidates" that these categories account for. This is useful to evaluate whether the generated GO summary really accounts for a substantial portion of what was going on.
153
154 If the labels of the plot are too crowded or too small, you can adjust the 'TextSize for plot labels' parameter and relaunch the analysis.
155
156 The tables
157 ^^^^^^^^^^
158
159 The script generates 5 tables.
160
161 Augmented GO terms for genes
162 main data table containing reformatted and augmented GO terms for each gene (in addition to the originally listed terms, the script finds all their parental terms if any were missing), and measures of interest.
163
164 Dissimilarity table
165 dissimilarity matrix of GO categories based on the number of genes shared between them in the dataset.
166
167 MWU Test
168 The results of MWU test.
169
170 The raw data for plot
171 The raw data represented in the plot.
172
173 Best GO terms
174 GO terms that best represent *independent* groups of significant GO terms.
175
103 176
104 ]]></help> 177 ]]></help>
105 <citations> 178 <citations>
179 <citation type="doi">10.1186/s12864-015-1540-2</citation>
106 <citation type="bibtex"> 180 <citation type="bibtex">
107 @misc{githubGO_MWU, 181 @misc{githubGO_MWU,
108 author = {LastTODO, FirstTODO}, 182 author = {Matz, Mikhail},
109 year = {TODO}, 183 year = {2021},
110 title = {GO_MWU}, 184 title = {GO_MWU},
111 publisher = {GitHub}, 185 publisher = {GitHub},
112 journal = {GitHub repository}, 186 journal = {GitHub repository},
113 url = {https://github.com/z0on/GO_MWU}, 187 url = {https://github.com/z0on/GO_MWU},
114 }</citation> 188 }</citation>