Mercurial > repos > cristian > rbgoa
comparison RBGOA.xml @ 2:5acf9dfdfa27 draft default tip
planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
author | cristian |
---|---|
date | Wed, 09 Nov 2022 08:57:54 +0000 |
parents | f7287f82602f |
children |
comparison
equal
deleted
inserted
replaced
1:f7287f82602f | 2:5acf9dfdfa27 |
---|---|
1 <tool id="RBGOA" name="RBGOA" version="0.2.0" python_template_version="3.5"> | 1 <tool id="RBGOA" name="RBGOA" version="0.3.0" python_template_version="3.5"> |
2 <description>"Rank Based Gene Ontology Analysis"</description> | 2 <description>"GO_MWU: a Rank Based Gene Ontology Analysis"</description> |
3 <requirements> | 3 <requirements> |
4 <requirement type="package" version="5.6">r-ape</requirement> | 4 <requirement type="package" version="5.6">r-ape</requirement> |
5 <requirement type="package" version="1.20.3">r-getopt</requirement> | 5 <requirement type="package" version="1.20.3">r-getopt</requirement> |
6 </requirements> | 6 </requirements> |
7 <version_command>Rscript GO_MWU.R -v</version_command> | 7 <version_command>Rscript GO_MWU.R -v</version_command> |
8 <command detect_errors="exit_code"><![CDATA[ | 8 <command detect_errors="exit_code"><![CDATA[ |
9 ln -s '${input1}' samples.tsv && | 9 ln -s '${input1}' samples.tsv && |
10 ln -s '${input2}' annotation.tsv && | 10 ln -s '${input2}' annotation.tsv && |
11 Rscript $__tool_directory__/GO_MWU.R -s $__tool_directory__ -i samples.tsv -a annotation.tsv -g $__tool_directory__/go.obo -d '$input3' -c '$input_filter.cluster' -o '$input_filter.over' -m '$input_filter.min' -p '$grouping.pcut' -t '$grouping.hcut' && | 11 Rscript $__tool_directory__/GO_MWU.R -i samples.tsv -a annotation.tsv -g $__tool_directory__/go.obo -d '$input3' -c '$input_filter.cluster' -o '$input_filter.over' -m '$input_filter.min' -k '$plot_output.absval' -p '$grouping.pcut' -t '$grouping.hcut' -e '$plot_output.textsize' --l1 '$plot_output.lev1' --l2 '$plot_output.lev2' --l3 '$plot_output.lev3' && |
12 mv samples_${input3}.tsv div_input.tsv && | 12 mv samples_${input3}.tsv div_input.tsv && |
13 mv dissim_${input3}_samples_annotation.tsv dissim.tsv && | 13 mv dissim_${input3}_samples_annotation.tsv dissim.tsv && |
14 mv MWU_${input3}_samples.tsv mwu_file.tsv | 14 mv MWU_${input3}_samples.tsv mwu_file.tsv |
15 ]]></command> | 15 ]]></command> |
16 <inputs> | 16 <inputs> |
17 <param type="data" name="input1" format="tabular" label="Genes of interest with associated value" /> | 17 <param type="data" name="input1" format="tabular" label="Genes of interest with associated value" /> |
18 <param type="data" name="input2" format="tabular" label="Gene GO annotation file" /> | 18 <param type="data" name="input2" format="tabular" label="Gene-GO annotation file" /> |
19 <param type="select" name="input3" label="GO division" > | 19 <param type="select" name="input3" label="GO division" > |
20 <option value="BP" selected="true">BP</option> | 20 <option value="BP" selected="true">BP</option> |
21 <option value="MF">MF</option> | 21 <option value="MF">MF</option> |
22 <option value="CC">CC</option> | 22 <option value="CC">CC</option> |
23 </param> | 23 </param> |
24 <section name="input_filter" title="Input Filtering" expanded="true"> | 24 <section name="input_filter" title="Input Filtering" expanded="true"> |
25 <param type="float" name="over" value="0.1" label="Filter out GO categories that have more than this fraction of total number of genes" /> | 25 <param type="float" name="over" value="0.1" label="Filter out GO categories that include more than this fraction of the total number of genes" /> |
26 <param type="integer" name="min" value="5" label="Consider GO categories that have at least this many genes" /> | 26 <param type="integer" name="min" value="5" label="Consider GO categories that have at least this many genes" /> |
27 <param type="float" name="cluster" value="0.25" label="Threshold for merging similar (gene-sharing) terms" /> | 27 <param type="float" name="cluster" value="0.25" label="Threshold for merging similar (gene-sharing) terms" /> |
28 </section> | 28 </section> |
29 <section name="grouping" title="Significance and Grouping"> | 29 <section name="plot_output" title="Plot tweaking" expanded="true"> |
30 <param type="float" name="absval" value="1.0" label="absValue" help="Threshold for 'good genes'. Default: 1, to use with log2(foldchange). Read help below!" /> | |
31 <param type="float" name="lev1" value="0.1" label="Level 1" help="Significance level for smallest text" /> | |
32 <param type="float" name="lev2" value="0.05" label="Level 2" help="Significance level for intermediate text" /> | |
33 <param type="float" name="lev3" value="0.01" label="Level 3" help="Significance level for largest text" /> | |
34 <param type="float" name="textsize" value="1.2" label="TextSize for plot labels" /> | |
35 </section> | |
36 <section name="grouping" title="Significance and Grouping" expanded="true"> | |
30 <param type="float" name="pcut" value="1e-2" label="Adjusted p-value cutoff for representative GO" /> | 37 <param type="float" name="pcut" value="1e-2" label="Adjusted p-value cutoff for representative GO" /> |
31 <param type="float" name="hcut" value="0.9" label="Height at which to cut the GO terms tree to get 'independent groups'" /> | 38 <param type="float" name="hcut" value="0.9" label="Height at which to cut the GO terms tree to get 'independent groups'" /> |
32 </section> | 39 </section> |
33 </inputs> | 40 </inputs> |
34 <outputs> | 41 <outputs> |
35 <data name="graph" format="pdf" from_work_dir="Rplots.pdf" label="Tree output" /> | 42 <data name="graph" format="pdf" from_work_dir="Rplots.pdf" label="Plot of GO terms for (${input3})" /> |
36 <data name="div_input" format="tabular" from_work_dir="div_input.tsv" label="GO Division ${input3}" /> | 43 <data name="div_input" format="tabular" from_work_dir="div_input.tsv" label="Augmented ${input3} GO terms for genes" /> |
37 <data name="dissim" format="tabular" from_work_dir="dissim.tsv" label="Dissimilarity table" /> | 44 <data name="dissim" format="tabular" from_work_dir="dissim.tsv" label="Dissimilarity matrix of GO terms" /> |
38 <data name="mwu" format="tabular" from_work_dir="mwu_file.tsv" label="Delta rank for GO (${input3})" /> | 45 <data name="mwu" format="tabular" from_work_dir="mwu_file.tsv" label="MWU test result for (${input3})" /> |
39 <data name="results" format="tabular" from_work_dir="results.tsv" label="Raw data for graph" /> | 46 <data name="results" format="tabular" from_work_dir="results.tsv" label="Raw data for plot" /> |
40 <data name="best_go" format="tabular" from_work_dir="best_go.tsv" label="Best GO terms" /> | 47 <data name="best_go" format="tabular" from_work_dir="best_go.tsv" label="Best GO terms" /> |
41 </outputs> | 48 </outputs> |
42 <tests> | 49 <tests> |
43 <test> | 50 <test> |
44 <param name="input1" value="heats.csv"/> | 51 <param name="input1" value="heats.csv"/> |
79 <help><![CDATA[ | 86 <help><![CDATA[ |
80 ========================================================== | 87 ========================================================== |
81 Rank-based Gene Ontology Analysis with Adaptive Clustering | 88 Rank-based Gene Ontology Analysis with Adaptive Clustering |
82 ========================================================== | 89 ========================================================== |
83 | 90 |
84 Usage: GO_MWU.R [ | 91 What it does |
92 ------------ | |
85 | 93 |
86 -[-help|h]] | 94 In contrast to most other "GO enrichment analysis" methods (e.g., GeneMerge or DAVID), this one does not look for GO categories enriched among "significant" genes. |
87 | 95 |
88 --scriptdir | -s <character> | 96 Instead, it measures whether each GO category is significantly enriched by either up or down-regulated genes. |
97 Basically, the method tests whether the genes belonging to a certain GO category are significantly bunched up near the top or the bottom of the global ranked list of genes, instead of being spread evenly all over it. | |
98 The test used is called the Mann-Whitney U (MWU) test. | |
89 | 99 |
90 --input | -i <character> | 100 The major advantage of this approach is that the experimenter does not have to impose an arbitrary threshold for initial selection of "significant genes", and thus the whole dataset can be used to gain information. |
91 | |
92 --goAnnotations | -a <character> | |
93 | 101 |
94 --goDatabase | -g <character> | 102 In fact, no preliminary statistical test is required prior to the analysis; the method is best suited to analyze the distribution of raw measures, such as dN/dS values, log-fold-changes of gene expression, or kME (correlation) values from WGCNA. |
95 | 103 |
96 --goDivision | -d <character> | 104 The method can also be run in a traditional mode, looking for GO categories significantly over-represented among "significant genes" (based on Fisher's exact test). To make the method work in this mode, the measure of significance should be binary (1 or 0, i.e., significant or not). |
97 | 105 |
98 --threads | -t <integer> | 106 **"absValue"**: Genes with the measure value exceeding this value will be counted as "good genes". |
107 When using signed log(p-values) use the value 1.30103 which corresponds to -log(0.05, 10). Specify the value 0.001 if you are doing | |
108 Fisher's exact test for standard GO enrichment or analyzing a WGCNA module (all non-zero genes = "good genes"). | |
109 Use a value of 1 if you're using log2(fold-change). | |
110 This parameter does not affect statistics and serves just the illustrative purpose. | |
99 | 111 |
100 --pcut | -p <double> | 112 The method automatically retrieves all the missing parental terms for the lower-level GO categories. |
113 Then, fully redundant GO categories (i.e., containing exactly the same genes) are collapsed under name of the lower-level (more specific) term. | |
114 Then, highly similar categories are merged according to complete linkage clustering based on the fraction of shared genes. | |
115 The distance measure for clustering, introduced in Kosiol et al 2008, is the number of genes shared among the two GO categories within the analyzed dataset divided by the size of the smaller of the two categories. | |
101 | 116 |
102 --hcut | -c <double> | 117 The resulting hierarchical tree is then “cut” at the adjustable “height” ('Threshold for merging similar (gene-sharing) terms' parameter) to merge clustered categories. |
118 The default for this parameter is 0.25, implying that a group of categories will be merged if the most dissimilar two of them share >75% of genes included in the smaller of the two. | |
119 The merged categories inherit the name of the largest one. | |
120 This simplifies the GO hierarchy, generates biologically meaningful groups of categories tailored for the particular dataset, and improves the multiple testing situation. | |
121 | |
122 In the final plot, the method shows hierarchical clustering of GO categories based on the number of genes shared between them, to indicate which categories might be significant because of the same genes. | |
123 | |
124 ------ | |
125 | |
126 Output Files | |
127 ------------ | |
128 | |
129 The plot | |
130 ^^^^^^^^ | |
131 | |
132 The plot consists of three parts: | |
133 | |
134 | - Hierarchical clustering tree of significant GO categories based on shared genes in the current dataset. | |
135 Categories with no branch length between them are subsets of each other and their significance is most likely driven by the same genes. | |
136 | |
137 | - Category names, plotted in different colors and fonts. | |
138 Fonts indicate the level of statistical significance, colors indicate enrichment of GO categories with either up- (red) or down- (blue) regulated genes. | |
139 The category names are preceded by the fraction indicating the number of "good candidates" relative to the total number of genes belonging to this category. | |
140 The "good candidates" are the genes exceeding an arbitrary **'absValue'** cutoff in their significance measure. | |
141 Adjust 'absValue' parameter according to what your measure is. | |
142 By default it is set to -log(0.05,10), assuming that the measure is a signed log p-value (so, the "good candidates" would be the ones with raw p-value < 0.05). | |
143 Ideally we would like to see more than one such gene per displayed GO category. | |
144 With 'level 1'=1 the script will display all the categories containing "good candidates", which is a good way to summarize the whole GO content of the experiment. | |
145 Note that 'absValue' parameter does not affect statistics and serves just the illustrative purpose. | |
146 In the Fisher-test mode (binary significance measure) and signed WGCNA module analysis the colors are not used; in that case specify absValue=0.001 to make the script display the fraction of genes with non-zero measure within a GO category. | |
147 | |
148 | - The legend giving the correspondence of the fonts to significance thresholds. | |
149 The method corrects the p-values using Benjamini-Hochberg false discovery rate procedure except when analyzing WGCNA modules; in that case the false discovery rate is determined from ten permutations where significance measures are randomly shuffled among genes. | |
150 To set different thresholds for plotting, change parameters 'Level 1', 'Level 2' and 'Level 3' in the 'Plot tweaking' section. | |
151 | |
152 In addition, the script prints out the number of GO categories displayed and the fraction of "good candidates" that these categories account for. This is useful to evaluate whether the generated GO summary really accounts for a substantial portion of what was going on. | |
153 | |
154 If the labels of the plot are too crowded or too small, you can adjust the 'TextSize for plot labels' parameter and relaunch the analysis. | |
155 | |
156 The tables | |
157 ^^^^^^^^^^ | |
158 | |
159 The script generates 5 tables. | |
160 | |
161 Augmented GO terms for genes | |
162 main data table containing reformatted and augmented GO terms for each gene (in addition to the originally listed terms, the script finds all their parental terms if any were missing), and measures of interest. | |
163 | |
164 Dissimilarity table | |
165 dissimilarity matrix of GO categories based on the number of genes shared between them in the dataset. | |
166 | |
167 MWU Test | |
168 The results of MWU test. | |
169 | |
170 The raw data for plot | |
171 The raw data represented in the plot. | |
172 | |
173 Best GO terms | |
174 GO terms that best represent *independent* groups of significant GO terms. | |
175 | |
103 | 176 |
104 ]]></help> | 177 ]]></help> |
105 <citations> | 178 <citations> |
179 <citation type="doi">10.1186/s12864-015-1540-2</citation> | |
106 <citation type="bibtex"> | 180 <citation type="bibtex"> |
107 @misc{githubGO_MWU, | 181 @misc{githubGO_MWU, |
108 author = {LastTODO, FirstTODO}, | 182 author = {Matz, Mikhail}, |
109 year = {TODO}, | 183 year = {2021}, |
110 title = {GO_MWU}, | 184 title = {GO_MWU}, |
111 publisher = {GitHub}, | 185 publisher = {GitHub}, |
112 journal = {GitHub repository}, | 186 journal = {GitHub repository}, |
113 url = {https://github.com/z0on/GO_MWU}, | 187 url = {https://github.com/z0on/GO_MWU}, |
114 }</citation> | 188 }</citation> |