annotate random_forest.xml @ 2:caba07f41453 draft default tip

"planemo upload for repository https://github.com/secimTools/SECIMTools/tree/main/galaxy commit 498abad641099412df56f04ff6e144e4193bbc34-dirty"
author malex
date Thu, 10 Jun 2021 15:41:17 +0000
parents 2e7d47c0b027
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
1 <tool id="secimtools_random_forest" name="Random Forest (RF)" version="@WRAPPER_VERSION@">
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
2 <description>algorithm to select features.</description>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
3 <macros>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
4 <import>macros.xml</import>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
5 </macros>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
6 <expand macro="requirements" />
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
7 <command detect_errors="exit_code"><![CDATA[
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
8 random_forest.py
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
9 --input $input
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
10 --design $design
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
11 --ID $uniqID
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
12 --group $group
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
13 --snum $number_of_estimators
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
14 --num $number_of_factors
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
15 --out $outfile1
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
16 --out2 $outfile2
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
17 --figure $figure
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
18 ]]></command>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
19 <inputs>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
20 <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
21 <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
22 <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers."/>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
23 <param name="group" type="text" size="30" value="" label="Group/Treatment" help="Name of the column in your design file that contains group classifications."/>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
24 <param name="number_of_estimators" type="integer" size="30" value="1000" label="Number of trees in the forest" help="Recommend a minimum of 1000 trees."/>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
25 <param name="number_of_factors" type="integer" size="30" value="20" label="Number of factors to plot" help="Plots the (Default = 20) most important factors."/>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
26 </inputs>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
27 <outputs>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
28 <data format="csv" name="outfile1" label="${tool.name} on ${on_string}: Transformed Data"/>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
29 <data format="csv" name="outfile2" label="${tool.name} on ${on_string}: Importance Factors"/>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
30 <data format="pdf" name="figure" label="${tool.name} on ${on_string}: Variable Importance Plot"/>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
31 </outputs>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
32 <tests>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
33 <test>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
34 <param name="input" value="ST000006_data.tsv"/>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
35 <param name="design" value="ST000006_design.tsv"/>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
36 <param name="uniqID" value="Retention_Index" />
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
37 <param name="group" value="White_wine_type_and_source" />
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
38 <output name="outfile1" file="ST000006_random_forest_out.tsv" compare="sim_size" delta="10000" />
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
39 <output name="outfile2" file="ST000006_random_forest_out2.tsv" compare="sim_size" delta="10000" />
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
40 <output name="figure" file="ST000006_random_forest_figure.pdf" compare="sim_size" delta="10000" />
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
41 </test>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
42 </tests>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
43 <help><![CDATA[
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
44
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
45 @TIP_AND_WARNING@
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
46
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
47 **Tool Description**
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
48
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
49 The tool identifies features that are different between treatment groups based on the random forest algorithm.
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
50 Based on Classification and Regression Trees (CART), random forests are an ensemble learning method for classification, regression and variable importance evaluation.
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
51 More details about the algorithm can be found in the book:
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
52
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
53 Breiman, L. (2001). Random forests. Machine learning, 45(1), 5-32.
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
54
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
55 **NOTE: The use of machine learning algorithms (including random forest) on datasets with a small number of samples is ambiguous and should be executed with caution.**
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
56
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
57 --------------------------------------------------------------------------------------------------------------
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
58
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
59 **Input**
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
60
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
61 - Two input datasets are required.
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
62
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
63 @WIDE@
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
64
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
65 **NOTE:** The sample IDs must match the sample IDs in the Design File (below).
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
66 Extra columns will automatically be ignored.
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
67
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
68
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
69 @METADATA@
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
70
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
71 @UNIQID@
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
72
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
73 @GROUP@
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
74
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
75 **Number of Trees in the Forest**
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
76
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
77 - Run a minimum of 1000 trees.
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
78
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
79 **Number of factors to plot**
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
80
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
81 - Plots the 20 most important factors.
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
82
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
83 --------------------------------------------------------------------------------
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
84
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
85 **Output**
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
86
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
87 This tool will always output three different files:
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
88
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
89 (1) a TSV file with features ranked according to their relative importance
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
90
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
91 (2) a TSV file where ranked features from the wide format dataset are saved in columns in the order that corresponds to their relative importance
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
92
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
93 (3) and a PDF file a variable importance plot for the first 50 components. The variable importance plot displays the X (Default = 20) most important features based on the random forest algorithm. The color of each feature changes from the most important (dark blue) to the least important (light blue).
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
94
2
caba07f41453 "planemo upload for repository https://github.com/secimTools/SECIMTools/tree/main/galaxy commit 498abad641099412df56f04ff6e144e4193bbc34-dirty"
malex
parents: 1
diff changeset
95 **NOTE:** The user can take the resulting TSV file and plot any two (or three) features using the Scatter Plot 2D or Scatter Plot 3D tools.
1
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
96
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
97 A plot of two (or three) most important features is recommended since they are probably the most meaningful, but other features can be also considered for plotting.
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
98
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
99 **To plot the 2 most important features**: use the SECIM Tools 'Scatter Plot 2D' tool on the transformed dataset to plot the features against each other and evaluate separation levels.
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
100
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
101 ]]></help>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
102 <expand macro="citations"/>
2e7d47c0b027 "planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff changeset
103 </tool>