view random_forest.xml @ 2:caba07f41453 draft default tip

"planemo upload for repository https://github.com/secimTools/SECIMTools/tree/main/galaxy commit 498abad641099412df56f04ff6e144e4193bbc34-dirty"
author malex
date Thu, 10 Jun 2021 15:41:17 +0000
parents 2e7d47c0b027
children
line wrap: on
line source

<tool id="secimtools_random_forest" name="Random Forest (RF)" version="@WRAPPER_VERSION@">
    <description>algorithm to select features.</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements" />
    <command detect_errors="exit_code"><![CDATA[
random_forest.py
--input $input
--design $design
--ID $uniqID
--group $group
--snum $number_of_estimators
--num $number_of_factors
--out $outfile1
--out2 $outfile2
--figure $figure
    ]]></command>
    <inputs>
        <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/>
        <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/>
        <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers."/>
        <param name="group" type="text" size="30" value="" label="Group/Treatment" help="Name of the column in your design file that contains group classifications."/>
        <param name="number_of_estimators" type="integer" size="30" value="1000" label="Number of trees in the forest" help="Recommend a minimum of 1000 trees."/>
        <param name="number_of_factors" type="integer" size="30" value="20" label="Number of factors to plot" help="Plots the (Default = 20) most important factors."/>
    </inputs>
    <outputs>
        <data format="csv" name="outfile1" label="${tool.name} on ${on_string}: Transformed Data"/>
        <data format="csv" name="outfile2" label="${tool.name} on ${on_string}: Importance Factors"/>
        <data format="pdf" name="figure" label="${tool.name} on ${on_string}: Variable Importance Plot"/>
    </outputs>
    <tests>
        <test>
            <param name="input"  value="ST000006_data.tsv"/>
            <param name="design" value="ST000006_design.tsv"/>
            <param name="uniqID" value="Retention_Index" />
            <param name="group"  value="White_wine_type_and_source" />
            <output name="outfile1" file="ST000006_random_forest_out.tsv" compare="sim_size" delta="10000" />
            <output name="outfile2" file="ST000006_random_forest_out2.tsv" compare="sim_size" delta="10000" />
            <output name="figure"   file="ST000006_random_forest_figure.pdf" compare="sim_size" delta="10000" />
        </test>
    </tests>
    <help><![CDATA[

@TIP_AND_WARNING@

**Tool Description**

The tool identifies features that are different between treatment groups based on the random forest algorithm.
Based on Classification and Regression Trees (CART), random forests are an ensemble learning method for classification, regression and variable importance evaluation.
More details about the algorithm can be found in the book:

Breiman, L. (2001). Random forests. Machine learning, 45(1), 5-32.

**NOTE: The use of machine learning algorithms (including random forest) on datasets with a small number of samples is ambiguous and should be executed with caution.**

--------------------------------------------------------------------------------------------------------------

**Input**

    - Two input datasets are required.

@WIDE@

**NOTE:** The sample IDs must match the sample IDs in the Design File (below).
Extra columns will automatically be ignored.


@METADATA@

@UNIQID@

@GROUP@

**Number of Trees in the Forest**

    - Run a minimum of 1000 trees.

**Number of factors to plot**

    - Plots the 20 most important factors.

--------------------------------------------------------------------------------

**Output**

This tool will always output three different files:

(1) a TSV file with features ranked according to their relative importance

(2) a TSV file where ranked features from the wide format dataset are saved in columns in the order that corresponds to their relative importance

(3) and a PDF file a variable importance plot for the first 50 components. The variable importance plot displays the X (Default = 20) most important features based on the random forest algorithm. The color of each feature changes from the most important (dark blue) to the least important (light blue).

    **NOTE:** The user can take the resulting TSV file and plot any two (or three) features using the Scatter Plot 2D or Scatter Plot 3D tools.

A plot of two (or three) most important features is recommended since they are probably the most meaningful, but other features can be also considered for plotting.

    **To plot the 2 most important features**: use the SECIM Tools 'Scatter Plot 2D' tool on the transformed dataset to plot the features against each other and evaluate separation levels.

    ]]></help>
    <expand macro="citations"/>
</tool>