view deepmicro.xml @ 2:2d20b3a1babd draft default tip

planemo upload for repository https://github.com/paulzierep/DeepMicro commit 574cb8c241e18a15f006bf307235c7dd23f07c69
author iuc
date Tue, 23 Jul 2024 15:56:32 +0000
parents c58c1a99578b
children
line wrap: on
line source

<tool id="deepmicro" name="DeepMicro" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>
        Representation learning and classification framework
    </description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="biotools" />
    <expand macro="requirements" />
    <expand macro="version" />
    <command detect_errors="exit_code"><![CDATA[
        mkdir data &&
        mkdir results &&
        ln -s '$features' data/features.csv &&

        #if $mode.mode_type == "only_encoding":

            #for $params in $mode.parameter_set:

                #if $params.rl_type.rl_type_choice == "--pca" or $params.rl_type.rl_type_choice == "--rp":
                    DM.py -r 1 
                    -cd features.csv 
                    '$params.rl_type.rl_type_choice' 
                    --save_rep 
                    --no_clf 
                    -t \${GALAXY_SLOTS:-8} &&
                #else: 
                    DM.py -r 1 
                    -cd features.csv 
                    '$params.rl_type.rl_type_choice' 
                    -dm '$params.rl_type.dm' 
                    --save_rep 
                    --no_clf 
                    -t \${GALAXY_SLOTS:-8} &&
                #end if
            #end for

        #else:

            ln -s '$mode.class_labels' data/labels.csv &&

            #for $params in $mode.parameter_set:

                ## general args
                DM.py
                -r '$mode.repeat'
                -cd features.csv 
                -cl labels.csv 
                --save_rep 
                -m '$params.rl_type.classifier'  
                --scoring '$mode.scoring'
                -f '$mode.folds'
                -t \${GALAXY_SLOTS:-8} 

                ## only train classifier without encoding
                #if $params.rl_type.rl_type_choice == "no_rl":
                    && echo "Only train Clf - no encoding!"
                ## add rl type
                #elif $params.rl_type.rl_type_choice == "--pca" or $params.rl_type.rl_type_choice == "--rp":
                    '$params.rl_type.rl_type_choice' 
                ## add rl type and dm options
                #else: 
                    '$params.rl_type.rl_type_choice' 
                    -dm '$params.rl_type.dm' 
                #end if
                &&
            #end for

        #end if

        echo Done !
        ]]>
    </command>
    <inputs>
        <param argument="--features" type="data" format="tabular" label="Feature table" help="Dataset containing the features of samples"/>
        <conditional name="mode">
            <param name="mode_type" type="select" label="Mode" help="The tool can either only create a latent 
            representation of the data or create a latent representation of the data and cross validate a classifier using that encoding.">
                <option value="only_encoding">Create only encoding</option>
                <option value="e_and_c">Create encoding and cross validate a classifier</option>
            </param>
            <when value="only_encoding">
                <repeat name="parameter_set" title="Parameter Set">
                    <conditional name="rl_type">
                        <param name="rl_type_choice" type="select" label="Representation learning type" help="The type of representation learning" >
                            <option value="--pca">PCA</option>
                            <option value="--rp">Random Projection</option>
                            <option value="--ae">Shallow Autoencoder or Deep Autoencoder (SAE, DAE)</option>
                            <option value="--vae">Variational Autoencoder (VAE)</option>
                            <option value="--cae">Convolutional Autoencoder (CAE)</option>
                        </param>
                        <when value="--pca"/>
                        <when value="--rp"/>
                        <when value="--ae">
                            <expand macro="dm" />                        
                        </when>
                        <when value="--vae">
                            <expand macro="dm" />                          
                        </when>
                        <when value="--cae">
                            <expand macro="dm" />  
                        </when>
                    </conditional>
                </repeat>
            </when>
            <when value="e_and_c">
                <param argument="--class_labels" type="data" format="tabular" label="Class labels" help="Dataset containing the class labels corresponding to the features"/>
                <param name="scoring" type="select" label="Scoring function for the classifiere" help="The classifiere will be optimized for this scoring function." >
                    <option value="roc_auc">ROC AUC</option>
                    <option value="accuracy">Accuracy</option>
                    <option value="f1">F1 Score</option>
                    <option value="recall">Recall</option>
                    <option value="precision">Precision</option>
                </param>   
                <param name="folds" type="integer" value="5" label="Cross-validation folds" min="2" max="10" help="The number of folds for cross-validation in the tranining set"/>
                <param name="repeat" type="integer" value="1" label="Repeat the experiment with different seed" min="1" max="5" help="Repeat the experiment with different seeds. Leads to a different train / test split each time."/>
                <repeat name="parameter_set" title="Parameter Set">
                    <conditional name="rl_type">
                        <param name="rl_type_choice" type="select" label="Representation learning type" help="The type of representation learning. `Train on input` trains the classifier on the input features without representation learning" >
                            <option value="--pca">PCA</option>
                            <option value="--rp">Random Projection</option>
                            <option value="--ae">Shallow Autoencoder or Deep Autoencoder (SAE, DAE)</option>
                            <option value="--vae">Variational Autoencoder (VAE)</option>
                            <option value="--cae">Convolutional Autoencoder (CAE)</option>
                            <option value="no_rl">Train on input</option>
                        </param>
                        <when value="no_rl">
                            <expand macro="clfs" />                    
                        </when>
                        <when value="--pca">
                            <expand macro="clfs" />                    
                        </when>
                        <when value="--rp">
                            <expand macro="clfs" />                    
                        </when>
                        <when value="--ae">
                            <expand macro="dm" />   
                            <expand macro="clfs" />                    
                        </when>
                        <when value="--vae">
                            <expand macro="dm" /> 
                            <expand macro="clfs" />                            
                        </when>
                        <when value="--cae">
                            <expand macro="dm" /> 
                            <expand macro="clfs" />    
                        </when>
                    </conditional>
                </repeat>
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data name="results" format="tabular" from_work_dir="./results/*_result.txt" label="${tool.name} on ${on_string}: Results">
            <!-- results are only for classifiers  -->
            <filter>mode["mode_type"] == "e_and_c"</filter>
        </data>
        <collection name="encoded_features" type="list" label="Encoded Features">
            <!-- the encoded features generated by the tool are only for the training set, this is not very useful, therefore omitting
            todo change tool do export features complete dataset also when classification is performed -->
            <filter>mode["mode_type"] == "only_encoding"</filter>
            <discover_datasets directory="results" pattern="(?P&lt;designation&gt;.*)_rep\.csv" format="tabular" visible="false" />
        </collection>
        <collection name="model" type="list" label="Keras Models">
            <!-- the encoded features generated by the tool are only for the training set, this is not very useful, therefore omitting
            todo change tool do export features complete dataset also when classification is performed -->
            <filter>mode["mode_type"] == "only_encoding"</filter>
            <discover_datasets directory="." pattern="(?P&lt;designation&gt;.*).h5" format="data" visible="false" />
        </collection>
    </outputs>
    <tests>

    <!-- only encoding -->
    <!-- test one parameter sets -->

        <test expect_num_outputs="2">
            <param name="mode_type" value="only_encoding" />
            <param name="features" value="UserDataExample.csv" />
            <param name="rl_type_choice" value="--ae" />
            <param name="dm" value="40" />
            <output_collection name="encoded_features" type="list">
                <!-- output is non determinisitc -->
                <element name="AE[40]_features" ftype="tabular" >
                    <assert_contents>
                        <has_n_lines n="20"/>
                        <!-- <has_n_columns n="40" sep="," /> -->
                    </assert_contents>
                </element>
            </output_collection>
        </test>

        <test expect_num_outputs="2">
            <param name="mode_type" value="only_encoding" />
            <param name="features" value="UserDataExample.csv" />
            <param name="rl_type_choice" value="--pca" />
            <output_collection name="encoded_features" type="list">
                <element name="PCA_features" ftype="tabular" >
                    <assert_contents>
                        <has_n_lines n="20"/>
                        <!-- <has_n_columns n="40" sep="," /> -->
                    </assert_contents>
                </element>
            </output_collection>
        </test>

        <!-- only encoding -->

        <test expect_num_outputs="2">
            <param name="mode_type" value="only_encoding" />
            <param name="features" value="UserDataExample.csv" />
            <param name="rl_type_choice" value="--ae" />
            <param name="dm" value="40" />

            <output_collection name="encoded_features" type="list">
                <!-- output is non determinisitc -->
                <element name="AE[40]_features" ftype="tabular" >
                    <assert_contents>
                        <has_n_lines n="20"/>
                        <!-- <has_n_columns n="40" sep="," /> -->
                    </assert_contents>
                </element>
            </output_collection>
        </test>

        <!-- test multiple parameter sets -->
        <test expect_num_outputs="2">
            <param name="features" value="UserDataExample.csv" />
            <conditional name="mode">
                <param name="mode_type" value="only_encoding" />

                <repeat name="parameter_set">
                    <conditional name="rl_type">
                        <param name="rl_type_choice" value="--pca" />
                    </conditional>
                </repeat>

                <repeat name="parameter_set">
                    <conditional name="rl_type">
                        <param name="rl_type_choice" value="--ae" />
                        <param name="dm" value="40" />
                    </conditional>
                </repeat>
            
            </conditional>

            <output_collection name="encoded_features" type="list">
                <element name="AE[40]_features" ftype="tabular" >
                    <assert_contents>
                        <has_n_lines n="20"/>
                        <!-- <has_n_columns n="40" sep="," /> -->
                    </assert_contents>
                </element>
                <element name="PCA_features" ftype="tabular" >
                    <assert_contents>
                        <has_n_lines n="20"/>
                        <!-- <has_n_columns n="40" sep="," /> -->
                    </assert_contents>
                </element>
            </output_collection>
            
        </test>  

        <!-- encoding and clf -->
        <!-- test one parameter set -->
        <!-- test additional parameters scoring / folds and repeat -->
        <test expect_num_outputs="1">
            <param name="features" value="UserDataExample.csv" />
            <param name="mode_type" value="e_and_c" />
            <param name="class_labels" value="UserLabelExample.csv" />
            <param name="rl_type_choice" value="--vae" />
            <param name="dm" value="40" />
            <param name="scoring" value="roc_auc" />
            <param name="folds" value="2" />
            <param name="repeat" value="2" />
            <param name="classifier" value="rf" /> 
            <output ftype="tabular" name="results" >
                <assert_contents>
                    <has_text text="VAE[40]_rf" />
                </assert_contents>
            </output>

        </test>


        <!-- no rl -->
        <test expect_num_outputs="1">
            <param name="features" value="UserDataExample.csv" />
            <param name="mode_type" value="e_and_c" />
            <param name="class_labels" value="UserLabelExample.csv" />
            <param name="rl_type_choice" value="no_rl" />
            <param name="classifier" value="rf" /> 
            <output ftype="tabular" name="results" >
                <assert_contents>
                    <has_text text="rf" />
                </assert_contents>
            </output>

        </test>

        <!-- test multiple parameter sets -->
        <test expect_num_outputs="1">
            <param name="features" value="UserDataExample.csv" />
            <conditional name="mode">
                <param name="mode_type" value="e_and_c" />
                <param name="class_labels" value="UserLabelExample.csv" />

                <repeat name="parameter_set">
                    <conditional name="rl_type">
                        <param name="rl_type_choice" value="--cae" />
                        <param name="dm" value="20" />
                        <param name="classifier" value="rf" /> 
                    </conditional>
                </repeat>

                <repeat name="parameter_set">
                    <conditional name="rl_type">
                        <param name="rl_type_choice" value="--vae" />
                        <param name="dm" value="40" />
                        <param name="classifier" value="mlp" /> 
                    </conditional>
                </repeat>
            
            </conditional>
            
            <output ftype="tabular" name="results" >
                <assert_contents>
                    <has_text text="CAE[20]_rf" />
                    <has_text text="VAE[40]_mlp" />
                </assert_contents>
            </output>

        </test>  

    </tests>
    <help>
        <![CDATA[
DeepMicro is a deep representation learning framework exploiting various autoencoders 
to learn robust low-dimensional representations from high-dimensional data and training
classification models based on the learned representation.

======================================
Option 1) Only representation learning
====================================== 

The representation learning does not require class labels and can be learned from the features alone. 
The wrapper allows to explore multiple paramertes (i.e. different modes to
generate the features, please refer to the publication for details), for each 
added parameter the encoded features are generated. Those features can then be passed to subsequent ML tools,
such as `Ensemble methods for classification and regression` or `Split Dataset into training and test subsets`

=====================================================
Option 2)  Representation learning and classification
=====================================================

The tool itself can also evaluate the performance of the generated representation learning for different
classifiers internally using 5 fold CV. The wrapper allows to explore multiple paramertes and clfs.
Each parameter run will be stored as a line to the result file. If this option is chosen the latent representation is not
exported as output. To create the latent representation of the complete feature set, run the tool again
with the same parameters using the `Only representation learning` option.
The header of the result file is: 

'{Encoding}_{classifier}, AUC, ACC, Recall, Precision, F1_score, time-end, runtime(sec), classfication time(sec), best hyper-parameter'

The overall schema of the tool is shown in:

.. image:: ML_Workflow.png
        ]]>
    </help>
    <expand macro="citations" />
    <expand macro="creator" />
</tool>