Mercurial > repos > ufz > dfpl_train

<tool id="dfpl_train" name="deepFPlearn train" version="@TOOL_VERSION@+galaxy0" profile="23.0">
    <description>model to predict association of molecular structures to biological targets</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements"/>
    <command detect_errors="exit_code"><![CDATA[
    set -o pipefail;
    cat '$inputs'
       | python '$__tool_directory__/json_flatten.py'
       | python '$__tool_directory__/json_train.py'
       > config.json &&
    mkdir -p 'autoencoder' &&
    mkdir -p 'model' &&
    dfpl train --configFile config.json &&
    cp 'autoencoder/encoder_weights.h5' '$output_autoencoder_weights' &&
    cp 'model/${model_configuration.target}/model_weights.h5' '$output_model_weights'
    ]]></command>
    <configfiles>
        <inputs name="inputs" data_style="paths"/>
    </configfiles>
    <inputs>
        <section name="model_configuration" title="Model Configuration" expanded="true">
            <param label="Input File" argument="--inputFile"
                   type="data" format="csv" optional="false"
                   help="The file containing the data for training in comma separated CSV format. The first column should be smiles"/>
            <param label="Target" name="target"
                   type="text" optional="false"
                   help="The target column in the input file that should be trained for">
                <validator type="empty_field" message="A column name must be specified"/>
            </param>
            <param label="Chemical Representation" argument="--type"
                   type="select" optional="true"
                   help="Type of the chemical representation">
                <option value="fp" selected="true">fp</option>
                <option value="smiles">smiles</option>
            </param>
            <param label="Classification Threshold" argument="--threshold"
                   type="float" min="0" max="1" value="0.5" optional="true"
                   help="Threshold for binary classification"/>
            <param label="Fingerprint Type"
                   argument="--fpType"
                   optional="true"
                   type="select"
                   help="The type of fingerprint to be generated/used in input file">
                <option value="topological" selected="true">topological</option>
                <option value="MACCS">MACCS</option>
            </param>
            <param label="Fingerprint Size" argument="--fpSize"
                   type="integer" min="1" value="2048" optional="true"
                   help="Length of the fingerprint that should be generated"/>
            <param label="Multi-Label Classification" argument="--enableMultiLabel"
                   type="boolean"
                   checked="false"
                   help="Train multi-label classification model"/>
        </section>
        <section name="training_configuration" title="Training Configuration" expanded="true">
            <param argument="--split_type" type="select" optional="true" label="split_type"
                   help="Set how the data is split for the feedforward neural network">
                <option value="scaffold_balanced">Scaffold_balanced</option>
                <option value="random" selected="true">Random</option>
                <option value="molecular_weight">Molecular_weight</option>
            </param>
            <param label="Test Size" argument="--testSize"
                   type="float" min="0" max="1" value="0.2" optional="true"
                   help="Fraction of the dataset that should be used for testing"/>
            <param label="kFolds Cross-Validation" argument="--kFolds"
                   type="integer" value="1" min="1" optional="true"
                   help="Number of folds for cross-validation"/>
            <param label="Train FNN" argument="--trainFNN"
                   type="boolean" checked="true"
                   help="Deactivates the FNN training"/>
            <param label="Sample Down" argument="--sampleDown"
                   type="boolean"
                   help="Down sampling of the 0-valued samples"/>
            <param label="Sample Fraction Ones" argument="--sampleFractionOnes"
                   type="float" min="0" max="1" value="0.5" optional="true"
                   help="This is the desired fraction 1s/0s.only works if --sampleDown is enabled"/>
            <param label="Epochs" argument="--epochs"
                   type="integer" min="10" value="100" optional="true"
                   help="Number of epochs for the FNN training"/>
            <param label="Loss Function" argument="--lossFunction"
                   type="select" optional="true"
                   help="Loss function for FNN training. mse - mean squared error, bce - binary cross entropy">
                <option value="mse">MSE</option>
                <option value="bce" selected="true">BCE</option>
                <option value="focal">Focal</option>
            </param>
            <param label="Optimizer" argument="--optimizer"
                   type="select" optional="true"
                   help="Optimizer of the FNN">
                <option value="Adam" selected="true">Adam</option>
                <option value="SGD">Sgd</option>
            </param>
            <param label="Batch Size" argument="--batchSize"
                   type="integer" min="1" value="128" optional="true"
                   help="Batch size in FNN training"/>
            <param label="L2 Regularization" argument="--l2reg"
                   type="float" min="0" value="0.001" optional="true"
                   help="Value for l2 kernel regularizer"/>
            <param label="Dropout" argument="--dropout"
                   type="float" min="0" max="1" value="0.2" optional="true"
                   help="The fraction of data that is dropped out in each dropout layer"/>
            <param label="Learning Rate" argument="--learningRate"
                   type="float" min="0" value="2.2e-05" optional="true"
                   help="Learning rate size in FNN training"/>
            <param label="Learning Rate Decay" argument="--learningRateDecay"
                   type="float" min="0" max="1" value="0.96" optional="true"
                   help="Learning rate decay in FNN training"/>
            <param label="Activation Function" argument="--activationFunction"
                   type="select" optional="true"
                   help="The activation function of the FNN">
                <option value="relu" selected="true">Relu</option>
                <option value="selu">Selu</option>
            </param>
        </section>
        <conditional name="autoencoder">
            <param label="Compress Fingerprints with Autoencoder" argument="--compressFeatures"
                   type="select"
                   help="Compress the fingerprints using an autoencoder.
                   Either uses an already trained autoencoder (requires a weights file)
                   or creates and trains a new autoencoder.">
                <option value="true">Compress fingerprints</option>
                <option value="false">Use raw fingerprints</option>
            </param>
            <when value="true">
                <conditional name="train-autoencoder">
                    <param label="Load / Train Autoencoder" argument="--trainAC"
                           type="select"
                           help="Select if a new autoencoder should be trained
                           or if you want to provide the weights of a trained autoencoder yourself">
                        <option value="true">Train new autoencoder</option>
                        <option value="false">Load autoencoder from file</option>
                    </param>
                    <when value="false">
                        <param label="Encoder Weights File" argument="--ecWeightsFile"
                               type="data" format="h5"
                               help="The .hdf5 file of a trained encoder"/>
                    </when>
                    <when value="true">
                        <param label="Autoencoder Type" argument="--aeType"
                               type="select" optional="true"
                               help="Autoencoder type, variational or deterministic">
                            <option value="variational">Variational</option>
                            <option value="deterministic" selected="true">Deterministic</option>
                        </param>
                        <param label="Epochs" argument="--aeEpochs"
                               type="integer" min="5" value="100" optional="true"
                               help="Number of epochs for autoencoder training"/>
                        <param label="Batch Size" argument="--aeBatchSize"
                               type="integer" min="1" value="512" optional="true"
                               help="Batch size in autoencoder training"/>
                        <param label="Learning Rate" argument="--aeLearningRate"
                               type="float" min="0" value="0.001" optional="true"
                               help="Learning rate for autoencoder training"/>
                        <param label="Learning Rate Decay" argument="--aeLearningRateDecay"
                               type="float" value="0.96" min="0" max="1" optional="true"
                               help="Learning rate decay for autoencoder training"/>
                        <param label="Split Type" argument="--aeSplitType"
                               type="select" optional="true"
                               help="Set how the data is split for the autoencoder">
                            <option value="scaffold_balanced">Scaffold Balanced</option>
                            <option value="random" selected="true">Random</option>
                            <option value="molecular_weight">Molecular Weight</option>
                        </param>
                        <param label="FNN Type" argument="--fnnType"
                               type="select" optional="true"
                               help="The type of the feedforward neural network">
                            <option value="FNN" selected="true">FNN</option>
                            <option value="SNN">SNN</option>
                        </param>
                        <param label="Fingerprint Size" argument="--encFPSize"
                               type="integer" min="1" value="256" optional="true"
                               help="Size of encoded fingerprint (z-layer of autoencoder)"/>
                        <param label="Activation Function" argument="--aeActivationFunction"
                               type="select" optional="true"
                               help="The activation function of the autoencoder">
                            <option value="relu" selected="true">ReLU</option>
                            <option value="selu">SELU</option>
                        </param>
                        <param label="Visualize Latent Space" argument="--visualizeLatent"
                               type="boolean" checked="false"
                               help="UMAP the latent space for exploration"/>
                    </when>
                </conditional>
            </when>
            <when value="false"/>
        </conditional>
        <section title="Logging" name="logging_configuration" expanded="false">
            <param label="Verbosity Level" argument="--verbose"
                   type="select" optional="true"
                   help="Verbosity level of output">
                <option value="0">0: No additional output</option>
                <option value="1">1: Some additional output</option>
                <option value="2">2: Full additional output</option>
            </param>
<!--            <section name="tracking_configuration" title="Weights &amp; Biases" expanded="true">-->
<!--                <param label="Target"-->
<!--                       argument="&#45;&#45;wabTarget" type="text" optional="true"-->
<!--                       help="Which endpoint to use for tracking performance via Weights &amp; Biases. Should match the column name"/>-->
<!--                <param label="Track FNN" argument="&#45;&#45;wabTracking"-->
<!--                       type="boolean"-->
<!--                       help="Track FNN performance via Weights &amp; Biases"/>-->
<!--                <param label="Track Autoencoder" argument="&#45;&#45;aeWabTracking"-->
<!--                       type="boolean"-->
<!--                       help="Track autoencoder performance via Weights &amp; Biases"/>-->
<!--            </section>-->
        </section>
    </inputs>
    <outputs>
        <!--        todo: filter -> let user decide if they want output svg/csv or nothing -->
        <!--        <data name="loss_table" format="csv" from_work_dir="output/" label="${tool.name} on ${on_string}: csv">-->
        <!--        </data>-->
        <!--        <data name="loss_diagram" format="svg" from_work_dir="output/" label="${tool.name} on ${on_string}: svg">-->
        <!--        </data>-->
        <data name="output_model_weights" label="${tool.name} on ${on_string}: model weights"
              format="h5"/>
        <data name="output_autoencoder_weights" label="${tool.name} on ${on_string}: autoencoder weights"
              format="h5"/>
    </outputs>
    <tests>
        <test>
            <section name="model_configuration">
                <param name="inputFile" value="S_dataset.csv"/>
                <param name="target" value="Aromatase"/>
                <param name="type" value="smiles"/>
                <param name="fpType" value="topological"/>
                <param name="fpSize" value="2048"/>
                <param name="enableMultiLabel" value="false"/>
                <param name="threshold" value="0.5"/>
            </section>
            <section name="training_configuration">
                <param name="split_type" value="random"/>
                <param name="sampleFractionOnes" value="0"/>
                <param name="sampleDown" value="false"/>
                <param name="trainFNN" value="true"/>
                <param name="kFolds" value="1"/>
                <param name="testSize" value="0.2"/>
                <param name="optimizer" value="Adam"/>
                <param name="lossFunction" value="bce"/>
                <param name="epochs" value="10"/>
                <param name="batchSize" value="128"/>
                <param name="activationFunction" value="selu"/>
                <param name="dropout" value="0.0107"/>
                <param name="learningRate" value="2.2e-06"/>
                <param name="l2reg" value="0.001"/>
            </section>
            <conditional name="autoencoder">
                <param name="compressFeatures" value="true"/>
                <conditional name="train-autoencoder">
                <param name="trainAC" value="true"/>
                <param name="encFPSize" value="256"/>
                <param name="aeSplitType" value="random"/>
                <param name="aeEpochs" value="5"/>
                <param name="aeBatchSize" value="351"/>
                <param name="aeActivationFunction" value="relu"/>
                <param name="aeLearningRate" value="0.001"/>
                <param name="aeLearningRateDecay" value="0.0001"/>
                <param name="aeType" value="deterministic"/>
                <param name="fnnType" value="FNN"/>
            </conditional>
            </conditional>
            <!-- <param name="aeWabTracking" value="false"/>
            <param name="wabTracking" value="false"/> -->
            <section name="logging_configuration">
                <param name="verbose" value="2"/>
            </section>
            <!--            todo: add tests for svg, csv -->
            <output name="output_model_weights">
                <assert_contents>
                    <has_h5_keys keys="alpha_dropout_18,alpha_dropout_19,alpha_dropout_20,dense_30,dense_31,dense_32,dense_33,top_level_model_weights"/>
                </assert_contents>
            </output>
            <assert_stdout>
                <has_text text="Evaluating trained model"/>
            </assert_stdout>
        </test>
    </tests>
    <help><![CDATA[
    This tool is the train mode of `DeepFPLearn <https://github.com/yigbt/deepFPlearn>`_.
    It's equivalent to running ``dfpl train`` from the command line.

    The train mode is used to train models to predict the association of molecular structures to biological targets.
    The encoding of the molecules is done based on molecular fingerprints.

    The training data contains three targets and you may train models for each with this tool.

    The tool will generate the following outputs:

    - the trained models as a ``.zip`` file including

      - the weights of the trained FNN, if selected

      - the weights of the trained autoencoder, if selected

    - the training histories as tabular data (``.csv``)

    - the training histories as a plot (``.svg``)
    ]]></help>
    <expand macro="citations"/>
</tool>