Mercurial > repos > ufz > dfpl_train

diff dfpl_train.xml @ 0:e0bb949eac45 draft default tip
planemo upload for repository https://github.com/Helmholtz-UFZ/galaxy-tools/tree/main/tools/dfpl commit 66c6acfeff5441c36fba97787ddc5ee3d6a4a6ec
author: ufz
date: Thu, 19 Dec 2024 12:51:21 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dfpl_train.xml	Thu Dec 19 12:51:21 2024 +0000
@@ -0,0 +1,301 @@
+<tool id="dfpl_train" name="deepFPlearn train" version="@TOOL_VERSION@+galaxy0" profile="23.0">
+    <description>model to predict association of molecular structures to biological targets</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+    set -o pipefail;
+    cat '$inputs'
+       | python '$__tool_directory__/json_flatten.py'
+       | python '$__tool_directory__/json_train.py'
+       > config.json &&
+    mkdir -p 'autoencoder' &&
+    mkdir -p 'model' &&
+    dfpl train --configFile config.json &&
+    cp 'autoencoder/encoder_weights.h5' '$output_autoencoder_weights' &&
+    cp 'model/${model_configuration.target}/model_weights.h5' '$output_model_weights'
+    ]]></command>
+    <configfiles>
+        <inputs name="inputs" data_style="paths"/>
+    </configfiles>
+    <inputs>
+        <section name="model_configuration" title="Model Configuration" expanded="true">
+            <param label="Input File" argument="--inputFile"
+                   type="data" format="csv" optional="false"
+                   help="The file containing the data for training in comma separated CSV format. The first column should be smiles"/>
+            <param label="Target" name="target"
+                   type="text" optional="false"
+                   help="The target column in the input file that should be trained for">
+                <validator type="empty_field" message="A column name must be specified"/>
+            </param>
+            <param label="Chemical Representation" argument="--type"
+                   type="select" optional="true"
+                   help="Type of the chemical representation">
+                <option value="fp" selected="true">fp</option>
+                <option value="smiles">smiles</option>
+            </param>
+            <param label="Classification Threshold" argument="--threshold"
+                   type="float" min="0" max="1" value="0.5" optional="true"
+                   help="Threshold for binary classification"/>
+            <param label="Fingerprint Type"
+                   argument="--fpType"
+                   optional="true"
+                   type="select"
+                   help="The type of fingerprint to be generated/used in input file">
+                <option value="topological" selected="true">topological</option>
+                <option value="MACCS">MACCS</option>
+            </param>
+            <param label="Fingerprint Size" argument="--fpSize"
+                   type="integer" min="1" value="2048" optional="true"
+                   help="Length of the fingerprint that should be generated"/>
+            <param label="Multi-Label Classification" argument="--enableMultiLabel"
+                   type="boolean"
+                   checked="false"
+                   help="Train multi-label classification model"/>
+        </section>
+        <section name="training_configuration" title="Training Configuration" expanded="true">
+            <param argument="--split_type" type="select" optional="true" label="split_type"
+                   help="Set how the data is split for the feedforward neural network">
+                <option value="scaffold_balanced">Scaffold_balanced</option>
+                <option value="random" selected="true">Random</option>
+                <option value="molecular_weight">Molecular_weight</option>
+            </param>
+            <param label="Test Size" argument="--testSize"
+                   type="float" min="0" max="1" value="0.2" optional="true"
+                   help="Fraction of the dataset that should be used for testing"/>
+            <param label="kFolds Cross-Validation" argument="--kFolds"
+                   type="integer" value="1" min="1" optional="true"
+                   help="Number of folds for cross-validation"/>
+            <param label="Train FNN" argument="--trainFNN"
+                   type="boolean" checked="true"
+                   help="Deactivates the FNN training"/>
+            <param label="Sample Down" argument="--sampleDown"
+                   type="boolean"
+                   help="Down sampling of the 0-valued samples"/>
+            <param label="Sample Fraction Ones" argument="--sampleFractionOnes"
+                   type="float" min="0" max="1" value="0.5" optional="true"
+                   help="This is the desired fraction 1s/0s.only works if --sampleDown is enabled"/>
+            <param label="Epochs" argument="--epochs"
+                   type="integer" min="10" value="100" optional="true"
+                   help="Number of epochs for the FNN training"/>
+            <param label="Loss Function" argument="--lossFunction"
+                   type="select" optional="true"
+                   help="Loss function for FNN training. mse - mean squared error, bce - binary cross entropy">
+                <option value="mse">MSE</option>
+                <option value="bce" selected="true">BCE</option>
+                <option value="focal">Focal</option>
+            </param>
+            <param label="Optimizer" argument="--optimizer"
+                   type="select" optional="true"
+                   help="Optimizer of the FNN">
+                <option value="Adam" selected="true">Adam</option>
+                <option value="SGD">Sgd</option>
+            </param>
+            <param label="Batch Size" argument="--batchSize"
+                   type="integer" min="1" value="128" optional="true"
+                   help="Batch size in FNN training"/>
+            <param label="L2 Regularization" argument="--l2reg"
+                   type="float" min="0" value="0.001" optional="true"
+                   help="Value for l2 kernel regularizer"/>
+            <param label="Dropout" argument="--dropout"
+                   type="float" min="0" max="1" value="0.2" optional="true"
+                   help="The fraction of data that is dropped out in each dropout layer"/>
+            <param label="Learning Rate" argument="--learningRate"
+                   type="float" min="0" value="2.2e-05" optional="true"
+                   help="Learning rate size in FNN training"/>
+            <param label="Learning Rate Decay" argument="--learningRateDecay"
+                   type="float" min="0" max="1" value="0.96" optional="true"
+                   help="Learning rate decay in FNN training"/>
+            <param label="Activation Function" argument="--activationFunction"
+                   type="select" optional="true"
+                   help="The activation function of the FNN">
+                <option value="relu" selected="true">Relu</option>
+                <option value="selu">Selu</option>
+            </param>
+        </section>
+        <conditional name="autoencoder">
+            <param label="Compress Fingerprints with Autoencoder" argument="--compressFeatures"
+                   type="select"
+                   help="Compress the fingerprints using an autoencoder.
+                   Either uses an already trained autoencoder (requires a weights file)
+                   or creates and trains a new autoencoder.">
+                <option value="true">Compress fingerprints</option>
+                <option value="false">Use raw fingerprints</option>
+            </param>
+            <when value="true">
+                <conditional name="train-autoencoder">
+                    <param label="Load / Train Autoencoder" argument="--trainAC"
+                           type="select"
+                           help="Select if a new autoencoder should be trained
+                           or if you want to provide the weights of a trained autoencoder yourself">
+                        <option value="true">Train new autoencoder</option>
+                        <option value="false">Load autoencoder from file</option>
+                    </param>
+                    <when value="false">
+                        <param label="Encoder Weights File" argument="--ecWeightsFile"
+                               type="data" format="h5"
+                               help="The .hdf5 file of a trained encoder"/>
+                    </when>
+                    <when value="true">
+                        <param label="Autoencoder Type" argument="--aeType"
+                               type="select" optional="true"
+                               help="Autoencoder type, variational or deterministic">
+                            <option value="variational">Variational</option>
+                            <option value="deterministic" selected="true">Deterministic</option>
+                        </param>
+                        <param label="Epochs" argument="--aeEpochs"
+                               type="integer" min="5" value="100" optional="true"
+                               help="Number of epochs for autoencoder training"/>
+                        <param label="Batch Size" argument="--aeBatchSize"
+                               type="integer" min="1" value="512" optional="true"
+                               help="Batch size in autoencoder training"/>
+                        <param label="Learning Rate" argument="--aeLearningRate"
+                               type="float" min="0" value="0.001" optional="true"
+                               help="Learning rate for autoencoder training"/>
+                        <param label="Learning Rate Decay" argument="--aeLearningRateDecay"
+                               type="float" value="0.96" min="0" max="1" optional="true"
+                               help="Learning rate decay for autoencoder training"/>
+                        <param label="Split Type" argument="--aeSplitType"
+                               type="select" optional="true"
+                               help="Set how the data is split for the autoencoder">
+                            <option value="scaffold_balanced">Scaffold Balanced</option>
+                            <option value="random" selected="true">Random</option>
+                            <option value="molecular_weight">Molecular Weight</option>
+                        </param>
+                        <param label="FNN Type" argument="--fnnType"
+                               type="select" optional="true"
+                               help="The type of the feedforward neural network">
+                            <option value="FNN" selected="true">FNN</option>
+                            <option value="SNN">SNN</option>
+                        </param>
+                        <param label="Fingerprint Size" argument="--encFPSize"
+                               type="integer" min="1" value="256" optional="true"
+                               help="Size of encoded fingerprint (z-layer of autoencoder)"/>
+                        <param label="Activation Function" argument="--aeActivationFunction"
+                               type="select" optional="true"
+                               help="The activation function of the autoencoder">
+                            <option value="relu" selected="true">ReLU</option>
+                            <option value="selu">SELU</option>
+                        </param>
+                        <param label="Visualize Latent Space" argument="--visualizeLatent"
+                               type="boolean" checked="false"
+                               help="UMAP the latent space for exploration"/>
+                    </when>
+                </conditional>
+            </when>
+            <when value="false"/>
+        </conditional>
+        <section title="Logging" name="logging_configuration" expanded="false">
+            <param label="Verbosity Level" argument="--verbose"
+                   type="select" optional="true"
+                   help="Verbosity level of output">
+                <option value="0">0: No additional output</option>
+                <option value="1">1: Some additional output</option>
+                <option value="2">2: Full additional output</option>
+            </param>
+<!--            <section name="tracking_configuration" title="Weights &amp; Biases" expanded="true">-->
+<!--                <param label="Target"-->
+<!--                       argument="&#45;&#45;wabTarget" type="text" optional="true"-->
+<!--                       help="Which endpoint to use for tracking performance via Weights &amp; Biases. Should match the column name"/>-->
+<!--                <param label="Track FNN" argument="&#45;&#45;wabTracking"-->
+<!--                       type="boolean"-->
+<!--                       help="Track FNN performance via Weights &amp; Biases"/>-->
+<!--                <param label="Track Autoencoder" argument="&#45;&#45;aeWabTracking"-->
+<!--                       type="boolean"-->
+<!--                       help="Track autoencoder performance via Weights &amp; Biases"/>-->
+<!--            </section>-->
+        </section>
+    </inputs>
+    <outputs>
+        <!--        todo: filter -> let user decide if they want output svg/csv or nothing -->
+        <!--        <data name="loss_table" format="csv" from_work_dir="output/" label="${tool.name} on ${on_string}: csv">-->
+        <!--        </data>-->
+        <!--        <data name="loss_diagram" format="svg" from_work_dir="output/" label="${tool.name} on ${on_string}: svg">-->
+        <!--        </data>-->
+        <data name="output_model_weights" label="${tool.name} on ${on_string}: model weights"
+              format="h5"/>
+        <data name="output_autoencoder_weights" label="${tool.name} on ${on_string}: autoencoder weights"
+              format="h5"/>
+    </outputs>
+    <tests>
+        <test>
+            <section name="model_configuration">
+                <param name="inputFile" value="S_dataset.csv"/>
+                <param name="target" value="Aromatase"/>
+                <param name="type" value="smiles"/>
+                <param name="fpType" value="topological"/>
+                <param name="fpSize" value="2048"/>
+                <param name="enableMultiLabel" value="false"/>
+                <param name="threshold" value="0.5"/>
+            </section>
+            <section name="training_configuration">
+                <param name="split_type" value="random"/>
+                <param name="sampleFractionOnes" value="0"/>
+                <param name="sampleDown" value="false"/>
+                <param name="trainFNN" value="true"/>
+                <param name="kFolds" value="1"/>
+                <param name="testSize" value="0.2"/>
+                <param name="optimizer" value="Adam"/>
+                <param name="lossFunction" value="bce"/>
+                <param name="epochs" value="10"/>
+                <param name="batchSize" value="128"/>
+                <param name="activationFunction" value="selu"/>
+                <param name="dropout" value="0.0107"/>
+                <param name="learningRate" value="2.2e-06"/>
+                <param name="l2reg" value="0.001"/>
+            </section>
+            <conditional name="autoencoder">
+                <param name="compressFeatures" value="true"/>
+                <conditional name="train-autoencoder">
+                <param name="trainAC" value="true"/>
+                <param name="encFPSize" value="256"/>
+                <param name="aeSplitType" value="random"/>
+                <param name="aeEpochs" value="5"/>
+                <param name="aeBatchSize" value="351"/>
+                <param name="aeActivationFunction" value="relu"/>
+                <param name="aeLearningRate" value="0.001"/>
+                <param name="aeLearningRateDecay" value="0.0001"/>
+                <param name="aeType" value="deterministic"/>
+                <param name="fnnType" value="FNN"/>
+            </conditional>
+            </conditional>
+            <!-- <param name="aeWabTracking" value="false"/>
+            <param name="wabTracking" value="false"/> -->
+            <section name="logging_configuration">
+                <param name="verbose" value="2"/>
+            </section>
+            <!--            todo: add tests for svg, csv -->
+            <output name="output_model_weights">
+                <assert_contents>
+                    <has_h5_keys keys="alpha_dropout_18,alpha_dropout_19,alpha_dropout_20,dense_30,dense_31,dense_32,dense_33,top_level_model_weights"/>
+                </assert_contents>
+            </output>
+            <assert_stdout>
+                <has_text text="Evaluating trained model"/>
+            </assert_stdout>
+        </test>
+    </tests>
+    <help><![CDATA[
+    This tool is the train mode of `DeepFPLearn <https://github.com/yigbt/deepFPlearn>`_.
+    It's equivalent to running ``dfpl train`` from the command line.
+
+    The train mode is used to train models to predict the association of molecular structures to biological targets.
+    The encoding of the molecules is done based on molecular fingerprints.
+
+    The training data contains three targets and you may train models for each with this tool.
+
+    The tool will generate the following outputs:
+
+    - the trained models as a ``.zip`` file including
+
+      - the weights of the trained FNN, if selected
+
+      - the weights of the trained autoencoder, if selected
+
+    - the training histories as tabular data (``.csv``)
+
+    - the training histories as a plot (``.svg``)
+    ]]></help>
+    <expand macro="citations"/>
+</tool>