Mercurial > repos > ufz > dfpl_train
diff dfpl_train.xml @ 0:e0bb949eac45 draft default tip
planemo upload for repository https://github.com/Helmholtz-UFZ/galaxy-tools/tree/main/tools/dfpl commit 66c6acfeff5441c36fba97787ddc5ee3d6a4a6ec
author | ufz |
---|---|
date | Thu, 19 Dec 2024 12:51:21 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dfpl_train.xml Thu Dec 19 12:51:21 2024 +0000 @@ -0,0 +1,301 @@ +<tool id="dfpl_train" name="deepFPlearn train" version="@TOOL_VERSION@+galaxy0" profile="23.0"> + <description>model to predict association of molecular structures to biological targets</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements"/> + <command detect_errors="exit_code"><![CDATA[ + set -o pipefail; + cat '$inputs' + | python '$__tool_directory__/json_flatten.py' + | python '$__tool_directory__/json_train.py' + > config.json && + mkdir -p 'autoencoder' && + mkdir -p 'model' && + dfpl train --configFile config.json && + cp 'autoencoder/encoder_weights.h5' '$output_autoencoder_weights' && + cp 'model/${model_configuration.target}/model_weights.h5' '$output_model_weights' + ]]></command> + <configfiles> + <inputs name="inputs" data_style="paths"/> + </configfiles> + <inputs> + <section name="model_configuration" title="Model Configuration" expanded="true"> + <param label="Input File" argument="--inputFile" + type="data" format="csv" optional="false" + help="The file containing the data for training in comma separated CSV format. The first column should be smiles"/> + <param label="Target" name="target" + type="text" optional="false" + help="The target column in the input file that should be trained for"> + <validator type="empty_field" message="A column name must be specified"/> + </param> + <param label="Chemical Representation" argument="--type" + type="select" optional="true" + help="Type of the chemical representation"> + <option value="fp" selected="true">fp</option> + <option value="smiles">smiles</option> + </param> + <param label="Classification Threshold" argument="--threshold" + type="float" min="0" max="1" value="0.5" optional="true" + help="Threshold for binary classification"/> + <param label="Fingerprint Type" + argument="--fpType" + optional="true" + type="select" + help="The type of fingerprint to be generated/used in input file"> + <option value="topological" selected="true">topological</option> + <option value="MACCS">MACCS</option> + </param> + <param label="Fingerprint Size" argument="--fpSize" + type="integer" min="1" value="2048" optional="true" + help="Length of the fingerprint that should be generated"/> + <param label="Multi-Label Classification" argument="--enableMultiLabel" + type="boolean" + checked="false" + help="Train multi-label classification model"/> + </section> + <section name="training_configuration" title="Training Configuration" expanded="true"> + <param argument="--split_type" type="select" optional="true" label="split_type" + help="Set how the data is split for the feedforward neural network"> + <option value="scaffold_balanced">Scaffold_balanced</option> + <option value="random" selected="true">Random</option> + <option value="molecular_weight">Molecular_weight</option> + </param> + <param label="Test Size" argument="--testSize" + type="float" min="0" max="1" value="0.2" optional="true" + help="Fraction of the dataset that should be used for testing"/> + <param label="kFolds Cross-Validation" argument="--kFolds" + type="integer" value="1" min="1" optional="true" + help="Number of folds for cross-validation"/> + <param label="Train FNN" argument="--trainFNN" + type="boolean" checked="true" + help="Deactivates the FNN training"/> + <param label="Sample Down" argument="--sampleDown" + type="boolean" + help="Down sampling of the 0-valued samples"/> + <param label="Sample Fraction Ones" argument="--sampleFractionOnes" + type="float" min="0" max="1" value="0.5" optional="true" + help="This is the desired fraction 1s/0s.only works if --sampleDown is enabled"/> + <param label="Epochs" argument="--epochs" + type="integer" min="10" value="100" optional="true" + help="Number of epochs for the FNN training"/> + <param label="Loss Function" argument="--lossFunction" + type="select" optional="true" + help="Loss function for FNN training. mse - mean squared error, bce - binary cross entropy"> + <option value="mse">MSE</option> + <option value="bce" selected="true">BCE</option> + <option value="focal">Focal</option> + </param> + <param label="Optimizer" argument="--optimizer" + type="select" optional="true" + help="Optimizer of the FNN"> + <option value="Adam" selected="true">Adam</option> + <option value="SGD">Sgd</option> + </param> + <param label="Batch Size" argument="--batchSize" + type="integer" min="1" value="128" optional="true" + help="Batch size in FNN training"/> + <param label="L2 Regularization" argument="--l2reg" + type="float" min="0" value="0.001" optional="true" + help="Value for l2 kernel regularizer"/> + <param label="Dropout" argument="--dropout" + type="float" min="0" max="1" value="0.2" optional="true" + help="The fraction of data that is dropped out in each dropout layer"/> + <param label="Learning Rate" argument="--learningRate" + type="float" min="0" value="2.2e-05" optional="true" + help="Learning rate size in FNN training"/> + <param label="Learning Rate Decay" argument="--learningRateDecay" + type="float" min="0" max="1" value="0.96" optional="true" + help="Learning rate decay in FNN training"/> + <param label="Activation Function" argument="--activationFunction" + type="select" optional="true" + help="The activation function of the FNN"> + <option value="relu" selected="true">Relu</option> + <option value="selu">Selu</option> + </param> + </section> + <conditional name="autoencoder"> + <param label="Compress Fingerprints with Autoencoder" argument="--compressFeatures" + type="select" + help="Compress the fingerprints using an autoencoder. + Either uses an already trained autoencoder (requires a weights file) + or creates and trains a new autoencoder."> + <option value="true">Compress fingerprints</option> + <option value="false">Use raw fingerprints</option> + </param> + <when value="true"> + <conditional name="train-autoencoder"> + <param label="Load / Train Autoencoder" argument="--trainAC" + type="select" + help="Select if a new autoencoder should be trained + or if you want to provide the weights of a trained autoencoder yourself"> + <option value="true">Train new autoencoder</option> + <option value="false">Load autoencoder from file</option> + </param> + <when value="false"> + <param label="Encoder Weights File" argument="--ecWeightsFile" + type="data" format="h5" + help="The .hdf5 file of a trained encoder"/> + </when> + <when value="true"> + <param label="Autoencoder Type" argument="--aeType" + type="select" optional="true" + help="Autoencoder type, variational or deterministic"> + <option value="variational">Variational</option> + <option value="deterministic" selected="true">Deterministic</option> + </param> + <param label="Epochs" argument="--aeEpochs" + type="integer" min="5" value="100" optional="true" + help="Number of epochs for autoencoder training"/> + <param label="Batch Size" argument="--aeBatchSize" + type="integer" min="1" value="512" optional="true" + help="Batch size in autoencoder training"/> + <param label="Learning Rate" argument="--aeLearningRate" + type="float" min="0" value="0.001" optional="true" + help="Learning rate for autoencoder training"/> + <param label="Learning Rate Decay" argument="--aeLearningRateDecay" + type="float" value="0.96" min="0" max="1" optional="true" + help="Learning rate decay for autoencoder training"/> + <param label="Split Type" argument="--aeSplitType" + type="select" optional="true" + help="Set how the data is split for the autoencoder"> + <option value="scaffold_balanced">Scaffold Balanced</option> + <option value="random" selected="true">Random</option> + <option value="molecular_weight">Molecular Weight</option> + </param> + <param label="FNN Type" argument="--fnnType" + type="select" optional="true" + help="The type of the feedforward neural network"> + <option value="FNN" selected="true">FNN</option> + <option value="SNN">SNN</option> + </param> + <param label="Fingerprint Size" argument="--encFPSize" + type="integer" min="1" value="256" optional="true" + help="Size of encoded fingerprint (z-layer of autoencoder)"/> + <param label="Activation Function" argument="--aeActivationFunction" + type="select" optional="true" + help="The activation function of the autoencoder"> + <option value="relu" selected="true">ReLU</option> + <option value="selu">SELU</option> + </param> + <param label="Visualize Latent Space" argument="--visualizeLatent" + type="boolean" checked="false" + help="UMAP the latent space for exploration"/> + </when> + </conditional> + </when> + <when value="false"/> + </conditional> + <section title="Logging" name="logging_configuration" expanded="false"> + <param label="Verbosity Level" argument="--verbose" + type="select" optional="true" + help="Verbosity level of output"> + <option value="0">0: No additional output</option> + <option value="1">1: Some additional output</option> + <option value="2">2: Full additional output</option> + </param> +<!-- <section name="tracking_configuration" title="Weights & Biases" expanded="true">--> +<!-- <param label="Target"--> +<!-- argument="--wabTarget" type="text" optional="true"--> +<!-- help="Which endpoint to use for tracking performance via Weights & Biases. Should match the column name"/>--> +<!-- <param label="Track FNN" argument="--wabTracking"--> +<!-- type="boolean"--> +<!-- help="Track FNN performance via Weights & Biases"/>--> +<!-- <param label="Track Autoencoder" argument="--aeWabTracking"--> +<!-- type="boolean"--> +<!-- help="Track autoencoder performance via Weights & Biases"/>--> +<!-- </section>--> + </section> + </inputs> + <outputs> + <!-- todo: filter -> let user decide if they want output svg/csv or nothing --> + <!-- <data name="loss_table" format="csv" from_work_dir="output/" label="${tool.name} on ${on_string}: csv">--> + <!-- </data>--> + <!-- <data name="loss_diagram" format="svg" from_work_dir="output/" label="${tool.name} on ${on_string}: svg">--> + <!-- </data>--> + <data name="output_model_weights" label="${tool.name} on ${on_string}: model weights" + format="h5"/> + <data name="output_autoencoder_weights" label="${tool.name} on ${on_string}: autoencoder weights" + format="h5"/> + </outputs> + <tests> + <test> + <section name="model_configuration"> + <param name="inputFile" value="S_dataset.csv"/> + <param name="target" value="Aromatase"/> + <param name="type" value="smiles"/> + <param name="fpType" value="topological"/> + <param name="fpSize" value="2048"/> + <param name="enableMultiLabel" value="false"/> + <param name="threshold" value="0.5"/> + </section> + <section name="training_configuration"> + <param name="split_type" value="random"/> + <param name="sampleFractionOnes" value="0"/> + <param name="sampleDown" value="false"/> + <param name="trainFNN" value="true"/> + <param name="kFolds" value="1"/> + <param name="testSize" value="0.2"/> + <param name="optimizer" value="Adam"/> + <param name="lossFunction" value="bce"/> + <param name="epochs" value="10"/> + <param name="batchSize" value="128"/> + <param name="activationFunction" value="selu"/> + <param name="dropout" value="0.0107"/> + <param name="learningRate" value="2.2e-06"/> + <param name="l2reg" value="0.001"/> + </section> + <conditional name="autoencoder"> + <param name="compressFeatures" value="true"/> + <conditional name="train-autoencoder"> + <param name="trainAC" value="true"/> + <param name="encFPSize" value="256"/> + <param name="aeSplitType" value="random"/> + <param name="aeEpochs" value="5"/> + <param name="aeBatchSize" value="351"/> + <param name="aeActivationFunction" value="relu"/> + <param name="aeLearningRate" value="0.001"/> + <param name="aeLearningRateDecay" value="0.0001"/> + <param name="aeType" value="deterministic"/> + <param name="fnnType" value="FNN"/> + </conditional> + </conditional> + <!-- <param name="aeWabTracking" value="false"/> + <param name="wabTracking" value="false"/> --> + <section name="logging_configuration"> + <param name="verbose" value="2"/> + </section> + <!-- todo: add tests for svg, csv --> + <output name="output_model_weights"> + <assert_contents> + <has_h5_keys keys="alpha_dropout_18,alpha_dropout_19,alpha_dropout_20,dense_30,dense_31,dense_32,dense_33,top_level_model_weights"/> + </assert_contents> + </output> + <assert_stdout> + <has_text text="Evaluating trained model"/> + </assert_stdout> + </test> + </tests> + <help><![CDATA[ + This tool is the train mode of `DeepFPLearn <https://github.com/yigbt/deepFPlearn>`_. + It's equivalent to running ``dfpl train`` from the command line. + + The train mode is used to train models to predict the association of molecular structures to biological targets. + The encoding of the molecules is done based on molecular fingerprints. + + The training data contains three targets and you may train models for each with this tool. + + The tool will generate the following outputs: + + - the trained models as a ``.zip`` file including + + - the weights of the trained FNN, if selected + + - the weights of the trained autoencoder, if selected + + - the training histories as tabular data (``.csv``) + + - the training histories as a plot (``.svg``) + ]]></help> + <expand macro="citations"/> +</tool>