comparison dfpl_train.xml @ 0:e0bb949eac45 draft default tip

planemo upload for repository https://github.com/Helmholtz-UFZ/galaxy-tools/tree/main/tools/dfpl commit 66c6acfeff5441c36fba97787ddc5ee3d6a4a6ec
author ufz
date Thu, 19 Dec 2024 12:51:21 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0bb949eac45
1 <tool id="dfpl_train" name="deepFPlearn train" version="@TOOL_VERSION@+galaxy0" profile="23.0">
2 <description>model to predict association of molecular structures to biological targets</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="requirements"/>
7 <command detect_errors="exit_code"><![CDATA[
8 set -o pipefail;
9 cat '$inputs'
10 | python '$__tool_directory__/json_flatten.py'
11 | python '$__tool_directory__/json_train.py'
12 > config.json &&
13 mkdir -p 'autoencoder' &&
14 mkdir -p 'model' &&
15 dfpl train --configFile config.json &&
16 cp 'autoencoder/encoder_weights.h5' '$output_autoencoder_weights' &&
17 cp 'model/${model_configuration.target}/model_weights.h5' '$output_model_weights'
18 ]]></command>
19 <configfiles>
20 <inputs name="inputs" data_style="paths"/>
21 </configfiles>
22 <inputs>
23 <section name="model_configuration" title="Model Configuration" expanded="true">
24 <param label="Input File" argument="--inputFile"
25 type="data" format="csv" optional="false"
26 help="The file containing the data for training in comma separated CSV format. The first column should be smiles"/>
27 <param label="Target" name="target"
28 type="text" optional="false"
29 help="The target column in the input file that should be trained for">
30 <validator type="empty_field" message="A column name must be specified"/>
31 </param>
32 <param label="Chemical Representation" argument="--type"
33 type="select" optional="true"
34 help="Type of the chemical representation">
35 <option value="fp" selected="true">fp</option>
36 <option value="smiles">smiles</option>
37 </param>
38 <param label="Classification Threshold" argument="--threshold"
39 type="float" min="0" max="1" value="0.5" optional="true"
40 help="Threshold for binary classification"/>
41 <param label="Fingerprint Type"
42 argument="--fpType"
43 optional="true"
44 type="select"
45 help="The type of fingerprint to be generated/used in input file">
46 <option value="topological" selected="true">topological</option>
47 <option value="MACCS">MACCS</option>
48 </param>
49 <param label="Fingerprint Size" argument="--fpSize"
50 type="integer" min="1" value="2048" optional="true"
51 help="Length of the fingerprint that should be generated"/>
52 <param label="Multi-Label Classification" argument="--enableMultiLabel"
53 type="boolean"
54 checked="false"
55 help="Train multi-label classification model"/>
56 </section>
57 <section name="training_configuration" title="Training Configuration" expanded="true">
58 <param argument="--split_type" type="select" optional="true" label="split_type"
59 help="Set how the data is split for the feedforward neural network">
60 <option value="scaffold_balanced">Scaffold_balanced</option>
61 <option value="random" selected="true">Random</option>
62 <option value="molecular_weight">Molecular_weight</option>
63 </param>
64 <param label="Test Size" argument="--testSize"
65 type="float" min="0" max="1" value="0.2" optional="true"
66 help="Fraction of the dataset that should be used for testing"/>
67 <param label="kFolds Cross-Validation" argument="--kFolds"
68 type="integer" value="1" min="1" optional="true"
69 help="Number of folds for cross-validation"/>
70 <param label="Train FNN" argument="--trainFNN"
71 type="boolean" checked="true"
72 help="Deactivates the FNN training"/>
73 <param label="Sample Down" argument="--sampleDown"
74 type="boolean"
75 help="Down sampling of the 0-valued samples"/>
76 <param label="Sample Fraction Ones" argument="--sampleFractionOnes"
77 type="float" min="0" max="1" value="0.5" optional="true"
78 help="This is the desired fraction 1s/0s.only works if --sampleDown is enabled"/>
79 <param label="Epochs" argument="--epochs"
80 type="integer" min="10" value="100" optional="true"
81 help="Number of epochs for the FNN training"/>
82 <param label="Loss Function" argument="--lossFunction"
83 type="select" optional="true"
84 help="Loss function for FNN training. mse - mean squared error, bce - binary cross entropy">
85 <option value="mse">MSE</option>
86 <option value="bce" selected="true">BCE</option>
87 <option value="focal">Focal</option>
88 </param>
89 <param label="Optimizer" argument="--optimizer"
90 type="select" optional="true"
91 help="Optimizer of the FNN">
92 <option value="Adam" selected="true">Adam</option>
93 <option value="SGD">Sgd</option>
94 </param>
95 <param label="Batch Size" argument="--batchSize"
96 type="integer" min="1" value="128" optional="true"
97 help="Batch size in FNN training"/>
98 <param label="L2 Regularization" argument="--l2reg"
99 type="float" min="0" value="0.001" optional="true"
100 help="Value for l2 kernel regularizer"/>
101 <param label="Dropout" argument="--dropout"
102 type="float" min="0" max="1" value="0.2" optional="true"
103 help="The fraction of data that is dropped out in each dropout layer"/>
104 <param label="Learning Rate" argument="--learningRate"
105 type="float" min="0" value="2.2e-05" optional="true"
106 help="Learning rate size in FNN training"/>
107 <param label="Learning Rate Decay" argument="--learningRateDecay"
108 type="float" min="0" max="1" value="0.96" optional="true"
109 help="Learning rate decay in FNN training"/>
110 <param label="Activation Function" argument="--activationFunction"
111 type="select" optional="true"
112 help="The activation function of the FNN">
113 <option value="relu" selected="true">Relu</option>
114 <option value="selu">Selu</option>
115 </param>
116 </section>
117 <conditional name="autoencoder">
118 <param label="Compress Fingerprints with Autoencoder" argument="--compressFeatures"
119 type="select"
120 help="Compress the fingerprints using an autoencoder.
121 Either uses an already trained autoencoder (requires a weights file)
122 or creates and trains a new autoencoder.">
123 <option value="true">Compress fingerprints</option>
124 <option value="false">Use raw fingerprints</option>
125 </param>
126 <when value="true">
127 <conditional name="train-autoencoder">
128 <param label="Load / Train Autoencoder" argument="--trainAC"
129 type="select"
130 help="Select if a new autoencoder should be trained
131 or if you want to provide the weights of a trained autoencoder yourself">
132 <option value="true">Train new autoencoder</option>
133 <option value="false">Load autoencoder from file</option>
134 </param>
135 <when value="false">
136 <param label="Encoder Weights File" argument="--ecWeightsFile"
137 type="data" format="h5"
138 help="The .hdf5 file of a trained encoder"/>
139 </when>
140 <when value="true">
141 <param label="Autoencoder Type" argument="--aeType"
142 type="select" optional="true"
143 help="Autoencoder type, variational or deterministic">
144 <option value="variational">Variational</option>
145 <option value="deterministic" selected="true">Deterministic</option>
146 </param>
147 <param label="Epochs" argument="--aeEpochs"
148 type="integer" min="5" value="100" optional="true"
149 help="Number of epochs for autoencoder training"/>
150 <param label="Batch Size" argument="--aeBatchSize"
151 type="integer" min="1" value="512" optional="true"
152 help="Batch size in autoencoder training"/>
153 <param label="Learning Rate" argument="--aeLearningRate"
154 type="float" min="0" value="0.001" optional="true"
155 help="Learning rate for autoencoder training"/>
156 <param label="Learning Rate Decay" argument="--aeLearningRateDecay"
157 type="float" value="0.96" min="0" max="1" optional="true"
158 help="Learning rate decay for autoencoder training"/>
159 <param label="Split Type" argument="--aeSplitType"
160 type="select" optional="true"
161 help="Set how the data is split for the autoencoder">
162 <option value="scaffold_balanced">Scaffold Balanced</option>
163 <option value="random" selected="true">Random</option>
164 <option value="molecular_weight">Molecular Weight</option>
165 </param>
166 <param label="FNN Type" argument="--fnnType"
167 type="select" optional="true"
168 help="The type of the feedforward neural network">
169 <option value="FNN" selected="true">FNN</option>
170 <option value="SNN">SNN</option>
171 </param>
172 <param label="Fingerprint Size" argument="--encFPSize"
173 type="integer" min="1" value="256" optional="true"
174 help="Size of encoded fingerprint (z-layer of autoencoder)"/>
175 <param label="Activation Function" argument="--aeActivationFunction"
176 type="select" optional="true"
177 help="The activation function of the autoencoder">
178 <option value="relu" selected="true">ReLU</option>
179 <option value="selu">SELU</option>
180 </param>
181 <param label="Visualize Latent Space" argument="--visualizeLatent"
182 type="boolean" checked="false"
183 help="UMAP the latent space for exploration"/>
184 </when>
185 </conditional>
186 </when>
187 <when value="false"/>
188 </conditional>
189 <section title="Logging" name="logging_configuration" expanded="false">
190 <param label="Verbosity Level" argument="--verbose"
191 type="select" optional="true"
192 help="Verbosity level of output">
193 <option value="0">0: No additional output</option>
194 <option value="1">1: Some additional output</option>
195 <option value="2">2: Full additional output</option>
196 </param>
197 <!-- <section name="tracking_configuration" title="Weights &amp; Biases" expanded="true">-->
198 <!-- <param label="Target"-->
199 <!-- argument="&#45;&#45;wabTarget" type="text" optional="true"-->
200 <!-- help="Which endpoint to use for tracking performance via Weights &amp; Biases. Should match the column name"/>-->
201 <!-- <param label="Track FNN" argument="&#45;&#45;wabTracking"-->
202 <!-- type="boolean"-->
203 <!-- help="Track FNN performance via Weights &amp; Biases"/>-->
204 <!-- <param label="Track Autoencoder" argument="&#45;&#45;aeWabTracking"-->
205 <!-- type="boolean"-->
206 <!-- help="Track autoencoder performance via Weights &amp; Biases"/>-->
207 <!-- </section>-->
208 </section>
209 </inputs>
210 <outputs>
211 <!-- todo: filter -> let user decide if they want output svg/csv or nothing -->
212 <!-- <data name="loss_table" format="csv" from_work_dir="output/" label="${tool.name} on ${on_string}: csv">-->
213 <!-- </data>-->
214 <!-- <data name="loss_diagram" format="svg" from_work_dir="output/" label="${tool.name} on ${on_string}: svg">-->
215 <!-- </data>-->
216 <data name="output_model_weights" label="${tool.name} on ${on_string}: model weights"
217 format="h5"/>
218 <data name="output_autoencoder_weights" label="${tool.name} on ${on_string}: autoencoder weights"
219 format="h5"/>
220 </outputs>
221 <tests>
222 <test>
223 <section name="model_configuration">
224 <param name="inputFile" value="S_dataset.csv"/>
225 <param name="target" value="Aromatase"/>
226 <param name="type" value="smiles"/>
227 <param name="fpType" value="topological"/>
228 <param name="fpSize" value="2048"/>
229 <param name="enableMultiLabel" value="false"/>
230 <param name="threshold" value="0.5"/>
231 </section>
232 <section name="training_configuration">
233 <param name="split_type" value="random"/>
234 <param name="sampleFractionOnes" value="0"/>
235 <param name="sampleDown" value="false"/>
236 <param name="trainFNN" value="true"/>
237 <param name="kFolds" value="1"/>
238 <param name="testSize" value="0.2"/>
239 <param name="optimizer" value="Adam"/>
240 <param name="lossFunction" value="bce"/>
241 <param name="epochs" value="10"/>
242 <param name="batchSize" value="128"/>
243 <param name="activationFunction" value="selu"/>
244 <param name="dropout" value="0.0107"/>
245 <param name="learningRate" value="2.2e-06"/>
246 <param name="l2reg" value="0.001"/>
247 </section>
248 <conditional name="autoencoder">
249 <param name="compressFeatures" value="true"/>
250 <conditional name="train-autoencoder">
251 <param name="trainAC" value="true"/>
252 <param name="encFPSize" value="256"/>
253 <param name="aeSplitType" value="random"/>
254 <param name="aeEpochs" value="5"/>
255 <param name="aeBatchSize" value="351"/>
256 <param name="aeActivationFunction" value="relu"/>
257 <param name="aeLearningRate" value="0.001"/>
258 <param name="aeLearningRateDecay" value="0.0001"/>
259 <param name="aeType" value="deterministic"/>
260 <param name="fnnType" value="FNN"/>
261 </conditional>
262 </conditional>
263 <!-- <param name="aeWabTracking" value="false"/>
264 <param name="wabTracking" value="false"/> -->
265 <section name="logging_configuration">
266 <param name="verbose" value="2"/>
267 </section>
268 <!-- todo: add tests for svg, csv -->
269 <output name="output_model_weights">
270 <assert_contents>
271 <has_h5_keys keys="alpha_dropout_18,alpha_dropout_19,alpha_dropout_20,dense_30,dense_31,dense_32,dense_33,top_level_model_weights"/>
272 </assert_contents>
273 </output>
274 <assert_stdout>
275 <has_text text="Evaluating trained model"/>
276 </assert_stdout>
277 </test>
278 </tests>
279 <help><![CDATA[
280 This tool is the train mode of `DeepFPLearn <https://github.com/yigbt/deepFPlearn>`_.
281 It's equivalent to running ``dfpl train`` from the command line.
282
283 The train mode is used to train models to predict the association of molecular structures to biological targets.
284 The encoding of the molecules is done based on molecular fingerprints.
285
286 The training data contains three targets and you may train models for each with this tool.
287
288 The tool will generate the following outputs:
289
290 - the trained models as a ``.zip`` file including
291
292 - the weights of the trained FNN, if selected
293
294 - the weights of the trained autoencoder, if selected
295
296 - the training histories as tabular data (``.csv``)
297
298 - the training histories as a plot (``.svg``)
299 ]]></help>
300 <expand macro="citations"/>
301 </tool>