Mercurial > repos > ufz > dfpl_train
comparison dfpl_train.xml @ 0:e0bb949eac45 draft default tip
planemo upload for repository https://github.com/Helmholtz-UFZ/galaxy-tools/tree/main/tools/dfpl commit 66c6acfeff5441c36fba97787ddc5ee3d6a4a6ec
author | ufz |
---|---|
date | Thu, 19 Dec 2024 12:51:21 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e0bb949eac45 |
---|---|
1 <tool id="dfpl_train" name="deepFPlearn train" version="@TOOL_VERSION@+galaxy0" profile="23.0"> | |
2 <description>model to predict association of molecular structures to biological targets</description> | |
3 <macros> | |
4 <import>macros.xml</import> | |
5 </macros> | |
6 <expand macro="requirements"/> | |
7 <command detect_errors="exit_code"><![CDATA[ | |
8 set -o pipefail; | |
9 cat '$inputs' | |
10 | python '$__tool_directory__/json_flatten.py' | |
11 | python '$__tool_directory__/json_train.py' | |
12 > config.json && | |
13 mkdir -p 'autoencoder' && | |
14 mkdir -p 'model' && | |
15 dfpl train --configFile config.json && | |
16 cp 'autoencoder/encoder_weights.h5' '$output_autoencoder_weights' && | |
17 cp 'model/${model_configuration.target}/model_weights.h5' '$output_model_weights' | |
18 ]]></command> | |
19 <configfiles> | |
20 <inputs name="inputs" data_style="paths"/> | |
21 </configfiles> | |
22 <inputs> | |
23 <section name="model_configuration" title="Model Configuration" expanded="true"> | |
24 <param label="Input File" argument="--inputFile" | |
25 type="data" format="csv" optional="false" | |
26 help="The file containing the data for training in comma separated CSV format. The first column should be smiles"/> | |
27 <param label="Target" name="target" | |
28 type="text" optional="false" | |
29 help="The target column in the input file that should be trained for"> | |
30 <validator type="empty_field" message="A column name must be specified"/> | |
31 </param> | |
32 <param label="Chemical Representation" argument="--type" | |
33 type="select" optional="true" | |
34 help="Type of the chemical representation"> | |
35 <option value="fp" selected="true">fp</option> | |
36 <option value="smiles">smiles</option> | |
37 </param> | |
38 <param label="Classification Threshold" argument="--threshold" | |
39 type="float" min="0" max="1" value="0.5" optional="true" | |
40 help="Threshold for binary classification"/> | |
41 <param label="Fingerprint Type" | |
42 argument="--fpType" | |
43 optional="true" | |
44 type="select" | |
45 help="The type of fingerprint to be generated/used in input file"> | |
46 <option value="topological" selected="true">topological</option> | |
47 <option value="MACCS">MACCS</option> | |
48 </param> | |
49 <param label="Fingerprint Size" argument="--fpSize" | |
50 type="integer" min="1" value="2048" optional="true" | |
51 help="Length of the fingerprint that should be generated"/> | |
52 <param label="Multi-Label Classification" argument="--enableMultiLabel" | |
53 type="boolean" | |
54 checked="false" | |
55 help="Train multi-label classification model"/> | |
56 </section> | |
57 <section name="training_configuration" title="Training Configuration" expanded="true"> | |
58 <param argument="--split_type" type="select" optional="true" label="split_type" | |
59 help="Set how the data is split for the feedforward neural network"> | |
60 <option value="scaffold_balanced">Scaffold_balanced</option> | |
61 <option value="random" selected="true">Random</option> | |
62 <option value="molecular_weight">Molecular_weight</option> | |
63 </param> | |
64 <param label="Test Size" argument="--testSize" | |
65 type="float" min="0" max="1" value="0.2" optional="true" | |
66 help="Fraction of the dataset that should be used for testing"/> | |
67 <param label="kFolds Cross-Validation" argument="--kFolds" | |
68 type="integer" value="1" min="1" optional="true" | |
69 help="Number of folds for cross-validation"/> | |
70 <param label="Train FNN" argument="--trainFNN" | |
71 type="boolean" checked="true" | |
72 help="Deactivates the FNN training"/> | |
73 <param label="Sample Down" argument="--sampleDown" | |
74 type="boolean" | |
75 help="Down sampling of the 0-valued samples"/> | |
76 <param label="Sample Fraction Ones" argument="--sampleFractionOnes" | |
77 type="float" min="0" max="1" value="0.5" optional="true" | |
78 help="This is the desired fraction 1s/0s.only works if --sampleDown is enabled"/> | |
79 <param label="Epochs" argument="--epochs" | |
80 type="integer" min="10" value="100" optional="true" | |
81 help="Number of epochs for the FNN training"/> | |
82 <param label="Loss Function" argument="--lossFunction" | |
83 type="select" optional="true" | |
84 help="Loss function for FNN training. mse - mean squared error, bce - binary cross entropy"> | |
85 <option value="mse">MSE</option> | |
86 <option value="bce" selected="true">BCE</option> | |
87 <option value="focal">Focal</option> | |
88 </param> | |
89 <param label="Optimizer" argument="--optimizer" | |
90 type="select" optional="true" | |
91 help="Optimizer of the FNN"> | |
92 <option value="Adam" selected="true">Adam</option> | |
93 <option value="SGD">Sgd</option> | |
94 </param> | |
95 <param label="Batch Size" argument="--batchSize" | |
96 type="integer" min="1" value="128" optional="true" | |
97 help="Batch size in FNN training"/> | |
98 <param label="L2 Regularization" argument="--l2reg" | |
99 type="float" min="0" value="0.001" optional="true" | |
100 help="Value for l2 kernel regularizer"/> | |
101 <param label="Dropout" argument="--dropout" | |
102 type="float" min="0" max="1" value="0.2" optional="true" | |
103 help="The fraction of data that is dropped out in each dropout layer"/> | |
104 <param label="Learning Rate" argument="--learningRate" | |
105 type="float" min="0" value="2.2e-05" optional="true" | |
106 help="Learning rate size in FNN training"/> | |
107 <param label="Learning Rate Decay" argument="--learningRateDecay" | |
108 type="float" min="0" max="1" value="0.96" optional="true" | |
109 help="Learning rate decay in FNN training"/> | |
110 <param label="Activation Function" argument="--activationFunction" | |
111 type="select" optional="true" | |
112 help="The activation function of the FNN"> | |
113 <option value="relu" selected="true">Relu</option> | |
114 <option value="selu">Selu</option> | |
115 </param> | |
116 </section> | |
117 <conditional name="autoencoder"> | |
118 <param label="Compress Fingerprints with Autoencoder" argument="--compressFeatures" | |
119 type="select" | |
120 help="Compress the fingerprints using an autoencoder. | |
121 Either uses an already trained autoencoder (requires a weights file) | |
122 or creates and trains a new autoencoder."> | |
123 <option value="true">Compress fingerprints</option> | |
124 <option value="false">Use raw fingerprints</option> | |
125 </param> | |
126 <when value="true"> | |
127 <conditional name="train-autoencoder"> | |
128 <param label="Load / Train Autoencoder" argument="--trainAC" | |
129 type="select" | |
130 help="Select if a new autoencoder should be trained | |
131 or if you want to provide the weights of a trained autoencoder yourself"> | |
132 <option value="true">Train new autoencoder</option> | |
133 <option value="false">Load autoencoder from file</option> | |
134 </param> | |
135 <when value="false"> | |
136 <param label="Encoder Weights File" argument="--ecWeightsFile" | |
137 type="data" format="h5" | |
138 help="The .hdf5 file of a trained encoder"/> | |
139 </when> | |
140 <when value="true"> | |
141 <param label="Autoencoder Type" argument="--aeType" | |
142 type="select" optional="true" | |
143 help="Autoencoder type, variational or deterministic"> | |
144 <option value="variational">Variational</option> | |
145 <option value="deterministic" selected="true">Deterministic</option> | |
146 </param> | |
147 <param label="Epochs" argument="--aeEpochs" | |
148 type="integer" min="5" value="100" optional="true" | |
149 help="Number of epochs for autoencoder training"/> | |
150 <param label="Batch Size" argument="--aeBatchSize" | |
151 type="integer" min="1" value="512" optional="true" | |
152 help="Batch size in autoencoder training"/> | |
153 <param label="Learning Rate" argument="--aeLearningRate" | |
154 type="float" min="0" value="0.001" optional="true" | |
155 help="Learning rate for autoencoder training"/> | |
156 <param label="Learning Rate Decay" argument="--aeLearningRateDecay" | |
157 type="float" value="0.96" min="0" max="1" optional="true" | |
158 help="Learning rate decay for autoencoder training"/> | |
159 <param label="Split Type" argument="--aeSplitType" | |
160 type="select" optional="true" | |
161 help="Set how the data is split for the autoencoder"> | |
162 <option value="scaffold_balanced">Scaffold Balanced</option> | |
163 <option value="random" selected="true">Random</option> | |
164 <option value="molecular_weight">Molecular Weight</option> | |
165 </param> | |
166 <param label="FNN Type" argument="--fnnType" | |
167 type="select" optional="true" | |
168 help="The type of the feedforward neural network"> | |
169 <option value="FNN" selected="true">FNN</option> | |
170 <option value="SNN">SNN</option> | |
171 </param> | |
172 <param label="Fingerprint Size" argument="--encFPSize" | |
173 type="integer" min="1" value="256" optional="true" | |
174 help="Size of encoded fingerprint (z-layer of autoencoder)"/> | |
175 <param label="Activation Function" argument="--aeActivationFunction" | |
176 type="select" optional="true" | |
177 help="The activation function of the autoencoder"> | |
178 <option value="relu" selected="true">ReLU</option> | |
179 <option value="selu">SELU</option> | |
180 </param> | |
181 <param label="Visualize Latent Space" argument="--visualizeLatent" | |
182 type="boolean" checked="false" | |
183 help="UMAP the latent space for exploration"/> | |
184 </when> | |
185 </conditional> | |
186 </when> | |
187 <when value="false"/> | |
188 </conditional> | |
189 <section title="Logging" name="logging_configuration" expanded="false"> | |
190 <param label="Verbosity Level" argument="--verbose" | |
191 type="select" optional="true" | |
192 help="Verbosity level of output"> | |
193 <option value="0">0: No additional output</option> | |
194 <option value="1">1: Some additional output</option> | |
195 <option value="2">2: Full additional output</option> | |
196 </param> | |
197 <!-- <section name="tracking_configuration" title="Weights & Biases" expanded="true">--> | |
198 <!-- <param label="Target"--> | |
199 <!-- argument="--wabTarget" type="text" optional="true"--> | |
200 <!-- help="Which endpoint to use for tracking performance via Weights & Biases. Should match the column name"/>--> | |
201 <!-- <param label="Track FNN" argument="--wabTracking"--> | |
202 <!-- type="boolean"--> | |
203 <!-- help="Track FNN performance via Weights & Biases"/>--> | |
204 <!-- <param label="Track Autoencoder" argument="--aeWabTracking"--> | |
205 <!-- type="boolean"--> | |
206 <!-- help="Track autoencoder performance via Weights & Biases"/>--> | |
207 <!-- </section>--> | |
208 </section> | |
209 </inputs> | |
210 <outputs> | |
211 <!-- todo: filter -> let user decide if they want output svg/csv or nothing --> | |
212 <!-- <data name="loss_table" format="csv" from_work_dir="output/" label="${tool.name} on ${on_string}: csv">--> | |
213 <!-- </data>--> | |
214 <!-- <data name="loss_diagram" format="svg" from_work_dir="output/" label="${tool.name} on ${on_string}: svg">--> | |
215 <!-- </data>--> | |
216 <data name="output_model_weights" label="${tool.name} on ${on_string}: model weights" | |
217 format="h5"/> | |
218 <data name="output_autoencoder_weights" label="${tool.name} on ${on_string}: autoencoder weights" | |
219 format="h5"/> | |
220 </outputs> | |
221 <tests> | |
222 <test> | |
223 <section name="model_configuration"> | |
224 <param name="inputFile" value="S_dataset.csv"/> | |
225 <param name="target" value="Aromatase"/> | |
226 <param name="type" value="smiles"/> | |
227 <param name="fpType" value="topological"/> | |
228 <param name="fpSize" value="2048"/> | |
229 <param name="enableMultiLabel" value="false"/> | |
230 <param name="threshold" value="0.5"/> | |
231 </section> | |
232 <section name="training_configuration"> | |
233 <param name="split_type" value="random"/> | |
234 <param name="sampleFractionOnes" value="0"/> | |
235 <param name="sampleDown" value="false"/> | |
236 <param name="trainFNN" value="true"/> | |
237 <param name="kFolds" value="1"/> | |
238 <param name="testSize" value="0.2"/> | |
239 <param name="optimizer" value="Adam"/> | |
240 <param name="lossFunction" value="bce"/> | |
241 <param name="epochs" value="10"/> | |
242 <param name="batchSize" value="128"/> | |
243 <param name="activationFunction" value="selu"/> | |
244 <param name="dropout" value="0.0107"/> | |
245 <param name="learningRate" value="2.2e-06"/> | |
246 <param name="l2reg" value="0.001"/> | |
247 </section> | |
248 <conditional name="autoencoder"> | |
249 <param name="compressFeatures" value="true"/> | |
250 <conditional name="train-autoencoder"> | |
251 <param name="trainAC" value="true"/> | |
252 <param name="encFPSize" value="256"/> | |
253 <param name="aeSplitType" value="random"/> | |
254 <param name="aeEpochs" value="5"/> | |
255 <param name="aeBatchSize" value="351"/> | |
256 <param name="aeActivationFunction" value="relu"/> | |
257 <param name="aeLearningRate" value="0.001"/> | |
258 <param name="aeLearningRateDecay" value="0.0001"/> | |
259 <param name="aeType" value="deterministic"/> | |
260 <param name="fnnType" value="FNN"/> | |
261 </conditional> | |
262 </conditional> | |
263 <!-- <param name="aeWabTracking" value="false"/> | |
264 <param name="wabTracking" value="false"/> --> | |
265 <section name="logging_configuration"> | |
266 <param name="verbose" value="2"/> | |
267 </section> | |
268 <!-- todo: add tests for svg, csv --> | |
269 <output name="output_model_weights"> | |
270 <assert_contents> | |
271 <has_h5_keys keys="alpha_dropout_18,alpha_dropout_19,alpha_dropout_20,dense_30,dense_31,dense_32,dense_33,top_level_model_weights"/> | |
272 </assert_contents> | |
273 </output> | |
274 <assert_stdout> | |
275 <has_text text="Evaluating trained model"/> | |
276 </assert_stdout> | |
277 </test> | |
278 </tests> | |
279 <help><![CDATA[ | |
280 This tool is the train mode of `DeepFPLearn <https://github.com/yigbt/deepFPlearn>`_. | |
281 It's equivalent to running ``dfpl train`` from the command line. | |
282 | |
283 The train mode is used to train models to predict the association of molecular structures to biological targets. | |
284 The encoding of the molecules is done based on molecular fingerprints. | |
285 | |
286 The training data contains three targets and you may train models for each with this tool. | |
287 | |
288 The tool will generate the following outputs: | |
289 | |
290 - the trained models as a ``.zip`` file including | |
291 | |
292 - the weights of the trained FNN, if selected | |
293 | |
294 - the weights of the trained autoencoder, if selected | |
295 | |
296 - the training histories as tabular data (``.csv``) | |
297 | |
298 - the training histories as a plot (``.svg``) | |
299 ]]></help> | |
300 <expand macro="citations"/> | |
301 </tool> |