comparison spec2vec_training.xml @ 0:e1e22ada831e draft

planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/spec2vec commit 2e4bdc2fd94445aa5a8d1882a3d092cca727e4b6
author recetox
date Thu, 05 Jan 2023 10:08:12 +0000
parents
children 9d917de87cca
comparison
equal deleted inserted replaced
-1:000000000000 0:e1e22ada831e
1 <tool id="spec2vec_training" name="Spec2Vec Model Training" version="@TOOL_VERSION@-@TOOL_DEV_VERSION@+galaxy0" python_template_version="3.5" profile="21.05">
2 <description>Train a Spec2Vec model for mass spectra similarity scoring</description>
3
4 <macros>
5 <import>macros.xml</import>
6 </macros>
7 <expand macro="creator"/>
8
9 <requirements>
10 <container type="docker">recetox/spec2vec:@COMMIT_SHA@</container>
11 </requirements>
12
13 <command detect_errors="exit_code"><![CDATA[
14 ln -fs '${weights_filename}' '${weights_filename}.npy' &&
15 sh ${spec2vec_python_cli}
16 ]]></command>
17
18 <configfiles>
19 <configfile name="spec2vec_python_cli">
20 python3 '${__tool_directory__}/spec2vec_training_wrapper.py' \
21 --spectra_filename '$spectra_filename' \
22 --spectra_fileformat '$spectra_filename.ext' \
23 #if $output_parameters.model_checkpoints.save_checkpoints == 'TRUE'
24 --checkpoints '$output_parameters.model_checkpoints.checkpoints' \
25 #else
26 --epochs $output_parameters.model_checkpoints.epochs \
27 #end if
28 --vector_size $training_parameters.vector_size \
29 --alpha $training_parameters.alpha \
30 --min_alpha $training_parameters.min_alpha \
31 --window $training_parameters.window \
32 --min_count $training_parameters.min_count \
33 --sample $training_parameters.sample \
34 --seed $training_parameters.seed \
35 --sg $training_parameters.sg_param.sg \
36 #if not $training_parameters.sg_param.sg
37 --cbow_mean $training_parameters.sg_param.cbow_mean \
38 #end if
39 --hs $training_parameters.hs_param.hs \
40 #if not $training_parameters.hs_param.hs
41 --negative $training_parameters.hs_param.negative \
42 --ns_exponent $training_parameters.hs_param.ns_exponent \
43 #end if
44 --sorted_vocab $training_parameters.sorted_vocab \
45 --batch_words $training_parameters.batch_words \
46 --shrink_windows $training_parameters.shrink_windows \
47 #if $training_parameters.trim_vocab.max_vocab_size_bool == 'TRUE'
48 --max_vocab_size $training_parameters.trim_vocab.max_vocab_size \
49 #end if
50 --n_decimals $training_parameters.n_decimals \
51 --n_workers \${GALAXY_SLOTS:-1} \
52 #if $output_parameters.as_pickle
53 --model_filename_pickle '$model_filename_pickle' \
54 #end if
55 --model_filename '$model_filename' \
56 --weights_filename '$weights_filename' \
57 </configfile>
58 </configfiles>
59
60 <inputs>
61 <param label="Training spectra" name="spectra_filename" type="data" format="msp,mgf"
62 help="Spectra file to train a Spec2Vec model."/>
63
64 <section title="Output parameters" name="output_parameters" expanded="true">
65 <param label="Save model as Pickle file" name="as_pickle" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE"
66 help="Add a Pickle output besides default JSON."/>
67 <conditional name="model_checkpoints">
68 <param label="Model checkpoints" name="save_checkpoints" type="select" display="radio"
69 help="Epochs after which to save a model.">
70 <option value="TRUE">Yes</option>
71 <option value="FALSE" selected="true">No</option>
72 </param>
73 <when value="TRUE">
74 <param label="Number of training epochs with checkpoints" name="checkpoints" type="text" value="10,20,50"
75 help="Comma-separated epoch numbers after which to save a model. The highest number will be used as a total number of epochs for training.">
76 <validator type="empty_field"/>
77 <validator type="regex"
78 message="The input has to be a comma-separated sequence of integers without trailing commas. For example: 10,20,50">^[0-9]+(,[0-9]+)*$</validator>
79 </param>
80 </when>
81 <when value="FALSE">
82 <param label="Number of training epochs" name="epochs" type="integer" value="10"
83 help="Number of epochs to train the model."/>
84 </when>
85 </conditional>
86 </section>
87
88 <section title="Training hyperparameters" name="training_parameters" expanded="true">
89 <param label="Vector size" name="vector_size" type="integer" value="300"
90 min="1" help="Dimensionality of the feature vectors (i.e., into how many dimensions to encode each m/z and neutral loss peak."/>
91 <param label="Alpha" name="alpha" type="float" value="0.025"
92 min="0" help="The initial learning rate."/>
93 <param label="Minimum Alpha" name="min_alpha" type="float" value="0.00025"
94 min="0" help="Learning rate will linearly drop to this value as training progresses."/>
95 <param label="Window" name="window" type="integer" value="500"
96 help="Maximum distance between the current and predicted peak within a spectrum."/>
97 <param label="Minimum peak count" name="min_count" type="integer" value="1"
98 min="0" help="Ignores all peaks with absolute frequency lower than this."/>
99 <param label="Sample" name="sample" type="float" value="0.001"
100 help="The threshold for configuring which higher-frequency peaks are randomly downsampled."/>
101 <param label="Seed" name="seed" type="integer" value="1"
102 help="Seed of random number generator for model reproducibility."/>
103 <conditional name="sg_param">
104 <param label="Word-Embedding type" name="sg" type="select"
105 help="Embedding type: Skip-gram or Continuous Bag of Words">
106 <option value="0">CBOW</option>
107 <option value="1">Skip-gram</option>
108 </param>
109 <when value="0">
110 <param label="CBOW mean" name="cbow_mean" type="select"
111 help="Whether to use the sum of the context word vectors or their mean.">
112 <option value="0">Sum</option>
113 <option value="1" selected="true">Mean</option>
114 </param>
115 </when>
116 </conditional>
117 <conditional name="hs_param">
118 <param label="Last Layer Activation" name="hs" type="select"
119 help="Activation function of the last layer of the neural network. Negative sampling is more computationally efficient.">
120 <option value="0">Negative Sampling</option>
121 <option value="1">Hierarchical Softmax</option>
122 </param>
123 <when value="0">
124 <param label="Negative Samples" name="negative" type="integer" value="5"
125 min="1" help="Specify how many 'negative' examples should be drawn for each peak and neutral loss (usually between 5-20).">
126 <validator type="in_range" min="1" message="The value must be larger than 0."/>
127 </param>
128 <param label="Negative Sample Exponent" name="ns_exponent" type="float" value="0.75"
129 help="The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion to the frequencies,
130 0.0 samples all peaks and neutral losses equally, while a negative value samples low-frequency peaks more often than high-requency peaks.">
131 <validator type="in_range" min="-1.0" max="1.0" message="The value must be within -1.0 and 1.0 range."/>
132 </param>
133 </when>
134 </conditional>
135 <param label="Sort the vocabulary of spectra" name="sorted_vocab" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE"
136 help="If true, sort the vocabulary by descending frequency before assigning peak and neutral loss indices."/>
137 <param label="Batch size" name="batch_words" type="integer" value="10000"
138 help="Target size (in peaks and neutral losses) for batches of examples passed to worker threads (and thus cython routines).
139 Larger batches will be passed if individual peak sequences are longer than 10000 words, but the standard cython code truncates to that maximum."/>
140 <param label="Shrink windows" name="shrink_windows" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE"
141 help="EXPERIMENTAL. If true, the effective window size is uniformly sampled in range [1,Window] for each target peak during training."/>
142 <conditional name="trim_vocab">
143 <param label="Limit unique peaks and neutral losses in the spectral vocabulary" name="max_vocab_size_bool" type="select" display="radio"
144 help="Limits the RAM during vocabulary building; if there are more unique peaks and neutral losses than this, then prune the infrequent ones. Disable for no limit (default).">
145 <option value="FALSE">No limit</option>
146 <option value="TRUE">Limit</option>
147 </param>
148 <when value="TRUE">
149 <param label="Maximum unique peaks and neutral losses" name="max_vocab_size" type="integer" value="100000" min="1"/>
150 </when>
151 </conditional>
152 <param label="Number of decimals to round m/z values" name="n_decimals" type="integer" value="2"
153 min="0" max="5" help="Rounds peak position to this number of decimals."/>
154 </section>
155 </inputs>
156
157 <outputs>
158 <data label="Spec2Vec model on ${on_string}" name="model_filename" format="json"/>
159 <data label="Spec2Vec weights on ${on_string}" name="weights_filename" format="binary"/>
160 <data label="Spec2Vec pickle model on ${on_string}" name="model_filename_pickle" format="binary">
161 <filter>output_parameters['as_pickle']</filter>
162 </data>
163 <collection name="model_checkpoints" type="list" label="Spec2Vec model checkpoints on ${on_string}">
164 <discover_datasets pattern="__name_and_ext__" />
165 <filter>output_parameters['model_checkpoints']['save_checkpoints'] == 'TRUE'</filter>
166 </collection>
167 </outputs>
168
169 <tests>
170 <test expect_num_outputs="2"> <!-- Test 1: with default parameters -->
171 <param name="spectra_filename" value="RECETOX_Exposome_pesticides_HR_MS_normalized_20220323.msp" ftype="msp"/>
172 <output name="model_filename" file="model.json" ftype="json"/>
173 <output name="weights_filename" ftype="binary">
174 <assert_contents>
175 <has_size value="1708000" delta="1000"/>
176 <has_text text="'shape': (1423, 300)" n="1"/>
177 </assert_contents>
178 </output>
179 </test>
180 <test expect_num_outputs="3"> <!-- Test 2: pickle output -->
181 <param name="spectra_filename" value="RECETOX_Exposome_pesticides_HR_MS_normalized_20220323.msp" ftype="msp"/>
182 <param name="as_pickle" value="TRUE"/>
183 <output name="model_filename" file="model.json" ftype="json"/>
184 <output name="weights_filename" ftype="binary">
185 <assert_contents>
186 <has_size value="1708000" delta="1000"/>
187 <has_text text="'shape': (1423, 300)" n="1"/>
188 </assert_contents>
189 </output>
190 <output name="model_filename_pickle" ftype="binary">
191 <assert_contents>
192 <has_size value="3468000" delta="1000" />
193 <has_text text="gensim.models.word2vec"/>
194 <has_text text="peak@" n="1423"/>
195 </assert_contents>
196 </output>
197 </test>
198 <test expect_num_outputs="3"> <!-- Test 3: model checkpoints -->
199 <param name="spectra_filename" value="RECETOX_Exposome_pesticides_HR_MS_normalized_20220323.msp" ftype="msp"/>
200 <conditional name="model_checkpoints">
201 <param name="save_checkpoints" value="TRUE"/>
202 <param name="checkpoints" value="1,5,8,10"/>
203 </conditional>
204 <output name="model_filename" file="model.json" ftype="json"/>
205 <output name="weights_filename" ftype="binary">
206 <assert_contents>
207 <has_size value="1708000" delta="1000"/>
208 <has_text text="'shape': (1423, 300)" n="1"/>
209 </assert_contents>
210 </output>
211 <output_collection name="model_checkpoints" type="list" count="3">
212 <element name="spec2vec_iter_1">
213 <assert_contents>
214 <has_size value="3468000" delta="1000" />
215 <has_text text="gensim.models.word2vec" />
216 <has_text text="peak@" n="1423" />
217 </assert_contents>
218 </element>
219 <element name="spec2vec_iter_5">
220 <assert_contents>
221 <has_size value="3468000" delta="1000" />
222 <has_text text="gensim.models.word2vec" />
223 <has_text text="peak@" n="1423" />
224 </assert_contents>
225 </element>
226 <element name="spec2vec_iter_8">
227 <assert_contents>
228 <has_size value="3468000" delta="1000" />
229 <has_text text="gensim.models.word2vec" />
230 <has_text text="peak@" n="1423" />
231 </assert_contents>
232 </element>
233 </output_collection>
234 </test>
235 <test> <!-- Test 4: embeddings size in output corresponds to `vector_size` param -->
236 <param name="spectra_filename" value="RECETOX_Exposome_pesticides_HR_MS_normalized_20220323.msp" ftype="msp"/>
237 <param name="vector_size" value="100"/>
238 <output name="model_filename" file="model_vector_size_100.json" ftype="json"/>
239 <output name="weights_filename" ftype="binary">
240 <assert_contents>
241 <has_size value="569000" delta="1000"/>
242 <has_text text="'shape': (1423, 100)" n="1"/>
243 </assert_contents>
244 </output>
245 </test>
246 </tests>
247
248 <help><![CDATA[
249 **Spec2vec** is a spectral similarity score inspired by a natural language processing algorithm – Word2Vec.
250 Where Word2Vec learns relationships between words in sentences, spec2vec does so for mass fragments and neutral losses in MS/MS spectra.
251 The spectral similarity score is based on spectral embeddings learnt from the fragmental relationships within a large set of spectral data.
252 ]]></help>
253
254 <citations>
255 <citation type="doi">10.1371/journal.pcbi.1008724</citation>
256 </citations>
257 </tool>