Mercurial > repos > recetox > ramclustr
comparison macros.xml @ 0:36104baf75da draft
"planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/ramclustr commit 4d2ac914c951166e386a94d8ebb8cb1becfac122"
author | recetox |
---|---|
date | Tue, 22 Mar 2022 16:09:16 +0000 |
parents | |
children | 9a0d83c1b4b3 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:36104baf75da |
---|---|
1 <macros> | |
2 <token name="@TOOL_VERSION@">1.2.2</token> | |
3 | |
4 <xml name="creator"> | |
5 <creator> | |
6 <person | |
7 givenName="Helge" | |
8 familyName="Hecht" | |
9 url="https://github.com/hechth" | |
10 identifier="0000-0001-6744-996X" /> | |
11 <person | |
12 givenName="Maksym" | |
13 familyName="Skoryk" | |
14 url="https://github.com/maximskorik" | |
15 identifier="0000-0003-2056-8018" /> | |
16 <person | |
17 givenName="Matej" | |
18 familyName="Troják" | |
19 url="https://github.com/xtrojak" | |
20 identifier="0000-0003-0841-2707" /> | |
21 <person | |
22 givenName="Martin" | |
23 familyName="Čech" | |
24 url="https://github.com/martenson" | |
25 identifier="0000-0002-9318-1781" /> | |
26 <organization | |
27 url="https://www.recetox.muni.cz/" | |
28 email="GalaxyToolsDevelopmentandDeployment@space.muni.cz" | |
29 name="RECETOX MUNI"/> | |
30 </creator> | |
31 </xml> | |
32 | |
33 <xml name="parameters_csv"> | |
34 <section name="ms_csv" title="Input MS Data as CSV" expanded="true"> | |
35 <param label="Input CSV" name="ms" type="data" format="csv" | |
36 help="Features as columns, rows as samples. Column header in format mz_rt."/> | |
37 <param label="idMSMS" name="idmsms" type="data" format="csv" optional="true" | |
38 help="Optional idMSMS / MSe csv data. Same dimension and names as in input CSV are required."/> | |
39 </section> | |
40 </xml> | |
41 | |
42 <xml name="parameters_xcms"> | |
43 <section name="xcms" title="Input MS Data as XCMS" expanded="true"> | |
44 <param name="input_xcms" label="Input XCMS" type="data" format="rdata.xcms.fillpeaks" | |
45 help="Grouped feature data for clustering." /> | |
46 <param label="Preserve phenotype" name="usePheno" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" | |
47 help="Transfer phenotype data from XCMS object to Spec abundance file."/> | |
48 </section> | |
49 </xml> | |
50 | |
51 <xml name="parameters_required"> | |
52 <param label="Sigma r" name="sr" type="float" value="0.5" help="Correlational similarity between features."/> | |
53 <param label="Correlation method" name="cor_method" type="select" display="radio" | |
54 help="Choose correlational method to be used - see [1] for details."> | |
55 <option value="pearson" selected="true">pearson</option> | |
56 <option value="everything">everything</option> | |
57 <option value="spearman">spearman</option> | |
58 <option value="kendall">kendall</option> | |
59 </param> | |
60 <param label="Maximum RT difference" name="maxt" value="60" type="float" | |
61 help="Maximum difference to calculate RT similarity - values beyond this are assigned zero similarity."/> | |
62 </xml> | |
63 | |
64 <xml name="main_parameters"> | |
65 <section name="clustering" title="Clustering" expanded="true"> | |
66 <param label="Clustering linkage method" name="linkage" type="select" display="radio" | |
67 help="Choose hierarchical clustering linkage method - see [2] for details."> | |
68 <option value="average" selected="true">average</option> | |
69 <option value="ward.D">ward.D</option> | |
70 <option value="ward.D2">ward.D2</option> | |
71 <option value="single">single</option> | |
72 <option value="complete">complete</option> | |
73 <option value="mcquitty">mcquitty</option> | |
74 <option value="median">median</option> | |
75 <option value="centroid">centroid</option> | |
76 </param> | |
77 <param label="Minimal cluster size" name="minModuleSize" type="integer" value="2" | |
78 help="Minimal size (number of features) of a cluster."/> | |
79 <param label="Maximal tree height" name="hmax" type="float" value="0.3" | |
80 help="Cut the Hierarchical Cluster Analysis tree at this height, see [3] for details."/> | |
81 <param label="Use deepSplit" name="deepSplit" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false" | |
82 help="Check to produce more smaller clusters, uncheck for fewer bigger clusters, see [3] for details."/> | |
83 </section> | |
84 | |
85 <section name="normalisation" title="Normalisation" expanded="true"> | |
86 <conditional name="normalisation_method"> | |
87 <param label="Normalisation method" name="normalize" type="select" display="radio" | |
88 help="Choose method for normalization of feature intensities."> | |
89 <option value="none" selected="true">none</option> | |
90 <option value="TIC">TIC</option> | |
91 <option value="quantile">quantile</option> | |
92 <option value="batch.qc">batch.qc</option> | |
93 </param> | |
94 <when value="batch.qc"> | |
95 <param label="Metadata details" name="batch_order_qc" type="data" format="csv" optional="true" | |
96 help="CSV with sample names (or indices, currently not handled) on rows and columns with: | |
97 batch number ('batch'), position in sequence ('order'), and whether it is a QC sample or not | |
98 ('qc' with true/false OR 'sampleType' with 'sample/qc/blank')."/> | |
99 <param label="QC injection range" name="qc_inj_range" type="integer" value="20" | |
100 help="How many injections around each injection are to be scanned for presence of QC samples? | |
101 A good rule of thumb is between 1 and 3 times the typical | |
102 injection span between QC injections. i.e. if you inject QC ever 7 samples, set this to | |
103 between 7 and 21. Smaller values provide more local precision but make normalization sensitive | |
104 to individual poor outliers (though these are first removed using the boxplot function outlier | |
105 detection), while wider values provide less local precision in normalization but better | |
106 stability to individual peak areas."/> | |
107 </when> | |
108 </conditional> | |
109 </section> | |
110 | |
111 <section name="performance" title="Performance"> | |
112 <param label="Blocksize" name="blocksize" type="integer" value="2000" | |
113 help="Number of features processed in one block."/> | |
114 <param label="Blocksize factor" name="mult" type="integer" value="5" | |
115 help="Factor to scale blocksize to influence processing speed."/> | |
116 </section> | |
117 | |
118 <section name="msp_output_details" title="MSP output"> | |
119 <param label="Merge MSP Files" name="merge_msp" type="boolean" truevalue="TRUE" falsevalue="FALSE" | |
120 checked="true" help="Merge all MSP in one file or export one MSP per spectra."/> | |
121 <param label="m/z decimal places" name="mzdec" type="integer" value="6" | |
122 help="Number of decimal places used in printing m/z values."/> | |
123 <!-- | |
124 Currently not forwarded because the MSP is exported always manually afterwards | |
125 <param label="mspout" name="mspout" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" help="write msp formatted spectra to file?" /> | |
126 --> | |
127 </section> | |
128 | |
129 <section name="extras" title="Extras"> | |
130 <param label="RT only low n" name="rt_only_low_n" type="boolean" truevalue="TRUE" falsevalue="FALSE" | |
131 checked="true" | |
132 help="At low injection numbers, correlational relationships of peak intensities may be unreliable. | |
133 By default, RAMClustR will simply ignore the correlational Sigma r value and cluster on retention time alone. | |
134 If you wish to use correlation with at n less than 5, set this value to FALSE."/> | |
135 <param label="Replace zeros" name="replace_zeros" type="boolean" truevalue="TRUE" falsevalue="FALSE" | |
136 checked="true" | |
137 help="NA, NaN, and Inf values are replaced with zero, and zero values are sometimes returned from | |
138 peak peaking. When TRUE, zero values will be replaced with a small amount of noise, with noise level | |
139 set based on the detected signal intensities for that feature."/> | |
140 <param label="Experimental design metadata" name="ExpDes" type="data" format="csv" optional="true" | |
141 help="Definition of experimental design in CSV format." /> | |
142 </section> | |
143 </xml> | |
144 | |
145 <xml name="output_msp"> | |
146 <collection label="Mass spectra from ${tool.name} on ${on_string}" name="mass_spectra_collection" type="list"> | |
147 <discover_datasets pattern="__name_and_ext__" directory="spectra" recurse="true" ext="msp"/> | |
148 <filter>not msp_output_details['merge_msp']</filter> | |
149 </collection> | |
150 <data label="Mass spectra from ${tool.name} on ${on_string}" name="mass_spectra_merged" format="msp"> | |
151 <filter>msp_output_details['merge_msp']</filter> | |
152 </data> | |
153 </xml> | |
154 | |
155 <xml name="citations"> | |
156 <citations> | |
157 <!-- Example of annotating a citation using a BibTex entry. --> | |
158 <citation type="bibtex"> | |
159 @article{Broeckling2014e, | |
160 abstract = {Metabolomic data are frequently acquired using chromatographically coupled mass spectrometry | |
161 (MS) platforms. For such datasets, the first step in data analysis relies on feature detection, where a | |
162 feature is defined by a mass and retention time. While a feature typically is derived from a single | |
163 compound, a spectrum of mass signals is more a more-accurate representation of the mass spectrometric | |
164 signal for a given metabolite. Here, we report a novel feature grouping method that operates in an | |
165 unsupervised manner to group signals from MS data into spectra without relying on predictability of the | |
166 in-source phenomenon. We additionally address a fundamental bottleneck in metabolomics, annotation of MS | |
167 level signals, by incorporating indiscriminant MS/MS (idMS/MS) data implicitly: feature detection is | |
168 performed on both MS and idMS/MS data, and feature-feature relationships are determined simultaneously | |
169 from the MS and idMS/MS data. This approach facilitates identification of metabolites using in-source MS | |
170 and/or idMS/MS spectra from a single experiment, reduces quantitative analytical variation compared to | |
171 single-feature measures, and decreases false positive annotations of unpredictable phenomenon as novel | |
172 compounds. This tool is released as a freely available R package, called RAMClustR, and is sufficiently | |
173 versatile to group features from any chromatographic-spectrometric platform or feature-finding software. | |
174 {\textcopyright} 2014 American Chemical Society.}, | |
175 author = {Broeckling, C. D. and Afsar, F. A. and Neumann, S. and Ben-Hur, A. and Prenni, J. E.}, | |
176 doi = {10.1021/ac501530d}, | |
177 issn = {15206882}, | |
178 journal = {Analytical Chemistry}, | |
179 number = {14}, | |
180 pages = {6812--6817}, | |
181 pmid = {24927477}, | |
182 title = {{RAMClust: A novel feature clustering method enables spectral-matching-based annotation for | |
183 metabolomics data}}, | |
184 volume = {86}, | |
185 year = {2014} | |
186 } | |
187 </citation> | |
188 </citations> | |
189 </xml> | |
190 | |
191 <token name="@HELP@"> | |
192 <![CDATA[ | |
193 Documentation | |
194 For documentation on the tool see https://github.com/cbroeckl/RAMClustR/blob/master/vignettes/RAMClustR.Rmd | |
195 | |
196 Upstream Tools | |
197 +------------------------------+-------------------------------+----------------------+---------------------+ | |
198 | Name | Output File | Format | Parameter | | |
199 +==============================+===============================+======================+=====================+ | |
200 | xcms | xset.fillPeaks.RData | rdata.xcms.fillpeaks | xcmsObj | | |
201 +------------------------------+-------------------------------+----------------------+---------------------+ | |
202 | RAMClustR define experiment | Table with experiment details | csv | Experimental design | | |
203 +------------------------------+-------------------------------+----------------------+---------------------+ | |
204 | |
205 The tool takes an **xcmsSet** object as input and extracts all relevant information. | |
206 | |
207 +-------+------------------------+--------+------------+ | |
208 | Name | Output File | Format | Parameter | | |
209 +=======+========================+========+============+ | |
210 | ??? | Feature Table with MS1 | csv | ms | | |
211 +-------+------------------------+--------+------------+ | |
212 | ??? | Feature Table with MS2 | csv | idmsms | | |
213 +-------+------------------------+--------+------------+ | |
214 | |
215 Alternatively, the tool takes a **csv** table as input which has to fulfill the following requirements | |
216 | |
217 (1) no more than one sample (or file) name column and one feature name row; | |
218 (2) feature names that contain the mass and retention times, separated by a constant delimiter; and | |
219 (3) features in columns and samples in rows. | |
220 | |
221 +----------------------+-------------------+-------------------+--------------------+--------------------+ | |
222 | sample | 100.88_262.464 | 100.01_423.699 | 100.003_128.313 | 100.0057_154.686 | | |
223 +======================+===================+===================+====================+====================+ | |
224 | 10_qc_16x_dil_milliq | 0 | 195953.6376 | 0 | 0 | | |
225 +----------------------+-------------------+-------------------+--------------------+--------------------+ | |
226 | 11_qc_8x_dil_milliq | 0 | 117742.1828 | 4247300.664 | 0 | | |
227 +----------------------+-------------------+-------------------+--------------------+--------------------+ | |
228 | 12_qc_32x_dil_milliq | 4470859.38 | 0 | 2206092.112 | 0 | | |
229 +----------------------+-------------------+-------------------+--------------------+--------------------+ | |
230 | 15_qc_16x_dil_milliq | 0 | 0 | 2767477.481 | 0 | | |
231 +----------------------+-------------------+-------------------+--------------------+--------------------+ | |
232 | |
233 | |
234 Downstream Tools | |
235 The output is a msp file or a collection of msp files, with additional Spec Abundance file. | |
236 | |
237 +---------+--------------+----------------------+ | |
238 | Name | Output File | Format | | |
239 +=========+==============+======================+ | |
240 | matchMS | Mass Spectra | collection (tgz/msp) | | |
241 +---------+--------------+----------------------+ | |
242 | |
243 @GENERAL_HELP@ | |
244 ]]> | |
245 </token> | |
246 | |
247 <token name="@GENERAL_HELP@"> | |
248 Background | |
249 Metabolomics | |
250 Metabolomics is frequently performed using chromatographically coupled mass spectrometry, with gas | |
251 chromatography, liquid chromatography, and capillary electrophoresis being the most frequently utilized | |
252 methods of separation. The coupling of chromatography to mass spectrometry is enabled with an | |
253 appropriate ionization source - electron impact (EI) for gas phase separations and electrospray | |
254 ionization (ESI) for liquid phase separations. XCMS is a commonly used tool to detect all the signals | |
255 from a metabolomics dataset, generating aligned features, where a feature is represented by a mass and | |
256 retention time. Each feature is presumed to derive from a single compound. However, each compound is | |
257 represented by several features. With any ionization method, isotopic peaks will be observed reflective | |
258 of the elemental composition of the analyte. In EI, fragmentation is a byproduct of ionization, and has | |
259 driven the generation of large mass spectral libraries. In ESI, in-source fragmentation frequently | |
260 occurs, the magnitude of which is compound dependent, with more labile compounds being more prone to | |
261 in-source fragmentation. ESI can also product multiple adduct forms (protonated, potassiated, sodiated, | |
262 ammoniated...), and can produce multimers (i.e. [2M+H]+, [3M+K]+, etc) and multiple charged species | |
263 ([M+2H]++). This can become further complicated by considering combinations of these phenomena. For | |
264 example [2M+3H]+++ (triply charged dimer) or an in-source fragment of a dimer. | |
265 | |
266 RAMClustR approach | |
267 RAMClustR was designed to group features designed from the same compound using an approach which is | |
268 **1.** unsupervised, **2.** platform agnostic, and **3.** devoid of curated rules, as the depth of | |
269 understanding of these processes is insufficient to enable accurate curation/prediction of all phenomenon | |
270 that may occur. We achieve this by making two assumptions. The first is that two features derived | |
271 from the same compound with have (approximately) the same retention time. The second is that two | |
272 features derived from the same compound will have (approximately) the same quantitative trend across | |
273 all samples in the xcms sample set. From these assumptions, we can calculate a retention time | |
274 similarity score and a correlational similarity score for each feature pair. A high similarity score | |
275 for both retention time and correlation indicates a strong probability that two features derive from | |
276 the same compound. Since both conditions must be met, the product of the two similarity scores provides | |
277 the best approximation of the total similarity score - i.e. a feature pair with retention time similarity | |
278 of 1 and correlational similarity of 0 is unlikely to derive from one compound - 1 x 0 = 0, the final | |
279 similarity score is zero, indicating the two features represent two different compounds. Similarly, a | |
280 feature pair with retention time similarity of 0 and correlational similarity of 1 is unlikely to derive | |
281 from one compound - 0 x 1 = 0. Alternatively - a feature pair with retention time similarity of 1 and | |
282 correlational similarity of 1 is likely to derive from one compound - 1 x 1 = 1. | |
283 | |
284 The RAMClustR algorithm is built on creating similarity scores for all pairs of features, submitting | |
285 this score matrix for hierarchical clustering, and then cutting the resulting dendrogram into neat | |
286 chunks using the dynamicTreeCut package - where each 'chunk' of the dendrogram results in a group of | |
287 features likely to be derived from a single compound. Importantly, this is achieved without looking for | |
288 specific phenomenon (i.e. sodiation), meaning that grouping can be performed on any dataset, whether it | |
289 is positive or negative ionization mode, EI or ESI, LC-MS GC-MS or CE-MS, in-source fragment or complex | |
290 adduction event, and predictable or unpredictable signals. | |
291 </token> | |
292 | |
293 <token name="@HELP_experiment@"> | |
294 <![CDATA[ | |
295 Create an Experimental Design specification for RAMClustR experiment. | |
296 | |
297 Downstream Tools | |
298 +-----------+-----------------------+--------+ | |
299 | Name | Output File | Format | | |
300 +===========+=======================+========+ | |
301 | RAMClustR | Experiment definition | csv | | |
302 +-----------+-----------------------+--------+ | |
303 | |
304 ]]> | |
305 </token> | |
306 </macros> |