Galaxy |

Changeset 0:cfd2e19f00a9 (2023-06-26)

Next changeset 1:2c6fa447f6a0 (2023-07-20)

Commit message:
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/recetox-xmsannotator commit 1ab1a1dabfcebe11720de1411927a7438c1b64c1

added:
macros.xml
recetox_xmsannotator_advanced.xml
utils.R

diff -r 000000000000 -r cfd2e19f00a9 macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Mon Jun 26 13:55:56 2023 +0000

[

@@ -0,0 +1,144 @@
+<macros>
+<token name="@TOOL_VERSION@">0.10.0</token>
+
+<xml name="requirements">
+ <requirements>
+ <requirement type="package" version="@TOOL_VERSION@">r-recetox-xmsannotator</requirement>
+ </requirements>
+</xml>
+
+<xml name="creator">
+ <creator>
+ <person
+ givenName="Jiří"
+ familyName="Novotný"
+ url="https://github.com/xtracko"
+ identifier="0000-0001-5449-3523" />
+ <person
+ givenName="Martin"
+ familyName="Čech"
+ url="https://github.com/martenson"
+ identifier="0000-0002-9318-1781" />
+ <person
+ givenName="Matej"
+ familyName="Troják"
+ url="https://github.com/xtrojak"
+ identifier="0000-0003-0841-2707" />
+ <organization
+ url="https://www.recetox.muni.cz/"
+ email="GalaxyToolsDevelopmentandDeployment@space.muni.cz"
+ name="RECETOX MUNI" />
+ </creator>
+</xml>
+
+<xml name="inputs">
+ <param name="metadata_table" type="data" format="parquet,csv">
+ <label>Metadata table</label>
+ <help><![CDATA[
+ Peak metadata table*.
+ ]]></help>
+ </param>
+ <param name="intensity_table" type="data" format="parquet,csv">
+ <label>Intensity table</label>
+ <help><![CDATA[
+ Table with intensities** for features (rows) across samples (columns).
+ ]]></help>
+ </param>
+ <param name="compound_table" type="data" format="parquet,csv">
+ <label>Compound database</label>
+ <help><![CDATA[
+ Database of compounds according to which the annotation is performed.
+ The database is required to contain the fields compound_id, monoisotopic_mass, and molecular_formula.
+ ]]></help>
+ </param>
+ <param name="adduct_table" type="data" format="parquet,csv" optional="true">
+ <label>Adduct database</label>
+ <help><![CDATA[
+ Database of adduct which is combined with the database of compound to form a molecule-adduct pairs.
+ The database is required to contain adduct, charge, mass, and n_molecules.
+ ]]></help>
+ </param>
+ <param name="adduct_weights" type="data" format="parquet,csv" optional="true">
+ <label>Adduct weights</label>
+ <help>
+ A weight-by-adduct table.
+ </help>
+ </param>
+</xml>
+
+<xml name="outputs">
+ <data name="output_file" format="parquet">
+ <change_format>
+ <when input="metadata_table.ext" value="csv" format="csv" />
+ </change_format>
+ </data>
+</xml>
+
+<xml name="tolerance">
+ <param name="mass_tolerance_ppm" type="integer" min="0" value="5">
+ <label>Mass tolerance [ppm]</label>
+ <help>Mass tolerance in ppm for database matching.</help>
+ </param>
+ <yield/>
+</xml>
+
+<token name="@HELP@">
+Description
+===========
+
+Annotate the peak intensity table (e.g. from an apLCMS run) with compounds from the compounds database
+using advanced methods.
+
+The annotation process generates all possible compound-adduct pairs and matches those pairs to the measured
+peaks. A compound-adduct pair is pronounced as a match to a certain peak when the difference of their masses are
+withing some tolerance.
+
+Then, a score and a confidence level is assigned to each match based on peak correlation
+clustering, metabolite pathway associations, adducts expectations, and isotope conformations.
+
+Input tables description
+------------------------
+
+(*) Metadata table
+~~~~~~~~~~~~~~~~~~
+
+The output from recetox-aplcms tool.
+The `npeaks` column denotes the number of peaks which have been grouped into this feature.
+The columns with the sample names indicate whether this feature is present in the sample.
+Only id, mz, and rt columns are required to be present.
+
++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+
+| id | mz | mzmin | mzmax | rt | rtmin | rtmax | npeaks | 21_qc_no_dil_milliq | 29_qc_no_dil_milliq | 8_qc_no_dil_milliq |
++=======+==============+==============+===============+================+===============+===============+===========+========================+========================+========================+
+| 1 | 70.03707021 | 70.037066 | 70.0370750 | 294.1038014 | 294.0634942 | 294.149985 | 3 | 1 | 1 | 1 |
++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+
+| 2 | 70.06505677 | 70.065045 | 70.0650676 | 141.9560055 | 140.5762528 | 143.335758 | 2 | 1 | 0 | 1 |
++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+
+| 57 | 78.04643252 | 78.046429 | 78.0464325 | 294.0063397 | 293.9406777 | 294.072001 | 2 | 1 | 1 | 0 |
++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+
+| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+
+
+(**) Intensity table
+~~~~~~~~~~~~~~~~~~~~
+
+The output from recetox-aplcms tool.
+This table contains the peak area for aligned features in all samples.
+
++-------+------------------------+------------------------+------------------------+
+| id | 21_qc_no_dil_milliq | 29_qc_no_dil_milliq | 8_qc_no_dil_milliq |
++=======+========================+========================+========================+
+| 1 | 13187487.20482895 | 7957395.699119729 | 11700594.397257797 |
++-------+------------------------+------------------------+------------------------+
+| 2 | 2075168.6398983458 | 0 | 2574362.159289044 |
++-------+------------------------+------------------------+------------------------+
+| 57 | 2934524.4406785755 | 1333044.5065971944 | 0 |
++-------+------------------------+------------------------+------------------------+
+| ... | ... | ... | ... |
++-------+------------------------+------------------------+------------------------+
+</token>
+
+<xml name="citations">
+ <citation type="doi">10.1021/acs.analchem.6b01214</citation>
+</xml>
+</macros>

diff -r 000000000000 -r cfd2e19f00a9 recetox_xmsannotator_advanced.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/recetox_xmsannotator_advanced.xml Mon Jun 26 13:55:56 2023 +0000

[

b'@@ -0,0 +1,169 @@\n+<tool id="recetox_xmsannotator_advanced" name="recetox-xMSannotator" version="@TOOL_VERSION@+galaxy0">\r\n+\r\n+ <description>annotate peak intensity table including scores and confidence levels</description>\r\n+ <macros>\r\n+ <import>macros.xml</import>\r\n+ </macros>\r\n+ <expand macro="creator"/>\r\n+ <xrefs>\r\n+ <xref type="bio.tools">recetox-xmsannotator</xref>\r\n+ </xrefs>\r\n+ <expand macro="requirements" />\r\n+ <command detect_errors="aggressive"><![CDATA[\r\n+ Rscript -e \'source("${__tool_directory__}/utils.R")\' -e "n_workers <- \\${GALAXY_SLOTS:-1}" -e "source(\'${wrapper}\')"\r\n+ ]]></command>\r\n+\r\n+ <configfiles>\r\n+ <configfile name="wrapper"><![CDATA[\r\n+ metadata_table <- load_table("$metadata_table", "$metadata_table.ext")\r\n+ intensity_table <- load_table("$intensity_table", "$intensity_table.ext")\r\n+ peak_table <- create_peak_table(metadata_table, intensity_table)\r\n+\r\n+ filter_by <- create_filter_by_adducts("$filter_by")\r\n+\r\n+ annotation <- advanced_annotation(\r\n+ peak_table = peak_table,\r\n+ adduct_table = load_table("$adduct_table", "$adduct_table.ext"),\r\n+ adduct_weights = load_table("$adduct_weights", "$adduct_weights.ext"),\r\n+ compound_table = load_table("$compound_table", "$compound_table.ext"),\r\n+ mass_tolerance = 1e-6 * ${mass_tolerance_ppm},\r\n+ time_tolerance = $time_tolerance,\r\n+ correlation_threshold = as.double($clustering.correlation_threshold),\r\n+ min_cluster_size = as.integer($clustering.min_cluster_size),\r\n+ deep_split = as.integer($clustering.deep_split),\r\n+ network_type = "$clustering.network_type",\r\n+ redundancy_filtering = $scoring.redundancy_filtering,\r\n+ n_workers = n_workers,\r\n+ intensity_deviation_tolerance = as.double($intensity_deviation_tolerance),\r\n+ mass_defect_tolerance = as.double($mass_defect_tolerance),\r\n+ mass_defect_precision = as.double($mass_defect_precision),\r\n+ peak_rt_width = as.integer($peak_rt_width),\r\n+ maximum_isotopes = as.integer($maximum_isotopes),\r\n+ min_ions_per_chemical = as.integer($min_ions_per_chemical),\r\n+ filter_by = filter_by\r\n+ )\r\n+\r\n+ save_table(annotation, "$output_file", "$output_file.ext")\r\n+ ]]></configfile>\r\n+ </configfiles>\r\n+\r\n+ <inputs>\r\n+ <expand macro="inputs"/>\r\n+ <expand macro="tolerance">\r\n+ <param name="time_tolerance" type="float" value="10" min="0">\r\n+ <label>Retention time tolerance [s]</label>\r\n+ <help>\r\n+ Retention time tolerance in seconds for finding peaks derived from the same parent compound.\r\n+ </help>\r\n+ </param>\r\n+ </expand>\r\n+ <section name="clustering" title="Clustering">\r\n+ <param name="correlation_threshold" type="float" value="0.7">\r\n+ <label>Correlation threshold</label>\r\n+ <help>Correlation threshold between peaks to qualify as adducts/isotopes of the same metabolite.</help>\r\n+ </param>\r\n+ <param name="min_cluster_size" type="integer" value="10" min="1">\r\n+ <label>Minimum cluster size</label>\r\n+ <help>The minimum number of nodes to be considered as a cluster.</help>\r\n+ </param>\r\n+ <param name="deep_split" type="integer" value="2" min="0" max="4">\r\n+ <label>Deep split</label>\r\n+ <help>\r\n+ Deep split provides a rough control over sensitivity to cluster splitting. The higher the value,\r\n+ the more and smaller clusters will be produced (see WGCNA package documentation).\r\n+ </help>\r\n+ </param'..b'rict boosting</label>\r\n+ <help>\r\n+ Boost the scores of metabolites that not only belongs to the same pathway but also to the same\r\n+ cluster. Otherwise, do not account for cluster membership.\r\n+ </help>\r\n+ </param>\r\n+ <param name="min_isp" type="integer" min="0" value="1">\r\n+ <label>Minimum number of expected isotopes</label>\r\n+ <help>\r\n+ Minimum number of adducts/isotopes to be present for a match to be considered as a high confidence match.\r\n+ </help>\r\n+ </param>\r\n+ <param name="max_isp" type="integer" min="0" value="5">\r\n+ <label>Maximum number of expected isotopes</label>\r\n+ <help>\r\n+ Maximum number of adducts/isotopes to be present for a match to be considered as a high confidence match.\r\n+ </help>\r\n+ </param>\r\n+ <param name="redundancy_filtering" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE">\r\n+ <label>Redundancy filtering</label>\r\n+ <help>Whether to filter out low-scored multiple matcher or not.</help>\r\n+ </param>\r\n+ </section>\r\n+ <param name="intensity_deviation_tolerance" type="float" value="0.1">\r\n+ <label>Tolerance of intensity deviation</label>\r\n+ <help>A numeric threshold by which an intensity ratio of two isotopic peaks may differ from their actual abundance ratio.</help>\r\n+ </param>\r\n+ <param name="mass_defect_tolerance" type="float" value="0.1">\r\n+ <label>Tolerance of mass defect</label>\r\n+ <help>Maximum difference in mass defect between two peaks of the same compound.</help>\r\n+ </param>\r\n+ <param name="mass_defect_precision" type="float" value="0.01">\r\n+ <label>Precision for computing mass defect</label>\r\n+ </param>\r\n+ <param name="peak_rt_width" type="integer" value="1">\r\n+ <label>Estimated chromatographic peak width</label>\r\n+ </param>\r\n+ <param name="maximum_isotopes" type="integer" value="10">\r\n+ <label>Maximum isotopes</label>\r\n+ </param>\r\n+ <param name="min_ions_per_chemical" type="integer" value="2">\r\n+ <label>Minimum ions per chemical</label>\r\n+ </param>\r\n+ <param name="filter_by" type="select" label="Adducts to filter by" multiple="true" optional="true">\r\n+ <option value="M-H" selected="true">M-H</option>\r\n+ <option value="M+H" selected="true">M+H</option>\r\n+ <option value="2M-H">2M-H</option>\r\n+ <option value="M-2H">M-2H</option>\r\n+ </param>\r\n+ </inputs>\r\n+\r\n+ <outputs>\r\n+ <expand macro="outputs"/>\r\n+ </outputs>\r\n+\r\n+ <tests>\r\n+ <test>\r\n+ <param name="metadata_table" value="metadata_table.parquet" ftype="parquet" />\r\n+ <param name="intensity_table" value="intensity_table.parquet" ftype="parquet" />\r\n+ <param name="compound_table" value="database.parquet" ftype="parquet" />\r\n+ <param name="adduct_table" value="adduct_table.parquet" ftype="parquet" />\r\n+ <output name="output_file" file="expected_output.parquet" ftype="parquet"/>\r\n+ </test>\r\n+ <test>\r\n+ <param name="metadata_table" value="metadata_table.csv" ftype="csv" />\r\n+ <param name="intensity_table" value="intensity_table.csv" ftype="csv" />\r\n+ <param name="compound_table" value="database.csv" ftype="csv" />\r\n+ <param name="adduct_table" value="adduct_table.csv" ftype="csv" />\r\n+ <output name="output_file" file="expected_output.csv" ftype="csv"/>\r\n+ </test>\r\n+ </tests>\r\n+\r\n+ <help>\r\n+ <![CDATA[\r\n+ @HELP@\r\n+ ]]>\r\n+ </help>\r\n+\r\n+ <citations>\r\n+ <expand macro="citations"/>\r\n+ </citations>\r\n+</tool>\r\n'

diff -r 000000000000 -r cfd2e19f00a9 utils.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/utils.R Mon Jun 26 13:55:56 2023 +0000

[

@@ -0,0 +1,37 @@
+library(recetox.xmsannotator)
+library(dplyr)
+
+load_table <- function(filename, filetype) {
+ if (filename == "None") {
+ return(NULL)
+ }
+ if (filetype == "csv") {
+ return(as.data.frame(read.csv(filename)))
+ } else {
+ return(as.data.frame(arrow::read_parquet(filename)))
+ }
+}
+
+save_table <- function(table, filename, filetype) {
+ if (filetype == "csv") {
+ write.csv(table, filename, row.names = FALSE)
+ } else {
+ arrow::write_parquet(table, filename)
+ }
+}
+
+create_filter_by_adducts <- function(comma_separated_values) {
+ if (comma_separated_values == "None") {
+ return(NA)
+ }
+ filter_by <- strsplit(trimws(comma_separated_values), ",")[[1]]
+ return(filter_by)
+}
+
+create_peak_table <- function(metadata_table, intensity_table) {
+ metadata_table <- select(metadata_table, id, mz, rt)
+ peak_table <- inner_join(metadata_table, intensity_table, by = "id")
+ peak_table <- rename(peak_table, peak = id)
+ peak_table$peak <- as.integer(peak_table$peak)
+ return(peak_table)
+}