Mercurial > repos > recetox > recetox_xmsannotator_advanced
changeset 0:cfd2e19f00a9 draft
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/recetox-xmsannotator commit 1ab1a1dabfcebe11720de1411927a7438c1b64c1
author | recetox |
---|---|
date | Mon, 26 Jun 2023 13:55:56 +0000 |
parents | |
children | 2c6fa447f6a0 |
files | macros.xml recetox_xmsannotator_advanced.xml utils.R |
diffstat | 3 files changed, 350 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Mon Jun 26 13:55:56 2023 +0000 @@ -0,0 +1,144 @@ +<macros> +<token name="@TOOL_VERSION@">0.10.0</token> + +<xml name="requirements"> + <requirements> + <requirement type="package" version="@TOOL_VERSION@">r-recetox-xmsannotator</requirement> + </requirements> +</xml> + +<xml name="creator"> + <creator> + <person + givenName="Jiří" + familyName="Novotný" + url="https://github.com/xtracko" + identifier="0000-0001-5449-3523" /> + <person + givenName="Martin" + familyName="Čech" + url="https://github.com/martenson" + identifier="0000-0002-9318-1781" /> + <person + givenName="Matej" + familyName="Troják" + url="https://github.com/xtrojak" + identifier="0000-0003-0841-2707" /> + <organization + url="https://www.recetox.muni.cz/" + email="GalaxyToolsDevelopmentandDeployment@space.muni.cz" + name="RECETOX MUNI" /> + </creator> +</xml> + +<xml name="inputs"> + <param name="metadata_table" type="data" format="parquet,csv"> + <label>Metadata table</label> + <help><![CDATA[ + Peak metadata table*. + ]]></help> + </param> + <param name="intensity_table" type="data" format="parquet,csv"> + <label>Intensity table</label> + <help><![CDATA[ + Table with intensities** for features (rows) across samples (columns). + ]]></help> + </param> + <param name="compound_table" type="data" format="parquet,csv"> + <label>Compound database</label> + <help><![CDATA[ + Database of compounds according to which the annotation is performed. + The database is required to contain the fields <em>compound_id</em>, <em>monoisotopic_mass</em>, and <em>molecular_formula</em>. + ]]></help> + </param> + <param name="adduct_table" type="data" format="parquet,csv" optional="true"> + <label>Adduct database</label> + <help><![CDATA[ + Database of adduct which is combined with the database of compound to form a molecule-adduct pairs. + The database is required to contain <em>adduct</em>, <em>charge</em>, <em>mass</em>, and <em>n_molecules</em>. + ]]></help> + </param> + <param name="adduct_weights" type="data" format="parquet,csv" optional="true"> + <label>Adduct weights</label> + <help> + A weight-by-adduct table. + </help> + </param> +</xml> + +<xml name="outputs"> + <data name="output_file" format="parquet"> + <change_format> + <when input="metadata_table.ext" value="csv" format="csv" /> + </change_format> + </data> +</xml> + +<xml name="tolerance"> + <param name="mass_tolerance_ppm" type="integer" min="0" value="5"> + <label>Mass tolerance [ppm]</label> + <help>Mass tolerance in ppm for database matching.</help> + </param> + <yield/> +</xml> + +<token name="@HELP@"> +Description +=========== + +Annotate the peak intensity table (e.g. from an apLCMS run) with compounds from the compounds database +using advanced methods. + +The annotation process generates all possible compound-adduct pairs and matches those pairs to the measured +peaks. A compound-adduct pair is pronounced as a match to a certain peak when the difference of their masses are +withing some tolerance. + +Then, a score and a confidence level is assigned to each match based on peak correlation +clustering, metabolite pathway associations, adducts expectations, and isotope conformations. + +Input tables description +------------------------ + +(*) Metadata table +~~~~~~~~~~~~~~~~~~ + +The output from recetox-aplcms tool. +The `npeaks` column denotes the number of peaks which have been grouped into this feature. +The columns with the sample names indicate whether this feature is present in the sample. +Only id, mz, and rt columns are required to be present. + ++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+ +| id | mz | mzmin | mzmax | rt | rtmin | rtmax | npeaks | 21_qc_no_dil_milliq | 29_qc_no_dil_milliq | 8_qc_no_dil_milliq | ++=======+==============+==============+===============+================+===============+===============+===========+========================+========================+========================+ +| 1 | 70.03707021 | 70.037066 | 70.0370750 | 294.1038014 | 294.0634942 | 294.149985 | 3 | 1 | 1 | 1 | ++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+ +| 2 | 70.06505677 | 70.065045 | 70.0650676 | 141.9560055 | 140.5762528 | 143.335758 | 2 | 1 | 0 | 1 | ++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+ +| 57 | 78.04643252 | 78.046429 | 78.0464325 | 294.0063397 | 293.9406777 | 294.072001 | 2 | 1 | 1 | 0 | ++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+ +| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+ + +(**) Intensity table +~~~~~~~~~~~~~~~~~~~~ + +The output from recetox-aplcms tool. +This table contains the peak area for aligned features in all samples. + ++-------+------------------------+------------------------+------------------------+ +| id | 21_qc_no_dil_milliq | 29_qc_no_dil_milliq | 8_qc_no_dil_milliq | ++=======+========================+========================+========================+ +| 1 | 13187487.20482895 | 7957395.699119729 | 11700594.397257797 | ++-------+------------------------+------------------------+------------------------+ +| 2 | 2075168.6398983458 | 0 | 2574362.159289044 | ++-------+------------------------+------------------------+------------------------+ +| 57 | 2934524.4406785755 | 1333044.5065971944 | 0 | ++-------+------------------------+------------------------+------------------------+ +| ... | ... | ... | ... | ++-------+------------------------+------------------------+------------------------+ +</token> + +<xml name="citations"> + <citation type="doi">10.1021/acs.analchem.6b01214</citation> +</xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/recetox_xmsannotator_advanced.xml Mon Jun 26 13:55:56 2023 +0000 @@ -0,0 +1,169 @@ +<tool id="recetox_xmsannotator_advanced" name="recetox-xMSannotator" version="@TOOL_VERSION@+galaxy0"> + + <description>annotate peak intensity table including scores and confidence levels</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="creator"/> + <xrefs> + <xref type="bio.tools">recetox-xmsannotator</xref> + </xrefs> + <expand macro="requirements" /> + <command detect_errors="aggressive"><![CDATA[ + Rscript -e 'source("${__tool_directory__}/utils.R")' -e "n_workers <- \${GALAXY_SLOTS:-1}" -e "source('${wrapper}')" + ]]></command> + + <configfiles> + <configfile name="wrapper"><![CDATA[ + metadata_table <- load_table("$metadata_table", "$metadata_table.ext") + intensity_table <- load_table("$intensity_table", "$intensity_table.ext") + peak_table <- create_peak_table(metadata_table, intensity_table) + + filter_by <- create_filter_by_adducts("$filter_by") + + annotation <- advanced_annotation( + peak_table = peak_table, + adduct_table = load_table("$adduct_table", "$adduct_table.ext"), + adduct_weights = load_table("$adduct_weights", "$adduct_weights.ext"), + compound_table = load_table("$compound_table", "$compound_table.ext"), + mass_tolerance = 1e-6 * ${mass_tolerance_ppm}, + time_tolerance = $time_tolerance, + correlation_threshold = as.double($clustering.correlation_threshold), + min_cluster_size = as.integer($clustering.min_cluster_size), + deep_split = as.integer($clustering.deep_split), + network_type = "$clustering.network_type", + redundancy_filtering = $scoring.redundancy_filtering, + n_workers = n_workers, + intensity_deviation_tolerance = as.double($intensity_deviation_tolerance), + mass_defect_tolerance = as.double($mass_defect_tolerance), + mass_defect_precision = as.double($mass_defect_precision), + peak_rt_width = as.integer($peak_rt_width), + maximum_isotopes = as.integer($maximum_isotopes), + min_ions_per_chemical = as.integer($min_ions_per_chemical), + filter_by = filter_by + ) + + save_table(annotation, "$output_file", "$output_file.ext") + ]]></configfile> + </configfiles> + + <inputs> + <expand macro="inputs"/> + <expand macro="tolerance"> + <param name="time_tolerance" type="float" value="10" min="0"> + <label>Retention time tolerance [s]</label> + <help> + Retention time tolerance in seconds for finding peaks derived from the same parent compound. + </help> + </param> + </expand> + <section name="clustering" title="Clustering"> + <param name="correlation_threshold" type="float" value="0.7"> + <label>Correlation threshold</label> + <help>Correlation threshold between peaks to qualify as adducts/isotopes of the same metabolite.</help> + </param> + <param name="min_cluster_size" type="integer" value="10" min="1"> + <label>Minimum cluster size</label> + <help>The minimum number of nodes to be considered as a cluster.</help> + </param> + <param name="deep_split" type="integer" value="2" min="0" max="4"> + <label>Deep split</label> + <help> + Deep split provides a rough control over sensitivity to cluster splitting. The higher the value, + the more and smaller clusters will be produced (see WGCNA package documentation). + </help> + </param> + <param name="network_type" type="select" display="radio"> + <label>Network type</label> + <help> + Network type parameter affects how the network's adjacency matrix is created from the correlation + matrix (see WGCNA package documentation). + </help> + <option value="signed">Signed</option> + <option value="unsigned" selected="true">Unsigned</option> + </param> + </section> + <section name="scoring" title="Scoring" expanded="true"> + <param name="strict_boosting" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE"> + <label>Strict boosting</label> + <help> + Boost the scores of metabolites that not only belongs to the same pathway but also to the same + cluster. Otherwise, do not account for cluster membership. + </help> + </param> + <param name="min_isp" type="integer" min="0" value="1"> + <label>Minimum number of expected isotopes</label> + <help> + Minimum number of adducts/isotopes to be present for a match to be considered as a high confidence match. + </help> + </param> + <param name="max_isp" type="integer" min="0" value="5"> + <label>Maximum number of expected isotopes</label> + <help> + Maximum number of adducts/isotopes to be present for a match to be considered as a high confidence match. + </help> + </param> + <param name="redundancy_filtering" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE"> + <label>Redundancy filtering</label> + <help>Whether to filter out low-scored multiple matcher or not.</help> + </param> + </section> + <param name="intensity_deviation_tolerance" type="float" value="0.1"> + <label>Tolerance of intensity deviation</label> + <help>A numeric threshold by which an intensity ratio of two isotopic peaks may differ from their actual abundance ratio.</help> + </param> + <param name="mass_defect_tolerance" type="float" value="0.1"> + <label>Tolerance of mass defect</label> + <help>Maximum difference in mass defect between two peaks of the same compound.</help> + </param> + <param name="mass_defect_precision" type="float" value="0.01"> + <label>Precision for computing mass defect</label> + </param> + <param name="peak_rt_width" type="integer" value="1"> + <label>Estimated chromatographic peak width</label> + </param> + <param name="maximum_isotopes" type="integer" value="10"> + <label>Maximum isotopes</label> + </param> + <param name="min_ions_per_chemical" type="integer" value="2"> + <label>Minimum ions per chemical</label> + </param> + <param name="filter_by" type="select" label="Adducts to filter by" multiple="true" optional="true"> + <option value="M-H" selected="true">M-H</option> + <option value="M+H" selected="true">M+H</option> + <option value="2M-H">2M-H</option> + <option value="M-2H">M-2H</option> + </param> + </inputs> + + <outputs> + <expand macro="outputs"/> + </outputs> + + <tests> + <test> + <param name="metadata_table" value="metadata_table.parquet" ftype="parquet" /> + <param name="intensity_table" value="intensity_table.parquet" ftype="parquet" /> + <param name="compound_table" value="database.parquet" ftype="parquet" /> + <param name="adduct_table" value="adduct_table.parquet" ftype="parquet" /> + <output name="output_file" file="expected_output.parquet" ftype="parquet"/> + </test> + <test> + <param name="metadata_table" value="metadata_table.csv" ftype="csv" /> + <param name="intensity_table" value="intensity_table.csv" ftype="csv" /> + <param name="compound_table" value="database.csv" ftype="csv" /> + <param name="adduct_table" value="adduct_table.csv" ftype="csv" /> + <output name="output_file" file="expected_output.csv" ftype="csv"/> + </test> + </tests> + + <help> + <![CDATA[ + @HELP@ + ]]> + </help> + + <citations> + <expand macro="citations"/> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils.R Mon Jun 26 13:55:56 2023 +0000 @@ -0,0 +1,37 @@ +library(recetox.xmsannotator) +library(dplyr) + +load_table <- function(filename, filetype) { + if (filename == "None") { + return(NULL) + } + if (filetype == "csv") { + return(as.data.frame(read.csv(filename))) + } else { + return(as.data.frame(arrow::read_parquet(filename))) + } +} + +save_table <- function(table, filename, filetype) { + if (filetype == "csv") { + write.csv(table, filename, row.names = FALSE) + } else { + arrow::write_parquet(table, filename) + } +} + +create_filter_by_adducts <- function(comma_separated_values) { + if (comma_separated_values == "None") { + return(NA) + } + filter_by <- strsplit(trimws(comma_separated_values), ",")[[1]] + return(filter_by) +} + +create_peak_table <- function(metadata_table, intensity_table) { + metadata_table <- select(metadata_table, id, mz, rt) + peak_table <- inner_join(metadata_table, intensity_table, by = "id") + peak_table <- rename(peak_table, peak = id) + peak_table$peak <- as.integer(peak_table$peak) + return(peak_table) +}