# HG changeset patch
# User recetox
# Date 1676284115 0
# Node ID 472dc85ce7c57558f0a0671f00c104407ac1f46b
# Parent f9fb9d8fb710aec147de8173bd28ad538d8ff8e2
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/recetox_aplcms commit 506df2aef355b3791567283e1a175914f06b405a
diff -r f9fb9d8fb710 -r 472dc85ce7c5 help.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/help.xml Mon Feb 13 10:28:35 2023 +0000
@@ -0,0 +1,255 @@
+
+
+
+General Information
+===================
+
+Overview
+--------
+
+recetox-aplcms is a software package for peak detection in high resolution mass spectrometry (HRMS) data.
+It supports reading .mzml files in raw profile mode and uses a bi-Gaussian chromatographic peak shape for feature detection and quantification.
+
+recetox-aplcms is based on the apLCMS package developed by Tianwei Yu at Emory University - see the citations and the apLCMS section beneath.
+This version includes various software updates and is actively developed and maintained on `GitHub`_.
+Please submit eventual bug reports as `issues`_ on the repository.
+
+.. _GitHub: https://github.com/RECETOX/recetox-aplcms
+.. _issues: https://github.com/RECETOX/recetox-aplcms/issues/new
+
+
+Workflow
+--------
+
+.. image:: https://raw.githubusercontent.com/RECETOX/galaxytools/aee0dd6cf6c05936269efe4337c50e27cc68e86b/tools/recetox_aplcms/images/scheme.png
+ :width: 2560
+ :height: 788
+ :scale: 40
+ :alt: A picture of a workflow diagram.
+
+The individual steps of the recetox-aplcms package can be combined in 2 separate workflows processing HRMS data in an unsupervised manner or by including a-priori knowledge.
+The workflows consist of the following building blocks:
+
+(1) remove noise - denoise the raw data and extract the EIC
+(2) generate feature table - group features in EIC into peaks using peak-shape model
+(3) compute clusters - compute mz and rt clusters across samples
+(4) compute template - find the template for rt correction
+(5) correct time - correct the rt across samples using splines
+(6) align features - align identical features across samples
+(7) recover weaker signals - recover missed features in samples based on the aligned features
+(8) merge known table - add known features to detected features table and vice versa
+
+For detailed documentation on the individual steps please see the individual tool wrappers.
+
+
+apLCMS (Original Reference)
+---------------------------
+
+apLCMS is a software which generates a feature table from a batch of LC/MS spectra. The m/z and retention time
+tolerance levels are estimated from the data. A run-filter is used to detect peaks and remove noise.
+Non-parametric statistical methods are used to find-tune peak selection and grouping. After retention time
+correction, a feature table is generated by aligning peaks across spectra. For further information on apLCMS
+please refer to https://mypage.cuhk.edu.cn/academics/yutianwei/apLCMS/.
+
+
+
+recetox-aplcms - remove noise
+=============================
+
+This tool is the first step of recetox-aplcms.
+It removes noise from the raw data and performs a first clustering step of points with close m/z values into the extracted ion chromatograms (EICs).
+Only peaks with a minimum elution length of `min_run` seconds are kept.
+
+Example Output
+--------------
+The raw data points contained in the scans of the `mzml` file are filtered for noise and grouped into clusters based on m/z values.
+See an example output in the table below. The `group_number` column indicates the cluster index.
+
++----------------------+-------------------+-----------------------+--------------------+
+| mz | rt | intensity | group_number |
++======================+===================+=======================+====================+
+| 70.01060119055192 | 350.58654 | 21178.330810546875 | 5 |
++----------------------+-------------------+-----------------------+--------------------+
+| 70.02334120404554 | 130.175262 | 287869.5478515625 | 10 |
++----------------------+-------------------+-----------------------+--------------------+
+| 70.0287408273165 | 134.801352 | 60883.15185546875 | 11 |
++----------------------+-------------------+-----------------------+--------------------+
+| 70.02872416715464 | 183.991896 | 9201.574584960938 | 11 |
++----------------------+-------------------+-----------------------+--------------------+
+| ... | ... | ... | ... |
++----------------------+-------------------+-----------------------+--------------------+
+
+
+
+recetox-aplcms - generate feature table
+=======================================
+The second step in the recetox-aplcms workflow performing peak shape parameter estimation.
+
+This tool takes the grouped features created with `recetox-aplcms-remove-noise` and computes the peak shape in `rt` domain and integrates the peak area.
+
+
+Example Output
+--------------
+The output contains the `mz` and `rt` of the peaks as well as the standard deviation in both direction of the peak for the bi-gaussian peak shape.
+
++----------------------+-------------------+-----------------+-------------------+----------------------+
+| mz | rt | sd1 | sd2 | area |
++======================+===================+=================+===================+======================+
+| 70.02317542938793 | 142.36033 | 11.436659559 | 14.592754933 | 4159269.24595184 |
++----------------------+-------------------+-----------------+-------------------+----------------------+
+| 70.02869594233522 | 205.48765 | 0.263230763 | 0.285101428707 | 8849767.11861127 |
++----------------------+-------------------+-----------------+-------------------+----------------------+
+| 78.04643252598305 | 294.01713 | 0.51677558617 | 1.317028944141 | 1333044.50659719 |
++----------------------+-------------------+-----------------+-------------------+----------------------+
+| ... | ... | ... | ... | ... |
++----------------------+-------------------+-----------------+-------------------+----------------------+
+
+
+
+recetox-aplcms - compute clusters
+=================================
+
+Group features with `mz` and `rt` using tolerances within the tolerance into clusters, creating larger features from raw data points.
+Custom tolerances for `mz` and `rt` are computed based on the given parameters.
+The tool takes a collection of all detected features and computes the clusters over a global feature table, adding the `sample_id` and `cluster` columns to the table.
+
+Example Output
+--------------
+
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| mz | rt | sd1 | sd2 | area | sample_id | cluster |
++======================+===================+=================+===================+======================+=====================+===============+
+| 70.02317542938793 | 142.36033 | 11.436659559 | 14.592754933 | 4159269.245951841 | 21_qc_no_dil_milliq | 7 |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| 70.02869594233522 | 205.48765 | 0.263230763 | 0.285101428707 | 8849767.11861127 | 21_qc_no_dil_milliq | 9 |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| 78.04643252598305 | 294.01713 | 0.51677558617 | 1.317028944141 | 1333044.506597194 | 21_qc_no_dil_milliq | 13 |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| ... | ... | ... | ... | ... | ... | ... |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+
+
+
+recetox-aplcms - correct time
+=============================
+
+Apply spline-based retention time correction to a feature table given the template table and the computed `mz` and `rt` tolerances.
+
+Example Output
+--------------
+The output has the same format as `compute clusters` but the retention time values are corrected based on the template table.
+
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| mz | rt | sd1 | sd2 | area | sample_id | cluster |
++======================+===================+=================+===================+======================+=====================+===============+
+| 70.02317542938793 | 142.36033 | 11.436659559 | 14.592754933 | 4159269.245951841 | 21_qc_no_dil_milliq | 7 |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| 70.02869594233522 | 205.48765 | 0.263230763 | 0.285101428707 | 8849767.11861127 | 21_qc_no_dil_milliq | 9 |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| 78.04643252598305 | 294.01713 | 0.51677558617 | 1.317028944141 | 1333044.506597194 | 21_qc_no_dil_milliq | 13 |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| ... | ... | ... | ... | ... | ... | ... |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+
+
+recetox-aplcms - compute template
+=================================
+Compute the template from a set of feature tables, choosing the one with the most features as the template.
+
+
+
+recetox-aplcms - recover weaker signals
+=======================================
+Second stage peak detection based on the aligned feature table from the `feature alignment` step.
+If a feature is contained in the aligned feature table, this step revisits the raw data and searches
+for this feature at the retention time obtained by mapping the corrected retention time back to the original sample.
+
+This recovers features which are present in a sample but might have been filtered out initially as noise due to low signal intensity.
+
+Example Output
+--------------
+The table has the same format as the `compute clusters` output but might contain additional features which have been extracted based
+on their presence in the aligned feature table.
+
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| mz | rt | sd1 | sd2 | area | sample_id | cluster |
++======================+===================+=================+===================+======================+=====================+===============+
+| 70.02317542938793 | 142.36033 | 11.436659559 | 14.592754933 | 4159269.245951841 | 21_qc_no_dil_milliq | 7 |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| 70.02869594233522 | 205.48765 | 0.263230763 | 0.285101428707 | 8849767.11861127 | 21_qc_no_dil_milliq | 9 |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| 78.04643252598305 | 294.01713 | 0.51677558617 | 1.317028944141 | 1333044.506597194 | 21_qc_no_dil_milliq | 13 |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| ... | ... | ... | ... | ... | ... | ... |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+
+
+
+recetox-aplcms - align features
+===============================
+This step performs feature alignment after clustering and retention time correction.
+The peaks clustered across samples are grouped based on the given tolerances to create an aligned feature table, connecting identical features across samples.
+The parameter controls in how many samples a feature has to be detected at least in order to be included in the aligned feature table.
+
+Example Output
+--------------
+The tool outputs 3 tables: the peak related `metadata`, the `retention times` and the `intensities` for all features across all samples.
+
+Metadata Table
+~~~~~~~~~~~~~~
+The `npeaks` column denotes the number of peaks which have been grouped into this feature. The columns with the sample names indicate whether this feature is present in the sample.
+
++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+
+| id | mz | mzmin | mzmax | rt | rtmin | rtmax | npeaks | 21_qc_no_dil_milliq | 29_qc_no_dil_milliq | 8_qc_no_dil_milliq |
++=======+==============+==============+===============+================+===============+===============+===========+========================+========================+========================+
+| 1 | 70.03707021 | 70.037066 | 70.0370750 | 294.1038014 | 294.0634942 | 294.149985 | 3 | 1 | 1 | 1 |
++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+
+| 2 | 70.06505677 | 70.065045 | 70.0650676 | 141.9560055 | 140.5762528 | 143.335758 | 2 | 1 | 0 | 1 |
++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+
+| 57 | 78.04643252 | 78.046429 | 78.0464325 | 294.0063397 | 293.9406777 | 294.072001 | 2 | 1 | 1 | 0 |
++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+
+| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+
+
+Intensity Table
+~~~~~~~~~~~~~~~
+This table contains the peak area for aligned features in all samples.
+
++-------+------------------------+------------------------+------------------------+
+| id | 21_qc_no_dil_milliq | 29_qc_no_dil_milliq | 8_qc_no_dil_milliq |
++=======+========================+========================+========================+
+| 1 | 13187487.20482895 | 7957395.699119729 | 11700594.397257797 |
++-------+------------------------+------------------------+------------------------+
+| 2 | 2075168.6398983458 | 0 | 2574362.159289044 |
++-------+------------------------+------------------------+------------------------+
+| 57 | 2934524.4406785755 | 1333044.5065971944 | 0 |
++-------+------------------------+------------------------+------------------------+
+| ... | ... | ... | ... |
++-------+------------------------+------------------------+------------------------+
+
+Retention Time Table
+~~~~~~~~~~~~~~~~~~~~
+This table contains the retention times for all aligned features in all samples.
+
++-------+------------------------+------------------------+------------------------+
+| id | 21_qc_no_dil_milliq | 29_qc_no_dil_milliq | 8_qc_no_dil_milliq |
++=======+========================+========================+========================+
+| 1 | 294.09792478513236 | 294.1499853056912 | 294.0634942428341 |
++-------+------------------------+------------------------+------------------------+
+| 2 | 140.57625284242982 | 0 | 143.33575827589172 |
++-------+------------------------+------------------------+------------------------+
+| 57 | 294.07200187644435 | 293.9406777222317 | 0 |
++-------+------------------------+------------------------+------------------------+
+| ... | ... | ... | ... |
++-------+------------------------+------------------------+------------------------+
+
+
+
+recetox-aplcms - merge known table
+==================================
+
+This tool allows merging the detected features back into the table of known features and vice versa.
+It is used in the hybrid version of recetox-aplcms to augment the aligned feature table with the suspect peaks
+and to augment this table with successfully detected features.
+
+
diff -r f9fb9d8fb710 -r 472dc85ce7c5 images/scheme.png
Binary file images/scheme.png has changed
diff -r f9fb9d8fb710 -r 472dc85ce7c5 macros.xml
--- a/macros.xml Thu Jun 16 10:27:28 2022 +0000
+++ b/macros.xml Mon Feb 13 10:28:35 2023 +0000
@@ -1,11 +1,9 @@
- 0.9.4
+ 0.10.1
- r-base
- r-arrow
r-recetox-aplcms
- r-dplyr
+ pymzml
@@ -31,6 +29,11 @@
familyName="Novotný"
url="https://github.com/xtracko"
identifier="0000-0001-5449-3523" />
+
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
-
-
-
-
-
-
-
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
+
+
+
+
+
+
-
-
-
-
-
-
-
-
+
+
+
+
@@ -197,51 +147,8 @@
10.1186/1471-2105-11-559
10.1021/pr301053d
10.1093/bioinformatics/btu430
+ 10.1038/s41598-020-70850-0
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- apLCMS is a software which generates a feature table from a batch of LC/MS spectra. The m/z and retention time
- tolerance levels are estimated from the data. A run-filter is used to detect peaks and remove noise.
- Non-parametric statistical methods are used to find-tune peak selection and grouping. After retention time
- correction, a feature table is generated by aligning peaks across spectra. For further information on apLCMS
- please refer to https://mypage.cuhk.edu.cn/academics/yutianwei/apLCMS/.
-
diff -r f9fb9d8fb710 -r 472dc85ce7c5 macros_split.xml
--- a/macros_split.xml Thu Jun 16 10:27:28 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,23 +0,0 @@
-
-
-
-
-
\ No newline at end of file
diff -r f9fb9d8fb710 -r 472dc85ce7c5 main.R
--- a/main.R Thu Jun 16 10:27:28 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,123 +0,0 @@
-library(recetox.aplcms)
-library(dplyr)
-
-save_extracted_features <- function(df, filename) {
- df <- as.data.frame(df)
- columns <- c("mz", "pos", "sd1", "sd2", "area")
- arrow::write_parquet(df[columns], filename)
-}
-
-save_aligned_feature_table <- function(df, filename) {
- columns <- c("feature", "mz", "rt", "sample", "sample_rt", "sample_intensity")
- arrow::write_parquet(df[columns], filename)
-}
-
-save_recovered_feature_table <- function(df, filename, out_format) {
- columns <- c("feature", "mz", "rt", "sample", "sample_rt", "sample_intensity")
- if (out_format == "recetox") {
- peak_table <- df[columns]
- recetox_peak_table <- rcx_aplcms_to_rcx_xmsannotator(peak_table)
- arrow::write_parquet(recetox_peak_table, filename)
- } else {
- arrow::write_parquet(df[columns], filename)
- }
-}
-
-rcx_aplcms_to_rcx_xmsannotator <- function(peak_table) {
- col_base <- c("feature", "mz", "rt")
- output_table <- peak_table %>% distinct(across(any_of(col_base)))
-
- for (level in levels(factor(peak_table$sample))) {
- subdata <- peak_table %>%
- filter(sample == level) %>%
- select(any_of(c(col_base, "sample_intensity"))) %>%
- rename(!!level := "sample_intensity")
- output_table <- inner_join(output_table, subdata, by = col_base)
- }
- output_table <- output_table %>% rename(peak = feature)
- return(output_table)
-}
-
-known_table_columns <- function() {
- c("chemical_formula", "HMDB_ID", "KEGG_compound_ID", "mass", "ion.type",
- "m.z", "Number_profiles_processed", "Percent_found", "mz_min", "mz_max",
- "RT_mean", "RT_sd", "RT_min", "RT_max", "int_mean(log)", "int_sd(log)",
- "int_min(log)", "int_max(log)")
-}
-
-save_known_table <- function(df, filename) {
- columns <- known_table_columns()
- arrow::write_parquet(df[columns], filename)
-}
-
-read_known_table <- function(filename) {
- arrow::read_parquet(filename, col_select = known_table_columns())
-}
-
-save_pairing <- function(df, filename) {
- write.table(df, filename, row.names = FALSE, col.names = c("new", "old"))
-}
-
-save_all_extracted_features <- function(dfs, filenames) {
- filenames <- tools::file_path_sans_ext(basename(filenames))
- filenames <- paste0(filenames, ".parquet")
- filenames <- file.path("extracted", filenames)
- dir.create("extracted")
- mapply(save_extracted_features, dfs, filenames)
-}
-
-save_all_corrected_features <- function(dfs, filenames) {
- filenames <- tools::file_path_sans_ext(basename(filenames))
- filenames <- paste0(filenames, ".parquet")
- filenames <- file.path("corrected", filenames)
- dir.create("corrected")
- mapply(save_extracted_features, dfs, filenames)
-}
-
-unsupervised_main <- function(sample_files, aligned_file, recovered_file, out_format, ...) {
- sample_files <- sort_samples_by_acquisition_number(sample_files)
-
- res <- unsupervised(filenames = sample_files, ...)
-
- save_all_features(res, sample_files)
- save_all_feature_tables(res$aligned_feature_sample_table, res$recovered_feature_sample_table, aligned_file, recovered_file, out_format)
-}
-
-hybrid_main <- function(sample_files, known_table_file, updated_known_table_file, pairing_file, aligned_file, recovered_file, out_format, ...) {
- sample_files <- sort_samples_by_acquisition_number(sample_files)
-
- known <- read_known_table(known_table_file)
- res <- hybrid(filenames = sample_files, known_table = known, ...)
-
- save_known_table(res$updated_known_table, updated_known_table_file)
- save_pairing(res$features_known_table_pairing, pairing_file)
-
- save_all_features(res, sample_files)
- save_all_feature_tables(res$aligned_feature_sample_table, res$recovered_feature_sample_table, aligned_file, recovered_file, out_format)
-}
-
-save_all_features <- function(result, sample_files) {
- save_all_extracted_features(result$extracted_features, sample_files)
- save_all_corrected_features(result$corrected_features, sample_files)
-}
-
-save_all_feature_tables <- function(aligned_feature_sample_table,
- recovered_feature_sample_table,
- aligned_file,
- recovered_file,
- out_format) {
- save_aligned_feature_table(aligned_feature_sample_table, aligned_file)
- save_recovered_feature_table(recovered_feature_sample_table, recovered_file, out_format)
-}
-
-two_step_hybrid_main <- function(sample_files, known_table_file, updated_known_table_file, recovered_file, aligned_file, out_format, metadata, ...) {
- sample_files <- sort_samples_by_acquisition_number(sample_files)
- metadata <- read.table(metadata, sep = ",", header = TRUE)
-
- known_table <- read_known_table(known_table_file)
- res <- two.step.hybrid(filenames = sample_files, known.table = known_table, work_dir = getwd(), metadata = metadata, ...)
-
- save_known_table(res$known_table, updated_known_table_file)
- save_aligned_feature_table(res$aligned_features, aligned_file)
- save_recovered_feature_table(res$final_features, recovered_file, out_format)
-}
diff -r f9fb9d8fb710 -r 472dc85ce7c5 mzml_id_getter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mzml_id_getter.py Mon Feb 13 10:28:35 2023 +0000
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+
+import argparse
+import sys
+
+from pymzml.run import Reader
+
+
+def main(argv):
+ parser = argparse.ArgumentParser(description='Get run ID from an mzML file.')
+ parser.add_argument('mzml_file', help='Path to an mzML file to get run ID from.')
+ args = parser.parse_args()
+
+ mzml = Reader(args.mzml_file)
+ id = mzml.info['run_id']
+
+ if id is not None:
+ with open("sample_name.txt", mode='x') as f:
+ f.write(id)
+
+
+if __name__ == '__main__':
+ main(sys.argv[1:])
diff -r f9fb9d8fb710 -r 472dc85ce7c5 recetox_aplcms_recover_weaker_signals.xml
--- a/recetox_aplcms_recover_weaker_signals.xml Thu Jun 16 10:27:28 2022 +0000
+++ b/recetox_aplcms_recover_weaker_signals.xml Mon Feb 13 10:28:35 2023 +0000
@@ -1,143 +1,116 @@
-
- recover weaker signals from LC/MS spectra
+
+ recover weaker signals from raw data using an aligned feature table
macros.xml
- macros_split.xml
+ help.xml
+
-
-
- #for $infile in $ms_files
- ln -s '${infile}' '${infile.element_identifier}'
- #end for
- #for $infile in $extracted_files
- ln -s '${infile}' 'extracted_${infile.element_identifier}'
- #end for
- #for $infile in $corrected_files
- ln -s '${infile}' '${infile.element_identifier}'
- #end for
-
-
+
-
-
-
-
-
-
-
-
+
+
+
+
+
+
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff -r f9fb9d8fb710 -r 472dc85ce7c5 utils.R
--- a/utils.R Thu Jun 16 10:27:28 2022 +0000
+++ b/utils.R Mon Feb 13 10:28:35 2023 +0000
@@ -1,149 +1,125 @@
library(recetox.aplcms)
-align_features <- function(sample_names, ...) {
- aligned <- feature.align(...)
- feature_names <- seq_len(nrow(aligned$pk.times))
-
- list(
- mz_tolerance = as.numeric(aligned$mz.tol),
- rt_tolerance = as.numeric(aligned$chr.tol),
- rt_crosstab = as_feature_crosstab(feature_names, sample_names, aligned$pk.times),
- int_crosstab = as_feature_crosstab(feature_names, sample_names, aligned$aligned.ftrs)
- )
+get_env_sample_name <- function() {
+ sample_name <- Sys.getenv("SAMPLE_NAME", unset = NA)
+ if (nchar(sample_name) == 0) {
+ sample_name <- NA
+ }
+ if (is.na(sample_name)) {
+ message("The mzML file does not contain run ID.")
+ }
+ return(sample_name)
}
-get_sample_name <- function(filename) {
- tools::file_path_sans_ext(basename(filename))
-}
-
-as_feature_crosstab <- function(feature_names, sample_names, data) {
- colnames(data) <- c("mz", "rt", "mz_min", "mz_max", sample_names)
- rownames(data) <- feature_names
- as.data.frame(data)
+save_sample_name <- function(df, sample_name) {
+ attr(df, "sample_name") <- sample_name
+ return(df)
}
-as_feature_sample_table <- function(rt_crosstab, int_crosstab) {
- feature_names <- rownames(rt_crosstab)
- sample_names <- colnames(rt_crosstab)[- (1:4)]
-
- feature_table <- data.frame(
- feature = feature_names,
- mz = rt_crosstab[, 1],
- rt = rt_crosstab[, 2]
- )
-
- # series of conversions to produce a table type from data.frame
- rt_crosstab <- as.table(as.matrix(rt_crosstab[, - (1:4)]))
- int_crosstab <- as.table(as.matrix(int_crosstab[, - (1:4)]))
-
- crosstab_axes <- list(feature = feature_names, sample = sample_names)
- dimnames(rt_crosstab) <- dimnames(int_crosstab) <- crosstab_axes
-
- x <- as.data.frame(rt_crosstab, responseName = "sample_rt")
- y <- as.data.frame(int_crosstab, responseName = "sample_intensity")
-
- data <- merge(x, y, by = c("feature", "sample"))
- data <- merge(feature_table, data, by = "feature")
- data
+load_sample_name <- function(df) {
+ sample_name <- attr(df, "sample_name")
+ if (is.null(sample_name)) {
+ return(NA)
+ } else {
+ return(sample_name)
+ }
}
-load_features <- function(files) {
- files_list <- sort_samples_by_acquisition_number(files)
- features <- lapply(files_list, arrow::read_parquet)
- features <- lapply(features, as.matrix)
+save_data_as_parquet_file <- function(data, filename) {
+ arrow::write_parquet(data, filename)
+}
+
+load_data_from_parquet_file <- function(filename) {
+ return(arrow::read_parquet(filename))
+}
+
+load_parquet_collection <- function(files) {
+ features <- lapply(files, arrow::read_parquet)
+ features <- lapply(features, tibble::as_tibble)
return(features)
}
-save_data_as_parquet_files <- function(data, subdir) {
- dir.create(subdir)
- for (i in 0:(length(data) - 1)) {
- filename <- file.path(subdir, paste0(subdir, "_features_", i, ".parquet"))
- arrow::write_parquet(as.data.frame(data[i + 1]), filename)
- }
+save_parquet_collection <- function(table, sample_names, subdir) {
+ dir.create(subdir)
+ for (i in seq_len(length(table$feature_tables))) {
+ filename <- file.path(subdir, paste0(subdir, "_", sample_names[i], ".parquet"))
+ feature_table <- as.data.frame(table$feature_tables[[i]])
+ feature_table <- save_sample_name(feature_table, sample_names[i])
+ arrow::write_parquet(feature_table, filename)
+ }
+}
+
+sort_by_sample_name <- function(tables, sample_names) {
+ return(tables[order(sample_names)])
}
-save_aligned_features <- function(aligned, rt_file, int_file, tol_file) {
- arrow::write_parquet(as.data.frame(aligned$rt_crosstab), rt_file)
- arrow::write_parquet(as.data.frame(aligned$int_crosstab), int_file)
-
- mz_tolerance <- c(aligned$mz_tolerance)
- rt_tolerance <- c(aligned$rt_tolerance)
- arrow::write_parquet(data.frame(mz_tolerance, rt_tolerance), tol_file)
+save_tolerances <- function(table, tol_file) {
+ mz_tolerance <- c(table$mz_tol_relative)
+ rt_tolerance <- c(table$rt_tol_relative)
+ arrow::write_parquet(data.frame(mz_tolerance, rt_tolerance), tol_file)
}
-load_aligned_features <- function(rt_file, int_file, tol_file) {
- rt_cross_table <- arrow::read_parquet(rt_file)
- int_cross_table <- arrow::read_parquet(int_file)
- tolerances_table <- arrow::read_parquet(tol_file)
+get_mz_tol <- function(tolerances) {
+ return(tolerances$mz_tolerance)
+}
- result <- list()
- result$mz_tolerance <- tolerances_table$mz_tolerance
- result$rt_tolerance <- tolerances_table$rt_tolerance
- result$rt_crosstab <- rt_cross_table
- result$int_crosstab <- int_cross_table
- return(result)
+get_rt_tol <- function(tolerances) {
+ return(tolerances$rt_tolerance)
+}
+
+save_aligned_features <- function(aligned_features, metadata_file, rt_file, intensity_file) {
+ save_data_as_parquet_file(aligned_features$metadata, metadata_file)
+ save_data_as_parquet_file(aligned_features$rt, rt_file)
+ save_data_as_parquet_file(aligned_features$intensity, intensity_file)
}
-recover_signals <- function(cluster,
- filenames,
- extracted,
- corrected,
- aligned,
- mz_tol = 1e-05,
- mz_range = NA,
- rt_range = NA,
- use_observed_range = TRUE,
- min_bandwidth = NA,
- max_bandwidth = NA,
- recover_min_count = 3) {
- if (!is(cluster, "cluster")) {
- cluster <- parallel::makeCluster(cluster)
- on.exit(parallel::stopCluster(cluster))
- }
-
- clusterExport(cluster, c("extracted", "corrected", "aligned", "recover.weaker"))
- clusterEvalQ(cluster, library("splines"))
+select_table_with_sample_name <- function(tables, sample_name) {
+ sample_names <- lapply(tables, load_sample_name)
+ index <- which(sample_names == sample_name)
+ if (length(index) > 0) {
+ return(tables[[index]])
+ } else {
+ stop(sprintf("Mismatch - sample name '%s' not present in %s",
+ sample_name, paste(sample_names, collapse = ", ")))
+ }
+}
- recovered <- parLapply(cluster, seq_along(filenames), function(i) {
- recover.weaker(
- loc = i,
- filename = filenames[[i]],
- this.f1 = extracted[[i]],
- this.f2 = corrected[[i]],
- pk.times = aligned$rt_crosstab,
- aligned.ftrs = aligned$int_crosstab,
- orig.tol = mz_tol,
- align.mz.tol = aligned$mz_tolerance,
- align.chr.tol = aligned$rt_tolerance,
- mz.range = mz_range,
- chr.range = rt_range,
- use.observed.range = use_observed_range,
- bandwidth = 0.5,
- min.bw = min_bandwidth,
- max.bw = max_bandwidth,
- recover.min.count = recover_min_count
- )
- })
+select_adjusted <- function(recovered_features) {
+ return(recovered_features$adjusted_features)
+}
- feature_table <- aligned$rt_crosstab[, 1:4]
- rt_crosstab <- cbind(feature_table, sapply(recovered, function(x) x$this.times))
- int_crosstab <- cbind(feature_table, sapply(recovered, function(x) x$this.ftrs))
-
- feature_names <- rownames(feature_table)
- sample_names <- colnames(aligned$rt_crosstab[, - (1:4)])
-
- list(
- extracted_features = lapply(recovered, function(x) x$this.f1),
- corrected_features = lapply(recovered, function(x) x$this.f2),
- rt_crosstab = as_feature_crosstab(feature_names, sample_names, rt_crosstab),
- int_crosstab = as_feature_crosstab(feature_names, sample_names, int_crosstab)
- )
+known_table_columns <- function() {
+ c("chemical_formula", "HMDB_ID", "KEGG_compound_ID", "mass", "ion.type",
+ "m.z", "Number_profiles_processed", "Percent_found", "mz_min", "mz_max",
+ "RT_mean", "RT_sd", "RT_min", "RT_max", "int_mean(log)", "int_sd(log)",
+ "int_min(log)", "int_max(log)")
}
-create_feature_sample_table <- function(features) {
- table <- as_feature_sample_table(
- rt_crosstab = features$rt_crosstab,
- int_crosstab = features$int_crosstab
- )
- return(table)
+save_known_table <- function(table, filename) {
+ columns <- known_table_columns()
+ arrow::write_parquet(table$known_table[columns], filename)
+}
+
+read_known_table <- function(filename) {
+ arrow::read_parquet(filename, col_select = known_table_columns())
+}
+
+save_pairing <- function(table, filename) {
+ df <- table$pairing %>% as_tibble() %>% setNames(c("new", "old"))
+ arrow::write_parquet(df, filename)
}
+
+join_tables_to_list <- function(metadata, rt_table, intensity_table) {
+ features <- new("list")
+ features$metadata <- metadata
+ features$intensity <- intensity_table
+ features$rt <- rt_table
+ return(features)
+}
+
+validate_sample_names <- function(sample_names) {
+ if ((any(is.na(sample_names))) || (length(unique(sample_names)) != length(sample_names))) {
+ stop(sprintf("Sample names absent or not unique - provided sample names: %s",
+ paste(sample_names, collapse = ", ")))
+ }
+}