# HG changeset patch # User recetox # Date 1676284115 0 # Node ID 472dc85ce7c57558f0a0671f00c104407ac1f46b # Parent f9fb9d8fb710aec147de8173bd28ad538d8ff8e2 planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/recetox_aplcms commit 506df2aef355b3791567283e1a175914f06b405a diff -r f9fb9d8fb710 -r 472dc85ce7c5 help.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/help.xml Mon Feb 13 10:28:35 2023 +0000 @@ -0,0 +1,255 @@ + + + +General Information +=================== + +Overview +-------- + +recetox-aplcms is a software package for peak detection in high resolution mass spectrometry (HRMS) data. +It supports reading .mzml files in raw profile mode and uses a bi-Gaussian chromatographic peak shape for feature detection and quantification. + +recetox-aplcms is based on the apLCMS package developed by Tianwei Yu at Emory University - see the citations and the apLCMS section beneath. +This version includes various software updates and is actively developed and maintained on `GitHub`_. +Please submit eventual bug reports as `issues`_ on the repository. + +.. _GitHub: https://github.com/RECETOX/recetox-aplcms +.. _issues: https://github.com/RECETOX/recetox-aplcms/issues/new + + +Workflow +-------- + +.. image:: https://raw.githubusercontent.com/RECETOX/galaxytools/aee0dd6cf6c05936269efe4337c50e27cc68e86b/tools/recetox_aplcms/images/scheme.png + :width: 2560 + :height: 788 + :scale: 40 + :alt: A picture of a workflow diagram. + +The individual steps of the recetox-aplcms package can be combined in 2 separate workflows processing HRMS data in an unsupervised manner or by including a-priori knowledge. +The workflows consist of the following building blocks: + +(1) remove noise - denoise the raw data and extract the EIC +(2) generate feature table - group features in EIC into peaks using peak-shape model +(3) compute clusters - compute mz and rt clusters across samples +(4) compute template - find the template for rt correction +(5) correct time - correct the rt across samples using splines +(6) align features - align identical features across samples +(7) recover weaker signals - recover missed features in samples based on the aligned features +(8) merge known table - add known features to detected features table and vice versa + +For detailed documentation on the individual steps please see the individual tool wrappers. + + +apLCMS (Original Reference) +--------------------------- + +apLCMS is a software which generates a feature table from a batch of LC/MS spectra. The m/z and retention time +tolerance levels are estimated from the data. A run-filter is used to detect peaks and remove noise. +Non-parametric statistical methods are used to find-tune peak selection and grouping. After retention time +correction, a feature table is generated by aligning peaks across spectra. For further information on apLCMS +please refer to https://mypage.cuhk.edu.cn/academics/yutianwei/apLCMS/. + + + +recetox-aplcms - remove noise +============================= + +This tool is the first step of recetox-aplcms. +It removes noise from the raw data and performs a first clustering step of points with close m/z values into the extracted ion chromatograms (EICs). +Only peaks with a minimum elution length of `min_run` seconds are kept. + +Example Output +-------------- +The raw data points contained in the scans of the `mzml` file are filtered for noise and grouped into clusters based on m/z values. +See an example output in the table below. The `group_number` column indicates the cluster index. + ++----------------------+-------------------+-----------------------+--------------------+ +| mz | rt | intensity | group_number | ++======================+===================+=======================+====================+ +| 70.01060119055192 | 350.58654 | 21178.330810546875 | 5 | ++----------------------+-------------------+-----------------------+--------------------+ +| 70.02334120404554 | 130.175262 | 287869.5478515625 | 10 | ++----------------------+-------------------+-----------------------+--------------------+ +| 70.0287408273165 | 134.801352 | 60883.15185546875 | 11 | ++----------------------+-------------------+-----------------------+--------------------+ +| 70.02872416715464 | 183.991896 | 9201.574584960938 | 11 | ++----------------------+-------------------+-----------------------+--------------------+ +| ... | ... | ... | ... | ++----------------------+-------------------+-----------------------+--------------------+ + + + +recetox-aplcms - generate feature table +======================================= +The second step in the recetox-aplcms workflow performing peak shape parameter estimation. + +This tool takes the grouped features created with `recetox-aplcms-remove-noise` and computes the peak shape in `rt` domain and integrates the peak area. + + +Example Output +-------------- +The output contains the `mz` and `rt` of the peaks as well as the standard deviation in both direction of the peak for the bi-gaussian peak shape. + ++----------------------+-------------------+-----------------+-------------------+----------------------+ +| mz | rt | sd1 | sd2 | area | ++======================+===================+=================+===================+======================+ +| 70.02317542938793 | 142.36033 | 11.436659559 | 14.592754933 | 4159269.24595184 | ++----------------------+-------------------+-----------------+-------------------+----------------------+ +| 70.02869594233522 | 205.48765 | 0.263230763 | 0.285101428707 | 8849767.11861127 | ++----------------------+-------------------+-----------------+-------------------+----------------------+ +| 78.04643252598305 | 294.01713 | 0.51677558617 | 1.317028944141 | 1333044.50659719 | ++----------------------+-------------------+-----------------+-------------------+----------------------+ +| ... | ... | ... | ... | ... | ++----------------------+-------------------+-----------------+-------------------+----------------------+ + + + +recetox-aplcms - compute clusters +================================= + +Group features with `mz` and `rt` using tolerances within the tolerance into clusters, creating larger features from raw data points. +Custom tolerances for `mz` and `rt` are computed based on the given parameters. +The tool takes a collection of all detected features and computes the clusters over a global feature table, adding the `sample_id` and `cluster` columns to the table. + +Example Output +-------------- + ++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+ +| mz | rt | sd1 | sd2 | area | sample_id | cluster | ++======================+===================+=================+===================+======================+=====================+===============+ +| 70.02317542938793 | 142.36033 | 11.436659559 | 14.592754933 | 4159269.245951841 | 21_qc_no_dil_milliq | 7 | ++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+ +| 70.02869594233522 | 205.48765 | 0.263230763 | 0.285101428707 | 8849767.11861127 | 21_qc_no_dil_milliq | 9 | ++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+ +| 78.04643252598305 | 294.01713 | 0.51677558617 | 1.317028944141 | 1333044.506597194 | 21_qc_no_dil_milliq | 13 | ++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+ +| ... | ... | ... | ... | ... | ... | ... | ++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+ + + + +recetox-aplcms - correct time +============================= + +Apply spline-based retention time correction to a feature table given the template table and the computed `mz` and `rt` tolerances. + +Example Output +-------------- +The output has the same format as `compute clusters` but the retention time values are corrected based on the template table. + ++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+ +| mz | rt | sd1 | sd2 | area | sample_id | cluster | ++======================+===================+=================+===================+======================+=====================+===============+ +| 70.02317542938793 | 142.36033 | 11.436659559 | 14.592754933 | 4159269.245951841 | 21_qc_no_dil_milliq | 7 | ++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+ +| 70.02869594233522 | 205.48765 | 0.263230763 | 0.285101428707 | 8849767.11861127 | 21_qc_no_dil_milliq | 9 | ++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+ +| 78.04643252598305 | 294.01713 | 0.51677558617 | 1.317028944141 | 1333044.506597194 | 21_qc_no_dil_milliq | 13 | ++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+ +| ... | ... | ... | ... | ... | ... | ... | ++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+ + + +recetox-aplcms - compute template +================================= +Compute the template from a set of feature tables, choosing the one with the most features as the template. + + + +recetox-aplcms - recover weaker signals +======================================= +Second stage peak detection based on the aligned feature table from the `feature alignment` step. +If a feature is contained in the aligned feature table, this step revisits the raw data and searches +for this feature at the retention time obtained by mapping the corrected retention time back to the original sample. + +This recovers features which are present in a sample but might have been filtered out initially as noise due to low signal intensity. + +Example Output +-------------- +The table has the same format as the `compute clusters` output but might contain additional features which have been extracted based +on their presence in the aligned feature table. + ++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+ +| mz | rt | sd1 | sd2 | area | sample_id | cluster | ++======================+===================+=================+===================+======================+=====================+===============+ +| 70.02317542938793 | 142.36033 | 11.436659559 | 14.592754933 | 4159269.245951841 | 21_qc_no_dil_milliq | 7 | ++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+ +| 70.02869594233522 | 205.48765 | 0.263230763 | 0.285101428707 | 8849767.11861127 | 21_qc_no_dil_milliq | 9 | ++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+ +| 78.04643252598305 | 294.01713 | 0.51677558617 | 1.317028944141 | 1333044.506597194 | 21_qc_no_dil_milliq | 13 | ++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+ +| ... | ... | ... | ... | ... | ... | ... | ++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+ + + + +recetox-aplcms - align features +=============================== +This step performs feature alignment after clustering and retention time correction. +The peaks clustered across samples are grouped based on the given tolerances to create an aligned feature table, connecting identical features across samples. +The parameter controls in how many samples a feature has to be detected at least in order to be included in the aligned feature table. + +Example Output +-------------- +The tool outputs 3 tables: the peak related `metadata`, the `retention times` and the `intensities` for all features across all samples. + +Metadata Table +~~~~~~~~~~~~~~ +The `npeaks` column denotes the number of peaks which have been grouped into this feature. The columns with the sample names indicate whether this feature is present in the sample. + ++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+ +| id | mz | mzmin | mzmax | rt | rtmin | rtmax | npeaks | 21_qc_no_dil_milliq | 29_qc_no_dil_milliq | 8_qc_no_dil_milliq | ++=======+==============+==============+===============+================+===============+===============+===========+========================+========================+========================+ +| 1 | 70.03707021 | 70.037066 | 70.0370750 | 294.1038014 | 294.0634942 | 294.149985 | 3 | 1 | 1 | 1 | ++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+ +| 2 | 70.06505677 | 70.065045 | 70.0650676 | 141.9560055 | 140.5762528 | 143.335758 | 2 | 1 | 0 | 1 | ++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+ +| 57 | 78.04643252 | 78.046429 | 78.0464325 | 294.0063397 | 293.9406777 | 294.072001 | 2 | 1 | 1 | 0 | ++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+ +| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+ + +Intensity Table +~~~~~~~~~~~~~~~ +This table contains the peak area for aligned features in all samples. + ++-------+------------------------+------------------------+------------------------+ +| id | 21_qc_no_dil_milliq | 29_qc_no_dil_milliq | 8_qc_no_dil_milliq | ++=======+========================+========================+========================+ +| 1 | 13187487.20482895 | 7957395.699119729 | 11700594.397257797 | ++-------+------------------------+------------------------+------------------------+ +| 2 | 2075168.6398983458 | 0 | 2574362.159289044 | ++-------+------------------------+------------------------+------------------------+ +| 57 | 2934524.4406785755 | 1333044.5065971944 | 0 | ++-------+------------------------+------------------------+------------------------+ +| ... | ... | ... | ... | ++-------+------------------------+------------------------+------------------------+ + +Retention Time Table +~~~~~~~~~~~~~~~~~~~~ +This table contains the retention times for all aligned features in all samples. + ++-------+------------------------+------------------------+------------------------+ +| id | 21_qc_no_dil_milliq | 29_qc_no_dil_milliq | 8_qc_no_dil_milliq | ++=======+========================+========================+========================+ +| 1 | 294.09792478513236 | 294.1499853056912 | 294.0634942428341 | ++-------+------------------------+------------------------+------------------------+ +| 2 | 140.57625284242982 | 0 | 143.33575827589172 | ++-------+------------------------+------------------------+------------------------+ +| 57 | 294.07200187644435 | 293.9406777222317 | 0 | ++-------+------------------------+------------------------+------------------------+ +| ... | ... | ... | ... | ++-------+------------------------+------------------------+------------------------+ + + + +recetox-aplcms - merge known table +================================== + +This tool allows merging the detected features back into the table of known features and vice versa. +It is used in the hybrid version of recetox-aplcms to augment the aligned feature table with the suspect peaks +and to augment this table with successfully detected features. + + diff -r f9fb9d8fb710 -r 472dc85ce7c5 images/scheme.png Binary file images/scheme.png has changed diff -r f9fb9d8fb710 -r 472dc85ce7c5 macros.xml --- a/macros.xml Thu Jun 16 10:27:28 2022 +0000 +++ b/macros.xml Mon Feb 13 10:28:35 2023 +0000 @@ -1,11 +1,9 @@ - 0.9.4 + 0.10.1 - r-base - r-arrow r-recetox-aplcms - r-dplyr + pymzml @@ -31,6 +29,11 @@ familyName="Novotný" url="https://github.com/xtracko" identifier="0000-0001-5449-3523" /> + - - - - - - - - - -

- - -

+ + + + + + - -

- - - - - - - -

- - - -

- + + + + - - - - - - - - - - - - - -

+ + + + + + + + + + + + - -

- - - -

+ + + + + + + + + + + + + + + + - -

- - - - -

- - -

- - - - -

- - - + + + + + + - -

- -

- - - - - diff -r f9fb9d8fb710 -r 472dc85ce7c5 macros_split.xml --- a/macros_split.xml Thu Jun 16 10:27:28 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ - - -

- - - - - - -

- - \ No newline at end of file diff -r f9fb9d8fb710 -r 472dc85ce7c5 main.R --- a/main.R Thu Jun 16 10:27:28 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,123 +0,0 @@ -library(recetox.aplcms) -library(dplyr) - -save_extracted_features <- function(df, filename) { - df <- as.data.frame(df) - columns <- c("mz", "pos", "sd1", "sd2", "area") - arrow::write_parquet(df[columns], filename) -} - -save_aligned_feature_table <- function(df, filename) { - columns <- c("feature", "mz", "rt", "sample", "sample_rt", "sample_intensity") - arrow::write_parquet(df[columns], filename) -} - -save_recovered_feature_table <- function(df, filename, out_format) { - columns <- c("feature", "mz", "rt", "sample", "sample_rt", "sample_intensity") - if (out_format == "recetox") { - peak_table <- df[columns] - recetox_peak_table <- rcx_aplcms_to_rcx_xmsannotator(peak_table) - arrow::write_parquet(recetox_peak_table, filename) - } else { - arrow::write_parquet(df[columns], filename) - } -} - -rcx_aplcms_to_rcx_xmsannotator <- function(peak_table) { - col_base <- c("feature", "mz", "rt") - output_table <- peak_table %>% distinct(across(any_of(col_base))) - - for (level in levels(factor(peak_table$sample))) { - subdata <- peak_table %>% - filter(sample == level) %>% - select(any_of(c(col_base, "sample_intensity"))) %>% - rename(!!level := "sample_intensity") - output_table <- inner_join(output_table, subdata, by = col_base) - } - output_table <- output_table %>% rename(peak = feature) - return(output_table) -} - -known_table_columns <- function() { - c("chemical_formula", "HMDB_ID", "KEGG_compound_ID", "mass", "ion.type", - "m.z", "Number_profiles_processed", "Percent_found", "mz_min", "mz_max", - "RT_mean", "RT_sd", "RT_min", "RT_max", "int_mean(log)", "int_sd(log)", - "int_min(log)", "int_max(log)") -} - -save_known_table <- function(df, filename) { - columns <- known_table_columns() - arrow::write_parquet(df[columns], filename) -} - -read_known_table <- function(filename) { - arrow::read_parquet(filename, col_select = known_table_columns()) -} - -save_pairing <- function(df, filename) { - write.table(df, filename, row.names = FALSE, col.names = c("new", "old")) -} - -save_all_extracted_features <- function(dfs, filenames) { - filenames <- tools::file_path_sans_ext(basename(filenames)) - filenames <- paste0(filenames, ".parquet") - filenames <- file.path("extracted", filenames) - dir.create("extracted") - mapply(save_extracted_features, dfs, filenames) -} - -save_all_corrected_features <- function(dfs, filenames) { - filenames <- tools::file_path_sans_ext(basename(filenames)) - filenames <- paste0(filenames, ".parquet") - filenames <- file.path("corrected", filenames) - dir.create("corrected") - mapply(save_extracted_features, dfs, filenames) -} - -unsupervised_main <- function(sample_files, aligned_file, recovered_file, out_format, ...) { - sample_files <- sort_samples_by_acquisition_number(sample_files) - - res <- unsupervised(filenames = sample_files, ...) - - save_all_features(res, sample_files) - save_all_feature_tables(res$aligned_feature_sample_table, res$recovered_feature_sample_table, aligned_file, recovered_file, out_format) -} - -hybrid_main <- function(sample_files, known_table_file, updated_known_table_file, pairing_file, aligned_file, recovered_file, out_format, ...) { - sample_files <- sort_samples_by_acquisition_number(sample_files) - - known <- read_known_table(known_table_file) - res <- hybrid(filenames = sample_files, known_table = known, ...) - - save_known_table(res$updated_known_table, updated_known_table_file) - save_pairing(res$features_known_table_pairing, pairing_file) - - save_all_features(res, sample_files) - save_all_feature_tables(res$aligned_feature_sample_table, res$recovered_feature_sample_table, aligned_file, recovered_file, out_format) -} - -save_all_features <- function(result, sample_files) { - save_all_extracted_features(result$extracted_features, sample_files) - save_all_corrected_features(result$corrected_features, sample_files) -} - -save_all_feature_tables <- function(aligned_feature_sample_table, - recovered_feature_sample_table, - aligned_file, - recovered_file, - out_format) { - save_aligned_feature_table(aligned_feature_sample_table, aligned_file) - save_recovered_feature_table(recovered_feature_sample_table, recovered_file, out_format) -} - -two_step_hybrid_main <- function(sample_files, known_table_file, updated_known_table_file, recovered_file, aligned_file, out_format, metadata, ...) { - sample_files <- sort_samples_by_acquisition_number(sample_files) - metadata <- read.table(metadata, sep = ",", header = TRUE) - - known_table <- read_known_table(known_table_file) - res <- two.step.hybrid(filenames = sample_files, known.table = known_table, work_dir = getwd(), metadata = metadata, ...) - - save_known_table(res$known_table, updated_known_table_file) - save_aligned_feature_table(res$aligned_features, aligned_file) - save_recovered_feature_table(res$final_features, recovered_file, out_format) -} diff -r f9fb9d8fb710 -r 472dc85ce7c5 mzml_id_getter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mzml_id_getter.py Mon Feb 13 10:28:35 2023 +0000 @@ -0,0 +1,23 @@ +#!/usr/bin/env python + +import argparse +import sys + +from pymzml.run import Reader + + +def main(argv): + parser = argparse.ArgumentParser(description='Get run ID from an mzML file.') + parser.add_argument('mzml_file', help='Path to an mzML file to get run ID from.') + args = parser.parse_args() + + mzml = Reader(args.mzml_file) + id = mzml.info['run_id'] + + if id is not None: + with open("sample_name.txt", mode='x') as f: + f.write(id) + + +if __name__ == '__main__': + main(sys.argv[1:]) diff -r f9fb9d8fb710 -r 472dc85ce7c5 recetox_aplcms_recover_weaker_signals.xml --- a/recetox_aplcms_recover_weaker_signals.xml Thu Jun 16 10:27:28 2022 +0000 +++ b/recetox_aplcms_recover_weaker_signals.xml Mon Feb 13 10:28:35 2023 +0000 @@ -1,143 +1,116 @@ - - recover weaker signals from LC/MS spectra + + recover weaker signals from raw data using an aligned feature table macros.xml - macros_split.xml + help.xml + - - - #for $infile in $ms_files - ln -s '${infile}' '${infile.element_identifier}' - #end for - #for $infile in $extracted_files - ln -s '${infile}' 'extracted_${infile.element_identifier}' - #end for - #for $infile in $corrected_files - ln -s '${infile}' '${infile.element_identifier}' - #end for - - + - - - - - - - - + + + + + + - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r f9fb9d8fb710 -r 472dc85ce7c5 utils.R --- a/utils.R Thu Jun 16 10:27:28 2022 +0000 +++ b/utils.R Mon Feb 13 10:28:35 2023 +0000 @@ -1,149 +1,125 @@ library(recetox.aplcms) -align_features <- function(sample_names, ...) { - aligned <- feature.align(...) - feature_names <- seq_len(nrow(aligned$pk.times)) - - list( - mz_tolerance = as.numeric(aligned$mz.tol), - rt_tolerance = as.numeric(aligned$chr.tol), - rt_crosstab = as_feature_crosstab(feature_names, sample_names, aligned$pk.times), - int_crosstab = as_feature_crosstab(feature_names, sample_names, aligned$aligned.ftrs) - ) +get_env_sample_name <- function() { + sample_name <- Sys.getenv("SAMPLE_NAME", unset = NA) + if (nchar(sample_name) == 0) { + sample_name <- NA + } + if (is.na(sample_name)) { + message("The mzML file does not contain run ID.") + } + return(sample_name) } -get_sample_name <- function(filename) { - tools::file_path_sans_ext(basename(filename)) -} - -as_feature_crosstab <- function(feature_names, sample_names, data) { - colnames(data) <- c("mz", "rt", "mz_min", "mz_max", sample_names) - rownames(data) <- feature_names - as.data.frame(data) +save_sample_name <- function(df, sample_name) { + attr(df, "sample_name") <- sample_name + return(df) } -as_feature_sample_table <- function(rt_crosstab, int_crosstab) { - feature_names <- rownames(rt_crosstab) - sample_names <- colnames(rt_crosstab)[- (1:4)] - - feature_table <- data.frame( - feature = feature_names, - mz = rt_crosstab[, 1], - rt = rt_crosstab[, 2] - ) - - # series of conversions to produce a table type from data.frame - rt_crosstab <- as.table(as.matrix(rt_crosstab[, - (1:4)])) - int_crosstab <- as.table(as.matrix(int_crosstab[, - (1:4)])) - - crosstab_axes <- list(feature = feature_names, sample = sample_names) - dimnames(rt_crosstab) <- dimnames(int_crosstab) <- crosstab_axes - - x <- as.data.frame(rt_crosstab, responseName = "sample_rt") - y <- as.data.frame(int_crosstab, responseName = "sample_intensity") - - data <- merge(x, y, by = c("feature", "sample")) - data <- merge(feature_table, data, by = "feature") - data +load_sample_name <- function(df) { + sample_name <- attr(df, "sample_name") + if (is.null(sample_name)) { + return(NA) + } else { + return(sample_name) + } } -load_features <- function(files) { - files_list <- sort_samples_by_acquisition_number(files) - features <- lapply(files_list, arrow::read_parquet) - features <- lapply(features, as.matrix) +save_data_as_parquet_file <- function(data, filename) { + arrow::write_parquet(data, filename) +} + +load_data_from_parquet_file <- function(filename) { + return(arrow::read_parquet(filename)) +} + +load_parquet_collection <- function(files) { + features <- lapply(files, arrow::read_parquet) + features <- lapply(features, tibble::as_tibble) return(features) } -save_data_as_parquet_files <- function(data, subdir) { - dir.create(subdir) - for (i in 0:(length(data) - 1)) { - filename <- file.path(subdir, paste0(subdir, "_features_", i, ".parquet")) - arrow::write_parquet(as.data.frame(data[i + 1]), filename) - } +save_parquet_collection <- function(table, sample_names, subdir) { + dir.create(subdir) + for (i in seq_len(length(table$feature_tables))) { + filename <- file.path(subdir, paste0(subdir, "_", sample_names[i], ".parquet")) + feature_table <- as.data.frame(table$feature_tables[[i]]) + feature_table <- save_sample_name(feature_table, sample_names[i]) + arrow::write_parquet(feature_table, filename) + } +} + +sort_by_sample_name <- function(tables, sample_names) { + return(tables[order(sample_names)]) } -save_aligned_features <- function(aligned, rt_file, int_file, tol_file) { - arrow::write_parquet(as.data.frame(aligned$rt_crosstab), rt_file) - arrow::write_parquet(as.data.frame(aligned$int_crosstab), int_file) - - mz_tolerance <- c(aligned$mz_tolerance) - rt_tolerance <- c(aligned$rt_tolerance) - arrow::write_parquet(data.frame(mz_tolerance, rt_tolerance), tol_file) +save_tolerances <- function(table, tol_file) { + mz_tolerance <- c(table$mz_tol_relative) + rt_tolerance <- c(table$rt_tol_relative) + arrow::write_parquet(data.frame(mz_tolerance, rt_tolerance), tol_file) } -load_aligned_features <- function(rt_file, int_file, tol_file) { - rt_cross_table <- arrow::read_parquet(rt_file) - int_cross_table <- arrow::read_parquet(int_file) - tolerances_table <- arrow::read_parquet(tol_file) +get_mz_tol <- function(tolerances) { + return(tolerances$mz_tolerance) +} - result <- list() - result$mz_tolerance <- tolerances_table$mz_tolerance - result$rt_tolerance <- tolerances_table$rt_tolerance - result$rt_crosstab <- rt_cross_table - result$int_crosstab <- int_cross_table - return(result) +get_rt_tol <- function(tolerances) { + return(tolerances$rt_tolerance) +} + +save_aligned_features <- function(aligned_features, metadata_file, rt_file, intensity_file) { + save_data_as_parquet_file(aligned_features$metadata, metadata_file) + save_data_as_parquet_file(aligned_features$rt, rt_file) + save_data_as_parquet_file(aligned_features$intensity, intensity_file) } -recover_signals <- function(cluster, - filenames, - extracted, - corrected, - aligned, - mz_tol = 1e-05, - mz_range = NA, - rt_range = NA, - use_observed_range = TRUE, - min_bandwidth = NA, - max_bandwidth = NA, - recover_min_count = 3) { - if (!is(cluster, "cluster")) { - cluster <- parallel::makeCluster(cluster) - on.exit(parallel::stopCluster(cluster)) - } - - clusterExport(cluster, c("extracted", "corrected", "aligned", "recover.weaker")) - clusterEvalQ(cluster, library("splines")) +select_table_with_sample_name <- function(tables, sample_name) { + sample_names <- lapply(tables, load_sample_name) + index <- which(sample_names == sample_name) + if (length(index) > 0) { + return(tables[[index]]) + } else { + stop(sprintf("Mismatch - sample name '%s' not present in %s", + sample_name, paste(sample_names, collapse = ", "))) + } +} - recovered <- parLapply(cluster, seq_along(filenames), function(i) { - recover.weaker( - loc = i, - filename = filenames[[i]], - this.f1 = extracted[[i]], - this.f2 = corrected[[i]], - pk.times = aligned$rt_crosstab, - aligned.ftrs = aligned$int_crosstab, - orig.tol = mz_tol, - align.mz.tol = aligned$mz_tolerance, - align.chr.tol = aligned$rt_tolerance, - mz.range = mz_range, - chr.range = rt_range, - use.observed.range = use_observed_range, - bandwidth = 0.5, - min.bw = min_bandwidth, - max.bw = max_bandwidth, - recover.min.count = recover_min_count - ) - }) +select_adjusted <- function(recovered_features) { + return(recovered_features$adjusted_features) +} - feature_table <- aligned$rt_crosstab[, 1:4] - rt_crosstab <- cbind(feature_table, sapply(recovered, function(x) x$this.times)) - int_crosstab <- cbind(feature_table, sapply(recovered, function(x) x$this.ftrs)) - - feature_names <- rownames(feature_table) - sample_names <- colnames(aligned$rt_crosstab[, - (1:4)]) - - list( - extracted_features = lapply(recovered, function(x) x$this.f1), - corrected_features = lapply(recovered, function(x) x$this.f2), - rt_crosstab = as_feature_crosstab(feature_names, sample_names, rt_crosstab), - int_crosstab = as_feature_crosstab(feature_names, sample_names, int_crosstab) - ) +known_table_columns <- function() { + c("chemical_formula", "HMDB_ID", "KEGG_compound_ID", "mass", "ion.type", + "m.z", "Number_profiles_processed", "Percent_found", "mz_min", "mz_max", + "RT_mean", "RT_sd", "RT_min", "RT_max", "int_mean(log)", "int_sd(log)", + "int_min(log)", "int_max(log)") } -create_feature_sample_table <- function(features) { - table <- as_feature_sample_table( - rt_crosstab = features$rt_crosstab, - int_crosstab = features$int_crosstab - ) - return(table) +save_known_table <- function(table, filename) { + columns <- known_table_columns() + arrow::write_parquet(table$known_table[columns], filename) +} + +read_known_table <- function(filename) { + arrow::read_parquet(filename, col_select = known_table_columns()) +} + +save_pairing <- function(table, filename) { + df <- table$pairing %>% as_tibble() %>% setNames(c("new", "old")) + arrow::write_parquet(df, filename) } + +join_tables_to_list <- function(metadata, rt_table, intensity_table) { + features <- new("list") + features$metadata <- metadata + features$intensity <- intensity_table + features$rt <- rt_table + return(features) +} + +validate_sample_names <- function(sample_names) { + if ((any(is.na(sample_names))) || (length(unique(sample_names)) != length(sample_names))) { + stop(sprintf("Sample names absent or not unique - provided sample names: %s", + paste(sample_names, collapse = ", "))) + } +}