Repository 'recetox_aplcms_recover_weaker_signals'
hg clone https://toolshed.g2.bx.psu.edu/repos/recetox/recetox_aplcms_recover_weaker_signals

Changeset 0:067a308223e3 (2022-06-10)
Next changeset 1:f9fb9d8fb710 (2022-06-16)
Commit message:
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/recetox_aplcms commit 19de0924a65bc65cbbf7c1fc17e9b5348305f95c
added:
macros.xml
macros_split.xml
main.R
recetox_aplcms_recover_weaker_signals.xml
test-data/corrected_expected/corrected_0.parquet
test-data/corrected_expected/corrected_1.parquet
test-data/corrected_expected/corrected_2.parquet
test-data/extracted_expected/extracted_0.parquet
test-data/extracted_expected/extracted_1.parquet
test-data/extracted_expected/extracted_2.parquet
test-data/hybrid.recetox.parquet
test-data/hybrid_recovered_feature_sample_table.parquet
test-data/int_cross_table.parquet
test-data/known_table.parquet
test-data/mbr_test0.mzml
test-data/mbr_test0_copy.mzml
test-data/mbr_test1.mzml
test-data/mbr_test2.mzml
test-data/rt_cross_table.parquet
test-data/tolerances.parquet
test-data/two_step_hybrid.recetox.parquet
test-data/two_step_hybrid_info.csv
test-data/unsupervised_output/corrected_features_0.parquet
test-data/unsupervised_output/corrected_features_1.parquet
test-data/unsupervised_output/corrected_features_2.parquet
test-data/unsupervised_output/extracted_features_0.parquet
test-data/unsupervised_output/extracted_features_1.parquet
test-data/unsupervised_output/extracted_features_2.parquet
test-data/unsupervised_output/unsupervised.recetox.parquet
test-data/unsupervised_output/unsupervised_aligned_feature_sample_table.parquet
test-data/unsupervised_output/unsupervised_recovered_feature_sample_table.parquet
utils.R
b
diff -r 000000000000 -r 067a308223e3 macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Fri Jun 10 10:18:24 2022 +0000
[
b'@@ -0,0 +1,247 @@\n+<macros>\r\n+    <token name="@TOOL_VERSION@">0.9.4</token>\r\n+    <xml name="requirements">\r\n+        <requirements>\r\n+            <requirement type="package" version="4.1.0">r-base</requirement>\r\n+            <requirement type="package" version="4.0.1">r-arrow</requirement>\r\n+            <requirement type="package" version="@TOOL_VERSION@">r-recetox-aplcms</requirement>\r\n+            <requirement type="package" version="1.0.7">r-dplyr</requirement>\r\n+        </requirements>\r\n+    </xml>\r\n+\r\n+    <xml name="creator">\r\n+        <creator>\r\n+            <person\r\n+                givenName="Maksym"\r\n+                familyName="Skoryk"\r\n+                url="https://github.com/maximskorik"\r\n+                identifier="0000-0003-2056-8018" />\r\n+            <person\r\n+                givenName="Matej"\r\n+                familyName="Troj\xc3\xa1k"\r\n+                url="https://github.com/xtrojak"\r\n+                identifier="0000-0003-0841-2707" />\r\n+            <person\r\n+                givenName="Martin"\r\n+                familyName="\xc4\x8cech"\r\n+                url="https://github.com/martenson"\r\n+                identifier="0000-0002-9318-1781" />\r\n+            <person\r\n+                givenName="Ji\xc5\x99\xc3\xad"\r\n+                familyName="Novotn\xc3\xbd"\r\n+                url="https://github.com/xtracko"\r\n+                identifier="0000-0001-5449-3523" />\r\n+            <organization\r\n+                url="https://www.recetox.muni.cz/"\r\n+                email="GalaxyToolsDevelopmentandDeployment@space.muni.cz"\r\n+                name="RECETOX MUNI"/>\r\n+        </creator>\r\n+    </xml>\r\n+\r\n+    <xml name="inputs">\r\n+        <inputs>\r\n+            <param name="files" type="data" format="mzdata,mzml,mzxml,netcdf" multiple="true" min="3" label="data"\r\n+                   help="Mass spectrometry files for peak extraction." />\r\n+            <yield />\r\n+       </inputs>\r\n+    </xml>\r\n+\r\n+    <xml name="history_db">\r\n+        <param name="known_table" type="data" format="parquet" label="known_table"\r\n+               help="A data table containing the known metabolite ions and previously found features. The table must contain these 18 columns: chemical_formula (optional), HMDB_ID (optional), KEGG_compound_ID (optional), neutral.mass (optional), ion.type (the ion form - optional), m.z (either theoretical or mean observed m/z value of previously found features), Number_profiles_processed (the total number of processed samples to build this database), Percent_found (the percentage of historically processed samples in which the feature appeared), mz_min (minimum  observed m/z value), mz_max (maximum observed m/z value), RT_mean (mean observed retention time), RT_sd (standard deviation of observed retention time), RT_min (minimum observed retention time), RT_max (maximum observed retention time), int_mean.log. (mean observed log intensity), int_sd.log. (standard deviation of observed log intensity), int_min.log. (minimum observed log intensity), int_max.log. (maximum observed log intensity)." />\r\n+        <section name="history_db" title="Known-Table settings">\r\n+            <param name="match_tol_ppm" type="integer" optional="true" min="0" label="match_tol_ppm (optional)"\r\n+                   help="The ppm tolerance to match identified features to known metabolites/features." />\r\n+            <param name="new_feature_min_count" type="integer" value="2" min="1" label="new_feature_min_count"\r\n+                   help="The minimum number of occurrences of a historically unseen (unknown) feature to add this feature into the database of known features." />\r\n+        </section>\r\n+    </xml>\r\n+\r\n+    <xml name="noise_filtering">\r\n+        <section name="noise_filtering" title="Noise filtering and peak detection">\r\n+            <yield />\r\n+            <param name="min_pres" type="float" value="0.5"\r\n+                   label="min_pres"\r\n+                   help="The minimum proportion of presence in the time period for a series of signals grouped by m/z'..b'">\r\n+        <param name="mz_tol" type="float" value="1e-05" label="mz_tol"\r\n+               help="The m/z tolerance level for the grouping of data points. This value is expressed as the\r\n+               fraction of the m/z value. This value, multiplied by the m/z value, becomes the cutoff level.\r\n+               The recommended value is the machine\'s nominal accuracy level. Divide the ppm value by 1e6.\r\n+               For FTMS, 1e-5 is recommended." />\r\n+    </xml>\r\n+\r\n+    <xml name="output_format">\r\n+       <section name="output_format" title="Output Format">\r\n+              <param name="out_format" type="boolean" checked="false" truevalue="recetox" falsevalue="original" label="Use custom RECETOX output format?" />\r\n+       </section>\r\n+    </xml>\r\n+\r\n+    <xml name="unsupervised_outputs">\r\n+        <data name="recovered_feature_sample_table" format="parquet" label="${tool.name} recovered_feature_sample_table on ${on_string}" />\r\n+        <data name="aligned_feature_sample_table" format="parquet" label="${tool.name} aligned_feature_sample_table on ${on_string}" hidden="true" />\r\n+        <yield />\r\n+    </xml>\r\n+\r\n+    <xml name="citations">\r\n+        <citations>\r\n+            <citation type="doi">10.1093/bioinformatics/btp291</citation>\r\n+            <citation type="doi">10.1186/1471-2105-11-559</citation>\r\n+            <citation type="doi">10.1021/pr301053d</citation>\r\n+            <citation type="doi">10.1093/bioinformatics/btu430</citation>\r\n+            <yield />\r\n+        </citations>\r\n+    </xml>\r\n+\r\n+    <token name="@HELP_hybrid@">\r\n+        <![CDATA[\r\n+            This is the Hybrid version of apLCMS which is incorporating the knowledge of known metabolites and historically\r\n+            detected features on the same machinery to help detect and quantify lower-intensity peaks.\r\n+\r\n+            CAUTION: To use such knowledge, especially historical data, you must keep using (1) the same chromatography\r\n+            system (otherwise the retention time will not match), and (2) the same type of samples with similar extraction\r\n+            technique, such as human serum.\r\n+\r\n+            @GENERAL_HELP@\r\n+        ]]>\r\n+    </token>\r\n+\r\n+    <token name="@HELP_unsupervised@">\r\n+        <![CDATA[\r\n+            This is the Unsupervised version of apLCMS which is not relying on any existing knowledge about metabolites or\r\n+            any historically detected features. For such functionality please use the Hybrid version of apLCMS.\r\n+\r\n+            @GENERAL_HELP@\r\n+        ]]>\r\n+    </token>\r\n+\r\n+    <token name="@HELP_two-step-hybrid@">\r\n+        <![CDATA[\r\n+              This is the **Two-Step Hybrid** version of **apLCMS**. This tool is improved upon the Hybrid version by accounting for the batch\r\n+              effects in multi-batch experiments. As in the Hybrid version, this tool incorporates the knowledge of known metabolites and\r\n+              historically detected features on the same machinery to help detect and quantify lower-intensity peaks.\r\n+\r\n+              **CAUTION**: To use such knowledge, especially historical data, you must keep using (1) the same chromatography\r\n+              system (otherwise the retention time will not match), and (2) the same type of samples with similar extraction\r\n+              technique, such as human serum.\r\n+\r\n+            @GENERAL_HELP@\r\n+        ]]>\r\n+    </token>\r\n+\r\n+    <token name="@GENERAL_HELP@">\r\n+        apLCMS is a software which generates a feature table from a batch of LC/MS spectra. The m/z and retention time\r\n+        tolerance levels are estimated from the data. A run-filter is used to detect peaks and remove noise.\r\n+        Non-parametric statistical methods are used to find-tune peak selection and grouping. After retention time\r\n+        correction, a feature table is generated by aligning peaks across spectra. For further information on apLCMS\r\n+        please refer to https://mypage.cuhk.edu.cn/academics/yutianwei/apLCMS/.\r\n+    </token>\r\n+</macros>\r\n'
b
diff -r 000000000000 -r 067a308223e3 macros_split.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros_split.xml Fri Jun 10 10:18:24 2022 +0000
b
@@ -0,0 +1,23 @@
+<macros>
+    <xml name="noise_filtering_split">
+        <section name="noise_filtering" title="Noise filtering and peak detection">
+            <param name="min_pres" type="float" value="0.5"
+                   label="min_pres"
+                   help="The minimum proportion of presence in the time period for a series of signals grouped by m/z to be considered a peak." />
+            <param name="min_run" type="float" value="12"
+                   label="min_run"
+                   help="The minimum length of elution time for a series of signals grouped by m/z to be considered a peak." />
+            <param name="mz_tol" type="float" value="1e-05"
+                   label="mz_tol"
+                   help="The m/z tolerance level for the grouping of data points. This value is expressed as the fraction of the m/z value. This value, multiplied by the m/z value, becomes the cutoff level. The recommended value is the machine's nominal accuracy level. Divide the ppm value by 1e6. For FTMS, 1e-5 is recommended." />
+            <param name="baseline_correct" type="float" value="0" label="baseline_correct"
+                   help="After grouping the observations, the highest intensity in each group is found. If the highest is lower than this value, the entire group will be deleted. The default value is NA, in which case the program uses a percentile of the height of the noise groups. If given a value, the value will be used as the threshold, and baseline.correct.noise.percentile will be ignored." />
+            <param name="baseline_correct_noise_percentile" type="float" value="0.05"
+                   label="baseline_correct_noise_percentile"
+                   help="The percentile of signal strength of those EIC that don't pass the run filter, to be used as the baseline threshold of signal strength." />
+            <param name="intensity_weighted" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE"
+                   label="intensity_weighted"
+                   help="Whether to weight the local density by signal intensities in initial peak detection." />
+        </section>
+    </xml>
+</macros>
\ No newline at end of file
b
diff -r 000000000000 -r 067a308223e3 main.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/main.R Fri Jun 10 10:18:24 2022 +0000
[
@@ -0,0 +1,123 @@
+library(recetox.aplcms)
+library(dplyr)
+
+save_extracted_features <- function(df, filename) {
+  df <- as.data.frame(df)
+  columns <- c("mz", "pos", "sd1", "sd2", "area")
+  arrow::write_parquet(df[columns], filename)
+}
+
+save_aligned_feature_table <- function(df, filename) {
+  columns <- c("feature", "mz", "rt", "sample", "sample_rt", "sample_intensity")
+  arrow::write_parquet(df[columns], filename)
+}
+
+save_recovered_feature_table <- function(df, filename, out_format) {
+  columns <- c("feature", "mz", "rt", "sample", "sample_rt", "sample_intensity")
+  if (out_format == "recetox") {
+    peak_table <- df[columns]
+    recetox_peak_table <- rcx_aplcms_to_rcx_xmsannotator(peak_table)
+    arrow::write_parquet(recetox_peak_table, filename)
+  } else {
+    arrow::write_parquet(df[columns], filename)
+  }
+}
+
+rcx_aplcms_to_rcx_xmsannotator <- function(peak_table) {
+    col_base <- c("feature", "mz", "rt")
+    output_table <- peak_table %>% distinct(across(any_of(col_base)))
+
+    for (level in levels(factor(peak_table$sample))) {
+        subdata <- peak_table %>%
+            filter(sample == level) %>%
+            select(any_of(c(col_base, "sample_intensity"))) %>%
+            rename(!!level := "sample_intensity")
+        output_table <- inner_join(output_table, subdata, by = col_base)
+    }
+    output_table <- output_table %>% rename(peak = feature)
+    return(output_table)
+}
+
+known_table_columns <- function() {
+  c("chemical_formula", "HMDB_ID", "KEGG_compound_ID", "mass", "ion.type",
+    "m.z", "Number_profiles_processed", "Percent_found", "mz_min", "mz_max",
+    "RT_mean", "RT_sd", "RT_min", "RT_max", "int_mean(log)", "int_sd(log)",
+    "int_min(log)", "int_max(log)")
+}
+
+save_known_table <- function(df, filename) {
+  columns <- known_table_columns()
+  arrow::write_parquet(df[columns], filename)
+}
+
+read_known_table <- function(filename) {
+  arrow::read_parquet(filename, col_select = known_table_columns())
+}
+
+save_pairing <- function(df, filename) {
+  write.table(df, filename, row.names = FALSE, col.names = c("new", "old"))
+}
+
+save_all_extracted_features <- function(dfs, filenames) {
+  filenames <- tools::file_path_sans_ext(basename(filenames))
+  filenames <- paste0(filenames, ".parquet")
+  filenames <- file.path("extracted", filenames)
+  dir.create("extracted")
+  mapply(save_extracted_features, dfs, filenames)
+}
+
+save_all_corrected_features <- function(dfs, filenames) {
+  filenames <- tools::file_path_sans_ext(basename(filenames))
+  filenames <- paste0(filenames, ".parquet")
+  filenames <- file.path("corrected", filenames)
+  dir.create("corrected")
+  mapply(save_extracted_features, dfs, filenames)
+}
+
+unsupervised_main <- function(sample_files, aligned_file, recovered_file, out_format, ...) {
+  sample_files <- sort_samples_by_acquisition_number(sample_files)
+
+  res <- unsupervised(filenames = sample_files, ...)
+
+  save_all_features(res, sample_files)
+  save_all_feature_tables(res$aligned_feature_sample_table, res$recovered_feature_sample_table, aligned_file, recovered_file, out_format)
+}
+
+hybrid_main <- function(sample_files, known_table_file, updated_known_table_file, pairing_file, aligned_file, recovered_file, out_format, ...) {
+  sample_files <- sort_samples_by_acquisition_number(sample_files)
+
+  known <- read_known_table(known_table_file)
+  res <- hybrid(filenames = sample_files, known_table = known, ...)
+
+  save_known_table(res$updated_known_table, updated_known_table_file)
+  save_pairing(res$features_known_table_pairing, pairing_file)
+
+  save_all_features(res, sample_files)
+  save_all_feature_tables(res$aligned_feature_sample_table, res$recovered_feature_sample_table, aligned_file, recovered_file, out_format)
+}
+
+save_all_features <- function(result, sample_files) {
+  save_all_extracted_features(result$extracted_features, sample_files)
+  save_all_corrected_features(result$corrected_features, sample_files)
+}
+
+save_all_feature_tables <- function(aligned_feature_sample_table,
+                                    recovered_feature_sample_table,
+                                    aligned_file,
+                                    recovered_file,
+                                    out_format) {
+  save_aligned_feature_table(aligned_feature_sample_table, aligned_file)
+  save_recovered_feature_table(recovered_feature_sample_table, recovered_file, out_format)
+}
+
+two_step_hybrid_main <- function(sample_files, known_table_file, updated_known_table_file, recovered_file, aligned_file, out_format, metadata, ...) {
+  sample_files <- sort_samples_by_acquisition_number(sample_files)
+  metadata <- read.table(metadata, sep = ",", header = TRUE)
+
+  known_table <- read_known_table(known_table_file)
+  res <- two.step.hybrid(filenames = sample_files, known.table = known_table, work_dir = getwd(), metadata = metadata, ...)
+
+  save_known_table(res$known_table, updated_known_table_file)
+  save_aligned_feature_table(res$aligned_features, aligned_file)
+  save_recovered_feature_table(res$final_features, recovered_file, out_format)
+}
b
diff -r 000000000000 -r 067a308223e3 recetox_aplcms_recover_weaker_signals.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/recetox_aplcms_recover_weaker_signals.xml Fri Jun 10 10:18:24 2022 +0000
[
@@ -0,0 +1,131 @@
+<tool id="recetox_aplcms_recover_weaker_signals" name="RECETOX apLCMS - recover weaker signals" version="@TOOL_VERSION@+galaxy0">
+    <description>recover weaker signals from LC/MS spectra</description>
+    <macros>
+        <import>macros.xml</import>
+        <import>macros_split.xml</import>
+    </macros>
+    <expand macro="creator"/>
+
+    <expand macro="requirements"/>
+    <command detect_errors="aggressive"><![CDATA[
+        sh ${symlink_inputs} &&
+        Rscript -e 'source("${__tool_directory__}/utils.R")' -e 'source("${__tool_directory__}/main.R")' -e 'source("${run_script}")'
+    ]]></command>
+    <configfiles>
+        <configfile name="symlink_inputs">
+            #for $infile in $ms_files
+                ln -s '${infile}' '${infile.element_identifier}'
+            #end for
+            #for $infile in $extracted_files
+                ln -s '${infile}' '${infile.element_identifier}'
+            #end for
+            #for $infile in $corrected_files
+                ln -s '${infile}' '${infile.element_identifier}'
+            #end for
+        </configfile>
+        <configfile name="run_script"><![CDATA[
+            #set filenames = str("', '").join([str($f.element_identifier) for $f in $ms_files])
+            filenames <- sort_samples_by_acquisition_number(c('$filenames'))
+
+            #set extracted_files = str("', '").join([str($f.element_identifier) for $f in $extracted_files])
+            extracted <- load_features(c('$extracted_files'))
+
+            #set corrected_files = str("', '").join([str($f.element_identifier) for $f in $corrected_files])
+            corrected <- load_features(c('$corrected_files'))
+
+            aligned <- load_aligned_features('$rt_cross_table_file', '$int_cross_table_file', '$tolerances_file')
+
+            cluster <- as.integer(Sys.getenv('GALAXY_SLOTS', unset = 1))
+
+            recovered <- recover_signals(cluster,
+                                         filenames,
+                                         extracted,
+                                         corrected,
+                                         aligned,
+                                         $mz_tol,
+                                         $weak_signal_recovery.recover_mz_range,
+                                         $weak_signal_recovery.recover_chr_range,
+                                         $weak_signal_recovery.use_observed_range,
+                                         $min_bandwidth,
+                                         $max_bandwidth,
+                                         $weak_signal_recovery.recover_min_count
+            )
+
+            aligned_feature_sample_table <- create_feature_sample_table(aligned)
+            recovered_feature_sample_table <- create_feature_sample_table(recovered)
+
+            save_all_features(recovered, filenames)
+            save_all_feature_tables(aligned_feature_sample_table, recovered_feature_sample_table,
+                                    '${aligned_feature_sample_table}', '${recovered_feature_sample_table}',
+                                    '$output_format.out_format')
+        ]]></configfile>
+    </configfiles>
+
+    <inputs>
+        <param name="ms_files" type="data" format="mzdata,mzml,mzxml,netcdf" multiple="true" min="2" label="Input data"
+               help="Mass spectrometry file for peak extraction." />
+        <param name="extracted_files" type="data" format="parquet" multiple="true" min="2" label="Input extracted feature samples"
+               help="Mass spectrometry files containing feature samples." />
+        <param name="corrected_files" type="data" format="parquet" multiple="true" min="2" label="Input corrected feature samples"
+               help="Mass spectrometry file containing corrected feature samples." />
+        <param name="tolerances_file" type="data" format="parquet" label="Input tolerances" help="TBD"/>
+        <param name="rt_cross_table_file" type="data" format="parquet" label="Input rt cross table" help="TBD"/>
+        <param name="int_cross_table_file" type="data" format="parquet" label="Input int cross table" help="TBD"/>
+        <expand macro="mz_tol_macro"/>
+        <param name="min_bandwidth" type="float" optional="true" label="min_bandwidth (optional)"
+               help="The minimum bandwidth to use in the kernel smoother." />
+        <param name="max_bandwidth" type="float" optional="true" label="max_bandwidth (optional)"
+               help="The maximum bandwidth to use in the kernel smoother." />
+        <expand macro="weak_signal_recovery"/>
+        <expand macro="output_format"/>
+    </inputs>
+
+    <outputs>
+        <expand macro="unsupervised_outputs">
+            <collection  name="extracted_features" type="list" label="${tool.name} extracted_features on ${on_string}">
+                <discover_datasets pattern="__designation__" directory="extracted" format="parquet" />
+            </collection >
+            <collection  name="corrected_features" type="list" label="${tool.name} corrected_features on ${on_string}">
+                <discover_datasets pattern="__designation__" directory="corrected" format="parquet" />
+            </collection >
+        </expand>
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="ms_files" value="mbr_test2.mzml,mbr_test1.mzml,mbr_test0.mzml" ftype="mzml"/>
+            <param name="extracted_files" ftype="parquet"
+                   value="extracted_expected/extracted_0.parquet,extracted_expected/extracted_1.parquet,extracted_expected/extracted_2.parquet"/>
+            <param name="corrected_files" ftype="parquet"
+                   value="corrected_expected/corrected_0.parquet,corrected_expected/corrected_1.parquet,corrected_expected/corrected_2.parquet"/>
+            <param name="tolerances_file" value="tolerances.parquet" ftype="parquet"/>
+            <param name="rt_cross_table_file" value="rt_cross_table.parquet" ftype="parquet"/>
+            <param name="int_cross_table_file" value="int_cross_table.parquet" ftype="parquet"/>
+
+            <output name="recovered_feature_sample_table" ftype="parquet"
+                    file="unsupervised_output/unsupervised_recovered_feature_sample_table.parquet"/>
+            <output name="aligned_feature_sample_table" ftype="parquet"
+                    file="unsupervised_output/unsupervised_aligned_feature_sample_table.parquet"/>
+            <output_collection name="corrected_features" type="list">
+                <element name="mbr_test0.parquet" file="unsupervised_output/corrected_features_0.parquet" ftype="parquet"/>
+                <element name="mbr_test1.parquet" file="unsupervised_output/corrected_features_1.parquet" ftype="parquet"/>
+                <element name="mbr_test2.parquet" file="unsupervised_output/corrected_features_2.parquet" ftype="parquet"/>
+            </output_collection>
+            <output_collection name="extracted_features" type="list">
+                <element name="mbr_test0.parquet" file="unsupervised_output/extracted_features_0.parquet" ftype="parquet"/>
+                <element name="mbr_test1.parquet" file="unsupervised_output/extracted_features_1.parquet" ftype="parquet"/>
+                <element name="mbr_test2.parquet" file="unsupervised_output/extracted_features_2.parquet" ftype="parquet"/>
+            </output_collection>
+        </test>
+    </tests>
+
+    <help>
+        <![CDATA[
+            This is a tool which runs apLCMS recovery of weaker signals.
+
+            @GENERAL_HELP@
+        ]]>
+    </help>
+
+    <expand macro="citations"/>
+</tool>
b
diff -r 000000000000 -r 067a308223e3 test-data/corrected_expected/corrected_0.parquet
b
Binary file test-data/corrected_expected/corrected_0.parquet has changed
b
diff -r 000000000000 -r 067a308223e3 test-data/corrected_expected/corrected_1.parquet
b
Binary file test-data/corrected_expected/corrected_1.parquet has changed
b
diff -r 000000000000 -r 067a308223e3 test-data/corrected_expected/corrected_2.parquet
b
Binary file test-data/corrected_expected/corrected_2.parquet has changed
b
diff -r 000000000000 -r 067a308223e3 test-data/extracted_expected/extracted_0.parquet
b
Binary file test-data/extracted_expected/extracted_0.parquet has changed
b
diff -r 000000000000 -r 067a308223e3 test-data/extracted_expected/extracted_1.parquet
b
Binary file test-data/extracted_expected/extracted_1.parquet has changed
b
diff -r 000000000000 -r 067a308223e3 test-data/extracted_expected/extracted_2.parquet
b
Binary file test-data/extracted_expected/extracted_2.parquet has changed
b
diff -r 000000000000 -r 067a308223e3 test-data/hybrid.recetox.parquet
b
Binary file test-data/hybrid.recetox.parquet has changed
b
diff -r 000000000000 -r 067a308223e3 test-data/hybrid_recovered_feature_sample_table.parquet
b
Binary file test-data/hybrid_recovered_feature_sample_table.parquet has changed
b
diff -r 000000000000 -r 067a308223e3 test-data/int_cross_table.parquet
b
Binary file test-data/int_cross_table.parquet has changed
b
diff -r 000000000000 -r 067a308223e3 test-data/known_table.parquet
b
Binary file test-data/known_table.parquet has changed
b
diff -r 000000000000 -r 067a308223e3 test-data/mbr_test0.mzml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mbr_test0.mzml Fri Jun 10 10:18:24 2022 +0000
b
b'@@ -0,0 +1,26908 @@\n+<?xml version="1.0" encoding="utf-8"?>\n+<indexedmzML xmlns="http://psi.hupo.org/ms/mzml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.2_idx.xsd">\n+  <mzML xmlns="http://psi.hupo.org/ms/mzml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.0.xsd" id="2016_Jan_12_QE2_47" version="1.1.0">\n+    <cvList count="2">\n+      <cv id="MS" fullName="Proteomics Standards Initiative Mass Spectrometry Ontology" version="3.74.0" URI="http://psidev.cvs.sourceforge.net/*checkout*/psidev/psi/psi-ms/mzML/controlledVocabulary/psi-ms.obo"/>\n+      <cv id="UO" fullName="Unit Ontology" version="12:10:2011" URI="http://obo.cvs.sourceforge.net/*checkout*/obo/obo/ontology/phenotype/unit.obo"/>\n+    </cvList>\n+    <fileDescription>\n+      <fileContent>\n+        <cvParam cvRef="MS" accession="MS:1000579" name="MS1 spectrum" value=""/>\n+        <cvParam cvRef="MS" accession="MS:1000127" name="centroid spectrum" value=""/>\n+        <cvParam cvRef="MS" accession="MS:1000580" name="MSn spectrum" value=""/>\n+      </fileContent>\n+      <sourceFileList count="3">\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.raw" name="2016_Jan_12_QE2_47.raw" location="">\n+          <cvParam cvRef="MS" accession="MS:1000569" name="SHA-1" value="6f65a1287b2d7522c5fb90a8a5304212b3568b67"/>\n+          <cvParam cvRef="MS" accession="MS:1000563" name="Thermo RAW format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000768" name="Thermo nativeID format" value=""/>\n+        </sourceFile>\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.mzXML" name="2016_Jan_12_QE2_47.mzXML" location="file:///">\n+          <cvParam cvRef="MS" accession="MS:1000776" name="scan number only nativeID format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000566" name="ISB mzXML format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000569" name="SHA-1" value="d9c6278c1c6a8f8ca6c69133bacd00efc5f2e706"/>\n+        </sourceFile>\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.mzML" name="2016_Jan_12_QE2_47.mzML" location="file:////Users/kumar207/Documents/Projects/Pratik/script/../mzML">\n+        </sourceFile>\n+      </sourceFileList>\n+    </fileDescription>\n+    <softwareList count="4">\n+      <software id="Xcalibur_x0020_software" version="2.6-264001/2.6.0.2640">\n+        <cvParam cvRef="MS" accession="MS:1000532" name="Xcalibur" value=""/>\n+      </software>\n+      <software id="ReAdW_x0020_software" version="2015.1.0(build Jun 29 2015 14:09:50)">\n+        <cvParam cvRef="MS" accession="MS:1000541" name="ReAdW" value=""/>\n+      </software>\n+      <software id="pwiz_3.0.8990" version="3.0.8990">\n+        <cvParam cvRef="MS" accession="MS:1000615" name="ProteoWizard software" value=""/>\n+      </software>\n+      <software id="pwiz_3.0.7680" version="3.0.7680">\n+        <cvParam cvRef="MS" accession="MS:1000615" name="ProteoWizard software" value=""/>\n+      </software>\n+    </softwareList>\n+    <instrumentConfigurationList count="1">\n+      <instrumentConfiguration id="IC1">\n+        <userParam name="msManufacturer" value="Thermo Finnigan"/>\n+        <userParam name="msModel" value="unknown"/>\n+        <componentList count="3">\n+          <source order="1">\n+            <userParam name="msIonisation" value="NSI"/>\n+          </source>\n+          <analyzer order="1">\n+            <cvParam cvRef="MS" accession="MS:1000079" name="fourier transform ion cyclotron resonance mass spectrometer" value=""/>\n+          </analyzer>\n+          <detector order="1">\n+            <userParam name="msDetector" value="unknown"/>\n+          </detector>\n+        </componentList>\n+        <softwareRef ref="Xcalibur_x0020_software"/>\n+      </instrumentConfiguration>\n+    </instrumentConfigurationList>\n+    <dataProcessingList count="3">\n+      <dataProcessing id="dataProces'..b'ntrollerNumber=1 scan=90261">2414719</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90324">2418981</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90325">2422771</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90389">2426417</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90825">2430097</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90896">2434045</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90952">2438179</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90967">2441823</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91020">2445963</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91036">2449647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91084">2453875</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91100">2457676</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91151">2462056</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91167">2465804</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91234">2469798</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91304">2473755</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91379">2477398</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91464">2481055</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92113">2484679</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92168">2488404</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92183">2492017</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92238">2495975</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92249">2499652</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92303">2503647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92318">2507303</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92389">2511184</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94116">2514904</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94183">2518552</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94258">2522503</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94332">2526517</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94729">2530126</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94857">2534084</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94919">2538185</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94980">2541937</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=95063">2545506</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96694">2549319</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96764">2552999</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96836">2556647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96918">2560122</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96993">2563742</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=97169">2567598</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99329">2571216</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99385">2574908</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99446">2578708</offset>\n+    </index>\n+    <index name="chromatogram">\n+    </index>\n+  </indexList>\n+  <indexListOffset>2582386</indexListOffset>\n+  <fileChecksum>23b8703ddaf744e69e1ba72be2584fd3df6f2dc4</fileChecksum>\n+</indexedmzML>\n'
b
diff -r 000000000000 -r 067a308223e3 test-data/mbr_test0_copy.mzml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mbr_test0_copy.mzml Fri Jun 10 10:18:24 2022 +0000
b
b'@@ -0,0 +1,26908 @@\n+<?xml version="1.0" encoding="utf-8"?>\n+<indexedmzML xmlns="http://psi.hupo.org/ms/mzml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.2_idx.xsd">\n+  <mzML xmlns="http://psi.hupo.org/ms/mzml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.0.xsd" id="2016_Jan_12_QE2_47" version="1.1.0">\n+    <cvList count="2">\n+      <cv id="MS" fullName="Proteomics Standards Initiative Mass Spectrometry Ontology" version="3.74.0" URI="http://psidev.cvs.sourceforge.net/*checkout*/psidev/psi/psi-ms/mzML/controlledVocabulary/psi-ms.obo"/>\n+      <cv id="UO" fullName="Unit Ontology" version="12:10:2011" URI="http://obo.cvs.sourceforge.net/*checkout*/obo/obo/ontology/phenotype/unit.obo"/>\n+    </cvList>\n+    <fileDescription>\n+      <fileContent>\n+        <cvParam cvRef="MS" accession="MS:1000579" name="MS1 spectrum" value=""/>\n+        <cvParam cvRef="MS" accession="MS:1000127" name="centroid spectrum" value=""/>\n+        <cvParam cvRef="MS" accession="MS:1000580" name="MSn spectrum" value=""/>\n+      </fileContent>\n+      <sourceFileList count="3">\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.raw" name="2016_Jan_12_QE2_47.raw" location="">\n+          <cvParam cvRef="MS" accession="MS:1000569" name="SHA-1" value="6f65a1287b2d7522c5fb90a8a5304212b3568b67"/>\n+          <cvParam cvRef="MS" accession="MS:1000563" name="Thermo RAW format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000768" name="Thermo nativeID format" value=""/>\n+        </sourceFile>\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.mzXML" name="2016_Jan_12_QE2_47.mzXML" location="file:///">\n+          <cvParam cvRef="MS" accession="MS:1000776" name="scan number only nativeID format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000566" name="ISB mzXML format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000569" name="SHA-1" value="d9c6278c1c6a8f8ca6c69133bacd00efc5f2e706"/>\n+        </sourceFile>\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.mzML" name="2016_Jan_12_QE2_47.mzML" location="file:////Users/kumar207/Documents/Projects/Pratik/script/../mzML">\n+        </sourceFile>\n+      </sourceFileList>\n+    </fileDescription>\n+    <softwareList count="4">\n+      <software id="Xcalibur_x0020_software" version="2.6-264001/2.6.0.2640">\n+        <cvParam cvRef="MS" accession="MS:1000532" name="Xcalibur" value=""/>\n+      </software>\n+      <software id="ReAdW_x0020_software" version="2015.1.0(build Jun 29 2015 14:09:50)">\n+        <cvParam cvRef="MS" accession="MS:1000541" name="ReAdW" value=""/>\n+      </software>\n+      <software id="pwiz_3.0.8990" version="3.0.8990">\n+        <cvParam cvRef="MS" accession="MS:1000615" name="ProteoWizard software" value=""/>\n+      </software>\n+      <software id="pwiz_3.0.7680" version="3.0.7680">\n+        <cvParam cvRef="MS" accession="MS:1000615" name="ProteoWizard software" value=""/>\n+      </software>\n+    </softwareList>\n+    <instrumentConfigurationList count="1">\n+      <instrumentConfiguration id="IC1">\n+        <userParam name="msManufacturer" value="Thermo Finnigan"/>\n+        <userParam name="msModel" value="unknown"/>\n+        <componentList count="3">\n+          <source order="1">\n+            <userParam name="msIonisation" value="NSI"/>\n+          </source>\n+          <analyzer order="1">\n+            <cvParam cvRef="MS" accession="MS:1000079" name="fourier transform ion cyclotron resonance mass spectrometer" value=""/>\n+          </analyzer>\n+          <detector order="1">\n+            <userParam name="msDetector" value="unknown"/>\n+          </detector>\n+        </componentList>\n+        <softwareRef ref="Xcalibur_x0020_software"/>\n+      </instrumentConfiguration>\n+    </instrumentConfigurationList>\n+    <dataProcessingList count="3">\n+      <dataProcessing id="dataProces'..b'ntrollerNumber=1 scan=90261">2414719</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90324">2418981</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90325">2422771</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90389">2426417</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90825">2430097</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90896">2434045</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90952">2438179</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90967">2441823</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91020">2445963</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91036">2449647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91084">2453875</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91100">2457676</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91151">2462056</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91167">2465804</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91234">2469798</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91304">2473755</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91379">2477398</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91464">2481055</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92113">2484679</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92168">2488404</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92183">2492017</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92238">2495975</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92249">2499652</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92303">2503647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92318">2507303</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92389">2511184</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94116">2514904</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94183">2518552</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94258">2522503</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94332">2526517</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94729">2530126</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94857">2534084</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94919">2538185</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94980">2541937</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=95063">2545506</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96694">2549319</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96764">2552999</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96836">2556647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96918">2560122</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96993">2563742</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=97169">2567598</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99329">2571216</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99385">2574908</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99446">2578708</offset>\n+    </index>\n+    <index name="chromatogram">\n+    </index>\n+  </indexList>\n+  <indexListOffset>2582386</indexListOffset>\n+  <fileChecksum>23b8703ddaf744e69e1ba72be2584fd3df6f2dc4</fileChecksum>\n+</indexedmzML>\n'
b
diff -r 000000000000 -r 067a308223e3 test-data/mbr_test1.mzml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mbr_test1.mzml Fri Jun 10 10:18:24 2022 +0000
b
b'@@ -0,0 +1,26908 @@\n+<?xml version="1.0" encoding="utf-8"?>\n+<indexedmzML xmlns="http://psi.hupo.org/ms/mzml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.2_idx.xsd">\n+  <mzML xmlns="http://psi.hupo.org/ms/mzml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.0.xsd" id="2016_Jan_12_QE2_47" version="1.1.0">\n+    <cvList count="2">\n+      <cv id="MS" fullName="Proteomics Standards Initiative Mass Spectrometry Ontology" version="3.74.0" URI="http://psidev.cvs.sourceforge.net/*checkout*/psidev/psi/psi-ms/mzML/controlledVocabulary/psi-ms.obo"/>\n+      <cv id="UO" fullName="Unit Ontology" version="12:10:2011" URI="http://obo.cvs.sourceforge.net/*checkout*/obo/obo/ontology/phenotype/unit.obo"/>\n+    </cvList>\n+    <fileDescription>\n+      <fileContent>\n+        <cvParam cvRef="MS" accession="MS:1000579" name="MS1 spectrum" value=""/>\n+        <cvParam cvRef="MS" accession="MS:1000127" name="centroid spectrum" value=""/>\n+        <cvParam cvRef="MS" accession="MS:1000580" name="MSn spectrum" value=""/>\n+      </fileContent>\n+      <sourceFileList count="3">\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.raw" name="2016_Jan_12_QE2_47.raw" location="">\n+          <cvParam cvRef="MS" accession="MS:1000569" name="SHA-1" value="6f65a1287b2d7522c5fb90a8a5304212b3568b67"/>\n+          <cvParam cvRef="MS" accession="MS:1000563" name="Thermo RAW format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000768" name="Thermo nativeID format" value=""/>\n+        </sourceFile>\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.mzXML" name="2016_Jan_12_QE2_47.mzXML" location="file:///">\n+          <cvParam cvRef="MS" accession="MS:1000776" name="scan number only nativeID format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000566" name="ISB mzXML format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000569" name="SHA-1" value="d9c6278c1c6a8f8ca6c69133bacd00efc5f2e706"/>\n+        </sourceFile>\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.mzML" name="2016_Jan_12_QE2_47.mzML" location="file:////Users/kumar207/Documents/Projects/Pratik/script/../mzML">\n+        </sourceFile>\n+      </sourceFileList>\n+    </fileDescription>\n+    <softwareList count="4">\n+      <software id="Xcalibur_x0020_software" version="2.6-264001/2.6.0.2640">\n+        <cvParam cvRef="MS" accession="MS:1000532" name="Xcalibur" value=""/>\n+      </software>\n+      <software id="ReAdW_x0020_software" version="2015.1.0(build Jun 29 2015 14:09:50)">\n+        <cvParam cvRef="MS" accession="MS:1000541" name="ReAdW" value=""/>\n+      </software>\n+      <software id="pwiz_3.0.8990" version="3.0.8990">\n+        <cvParam cvRef="MS" accession="MS:1000615" name="ProteoWizard software" value=""/>\n+      </software>\n+      <software id="pwiz_3.0.7680" version="3.0.7680">\n+        <cvParam cvRef="MS" accession="MS:1000615" name="ProteoWizard software" value=""/>\n+      </software>\n+    </softwareList>\n+    <instrumentConfigurationList count="1">\n+      <instrumentConfiguration id="IC1">\n+        <userParam name="msManufacturer" value="Thermo Finnigan"/>\n+        <userParam name="msModel" value="unknown"/>\n+        <componentList count="3">\n+          <source order="1">\n+            <userParam name="msIonisation" value="NSI"/>\n+          </source>\n+          <analyzer order="1">\n+            <cvParam cvRef="MS" accession="MS:1000079" name="fourier transform ion cyclotron resonance mass spectrometer" value=""/>\n+          </analyzer>\n+          <detector order="1">\n+            <userParam name="msDetector" value="unknown"/>\n+          </detector>\n+        </componentList>\n+        <softwareRef ref="Xcalibur_x0020_software"/>\n+      </instrumentConfiguration>\n+    </instrumentConfigurationList>\n+    <dataProcessingList count="3">\n+      <dataProcessing id="dataProces'..b'ntrollerNumber=1 scan=90261">2414719</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90324">2418981</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90325">2422771</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90389">2426417</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90825">2430097</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90896">2434045</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90952">2438179</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90967">2441823</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91020">2445963</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91036">2449647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91084">2453875</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91100">2457676</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91151">2462056</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91167">2465804</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91234">2469798</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91304">2473755</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91379">2477398</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91464">2481055</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92113">2484679</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92168">2488404</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92183">2492017</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92238">2495975</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92249">2499652</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92303">2503647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92318">2507303</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92389">2511184</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94116">2514904</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94183">2518552</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94258">2522503</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94332">2526517</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94729">2530126</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94857">2534084</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94919">2538185</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94980">2541937</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=95063">2545506</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96694">2549319</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96764">2552999</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96836">2556647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96918">2560122</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96993">2563742</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=97169">2567598</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99329">2571216</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99385">2574908</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99446">2578708</offset>\n+    </index>\n+    <index name="chromatogram">\n+    </index>\n+  </indexList>\n+  <indexListOffset>2582386</indexListOffset>\n+  <fileChecksum>23b8703ddaf744e69e1ba72be2584fd3df6f2dc4</fileChecksum>\n+</indexedmzML>\n'
b
diff -r 000000000000 -r 067a308223e3 test-data/mbr_test2.mzml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mbr_test2.mzml Fri Jun 10 10:18:24 2022 +0000
b
b'@@ -0,0 +1,26908 @@\n+<?xml version="1.0" encoding="utf-8"?>\n+<indexedmzML xmlns="http://psi.hupo.org/ms/mzml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.2_idx.xsd">\n+  <mzML xmlns="http://psi.hupo.org/ms/mzml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.0.xsd" id="2016_Jan_12_QE2_47" version="1.1.0">\n+    <cvList count="2">\n+      <cv id="MS" fullName="Proteomics Standards Initiative Mass Spectrometry Ontology" version="3.74.0" URI="http://psidev.cvs.sourceforge.net/*checkout*/psidev/psi/psi-ms/mzML/controlledVocabulary/psi-ms.obo"/>\n+      <cv id="UO" fullName="Unit Ontology" version="12:10:2011" URI="http://obo.cvs.sourceforge.net/*checkout*/obo/obo/ontology/phenotype/unit.obo"/>\n+    </cvList>\n+    <fileDescription>\n+      <fileContent>\n+        <cvParam cvRef="MS" accession="MS:1000579" name="MS1 spectrum" value=""/>\n+        <cvParam cvRef="MS" accession="MS:1000127" name="centroid spectrum" value=""/>\n+        <cvParam cvRef="MS" accession="MS:1000580" name="MSn spectrum" value=""/>\n+      </fileContent>\n+      <sourceFileList count="3">\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.raw" name="2016_Jan_12_QE2_47.raw" location="">\n+          <cvParam cvRef="MS" accession="MS:1000569" name="SHA-1" value="6f65a1287b2d7522c5fb90a8a5304212b3568b67"/>\n+          <cvParam cvRef="MS" accession="MS:1000563" name="Thermo RAW format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000768" name="Thermo nativeID format" value=""/>\n+        </sourceFile>\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.mzXML" name="2016_Jan_12_QE2_47.mzXML" location="file:///">\n+          <cvParam cvRef="MS" accession="MS:1000776" name="scan number only nativeID format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000566" name="ISB mzXML format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000569" name="SHA-1" value="d9c6278c1c6a8f8ca6c69133bacd00efc5f2e706"/>\n+        </sourceFile>\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.mzML" name="2016_Jan_12_QE2_47.mzML" location="file:////Users/kumar207/Documents/Projects/Pratik/script/../mzML">\n+        </sourceFile>\n+      </sourceFileList>\n+    </fileDescription>\n+    <softwareList count="4">\n+      <software id="Xcalibur_x0020_software" version="2.6-264001/2.6.0.2640">\n+        <cvParam cvRef="MS" accession="MS:1000532" name="Xcalibur" value=""/>\n+      </software>\n+      <software id="ReAdW_x0020_software" version="2015.1.0(build Jun 29 2015 14:09:50)">\n+        <cvParam cvRef="MS" accession="MS:1000541" name="ReAdW" value=""/>\n+      </software>\n+      <software id="pwiz_3.0.8990" version="3.0.8990">\n+        <cvParam cvRef="MS" accession="MS:1000615" name="ProteoWizard software" value=""/>\n+      </software>\n+      <software id="pwiz_3.0.7680" version="3.0.7680">\n+        <cvParam cvRef="MS" accession="MS:1000615" name="ProteoWizard software" value=""/>\n+      </software>\n+    </softwareList>\n+    <instrumentConfigurationList count="1">\n+      <instrumentConfiguration id="IC1">\n+        <userParam name="msManufacturer" value="Thermo Finnigan"/>\n+        <userParam name="msModel" value="unknown"/>\n+        <componentList count="3">\n+          <source order="1">\n+            <userParam name="msIonisation" value="NSI"/>\n+          </source>\n+          <analyzer order="1">\n+            <cvParam cvRef="MS" accession="MS:1000079" name="fourier transform ion cyclotron resonance mass spectrometer" value=""/>\n+          </analyzer>\n+          <detector order="1">\n+            <userParam name="msDetector" value="unknown"/>\n+          </detector>\n+        </componentList>\n+        <softwareRef ref="Xcalibur_x0020_software"/>\n+      </instrumentConfiguration>\n+    </instrumentConfigurationList>\n+    <dataProcessingList count="3">\n+      <dataProcessing id="dataProces'..b'ntrollerNumber=1 scan=90261">2414719</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90324">2418981</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90325">2422771</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90389">2426417</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90825">2430097</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90896">2434045</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90952">2438179</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90967">2441823</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91020">2445963</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91036">2449647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91084">2453875</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91100">2457676</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91151">2462056</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91167">2465804</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91234">2469798</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91304">2473755</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91379">2477398</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91464">2481055</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92113">2484679</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92168">2488404</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92183">2492017</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92238">2495975</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92249">2499652</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92303">2503647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92318">2507303</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92389">2511184</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94116">2514904</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94183">2518552</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94258">2522503</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94332">2526517</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94729">2530126</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94857">2534084</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94919">2538185</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94980">2541937</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=95063">2545506</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96694">2549319</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96764">2552999</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96836">2556647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96918">2560122</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96993">2563742</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=97169">2567598</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99329">2571216</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99385">2574908</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99446">2578708</offset>\n+    </index>\n+    <index name="chromatogram">\n+    </index>\n+  </indexList>\n+  <indexListOffset>2582386</indexListOffset>\n+  <fileChecksum>23b8703ddaf744e69e1ba72be2584fd3df6f2dc4</fileChecksum>\n+</indexedmzML>\n'
b
diff -r 000000000000 -r 067a308223e3 test-data/rt_cross_table.parquet
b
Binary file test-data/rt_cross_table.parquet has changed
b
diff -r 000000000000 -r 067a308223e3 test-data/tolerances.parquet
b
Binary file test-data/tolerances.parquet has changed
b
diff -r 000000000000 -r 067a308223e3 test-data/two_step_hybrid.recetox.parquet
b
Binary file test-data/two_step_hybrid.recetox.parquet has changed
b
diff -r 000000000000 -r 067a308223e3 test-data/two_step_hybrid_info.csv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/two_step_hybrid_info.csv Fri Jun 10 10:18:24 2022 +0000
b
@@ -0,0 +1,5 @@
+sample_name, batch
+mbr_test0, 1
+mbr_test1, 1
+mbr_test2, 2
+mbr_test0_copy, 2
b
diff -r 000000000000 -r 067a308223e3 test-data/unsupervised_output/corrected_features_0.parquet
b
Binary file test-data/unsupervised_output/corrected_features_0.parquet has changed
b
diff -r 000000000000 -r 067a308223e3 test-data/unsupervised_output/corrected_features_1.parquet
b
Binary file test-data/unsupervised_output/corrected_features_1.parquet has changed
b
diff -r 000000000000 -r 067a308223e3 test-data/unsupervised_output/corrected_features_2.parquet
b
Binary file test-data/unsupervised_output/corrected_features_2.parquet has changed
b
diff -r 000000000000 -r 067a308223e3 test-data/unsupervised_output/extracted_features_0.parquet
b
Binary file test-data/unsupervised_output/extracted_features_0.parquet has changed
b
diff -r 000000000000 -r 067a308223e3 test-data/unsupervised_output/extracted_features_1.parquet
b
Binary file test-data/unsupervised_output/extracted_features_1.parquet has changed
b
diff -r 000000000000 -r 067a308223e3 test-data/unsupervised_output/extracted_features_2.parquet
b
Binary file test-data/unsupervised_output/extracted_features_2.parquet has changed
b
diff -r 000000000000 -r 067a308223e3 test-data/unsupervised_output/unsupervised.recetox.parquet
b
Binary file test-data/unsupervised_output/unsupervised.recetox.parquet has changed
b
diff -r 000000000000 -r 067a308223e3 test-data/unsupervised_output/unsupervised_aligned_feature_sample_table.parquet
b
Binary file test-data/unsupervised_output/unsupervised_aligned_feature_sample_table.parquet has changed
b
diff -r 000000000000 -r 067a308223e3 test-data/unsupervised_output/unsupervised_recovered_feature_sample_table.parquet
b
Binary file test-data/unsupervised_output/unsupervised_recovered_feature_sample_table.parquet has changed
b
diff -r 000000000000 -r 067a308223e3 utils.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/utils.R Fri Jun 10 10:18:24 2022 +0000
[
@@ -0,0 +1,149 @@
+library(recetox.aplcms)
+
+align_features <- function(sample_names, ...) {
+    aligned <- feature.align(...)
+    feature_names <- seq_len(nrow(aligned$pk.times))
+
+  list(
+    mz_tolerance = as.numeric(aligned$mz.tol),
+    rt_tolerance = as.numeric(aligned$chr.tol),
+    rt_crosstab = as_feature_crosstab(feature_names, sample_names, aligned$pk.times),
+    int_crosstab = as_feature_crosstab(feature_names, sample_names, aligned$aligned.ftrs)
+    )
+}
+
+get_sample_name <- function(filename) {
+    tools::file_path_sans_ext(basename(filename))
+}
+
+as_feature_crosstab <- function(feature_names, sample_names, data) {
+  colnames(data) <- c("mz", "rt", "mz_min", "mz_max", sample_names)
+  rownames(data) <- feature_names
+  as.data.frame(data)
+}
+
+as_feature_sample_table <- function(rt_crosstab, int_crosstab) {
+  feature_names <- rownames(rt_crosstab)
+  sample_names <- colnames(rt_crosstab)[- (1:4)]
+
+  feature_table <- data.frame(
+    feature = feature_names,
+    mz = rt_crosstab[, 1],
+    rt = rt_crosstab[, 2]
+  )
+
+  # series of conversions to produce a table type from data.frame
+  rt_crosstab <- as.table(as.matrix(rt_crosstab[, - (1:4)]))
+  int_crosstab <- as.table(as.matrix(int_crosstab[, - (1:4)]))
+
+  crosstab_axes <- list(feature = feature_names, sample = sample_names)
+  dimnames(rt_crosstab) <- dimnames(int_crosstab) <- crosstab_axes
+
+  x <- as.data.frame(rt_crosstab, responseName = "sample_rt")
+  y <- as.data.frame(int_crosstab, responseName = "sample_intensity")
+
+  data <- merge(x, y, by = c("feature", "sample"))
+  data <- merge(feature_table, data, by = "feature")
+  data
+}
+
+load_features <- function(files) {
+    files_list <- sort_samples_by_acquisition_number(files)
+    features <- lapply(files_list, arrow::read_parquet)
+    features <- lapply(features, as.matrix)
+    return(features)
+}
+
+save_data_as_parquet_files <- function(data, subdir) {
+  dir.create(subdir)
+  for (i in 0:(length(data) - 1)) {
+    filename <- file.path(subdir, paste0(subdir, "_features_", i, ".parquet"))
+    arrow::write_parquet(as.data.frame(data[i + 1]), filename)
+  }
+}
+
+save_aligned_features <- function(aligned, rt_file, int_file, tol_file) {
+  arrow::write_parquet(as.data.frame(aligned$rt_crosstab), rt_file)
+  arrow::write_parquet(as.data.frame(aligned$int_crosstab), int_file)
+
+  mz_tolerance <- c(aligned$mz_tolerance)
+  rt_tolerance <- c(aligned$rt_tolerance)
+  arrow::write_parquet(data.frame(mz_tolerance, rt_tolerance), tol_file)
+}
+
+load_aligned_features <- function(rt_file, int_file, tol_file) {
+  rt_cross_table <- arrow::read_parquet(rt_file)
+  int_cross_table <- arrow::read_parquet(int_file)
+  tolerances_table <- arrow::read_parquet(tol_file)
+
+  result <- list()
+  result$mz_tolerance <- tolerances_table$mz_tolerance
+  result$rt_tolerance <- tolerances_table$rt_tolerance
+  result$rt_crosstab <- rt_cross_table
+  result$int_crosstab <- int_cross_table
+  return(result)
+}
+
+recover_signals <- function(cluster,
+                            filenames,
+                            extracted,
+                            corrected,
+                            aligned,
+                            mz_tol = 1e-05,
+                            mz_range = NA,
+                            rt_range = NA,
+                            use_observed_range = TRUE,
+                            min_bandwidth = NA,
+                            max_bandwidth = NA,
+                            recover_min_count = 3) {
+  if (!is(cluster, "cluster")) {
+    cluster <- parallel::makeCluster(cluster)
+    on.exit(parallel::stopCluster(cluster))
+  }
+
+  clusterExport(cluster, c("extracted", "corrected", "aligned", "recover.weaker"))
+  clusterEvalQ(cluster, library("splines"))
+
+  recovered <- parLapply(cluster, seq_along(filenames), function(i) {
+    recover.weaker(
+      loc = i,
+      filename = filenames[[i]],
+      this.f1 = extracted[[i]],
+      this.f2 = corrected[[i]],
+      pk.times = aligned$rt_crosstab,
+      aligned.ftrs = aligned$int_crosstab,
+      orig.tol = mz_tol,
+      align.mz.tol = aligned$mz_tolerance,
+      align.chr.tol = aligned$rt_tolerance,
+      mz.range = mz_range,
+      chr.range = rt_range,
+      use.observed.range = use_observed_range,
+      bandwidth = 0.5,
+      min.bw = min_bandwidth,
+      max.bw = max_bandwidth,
+      recover.min.count = recover_min_count
+    )
+  })
+
+  feature_table <- aligned$rt_crosstab[, 1:4]
+  rt_crosstab <- cbind(feature_table, sapply(recovered, function(x) x$this.times))
+  int_crosstab <- cbind(feature_table, sapply(recovered, function(x) x$this.ftrs))
+
+  feature_names <- rownames(feature_table)
+  sample_names <- colnames(aligned$rt_crosstab[, - (1:4)])
+
+  list(
+    extracted_features = lapply(recovered, function(x) x$this.f1),
+    corrected_features = lapply(recovered, function(x) x$this.f2),
+    rt_crosstab = as_feature_crosstab(feature_names, sample_names, rt_crosstab),
+    int_crosstab = as_feature_crosstab(feature_names, sample_names, int_crosstab)
+  )
+}
+
+create_feature_sample_table <- function(features) {
+  table <- as_feature_sample_table(
+      rt_crosstab = features$rt_crosstab,
+      int_crosstab = features$int_crosstab
+  )
+  return(table)
+}