comparison utils.R @ 2:abe783e0daca draft

planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/recetox_aplcms commit 506df2aef355b3791567283e1a175914f06b405a
author recetox
date Mon, 13 Feb 2023 10:26:59 +0000
parents 57c644d3f24c
children 1e2a13bcb5a7
comparison
equal deleted inserted replaced
1:b07fd3d7ffd0 2:abe783e0daca
1 library(recetox.aplcms) 1 library(recetox.aplcms)
2 2
3 align_features <- function(sample_names, ...) { 3 get_env_sample_name <- function() {
4 aligned <- feature.align(...) 4 sample_name <- Sys.getenv("SAMPLE_NAME", unset = NA)
5 feature_names <- seq_len(nrow(aligned$pk.times)) 5 if (nchar(sample_name) == 0) {
6 6 sample_name <- NA
7 list( 7 }
8 mz_tolerance = as.numeric(aligned$mz.tol), 8 if (is.na(sample_name)) {
9 rt_tolerance = as.numeric(aligned$chr.tol), 9 message("The mzML file does not contain run ID.")
10 rt_crosstab = as_feature_crosstab(feature_names, sample_names, aligned$pk.times), 10 }
11 int_crosstab = as_feature_crosstab(feature_names, sample_names, aligned$aligned.ftrs) 11 return(sample_name)
12 )
13 } 12 }
14 13
15 get_sample_name <- function(filename) { 14 save_sample_name <- function(df, sample_name) {
16 tools::file_path_sans_ext(basename(filename)) 15 attr(df, "sample_name") <- sample_name
16 return(df)
17 } 17 }
18 18
19 as_feature_crosstab <- function(feature_names, sample_names, data) { 19 load_sample_name <- function(df) {
20 colnames(data) <- c("mz", "rt", "mz_min", "mz_max", sample_names) 20 sample_name <- attr(df, "sample_name")
21 rownames(data) <- feature_names 21 if (is.null(sample_name)) {
22 as.data.frame(data) 22 return(NA)
23 } else {
24 return(sample_name)
25 }
23 } 26 }
24 27
25 as_feature_sample_table <- function(rt_crosstab, int_crosstab) { 28 save_data_as_parquet_file <- function(data, filename) {
26 feature_names <- rownames(rt_crosstab) 29 arrow::write_parquet(data, filename)
27 sample_names <- colnames(rt_crosstab)[- (1:4)]
28
29 feature_table <- data.frame(
30 feature = feature_names,
31 mz = rt_crosstab[, 1],
32 rt = rt_crosstab[, 2]
33 )
34
35 # series of conversions to produce a table type from data.frame
36 rt_crosstab <- as.table(as.matrix(rt_crosstab[, - (1:4)]))
37 int_crosstab <- as.table(as.matrix(int_crosstab[, - (1:4)]))
38
39 crosstab_axes <- list(feature = feature_names, sample = sample_names)
40 dimnames(rt_crosstab) <- dimnames(int_crosstab) <- crosstab_axes
41
42 x <- as.data.frame(rt_crosstab, responseName = "sample_rt")
43 y <- as.data.frame(int_crosstab, responseName = "sample_intensity")
44
45 data <- merge(x, y, by = c("feature", "sample"))
46 data <- merge(feature_table, data, by = "feature")
47 data
48 } 30 }
49 31
50 load_features <- function(files) { 32 load_data_from_parquet_file <- function(filename) {
51 files_list <- sort_samples_by_acquisition_number(files) 33 return(arrow::read_parquet(filename))
52 features <- lapply(files_list, arrow::read_parquet) 34 }
53 features <- lapply(features, as.matrix) 35
36 load_parquet_collection <- function(files) {
37 features <- lapply(files, arrow::read_parquet)
38 features <- lapply(features, tibble::as_tibble)
54 return(features) 39 return(features)
55 } 40 }
56 41
57 save_data_as_parquet_files <- function(data, subdir) { 42 save_parquet_collection <- function(table, sample_names, subdir) {
58 dir.create(subdir) 43 dir.create(subdir)
59 for (i in 0:(length(data) - 1)) { 44 for (i in seq_len(length(table$feature_tables))) {
60 filename <- file.path(subdir, paste0(subdir, "_features_", i, ".parquet")) 45 filename <- file.path(subdir, paste0(subdir, "_", sample_names[i], ".parquet"))
61 arrow::write_parquet(as.data.frame(data[i + 1]), filename) 46 feature_table <- as.data.frame(table$feature_tables[[i]])
62 } 47 feature_table <- save_sample_name(feature_table, sample_names[i])
48 arrow::write_parquet(feature_table, filename)
49 }
63 } 50 }
64 51
65 save_aligned_features <- function(aligned, rt_file, int_file, tol_file) { 52 sort_by_sample_name <- function(tables, sample_names) {
66 arrow::write_parquet(as.data.frame(aligned$rt_crosstab), rt_file) 53 return(tables[order(sample_names)])
67 arrow::write_parquet(as.data.frame(aligned$int_crosstab), int_file)
68
69 mz_tolerance <- c(aligned$mz_tolerance)
70 rt_tolerance <- c(aligned$rt_tolerance)
71 arrow::write_parquet(data.frame(mz_tolerance, rt_tolerance), tol_file)
72 } 54 }
73 55
74 load_aligned_features <- function(rt_file, int_file, tol_file) { 56 save_tolerances <- function(table, tol_file) {
75 rt_cross_table <- arrow::read_parquet(rt_file) 57 mz_tolerance <- c(table$mz_tol_relative)
76 int_cross_table <- arrow::read_parquet(int_file) 58 rt_tolerance <- c(table$rt_tol_relative)
77 tolerances_table <- arrow::read_parquet(tol_file) 59 arrow::write_parquet(data.frame(mz_tolerance, rt_tolerance), tol_file)
78
79 result <- list()
80 result$mz_tolerance <- tolerances_table$mz_tolerance
81 result$rt_tolerance <- tolerances_table$rt_tolerance
82 result$rt_crosstab <- rt_cross_table
83 result$int_crosstab <- int_cross_table
84 return(result)
85 } 60 }
86 61
87 recover_signals <- function(cluster, 62 get_mz_tol <- function(tolerances) {
88 filenames, 63 return(tolerances$mz_tolerance)
89 extracted,
90 corrected,
91 aligned,
92 mz_tol = 1e-05,
93 mz_range = NA,
94 rt_range = NA,
95 use_observed_range = TRUE,
96 min_bandwidth = NA,
97 max_bandwidth = NA,
98 recover_min_count = 3) {
99 if (!is(cluster, "cluster")) {
100 cluster <- parallel::makeCluster(cluster)
101 on.exit(parallel::stopCluster(cluster))
102 }
103
104 clusterExport(cluster, c("extracted", "corrected", "aligned", "recover.weaker"))
105 clusterEvalQ(cluster, library("splines"))
106
107 recovered <- parLapply(cluster, seq_along(filenames), function(i) {
108 recover.weaker(
109 loc = i,
110 filename = filenames[[i]],
111 this.f1 = extracted[[i]],
112 this.f2 = corrected[[i]],
113 pk.times = aligned$rt_crosstab,
114 aligned.ftrs = aligned$int_crosstab,
115 orig.tol = mz_tol,
116 align.mz.tol = aligned$mz_tolerance,
117 align.chr.tol = aligned$rt_tolerance,
118 mz.range = mz_range,
119 chr.range = rt_range,
120 use.observed.range = use_observed_range,
121 bandwidth = 0.5,
122 min.bw = min_bandwidth,
123 max.bw = max_bandwidth,
124 recover.min.count = recover_min_count
125 )
126 })
127
128 feature_table <- aligned$rt_crosstab[, 1:4]
129 rt_crosstab <- cbind(feature_table, sapply(recovered, function(x) x$this.times))
130 int_crosstab <- cbind(feature_table, sapply(recovered, function(x) x$this.ftrs))
131
132 feature_names <- rownames(feature_table)
133 sample_names <- colnames(aligned$rt_crosstab[, - (1:4)])
134
135 list(
136 extracted_features = lapply(recovered, function(x) x$this.f1),
137 corrected_features = lapply(recovered, function(x) x$this.f2),
138 rt_crosstab = as_feature_crosstab(feature_names, sample_names, rt_crosstab),
139 int_crosstab = as_feature_crosstab(feature_names, sample_names, int_crosstab)
140 )
141 } 64 }
142 65
143 create_feature_sample_table <- function(features) { 66 get_rt_tol <- function(tolerances) {
144 table <- as_feature_sample_table( 67 return(tolerances$rt_tolerance)
145 rt_crosstab = features$rt_crosstab,
146 int_crosstab = features$int_crosstab
147 )
148 return(table)
149 } 68 }
69
70 save_aligned_features <- function(aligned_features, metadata_file, rt_file, intensity_file) {
71 save_data_as_parquet_file(aligned_features$metadata, metadata_file)
72 save_data_as_parquet_file(aligned_features$rt, rt_file)
73 save_data_as_parquet_file(aligned_features$intensity, intensity_file)
74 }
75
76 select_table_with_sample_name <- function(tables, sample_name) {
77 sample_names <- lapply(tables, load_sample_name)
78 index <- which(sample_names == sample_name)
79 if (length(index) > 0) {
80 return(tables[[index]])
81 } else {
82 stop(sprintf("Mismatch - sample name '%s' not present in %s",
83 sample_name, paste(sample_names, collapse = ", ")))
84 }
85 }
86
87 select_adjusted <- function(recovered_features) {
88 return(recovered_features$adjusted_features)
89 }
90
91 known_table_columns <- function() {
92 c("chemical_formula", "HMDB_ID", "KEGG_compound_ID", "mass", "ion.type",
93 "m.z", "Number_profiles_processed", "Percent_found", "mz_min", "mz_max",
94 "RT_mean", "RT_sd", "RT_min", "RT_max", "int_mean(log)", "int_sd(log)",
95 "int_min(log)", "int_max(log)")
96 }
97
98 save_known_table <- function(table, filename) {
99 columns <- known_table_columns()
100 arrow::write_parquet(table$known_table[columns], filename)
101 }
102
103 read_known_table <- function(filename) {
104 arrow::read_parquet(filename, col_select = known_table_columns())
105 }
106
107 save_pairing <- function(table, filename) {
108 df <- table$pairing %>% as_tibble() %>% setNames(c("new", "old"))
109 arrow::write_parquet(df, filename)
110 }
111
112 join_tables_to_list <- function(metadata, rt_table, intensity_table) {
113 features <- new("list")
114 features$metadata <- metadata
115 features$intensity <- intensity_table
116 features$rt <- rt_table
117 return(features)
118 }
119
120 validate_sample_names <- function(sample_names) {
121 if ((any(is.na(sample_names))) || (length(unique(sample_names)) != length(sample_names))) {
122 stop(sprintf("Sample names absent or not unique - provided sample names: %s",
123 paste(sample_names, collapse = ", ")))
124 }
125 }