comparison utils.R @ 8:4963811b6dec draft

planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/recetox_aplcms commit bc3445f7c41271b0062c7674108f57708d08dd28
author recetox
date Thu, 30 May 2024 14:53:31 +0000
parents e2dc815e1b2c
children
comparison
equal deleted inserted replaced
7:e2dc815e1b2c 8:4963811b6dec
1 library(recetox.aplcms) 1 library(recetox.aplcms)
2 2
3 get_env_sample_name <- function() { 3 get_env_sample_name <- function() {
4 sample_name <- Sys.getenv("SAMPLE_NAME", unset = NA) 4 sample_name <- Sys.getenv("SAMPLE_NAME", unset = NA)
5 if (nchar(sample_name) == 0) { 5 if (nchar(sample_name) == 0) {
6 sample_name <- NA 6 sample_name <- NA
7 } 7 }
8 if (is.na(sample_name)) { 8 if (is.na(sample_name)) {
9 message("The mzML file does not contain run ID.") 9 message("The mzML file does not contain run ID.")
10 } 10 }
11 return(sample_name) 11 return(sample_name)
12 } 12 }
13 13
14 save_sample_name <- function(df, sample_name) { 14 save_sample_name <- function(df, sample_name) {
15 attr(df, "sample_name") <- sample_name 15 attr(df, "sample_name") <- sample_name
16 return(df) 16 return(df)
17 } 17 }
18 18
19 restore_sample_name <- function(df) { 19 restore_sample_name <- function(df) {
20 return(df$sample_id[1]) 20 return(df$sample_id[1])
21 } 21 }
22 22
23 load_sample_name <- function(df) { 23 load_sample_name <- function(df) {
24 sample_name <- attr(df, "sample_name") 24 sample_name <- attr(df, "sample_name")
25 if (is.null(sample_name)) { 25 if (is.null(sample_name)) {
26 return(NA) 26 return(NA)
27 } else { 27 } else {
28 return(sample_name) 28 return(sample_name)
29 } 29 }
30 } 30 }
31 31
32 save_data_as_parquet_file <- function(data, filename) { 32 save_data_as_parquet_file <- function(data, filename) {
33 arrow::write_parquet(data, filename) 33 arrow::write_parquet(data, filename)
34 } 34 }
35 35
36 load_data_from_parquet_file <- function(filename) { 36 load_data_from_parquet_file <- function(filename) {
37 return(arrow::read_parquet(filename)) 37 return(arrow::read_parquet(filename))
38 } 38 }
39 39
40 load_parquet_collection <- function(files) { 40 load_parquet_collection <- function(files) {
41 features <- lapply(files, arrow::read_parquet) 41 features <- lapply(files, arrow::read_parquet)
42 features <- lapply(features, tibble::as_tibble) 42 features <- lapply(features, tibble::as_tibble)
43 return(features) 43 return(features)
44 } 44 }
45 45
46 save_parquet_collection <- function(feature_tables, sample_names, subdir) { 46 save_parquet_collection <- function(feature_tables, sample_names, subdir) {
47 dir.create(subdir) 47 dir.create(subdir)
48 for (i in seq_len(length(feature_tables))) { 48 for (i in seq_len(length(feature_tables))) {
49 filename <- file.path(subdir, paste0(sample_names[i], ".parquet")) 49 filename <- file.path(subdir, paste0(sample_names[i], ".parquet"))
50 feature_table <- as.data.frame(feature_tables[[i]]) 50 feature_table <- as.data.frame(feature_tables[[i]])
51 feature_table <- save_sample_name(feature_table, sample_names[i]) 51 feature_table <- save_sample_name(feature_table, sample_names[i])
52 arrow::write_parquet(feature_table, filename) 52 arrow::write_parquet(feature_table, filename)
53 } 53 }
54 } 54 }
55 55
56 sort_by_sample_name <- function(tables, sample_names) { 56 sort_by_sample_name <- function(tables, sample_names) {
57 return(tables[order(sample_names)]) 57 return(tables[order(sample_names)])
58 } 58 }
59 59
60 save_tolerances <- function(table, tol_file) { 60 save_tolerances <- function(table, tol_file) {
61 mz_tolerance <- c(table$mz_tol_relative) 61 mz_tolerance <- c(table$mz_tol_relative)
62 rt_tolerance <- c(table$rt_tol_relative) 62 rt_tolerance <- c(table$rt_tol_relative)
63 arrow::write_parquet(data.frame(mz_tolerance, rt_tolerance), tol_file) 63 arrow::write_parquet(data.frame(mz_tolerance, rt_tolerance), tol_file)
64 } 64 }
65 65
66 save_aligned_features <- function(aligned_features, metadata_file, rt_file, intensity_file) { 66 save_aligned_features <- function(aligned_features, metadata_file, rt_file, intensity_file) {
67 save_data_as_parquet_file(aligned_features$metadata, metadata_file) 67 save_data_as_parquet_file(aligned_features$metadata, metadata_file)
68 save_data_as_parquet_file(aligned_features$rt, rt_file) 68 save_data_as_parquet_file(aligned_features$rt, rt_file)
69 save_data_as_parquet_file(aligned_features$intensity, intensity_file) 69 save_data_as_parquet_file(aligned_features$intensity, intensity_file)
70 } 70 }
71 71
72 select_table_with_sample_name <- function(tables, sample_name) { 72 select_table_with_sample_name <- function(tables, sample_name) {
73 sample_names <- lapply(tables, load_sample_name) 73 sample_names <- lapply(tables, load_sample_name)
74 index <- which(sample_names == sample_name) 74 index <- which(sample_names == sample_name)
75 if (length(index) > 0) { 75 if (length(index) > 0) {
76 return(tables[[index]]) 76 return(tables[[index]])
77 } else { 77 } else {
78 stop(sprintf( 78 stop(sprintf(
79 "Mismatch - sample name '%s' not present in %s", 79 "Mismatch - sample name '%s' not present in %s",
80 sample_name, paste(sample_names, collapse = ", ") 80 sample_name, paste(sample_names, collapse = ", ")
81 )) 81 ))
82 } 82 }
83 } 83 }
84 84
85 select_adjusted <- function(recovered_features) { 85 select_adjusted <- function(recovered_features) {
86 return(recovered_features$adjusted_features) 86 return(recovered_features$adjusted_features)
87 } 87 }
88 88
89 known_table_columns <- function() { 89 known_table_columns <- function() {
90 c( 90 c(
91 "chemical_formula", "HMDB_ID", "KEGG_compound_ID", "mass", "ion.type", 91 "chemical_formula", "HMDB_ID", "KEGG_compound_ID", "mass", "ion.type",
92 "m.z", "Number_profiles_processed", "Percent_found", "mz_min", "mz_max", 92 "m.z", "Number_profiles_processed", "Percent_found", "mz_min", "mz_max",
93 "RT_mean", "RT_sd", "RT_min", "RT_max", "int_mean(log)", "int_sd(log)", 93 "RT_mean", "RT_sd", "RT_min", "RT_max", "int_mean(log)", "int_sd(log)",
94 "int_min(log)", "int_max(log)" 94 "int_min(log)", "int_max(log)"
95 ) 95 )
96 } 96 }
97 97
98 save_known_table <- function(table, filename) { 98 save_known_table <- function(table, filename) {
99 columns <- known_table_columns() 99 columns <- known_table_columns()
100 arrow::write_parquet(table$known_table[columns], filename) 100 arrow::write_parquet(table$known_table[columns], filename)
101 } 101 }
102 102
103 read_known_table <- function(filename) { 103 read_known_table <- function(filename) {
104 arrow::read_parquet(filename, col_select = known_table_columns()) 104 arrow::read_parquet(filename, col_select = known_table_columns())
105 } 105 }
106 106
107 save_pairing <- function(table, filename) { 107 save_pairing <- function(table, filename) {
108 df <- table$pairing %>% 108 df <- table$pairing %>%
109 as_tibble() %>% 109 as_tibble() %>%
110 setNames(c("new", "old")) 110 setNames(c("new", "old"))
111 arrow::write_parquet(df, filename) 111 arrow::write_parquet(df, filename)
112 } 112 }
113 113
114 join_tables_to_list <- function(metadata, rt_table, intensity_table) { 114 join_tables_to_list <- function(metadata, rt_table, intensity_table) {
115 features <- new("list") 115 features <- new("list")
116 features$metadata <- metadata 116 features$metadata <- metadata
117 features$intensity <- intensity_table 117 features$intensity <- intensity_table
118 features$rt <- rt_table 118 features$rt <- rt_table
119 return(features) 119 return(features)
120 } 120 }
121 121
122 validate_sample_names <- function(sample_names) { 122 validate_sample_names <- function(sample_names) {
123 if ((any(is.na(sample_names))) || (length(unique(sample_names)) != length(sample_names))) { 123 if ((any(is.na(sample_names))) || (length(unique(sample_names)) != length(sample_names))) {
124 stop(sprintf( 124 stop(sprintf(
125 "Sample names absent or not unique - provided sample names: %s", 125 "Sample names absent or not unique - provided sample names: %s",
126 paste(sample_names, collapse = ", ") 126 paste(sample_names, collapse = ", ")
127 )) 127 ))
128 } 128 }
129 } 129 }
130 130
131 determine_sigma_ratios <- function(sigma_ratio_lim_min = NA, sigma_ratio_lim_max = NA) { 131 determine_sigma_ratios <- function(sigma_ratio_lim_min = NA, sigma_ratio_lim_max = NA) {
132 if (is.na(sigma_ratio_lim_min)) { 132 if (is.na(sigma_ratio_lim_min)) {
133 sigma_ratio_lim_min <- 0 133 sigma_ratio_lim_min <- 0
134 } 134 }
135 if (is.na(sigma_ratio_lim_max)) { 135 if (is.na(sigma_ratio_lim_max)) {
136 sigma_ratio_lim_max <- Inf 136 sigma_ratio_lim_max <- Inf
137 } 137 }
138 return(c(sigma_ratio_lim_min, sigma_ratio_lim_max)) 138 return(c(sigma_ratio_lim_min, sigma_ratio_lim_max))
139 } 139 }