Mercurial > repos > lain > xseekerpreparator
diff galaxy/tools/LC-MSMS/XSeekerPreparator.R @ 11:f4fc4a0f41e2 draft
" master branch Updating"
author | lain |
---|---|
date | Thu, 18 Mar 2021 10:46:01 +0000 |
parents | bb9ebd6365ea |
children | bdb2878ee189 |
line wrap: on
line diff
--- a/galaxy/tools/LC-MSMS/XSeekerPreparator.R Thu Jan 21 15:00:39 2021 +0000 +++ b/galaxy/tools/LC-MSMS/XSeekerPreparator.R Thu Mar 18 10:46:01 2021 +0000 @@ -1,7 +1,7 @@ TOOL_NAME <- "XSeekerPreparator" -VERSION <- "1.1.6" +VERSION <- "1.2.0" OUTPUT_SPECIFIC_TOOL <- "XSeeker_Galaxy" @@ -309,7 +309,7 @@ dummy_adduct$set_ips(adduct[[i <- i+1]]) dummy_adduct$set_formula_add(adduct[[i <- i+1]]) dummy_adduct$set_formula_ded(adduct[[i <- i+1]]) - dummy_adduct$save() + invisible(dummy_adduct$save()) dummy_adduct$clear(unset_id=TRUE) } message("Adducts created") @@ -339,15 +339,13 @@ dummy_compound$set_name(compounds[i, "name"]) dummy_compound$set_common_name(compounds[i, "common_name"]) dummy_compound$set_formula(compounds[i, "formula"]) - # dummy_compound$set_mz(compounds[i, "mz"]) - # dummy_compound$set_mz(compounds[i, "mz"]) compound_list[[length(compound_list)+1]] <- as.list( dummy_compound, c("mz", "name", "common_name", "formula") ) dummy_compound$clear(unset_id=TRUE) } - dummy_compound$save(bulk=compound_list) + invisible(dummy_compound$save(bulk=compound_list)) } translate_compounds <- function(compounds) { @@ -371,7 +369,7 @@ guess_translator <- function(header) { result <- list( - # HMDB_ID=NULL,< + # HMDB_ID=NULL, mz=NULL, name=NULL, common_name=NULL, @@ -413,13 +411,10 @@ csv_header_translator <- function(translation_table, csv) { header_names <- names(translation_table) result <- data.frame(1:nrow(csv)) - # colnames(result) <- header_names for (i in seq_along(header_names)) { result[, header_names[[i]]] <- csv[, translation_table[[i]]] } - print(result[, "mz"]) result[, "mz"] <- as.numeric(result[, "mz"]) - print(result[, "mz"]) return (result) } @@ -481,7 +476,18 @@ stop("Malformed variableMetada.") } + context <- new.env() + context$samples <- list() + context$peaks <- rdata$xa@xcmsSet@peaks + context$groupidx <- rdata$xa@xcmsSet@groupidx + xcms_set <- rdata$xa@xcmsSet + singlefile <- rdata$singlefile process_arg_list <- rdata$listOFlistArguments + var_meta <- rdata$variableMetadata + ## We needed to get rid of the rdata, which is very big. + ## So we gathered all variable assignment from rdata here, and got rid of it. + rm(rdata) + process_params <- list() for (list_name in names(process_arg_list)) { param_list <- list() @@ -492,19 +498,10 @@ } message("Parameters from previous processes extracted.") - var_meta <- rdata$variableMetadata - align_group <- rep(0, nrow(var_meta)) - var_meta <- cbind(var_meta, align_group) - context <- new.env() - context$clusters <- list() - context$groupidx <- rdata$xa@xcmsSet@groupidx - context$peaks <- rdata$xa@xcmsSet@peaks - context$show_percent <- show_percent indices <- as.numeric(unique(var_meta[, file_grouping_var])) smol_xcms_set <- orm$smol_xcms_set() mz_tab_info <- new.env() - xcms_set <- rdata$xa@xcmsSet g <- xcms::groups(xcms_set) mz_tab_info$group_length <- nrow(g) mz_tab_info$dataset_path <- xcms::filepaths(xcms_set) @@ -514,10 +511,15 @@ mz_tab_info$mzmed <- g[,"mzmed"] mz_tab_info$smallmolecule_abundance_assay <- xcms::groupval(xcms_set, value="into") blogified <- blob::blob(fst::compress_fst(serialize(mz_tab_info, NULL), compression=100)) - smol_xcms_set$set_raw(blogified)$save() + rm(mz_tab_info) + + invisible(smol_xcms_set$set_raw(blogified)$save()) + smol_xcms_set_id <- smol_xcms_set$get_id() + rm(smol_xcms_set) + for (no in indices) { - sample_name <- names(rdata$singlefile)[[no]] - sample_path <- rdata$singlefile[[no]] + sample_name <- names(singlefile)[[no]] + sample_path <- singlefile[[no]] if ( is.na(no) || is.null(sample_path) @@ -525,15 +527,18 @@ ) { next } - ms_file=xcms::xcmsRaw(sample_path) env <- new.env() - env$variableMetadata <- var_meta[var_meta[, file_grouping_var]==no,] + ms_file <- xcms::xcmsRaw(sample_path) env$tic <- ms_file@tic env$mz <- ms_file@env$mz env$scanindex <- ms_file@scanindex env$scantime <- ms_file@scantime env$intensity <- ms_file@env$intensity env$polarity <- as.character(ms_file@polarity[[1]]) + + ## Again, ms file is huge, so we get rid of it quickly. + rm(ms_file) + env$sample_name <- sample_name env$dataset_path <- sample_path env$process_params <- process_params @@ -541,11 +546,20 @@ env$enriched_rdata_version <- ENRICHED_RDATA_VERSION env$tool_name <- TOOL_NAME env$enriched_rdata_doc <- ENRICHED_RDATA_DOC - context$sample_no <- no - add_sample_to_database(orm, env, context, smol_xcms_set) + sample <- add_sample_to_database(orm, env, context, smol_xcms_set_id) + rm (env) + context$samples[no] <- sample$get_id() + rm (sample) } + context$clusters <- list() + context$show_percent <- show_percent + context$cluster_mean_rt_abundance <- list() + context$central_feature <- list() + load_variable_metadata(orm, var_meta, context) + clusters <- context$clusters + rm(context) message("Features enrichment") - complete_features(orm, context) + complete_features(orm, clusters, show_percent) message("Features enrichment done.") return (NULL) } @@ -571,7 +585,7 @@ return (classes[[1]]) } -add_sample_to_database <- function(orm, env, context, smol_xcms_set) { +add_sample_to_database <- function(orm, env, context, smol_xcms_set_id) { message(sprintf("Processing sample %s", env$sample_name)) sample <- ( orm$sample() @@ -582,30 +596,30 @@ if (is.null(env$polarity) || identical(env$polarity, character(0))) "" else env$polarity ) - $set_smol_xcms_set(smol_xcms_set) $set_raw(blob::blob(fst::compress_fst( serialize(env, NULL), compression=100 ))) - $save() ) - load_variable_metadata(orm, sample, env$variableMetadata, context) + sample[["smol_xcms_set_id"]] <- smol_xcms_set_id + sample$modified__[["smol_xcms_set_id"]] <- smol_xcms_set_id + sample <- sample$save() load_process_params(orm, sample, env$process_params) message(sprintf("Sample %s inserted.", env$sample_name)) return (sample) } -load_variable_metadata <- function(orm, sample, var_meta, context) { +load_variable_metadata <- function(orm, var_meta, context) { all_clusters <- orm$cluster()$all() - next_feature_id <- get_next_id(orm$feature()$all(), "featureID") - next_cluster_id <- get_next_id(all_clusters, "clusterID") + next_feature_id <- get_next_id(orm$feature()$all(), "featureID") + 1 + next_cluster_id <- 0 next_pc_group <- get_next_id(all_clusters, "pc_group") - next_align_group <- get_next_id(all_clusters, "align_group") + next_align_group <- get_next_id(all_clusters, "align_group") + 1 message("Extracting features") invisible(create_features( - orm, sample, var_meta, context, + orm, var_meta, context, next_feature_id, next_cluster_id, next_pc_group, next_align_group )) @@ -615,13 +629,13 @@ get_next_id <- function(models, attribute) { if ((id <- models$max(attribute)) == Inf || id == -Inf) { - return (1) + return (0) } - return (id + 1) + return (id) } create_features <- function( - orm, sample, var_meta, context, + orm, var_meta, context, next_feature_id, next_cluster_id, next_pc_group, next_align_group ) { @@ -643,11 +657,32 @@ curent_var_meta <- var_meta[row, ] + + set_feature_fields_from_var_meta(dummy_feature, curent_var_meta) + + dummy_feature$set_featureID(next_feature_id) + next_feature_id <- next_feature_id + 1 + fake_iso <- dummy_feature$get_iso() + iso <- extract_iso(fake_iso) + clusterID <- extract_clusterID(fake_iso, next_cluster_id) + context$clusterID <- clusterID + dummy_feature$set_iso(iso) + + peak_list <- context$peaks[context$groupidx[[row]], ] if (! ("matrix" %in% class(peak_list))) { peak_list <- matrix(peak_list, nrow=1, ncol=length(peak_list), dimnames=list(c(), names(peak_list))) } - sample_peak_list <- peak_list[as.integer(peak_list[, "sample"]) == context$sample_no, , drop=FALSE] + + clusterID <- as.character(clusterID) + if (is.null(context$central_feature[[clusterID]])) { + int_o <- extract_peak_var(peak_list, "into") + context$central_feature[[clusterID]] <- ( + peak_list[peak_list[, "into"] == int_o,]["sample"] + ) + } + + sample_peak_list <- peak_list[as.integer(peak_list[, "sample"]) == context$central_feature[[clusterID]], , drop=FALSE] if (!identical(sample_peak_list, numeric(0)) && !is.null(nrow(sample_peak_list)) && nrow(sample_peak_list) != 0) { if (!is.na(int_o <- extract_peak_var(sample_peak_list, "into"))) { dummy_feature$set_int_o(int_o) @@ -659,18 +694,9 @@ dummy_feature$set_max_o(max_o) } } - - set_feature_fields_from_var_meta(dummy_feature, curent_var_meta) - - dummy_feature$set_featureID(next_feature_id) - next_feature_id <- next_feature_id + 1 - fake_iso <- dummy_feature$get_iso() - iso <- extract_iso(fake_iso) - clusterID <- extract_clusterID(fake_iso, next_cluster_id) - context$clusterID <- clusterID - dummy_feature$set_iso(iso) create_associated_cluster( - sample, dummy_feature, clusterID, + context$central_feature[[clusterID]], + dummy_feature, clusterID, context, curent_var_meta, next_pc_group, next_align_group ) @@ -680,7 +706,8 @@ } message("")## +\n for previous message message("Saving features") - dummy_feature$save(bulk=features) + rm(var_meta) + invisible(dummy_feature$save(bulk=features)) message("Saved.") return (context$clusters) } @@ -734,22 +761,27 @@ } create_associated_cluster <- function( - sample, feature, grouping_variable, + sample_no, feature, clusterID, context, curent_var_meta, next_pc_group, next_align_group ) { pcgroup <- as.numeric(curent_var_meta[["pcgroup"]]) adduct <- as.character(curent_var_meta[["adduct"]]) annotation <- curent_var_meta[["isotopes"]] - grouping_variable <- as.character(grouping_variable) - if (is.null(cluster <- context$clusters[[grouping_variable]])) { - cluster <- context$clusters[[grouping_variable]] <- orm$cluster( + clusterID <- as.character(clusterID) + if (is.null(cluster <- context$clusters[[clusterID]])) { + cluster <- context$clusters[[clusterID]] <- orm$cluster( pc_group=pcgroup + next_pc_group, adduct=adduct, align_group=next_align_group, # curent_group=curent_group, clusterID=context$clusterID, annotation=annotation - )$set_sample(sample) + ) + ## Crappy hack to assign sample id to cluster without loading the sample. + ## Samples are too big (their sample$env) and slows the process, and eat all the menory + ## so we dont't want to load them. + cluster[["sample_id"]] <- context$samples[sample_no][[1]] + cluster$modified__[["sample_id"]] <- cluster[["sample_id"]] } else { if (context$clusterID != 0 && cluster$get_clusterID() == 0) { cluster$set_clusterID(context$clusterID) @@ -760,8 +792,16 @@ return (feature) } -complete_features <- function(orm, context) { - for (cluster in context$clusters) { +complete_features <- function(orm, clusters, show_percent) { + total <- length(clusters) + percent <- -1 + i <- 0 + for (cluster in clusters) { + i <- i+1 + if (show_percent && (i / total) * 100 > percent) { + percent <- percent + 1 + message("\r", sprintf("\r%d %%", percent), appendLF=FALSE) + } features <- orm$feature()$load_by(cluster_id=cluster$get_id()) if (features$any()) { if (!is.null(rt <- features$mean("rt"))) {