diff galaxy/tools/LC-MSMS/XSeekerPreparator.R @ 11:f4fc4a0f41e2 draft

" master branch Updating"
author lain
date Thu, 18 Mar 2021 10:46:01 +0000
parents bb9ebd6365ea
children bdb2878ee189
line wrap: on
line diff
--- a/galaxy/tools/LC-MSMS/XSeekerPreparator.R	Thu Jan 21 15:00:39 2021 +0000
+++ b/galaxy/tools/LC-MSMS/XSeekerPreparator.R	Thu Mar 18 10:46:01 2021 +0000
@@ -1,7 +1,7 @@
 
 
 TOOL_NAME <- "XSeekerPreparator"
-VERSION <- "1.1.6"
+VERSION <- "1.2.0"
 
 OUTPUT_SPECIFIC_TOOL <- "XSeeker_Galaxy"
 
@@ -309,7 +309,7 @@
         dummy_adduct$set_ips(adduct[[i <- i+1]])
         dummy_adduct$set_formula_add(adduct[[i <- i+1]])
         dummy_adduct$set_formula_ded(adduct[[i <- i+1]])
-        dummy_adduct$save()
+        invisible(dummy_adduct$save())
         dummy_adduct$clear(unset_id=TRUE)
     }
     message("Adducts created")
@@ -339,15 +339,13 @@
         dummy_compound$set_name(compounds[i, "name"])
         dummy_compound$set_common_name(compounds[i, "common_name"])
         dummy_compound$set_formula(compounds[i, "formula"])
-        # dummy_compound$set_mz(compounds[i, "mz"])
-        # dummy_compound$set_mz(compounds[i, "mz"])
         compound_list[[length(compound_list)+1]] <- as.list(
             dummy_compound,
             c("mz", "name", "common_name", "formula")
         )
         dummy_compound$clear(unset_id=TRUE)
     }
-    dummy_compound$save(bulk=compound_list)
+    invisible(dummy_compound$save(bulk=compound_list))
 }
 
 translate_compounds <- function(compounds) {
@@ -371,7 +369,7 @@
 
 guess_translator <- function(header) {
     result <- list(
-        # HMDB_ID=NULL,<
+        # HMDB_ID=NULL,
         mz=NULL,
         name=NULL,
         common_name=NULL,
@@ -413,13 +411,10 @@
 csv_header_translator <- function(translation_table, csv) {
     header_names <- names(translation_table)
     result <- data.frame(1:nrow(csv))
-    # colnames(result) <- header_names
     for (i in seq_along(header_names)) {
         result[, header_names[[i]]] <- csv[, translation_table[[i]]]
     }
-    print(result[, "mz"])
     result[, "mz"] <- as.numeric(result[, "mz"])
-    print(result[, "mz"])
     return (result)
 }
 
@@ -481,7 +476,18 @@
         stop("Malformed variableMetada.")
     }
 
+    context <- new.env()
+    context$samples <- list()
+    context$peaks <- rdata$xa@xcmsSet@peaks
+    context$groupidx <- rdata$xa@xcmsSet@groupidx
+    xcms_set <- rdata$xa@xcmsSet
+    singlefile <- rdata$singlefile
     process_arg_list <- rdata$listOFlistArguments
+    var_meta <- rdata$variableMetadata
+    ## We needed to get rid of the rdata, which is very big.
+    ## So we gathered all variable assignment from rdata here, and got rid of it.
+    rm(rdata)
+
     process_params <- list()
     for (list_name in names(process_arg_list)) {
         param_list <- list()
@@ -492,19 +498,10 @@
     }
     message("Parameters from previous processes extracted.")
 
-    var_meta <- rdata$variableMetadata
-    align_group <- rep(0, nrow(var_meta))
-    var_meta <- cbind(var_meta, align_group)
-    context <- new.env()
-    context$clusters <- list()
-    context$groupidx <- rdata$xa@xcmsSet@groupidx
-    context$peaks <- rdata$xa@xcmsSet@peaks
-    context$show_percent <- show_percent
 
     indices <- as.numeric(unique(var_meta[, file_grouping_var]))
     smol_xcms_set <- orm$smol_xcms_set()
     mz_tab_info <- new.env()
-    xcms_set <- rdata$xa@xcmsSet
     g <- xcms::groups(xcms_set)
     mz_tab_info$group_length <- nrow(g)
     mz_tab_info$dataset_path <- xcms::filepaths(xcms_set)
@@ -514,10 +511,15 @@
     mz_tab_info$mzmed <- g[,"mzmed"]
     mz_tab_info$smallmolecule_abundance_assay <- xcms::groupval(xcms_set, value="into")
     blogified <- blob::blob(fst::compress_fst(serialize(mz_tab_info, NULL), compression=100))
-    smol_xcms_set$set_raw(blogified)$save()
+    rm(mz_tab_info)
+
+    invisible(smol_xcms_set$set_raw(blogified)$save())
+    smol_xcms_set_id <- smol_xcms_set$get_id()
+    rm(smol_xcms_set)
+
     for (no in indices) {
-        sample_name <- names(rdata$singlefile)[[no]]
-        sample_path <- rdata$singlefile[[no]]
+        sample_name <- names(singlefile)[[no]]
+        sample_path <- singlefile[[no]]
         if (
             is.na(no)
             || is.null(sample_path)
@@ -525,15 +527,18 @@
         ) {
             next
         }
-        ms_file=xcms::xcmsRaw(sample_path)
         env <- new.env()
-        env$variableMetadata <- var_meta[var_meta[, file_grouping_var]==no,]
+        ms_file <- xcms::xcmsRaw(sample_path)
         env$tic <- ms_file@tic
         env$mz <- ms_file@env$mz
         env$scanindex <- ms_file@scanindex
         env$scantime <- ms_file@scantime
         env$intensity <- ms_file@env$intensity
         env$polarity <- as.character(ms_file@polarity[[1]])
+
+        ## Again, ms file is huge, so we get rid of it quickly.
+        rm(ms_file)
+
         env$sample_name <- sample_name
         env$dataset_path <- sample_path
         env$process_params <- process_params
@@ -541,11 +546,20 @@
         env$enriched_rdata_version <- ENRICHED_RDATA_VERSION
         env$tool_name <- TOOL_NAME
         env$enriched_rdata_doc <- ENRICHED_RDATA_DOC
-        context$sample_no <- no
-        add_sample_to_database(orm, env, context, smol_xcms_set)
+        sample <- add_sample_to_database(orm, env, context, smol_xcms_set_id)
+        rm (env)
+        context$samples[no] <- sample$get_id()
+        rm (sample)
     }
+    context$clusters <- list()
+    context$show_percent <- show_percent
+    context$cluster_mean_rt_abundance <- list()
+    context$central_feature <- list()
+    load_variable_metadata(orm, var_meta, context)
+    clusters <- context$clusters
+    rm(context)
     message("Features enrichment")
-    complete_features(orm, context)
+    complete_features(orm, clusters, show_percent)
     message("Features enrichment done.")
     return (NULL)
 }
@@ -571,7 +585,7 @@
     return (classes[[1]])
 }
 
-add_sample_to_database <- function(orm, env, context, smol_xcms_set) {
+add_sample_to_database <- function(orm, env, context, smol_xcms_set_id) {
     message(sprintf("Processing sample %s", env$sample_name))
     sample <- (
         orm$sample()
@@ -582,30 +596,30 @@
             if (is.null(env$polarity) || identical(env$polarity, character(0))) ""
             else env$polarity
         )
-        $set_smol_xcms_set(smol_xcms_set)
         $set_raw(blob::blob(fst::compress_fst(
             serialize(env, NULL),
             compression=100
         )))
-        $save()
     )
-    load_variable_metadata(orm, sample, env$variableMetadata, context)
+    sample[["smol_xcms_set_id"]] <- smol_xcms_set_id
+    sample$modified__[["smol_xcms_set_id"]] <- smol_xcms_set_id
+    sample <- sample$save()
     load_process_params(orm, sample, env$process_params)
     message(sprintf("Sample %s inserted.", env$sample_name))
     return (sample)
 }
 
 
-load_variable_metadata <- function(orm, sample, var_meta, context) {
+load_variable_metadata <- function(orm, var_meta, context) {
     all_clusters <- orm$cluster()$all()
 
-    next_feature_id <- get_next_id(orm$feature()$all(), "featureID")
-    next_cluster_id <- get_next_id(all_clusters, "clusterID")
+    next_feature_id <- get_next_id(orm$feature()$all(), "featureID") + 1
+    next_cluster_id <- 0
     next_pc_group <- get_next_id(all_clusters, "pc_group")
-    next_align_group <- get_next_id(all_clusters, "align_group")
+    next_align_group <- get_next_id(all_clusters, "align_group") + 1
     message("Extracting features")
     invisible(create_features(
-        orm, sample, var_meta, context,
+        orm, var_meta, context,
         next_feature_id, next_cluster_id,
         next_pc_group, next_align_group
     ))
@@ -615,13 +629,13 @@
 
 get_next_id <- function(models, attribute) {
     if ((id <- models$max(attribute)) == Inf || id == -Inf) {
-        return (1)
+        return (0)
     }
-    return (id + 1)
+    return (id)
 }
 
 create_features <- function(
-    orm, sample, var_meta, context,
+    orm, var_meta, context,
     next_feature_id, next_cluster_id,
     next_pc_group, next_align_group
 ) {
@@ -643,11 +657,32 @@
 
         curent_var_meta <- var_meta[row, ]
 
+
+        set_feature_fields_from_var_meta(dummy_feature, curent_var_meta)
+
+        dummy_feature$set_featureID(next_feature_id)
+        next_feature_id <- next_feature_id + 1
+        fake_iso <- dummy_feature$get_iso()
+        iso <- extract_iso(fake_iso)
+        clusterID <- extract_clusterID(fake_iso, next_cluster_id)
+        context$clusterID <- clusterID
+        dummy_feature$set_iso(iso)
+
+
         peak_list <- context$peaks[context$groupidx[[row]], ]
         if (! ("matrix" %in% class(peak_list))) {
             peak_list <- matrix(peak_list, nrow=1, ncol=length(peak_list), dimnames=list(c(), names(peak_list)))
         }
-        sample_peak_list <- peak_list[as.integer(peak_list[, "sample"]) == context$sample_no, , drop=FALSE]
+
+        clusterID <- as.character(clusterID)
+        if (is.null(context$central_feature[[clusterID]])) {
+            int_o <- extract_peak_var(peak_list, "into")
+            context$central_feature[[clusterID]] <- (
+                peak_list[peak_list[, "into"] == int_o,]["sample"]
+            )
+        }
+
+        sample_peak_list <- peak_list[as.integer(peak_list[, "sample"]) == context$central_feature[[clusterID]], , drop=FALSE]
         if (!identical(sample_peak_list, numeric(0)) && !is.null(nrow(sample_peak_list)) && nrow(sample_peak_list) != 0) {
             if (!is.na(int_o <- extract_peak_var(sample_peak_list, "into"))) {
                 dummy_feature$set_int_o(int_o)
@@ -659,18 +694,9 @@
                 dummy_feature$set_max_o(max_o)
             }
         }
-
-        set_feature_fields_from_var_meta(dummy_feature, curent_var_meta)
-
-        dummy_feature$set_featureID(next_feature_id)
-        next_feature_id <- next_feature_id + 1
-        fake_iso <- dummy_feature$get_iso()
-        iso <- extract_iso(fake_iso)
-        clusterID <- extract_clusterID(fake_iso, next_cluster_id)
-        context$clusterID <- clusterID
-        dummy_feature$set_iso(iso)
         create_associated_cluster(
-            sample, dummy_feature, clusterID,
+            context$central_feature[[clusterID]],
+            dummy_feature, clusterID,
             context, curent_var_meta, next_pc_group,
             next_align_group
         )
@@ -680,7 +706,8 @@
     }
     message("")## +\n for previous message 
     message("Saving features")
-    dummy_feature$save(bulk=features)
+    rm(var_meta)
+    invisible(dummy_feature$save(bulk=features))
     message("Saved.")
     return (context$clusters)
 }
@@ -734,22 +761,27 @@
 }
 
 create_associated_cluster <- function(
-    sample, feature, grouping_variable,
+    sample_no, feature, clusterID,
     context, curent_var_meta, next_pc_group, next_align_group
 ) {
     pcgroup <- as.numeric(curent_var_meta[["pcgroup"]])
     adduct <- as.character(curent_var_meta[["adduct"]])
     annotation <- curent_var_meta[["isotopes"]]
-    grouping_variable <- as.character(grouping_variable)
-    if (is.null(cluster <- context$clusters[[grouping_variable]])) {
-        cluster <- context$clusters[[grouping_variable]] <- orm$cluster(
+    clusterID <- as.character(clusterID)
+    if (is.null(cluster <- context$clusters[[clusterID]])) {
+        cluster <- context$clusters[[clusterID]] <- orm$cluster(
             pc_group=pcgroup + next_pc_group,
             adduct=adduct,
             align_group=next_align_group,
             # curent_group=curent_group,
             clusterID=context$clusterID,
             annotation=annotation
-        )$set_sample(sample)
+        )
+        ## Crappy hack to assign sample id to cluster without loading the sample.
+        ## Samples are too big (their sample$env) and slows the process, and eat all the menory
+        ## so we dont't want to load them.
+        cluster[["sample_id"]] <- context$samples[sample_no][[1]]
+        cluster$modified__[["sample_id"]] <- cluster[["sample_id"]]
     } else {
         if (context$clusterID != 0 && cluster$get_clusterID() == 0) {
             cluster$set_clusterID(context$clusterID)
@@ -760,8 +792,16 @@
     return (feature)
 }
 
-complete_features <- function(orm, context) {
-    for (cluster in context$clusters) {
+complete_features <- function(orm, clusters, show_percent) {
+    total <- length(clusters)
+    percent <- -1
+    i <- 0
+    for (cluster in clusters) {
+        i <- i+1
+        if (show_percent && (i / total) * 100 > percent) {
+            percent <- percent + 1
+            message("\r", sprintf("\r%d %%", percent), appendLF=FALSE)
+        }
         features <- orm$feature()$load_by(cluster_id=cluster$get_id())
         if (features$any()) {
             if (!is.null(rt <- features$mean("rt"))) {