xseekerpreparator: XSeekerPreparator.R comparison

comparison XSeekerPreparator.R @ 11:f4fc4a0f41e2 draft

" master branch Updating"

author	lain
date	Thu, 18 Mar 2021 10:46:01 +0000
parents	bb9ebd6365ea
children	bdb2878ee189

comparison

equal deleted inserted replaced

-:bb9ebd6365ea
+:f4fc4a0f41e2
 TOOL_NAME <- "XSeekerPreparator"
-VERSION <- "1.1.6"
+VERSION <- "1.2.0"
 OUTPUT_SPECIFIC_TOOL <- "XSeeker_Galaxy"
 ENRICHED_RDATA_VERSION <- paste("1.1.2", OUTPUT_SPECIFIC_TOOL, sep="-")
 ENRICHED_RDATA_DOC <- sprintf("
 dummy_adduct$set_oidscore(adduct[[i <- i+1]])
 dummy_adduct$set_quasi(adduct[[i <- i+1]])
 dummy_adduct$set_ips(adduct[[i <- i+1]])
 dummy_adduct$set_formula_add(adduct[[i <- i+1]])
 dummy_adduct$set_formula_ded(adduct[[i <- i+1]])
-dummy_adduct$save()
+invisible(dummy_adduct$save())
 dummy_adduct$clear(unset_id=TRUE)
 }
 message("Adducts created")
 }
 for (i in seq_len(nrow(compounds))) {
 dummy_compound$set_mz(compounds[i, "mz"])
 dummy_compound$set_name(compounds[i, "name"])
 dummy_compound$set_common_name(compounds[i, "common_name"])
 dummy_compound$set_formula(compounds[i, "formula"])
-# dummy_compound$set_mz(compounds[i, "mz"])
-# dummy_compound$set_mz(compounds[i, "mz"])
 compound_list[[length(compound_list)+1]] <- as.list(
 dummy_compound,
 c("mz", "name", "common_name", "formula")
 )
 dummy_compound$clear(unset_id=TRUE)
 }
-dummy_compound$save(bulk=compound_list)
+invisible(dummy_compound$save(bulk=compound_list))
 }
 translate_compounds <- function(compounds) {
 recognized_headers <- list(
 c("HMDB_ID", "MzBank", "X.M.H..", "X.M.H...1", "MetName", "ChemFormula", "INChIkey")
 return (csv_header_translator(translator, compounds))
 }
 guess_translator <- function(header) {
 result <- list(
-# HMDB_ID=NULL,<
+# HMDB_ID=NULL,
 mz=NULL,
 name=NULL,
 common_name=NULL,
 formula=NULL,
 # inchi_key=NULL
 }
 csv_header_translator <- function(translation_table, csv) {
 header_names <- names(translation_table)
 result <- data.frame(1:nrow(csv))
-# colnames(result) <- header_names
 for (i in seq_along(header_names)) {
 result[, header_names[[i]]] <- csv[, translation_table[[i]]]
 }
-print(result[, "mz"])
 result[, "mz"] <- as.numeric(result[, "mz"])
-print(result[, "mz"])
 return (result)
 }
 set_database_version <- function(orm, version) {
 orm$set_tag(
 message(sprintf("File grouping variable: %s", file_grouping_var))
 if(is.null(file_grouping_var)) {
 stop("Malformed variableMetada.")
 }
+context <- new.env()
+context$samples <- list()
+context$peaks <- rdata$xa@xcmsSet@peaks
+context$groupidx <- rdata$xa@xcmsSet@groupidx
+xcms_set <- rdata$xa@xcmsSet
+singlefile <- rdata$singlefile
 process_arg_list <- rdata$listOFlistArguments
+var_meta <- rdata$variableMetadata
+## We needed to get rid of the rdata, which is very big.
+## So we gathered all variable assignment from rdata here, and got rid of it.
+rm(rdata)
 process_params <- list()
 for (list_name in names(process_arg_list)) {
 param_list <- list()
 for (param_name in names(process_arg_list[[list_name]])) {
 param_list[[param_name]] <- process_arg_list[[list_name]][[param_name]]
 }
 process_params[[length(process_params)+1]] <- param_list
 }
 message("Parameters from previous processes extracted.")
-var_meta <- rdata$variableMetadata
-align_group <- rep(0, nrow(var_meta))
-var_meta <- cbind(var_meta, align_group)
-context <- new.env()
-context$clusters <- list()
-context$groupidx <- rdata$xa@xcmsSet@groupidx
-context$peaks <- rdata$xa@xcmsSet@peaks
-context$show_percent <- show_percent
 indices <- as.numeric(unique(var_meta[, file_grouping_var]))
 smol_xcms_set <- orm$smol_xcms_set()
 mz_tab_info <- new.env()
-xcms_set <- rdata$xa@xcmsSet
 g <- xcms::groups(xcms_set)
 mz_tab_info$group_length <- nrow(g)
 mz_tab_info$dataset_path <- xcms::filepaths(xcms_set)
 mz_tab_info$sampnames <- xcms::sampnames(xcms_set)
 mz_tab_info$sampclass <- xcms::sampclass(xcms_set)
 mz_tab_info$rtmed <- g[,"rtmed"]
 mz_tab_info$mzmed <- g[,"mzmed"]
 mz_tab_info$smallmolecule_abundance_assay <- xcms::groupval(xcms_set, value="into")
 blogified <- blob::blob(fst::compress_fst(serialize(mz_tab_info, NULL), compression=100))
-smol_xcms_set$set_raw(blogified)$save()
+rm(mz_tab_info)
+invisible(smol_xcms_set$set_raw(blogified)$save())
+smol_xcms_set_id <- smol_xcms_set$get_id()
+rm(smol_xcms_set)
 for (no in indices) {
-sample_name <- names(rdata$singlefile)[[no]]
+sample_name <- names(singlefile)[[no]]
-sample_path <- rdata$singlefile[[no]]
+sample_path <- singlefile[[no]]
 if (
 is.na(no)
 || is.null(sample_path)
 || !(sample_name %in% sample_names)
 ) {
 next
 }
-ms_file=xcms::xcmsRaw(sample_path)
 env <- new.env()
-env$variableMetadata <- var_meta[var_meta[, file_grouping_var]==no,]
+ms_file <- xcms::xcmsRaw(sample_path)
 env$tic <- ms_file@tic
 env$mz <- ms_file@env$mz
 env$scanindex <- ms_file@scanindex
 env$scantime <- ms_file@scantime
 env$intensity <- ms_file@env$intensity
 env$polarity <- as.character(ms_file@polarity[[1]])
+## Again, ms file is huge, so we get rid of it quickly.
+rm(ms_file)
 env$sample_name <- sample_name
 env$dataset_path <- sample_path
 env$process_params <- process_params
 env$enriched_rdata <- TRUE
 env$enriched_rdata_version <- ENRICHED_RDATA_VERSION
 env$tool_name <- TOOL_NAME
 env$enriched_rdata_doc <- ENRICHED_RDATA_DOC
-context$sample_no <- no
+sample <- add_sample_to_database(orm, env, context, smol_xcms_set_id)
-add_sample_to_database(orm, env, context, smol_xcms_set)
+rm (env)
-}
+context$samples[no] <- sample$get_id()
+rm (sample)
+}
+context$clusters <- list()
+context$show_percent <- show_percent
+context$cluster_mean_rt_abundance <- list()
+context$central_feature <- list()
+load_variable_metadata(orm, var_meta, context)
+clusters <- context$clusters
+rm(context)
 message("Features enrichment")
-complete_features(orm, context)
+complete_features(orm, clusters, show_percent)
 message("Features enrichment done.")
 return (NULL)
 }
 find_grouping_var <- function(var_meta) {
 stop("Could not find any class column in your variableMetadata.")
 }
 return (classes[[1]])
 }
-add_sample_to_database <- function(orm, env, context, smol_xcms_set) {
+add_sample_to_database <- function(orm, env, context, smol_xcms_set_id) {
 message(sprintf("Processing sample %s", env$sample_name))
 sample <- (
 orm$sample()
 $set_name(env$sample_name)
 $set_path(env$dataset_path)
 $set_kind("enriched_rdata")
 $set_polarity(
 if (is.null(env$polarity) || identical(env$polarity, character(0))) ""
 else env$polarity
 )
-$set_smol_xcms_set(smol_xcms_set)
 $set_raw(blob::blob(fst::compress_fst(
 serialize(env, NULL),
 compression=100
 )))
-$save()
 )
-load_variable_metadata(orm, sample, env$variableMetadata, context)
+sample[["smol_xcms_set_id"]] <- smol_xcms_set_id
+sample$modified__[["smol_xcms_set_id"]] <- smol_xcms_set_id
+sample <- sample$save()
 load_process_params(orm, sample, env$process_params)
 message(sprintf("Sample %s inserted.", env$sample_name))
 return (sample)
 }
-load_variable_metadata <- function(orm, sample, var_meta, context) {
+load_variable_metadata <- function(orm, var_meta, context) {
 all_clusters <- orm$cluster()$all()
-next_feature_id <- get_next_id(orm$feature()$all(), "featureID")
+next_feature_id <- get_next_id(orm$feature()$all(), "featureID") + 1
-next_cluster_id <- get_next_id(all_clusters, "clusterID")
+next_cluster_id <- 0
 next_pc_group <- get_next_id(all_clusters, "pc_group")
-next_align_group <- get_next_id(all_clusters, "align_group")
+next_align_group <- get_next_id(all_clusters, "align_group") + 1
 message("Extracting features")
 invisible(create_features(
-orm, sample, var_meta, context,
+orm, var_meta, context,
 next_feature_id, next_cluster_id,
 next_pc_group, next_align_group
 ))
 message("Extracting features done.")
 return (NULL)
 }
 get_next_id <- function(models, attribute) {
 if ((id <- models$max(attribute)) == Inf || id == -Inf) {
-return (1)
+return (0)
 }
-return (id + 1)
+return (id)
 }
 create_features <- function(
-orm, sample, var_meta, context,
+orm, var_meta, context,
 next_feature_id, next_cluster_id,
 next_pc_group, next_align_group
 ) {
 field_names <- as.list(names(orm$feature()$fields__))
 field_names[field_names=="id"] <- NULL
 message("\r", sprintf("\r%d %%", percent), appendLF=FALSE)
 }
 curent_var_meta <- var_meta[row, ]
+set_feature_fields_from_var_meta(dummy_feature, curent_var_meta)
+dummy_feature$set_featureID(next_feature_id)
+next_feature_id <- next_feature_id + 1
+fake_iso <- dummy_feature$get_iso()
+iso <- extract_iso(fake_iso)
+clusterID <- extract_clusterID(fake_iso, next_cluster_id)
+context$clusterID <- clusterID
+dummy_feature$set_iso(iso)
 peak_list <- context$peaks[context$groupidx[[row]], ]
 if (! ("matrix" %in% class(peak_list))) {
 peak_list <- matrix(peak_list, nrow=1, ncol=length(peak_list), dimnames=list(c(), names(peak_list)))
 }
-sample_peak_list <- peak_list[as.integer(peak_list[, "sample"]) == context$sample_no, , drop=FALSE]
+clusterID <- as.character(clusterID)
+if (is.null(context$central_feature[[clusterID]])) {
+int_o <- extract_peak_var(peak_list, "into")
+context$central_feature[[clusterID]] <- (
+peak_list[peak_list[, "into"] == int_o,]["sample"]
+)
+}
+sample_peak_list <- peak_list[as.integer(peak_list[, "sample"]) == context$central_feature[[clusterID]], , drop=FALSE]
 if (!identical(sample_peak_list, numeric(0)) && !is.null(nrow(sample_peak_list)) && nrow(sample_peak_list) != 0) {
 if (!is.na(int_o <- extract_peak_var(sample_peak_list, "into"))) {
 dummy_feature$set_int_o(int_o)
 }
 if (!is.na(int_b <- extract_peak_var(sample_peak_list, "intb"))) {
 }
 if (!is.na(max_o <- extract_peak_var(sample_peak_list, "maxo"))) {
 dummy_feature$set_max_o(max_o)
 }
 }
-set_feature_fields_from_var_meta(dummy_feature, curent_var_meta)
-dummy_feature$set_featureID(next_feature_id)
-next_feature_id <- next_feature_id + 1
-fake_iso <- dummy_feature$get_iso()
-iso <- extract_iso(fake_iso)
-clusterID <- extract_clusterID(fake_iso, next_cluster_id)
-context$clusterID <- clusterID
-dummy_feature$set_iso(iso)
 create_associated_cluster(
-sample, dummy_feature, clusterID,
+context$central_feature[[clusterID]],
+dummy_feature, clusterID,
 context, curent_var_meta, next_pc_group,
 next_align_group
 )
 next_align_group <- next_align_group + 1
 features[[length(features)+1]] <- as.list(dummy_feature, field_names)
 dummy_feature$clear()
 }
 message("")## +\n for previous message
 message("Saving features")
-dummy_feature$save(bulk=features)
+rm(var_meta)
+invisible(dummy_feature$save(bulk=features))
 message("Saved.")
 return (context$clusters)
 }
 extract_peak_var <- function(peak_list, var_name, selector=max) {
 }
 return (clusterID + next_cluster_id)
 }
 create_associated_cluster <- function(
-sample, feature, grouping_variable,
+sample_no, feature, clusterID,
 context, curent_var_meta, next_pc_group, next_align_group
 ) {
 pcgroup <- as.numeric(curent_var_meta[["pcgroup"]])
 adduct <- as.character(curent_var_meta[["adduct"]])
 annotation <- curent_var_meta[["isotopes"]]
-grouping_variable <- as.character(grouping_variable)
+clusterID <- as.character(clusterID)
-if (is.null(cluster <- context$clusters[[grouping_variable]])) {
+if (is.null(cluster <- context$clusters[[clusterID]])) {
-cluster <- context$clusters[[grouping_variable]] <- orm$cluster(
+cluster <- context$clusters[[clusterID]] <- orm$cluster(
 pc_group=pcgroup + next_pc_group,
 adduct=adduct,
 align_group=next_align_group,
 # curent_group=curent_group,
 clusterID=context$clusterID,
 annotation=annotation
-)$set_sample(sample)
+)
+## Crappy hack to assign sample id to cluster without loading the sample.
+## Samples are too big (their sample$env) and slows the process, and eat all the menory
+## so we dont't want to load them.
+cluster[["sample_id"]] <- context$samples[sample_no][[1]]
+cluster$modified__[["sample_id"]] <- cluster[["sample_id"]]
 } else {
 if (context$clusterID != 0 && cluster$get_clusterID() == 0) {
 cluster$set_clusterID(context$clusterID)
 }
 }
 cluster$save()
 feature$set_cluster(cluster)
 return (feature)
 }
-complete_features <- function(orm, context) {
+complete_features <- function(orm, clusters, show_percent) {
-for (cluster in context$clusters) {
+total <- length(clusters)
+percent <- -1
+i <- 0
+for (cluster in clusters) {
+i <- i+1
+if (show_percent && (i / total) * 100 > percent) {
+percent <- percent + 1
+message("\r", sprintf("\r%d %%", percent), appendLF=FALSE)
+}
 features <- orm$feature()$load_by(cluster_id=cluster$get_id())
 if (features$any()) {
 if (!is.null(rt <- features$mean("rt"))) {
 cluster$set_mean_rt(rt)$save()
 }

Mercurial > repos > lain > xseekerpreparator

comparison XSeekerPreparator.R @ 11:f4fc4a0f41e2 draft