Mercurial > repos > azomics > meta_autocluster
comparison metacyto_autocluster.R @ 0:c744db871f90 draft default tip
"planemo upload for repository https://github.com/AstraZeneca-Omics/immport-galaxy-tools/tree/master/flowtools/metacyto_autocluster commit 3cc1083d473530ed4f7439d590568baa51a46857"
| author | azomics |
|---|---|
| date | Tue, 27 Jul 2021 23:00:49 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c744db871f90 |
|---|---|
| 1 #!/usr/bin/env Rscript | |
| 2 ###################################################################### | |
| 3 # Copyright (c) 2018 Northrop Grumman. | |
| 4 # All rights reserved. | |
| 5 ###################################################################### | |
| 6 # | |
| 7 # Version 1 - January 2018 | |
| 8 # Author: Cristel Thomas | |
| 9 # | |
| 10 # | |
| 11 | |
| 12 library(flowCore) | |
| 13 library(MetaCyto) | |
| 14 | |
| 15 check_cluster_def <- function(cl_def) { | |
| 16 if (cl_def == "" || cl_def == "None") { | |
| 17 quit(save = "no", status = 14, runLast = FALSE) | |
| 18 } else { | |
| 19 tmp <- gsub(" ", "", cl_def, fixed = TRUE) | |
| 20 clean_def <- gsub(",", "|", tmp, fixed = TRUE) | |
| 21 return(toupper(clean_def)) | |
| 22 } | |
| 23 } | |
| 24 | |
| 25 path_to_group_file <- function(path_to_result) { | |
| 26 grp <- basename(dirname(path_to_result)) | |
| 27 return(paste(grp, "fcs", sep = ".", collapse = NULL)) | |
| 28 } | |
| 29 | |
| 30 group_file_to_group_name <- function(result_file) { | |
| 31 return(strsplit(result_file, ".", fixed = TRUE)[[1]][1]) | |
| 32 } | |
| 33 | |
| 34 auto_cluster_panels <- function(params, df, fcspaths, fcsnames, quant=0.95, | |
| 35 events=0.05, cluster_algorithm="FlowSOM", | |
| 36 clusters=vector(), outdir="", list_clust="", | |
| 37 metacluster=40, xdim=10, ydim=10, seed=42, uc="") { | |
| 38 | |
| 39 working_dir <- "tmp_metacyto" | |
| 40 working_out <- "tmp_metacyto_out" | |
| 41 dir.create(working_dir) | |
| 42 dir.create(outdir) | |
| 43 | |
| 44 # get nb of groups | |
| 45 nb_groups <- length(fcsnames) | |
| 46 | |
| 47 # reformat summary -- expects csv + 'fcs_names' && 'fcs_files' | |
| 48 new_df <- file.path(working_dir, "processed_sample_summary.csv") | |
| 49 df$fcs_names <- df$filenames | |
| 50 df$fcs_files <- df$filenames | |
| 51 write.csv(df, file = new_df, row.names = F) | |
| 52 | |
| 53 # move && rename FCS files to same directory | |
| 54 for (i in seq_len(length(fcspaths))) { | |
| 55 new_file <- file.path(working_dir, fcsnames[[i]]) | |
| 56 if (!grepl(".fcs$", new_file)) { | |
| 57 new_file <- paste0(new_file, ".fcs") | |
| 58 } | |
| 59 file.copy(fcspaths[[i]], new_file) | |
| 60 } | |
| 61 | |
| 62 #### will need to add other parameters when Zicheng has them working. | |
| 63 if (cluster_algorithm == "FlowSOM") { | |
| 64 cluster_label <- autoCluster.batch(preprocessOutputFolder = working_dir, | |
| 65 excludeClusterParameters = params, | |
| 66 labelQuantile = quant, | |
| 67 clusterFunction = flowSOM.MC, | |
| 68 minPercent = events, | |
| 69 k = metacluster, | |
| 70 xdim = xdim, | |
| 71 ydim = ydim, | |
| 72 seed = seed) | |
| 73 | |
| 74 } else { | |
| 75 cluster_label <- autoCluster.batch(preprocessOutputFolder = working_dir, | |
| 76 excludeClusterParameters = params, | |
| 77 labelQuantile = quant, | |
| 78 clusterFunction = flowHC, | |
| 79 minPercent = events) | |
| 80 } | |
| 81 | |
| 82 | |
| 83 # Add potential user-defined label to cluster definitions | |
| 84 if (length(clusters) > 1) { | |
| 85 cluster_label <- c(cluster_label, clusters) | |
| 86 } | |
| 87 write.table(cluster_label, list_clust, quote = F, row.names = F, col.names = F) | |
| 88 | |
| 89 # Derive summary statistics for the clusters | |
| 90 # Result will be written out to the directory speficied by the "outpath" argument | |
| 91 searchCluster.batch(preprocessOutputFolder = working_dir, | |
| 92 outpath = working_out, | |
| 93 clusterLabel = cluster_label) | |
| 94 | |
| 95 result_files <- list.files(working_out, | |
| 96 pattern = "cluster_stats_in_each_sample", | |
| 97 recursive = T, | |
| 98 full.names = T) | |
| 99 no_results <- vector() | |
| 100 if (length(result_files) != nb_groups) { | |
| 101 groups_with_results <- sapply(result_files, path_to_group_file) | |
| 102 ## one or more groups with no results, figure out which | |
| 103 no_results <- setdiff(fcsnames, groups_with_results) | |
| 104 } | |
| 105 | |
| 106 if (length(no_results) == nb_groups) { | |
| 107 sink(uc) | |
| 108 cat("No clusters were found in none of the groups.") | |
| 109 sink() | |
| 110 } else { | |
| 111 unused_clrs <- list() | |
| 112 if (length(no_results > 0)) { | |
| 113 grp_no_results <- sapply(no_results, group_file_to_group_name) | |
| 114 unused_clrs <- data.frame("cluster_label" = "any", "not_found_in" = grp_no_results) | |
| 115 } | |
| 116 for (result in result_files) { | |
| 117 group_name <- strsplit(result, .Platform$file.sep)[[1]][2] | |
| 118 new_filename <- paste(c(group_name, "cluster_stats.txt"), collapse = "_") | |
| 119 new_path <- file.path(outdir, new_filename) | |
| 120 tmp_df <- read.csv(result) | |
| 121 | |
| 122 used_clr <- as.character(unique(tmp_df$label)) | |
| 123 if (length(used_clr) != length(cluster_label)) { | |
| 124 unused <- setdiff(cluster_label, used_clr) | |
| 125 tmp_udf <- data.frame("cluster_label" = unused, "not_found_in" = group_name) | |
| 126 unused_clrs <- rbind(unused_clrs, tmp_udf) | |
| 127 } | |
| 128 colnames(tmp_df)[[1]] <- "group_name" | |
| 129 write.table(tmp_df, new_path, quote = F, row.names = F, col.names = T, sep = "\t") | |
| 130 } | |
| 131 | |
| 132 if (is.null(dim(unused_clrs))) { | |
| 133 sink(uc) | |
| 134 cat("All provided cluster definition were found in provided FCS files.") | |
| 135 sink() | |
| 136 } else { | |
| 137 write.table(unused_clrs, uc, quote = F, row.names = F, col.names = T, sep = "\t") | |
| 138 } | |
| 139 } | |
| 140 } | |
| 141 | |
| 142 check_input <- function(params = vector(), report = "", fcs_files = list(), | |
| 143 grp_names = list(), quant = 0.95, events = 0.05, | |
| 144 cluster_algorithm = "FlowSOM", clusters = vector(), outdir = "", | |
| 145 list_clust = "", metacluster = 40, xdim = 10, ydim = 10, | |
| 146 seed = 42, unused = "") { | |
| 147 # check FCS files | |
| 148 fcspaths <- unlist(fcs_files) | |
| 149 fcsnames <- unlist(grp_names) | |
| 150 ct_files <- 0 | |
| 151 some_pb <- FALSE | |
| 152 for (i in seq_len(length(fcspaths))) { | |
| 153 is_file_valid <- FALSE | |
| 154 tryCatch({ | |
| 155 fcs <- read.FCS(fcspaths[[i]], transformation = FALSE) | |
| 156 is_file_valid <- TRUE | |
| 157 }, error = function(ex) { | |
| 158 print(paste("File is not a valid FCS file:", fcsnames[[i]], ex)) | |
| 159 }) | |
| 160 if (is_file_valid) { | |
| 161 metacyto_pp_check <- if ("sample_id" %in% colnames(fcs)) TRUE else FALSE | |
| 162 if (metacyto_pp_check) { | |
| 163 idx <- length(colnames(fcs)) | |
| 164 ct_files <- ct_files + max(fcs@exprs[, idx]) | |
| 165 } else { | |
| 166 quit(save = "no", status = 11, runLast = FALSE) | |
| 167 } | |
| 168 } else { | |
| 169 some_pb <- TRUE | |
| 170 } | |
| 171 } | |
| 172 # check summary file format | |
| 173 df <- read.table(report, sep = "\t", header = T, colClasses = "character") | |
| 174 nm <- colnames(df) | |
| 175 check_ab <- if ("antibodies" %in% nm) TRUE else FALSE | |
| 176 check_sdy <- if ("study_id" %in% nm) TRUE else FALSE | |
| 177 | |
| 178 if (check_sdy && check_ab) { | |
| 179 # check that summary index compatible with FCSs in collection - by number of files == index nb? | |
| 180 if (ct_files != length(df$antibodies)) { | |
| 181 quit(save = "no", status = 12, runLast = FALSE) | |
| 182 } | |
| 183 } else { | |
| 184 quit(save = "no", status = 13, runLast = FALSE) | |
| 185 } | |
| 186 | |
| 187 if (some_pb) { | |
| 188 quit(save = "no", status = 10, runLast = FALSE) | |
| 189 } else { | |
| 190 auto_cluster_panels(params, df, fcspaths, fcsnames, quant, events, | |
| 191 cluster_algorithm, clusters, outdir, list_clust, | |
| 192 metacluster, xdim, ydim, seed, unused) | |
| 193 } | |
| 194 } | |
| 195 | |
| 196 ################################################################################ | |
| 197 ################################################################################ | |
| 198 args <- commandArgs(trailingOnly = TRUE) | |
| 199 | |
| 200 ex_param <- c("FSC-A", "FSC-W", "FSC-H", "FSC", "SSC", "SSC-A", "SSC-W", | |
| 201 "SSC-H", "Time", "Cell_length", "cell_length", "CELL_LENGTH") | |
| 202 | |
| 203 if (args[5] != "" && args[5] != "None") { | |
| 204 tmp <- gsub(" ", "", args[5], fixed = TRUE) | |
| 205 eparam <- unlist(strsplit(tmp, ",")) | |
| 206 ex_param <- toupper(eparam) | |
| 207 } | |
| 208 | |
| 209 i <- grep(args, pattern = "PARAM") | |
| 210 ii <- grep(args, pattern = "FCS_FILES") | |
| 211 | |
| 212 cluster_def <- vector() | |
| 213 if (i > 8) { | |
| 214 id <- i - 1 | |
| 215 cl_df <- args[8:id] | |
| 216 cluster_def <- sapply(cl_df, check_cluster_def) | |
| 217 } | |
| 218 | |
| 219 metacluster <- 40 | |
| 220 xdim <- 10 | |
| 221 ydim <- 10 | |
| 222 seed <- 42 | |
| 223 if (i + 1 != ii) { | |
| 224 metacluster <- as.numeric(args[i + 1]) | |
| 225 xdim <- as.numeric(args[i + 2]) | |
| 226 ydim <- as.numeric(args[i + 3]) | |
| 227 seed <- as.numeric(args[i + 4]) | |
| 228 } | |
| 229 | |
| 230 fcs_files <- list() | |
| 231 fcs_names <- list() | |
| 232 j <- 1 | |
| 233 m <- ii + 1 | |
| 234 n <- length(args) - 1 | |
| 235 tmp_fcs <- args[m:n] | |
| 236 | |
| 237 for (k in seq_len(length(tmp_fcs))) { | |
| 238 if (k %% 2) { | |
| 239 fcs_files[[j]] <- tmp_fcs[[k]] | |
| 240 fcs_names[[j]] <- tmp_fcs[[k + 1]] | |
| 241 j <- j + 1 | |
| 242 } | |
| 243 } | |
| 244 | |
| 245 check_input(ex_param, args[1], fcs_files, fcs_names, as.numeric(args[3]), | |
| 246 as.numeric(args[4]), args[2], cluster_def, args[6], args[7], | |
| 247 metacluster, xdim, ydim, seed, args[length(args)]) |
