Mercurial > repos > azomics > metacyto_preprocess
comparison metacyto_preprocess.R @ 0:bf6470882a15 draft default tip
"planemo upload for repository https://github.com/AstraZeneca-Omics/immport-galaxy-tools/tree/master/flowtools/metacyto_preprocess commit c3d761b4fca140636c3f22ef0fdbb855f3ecbdb8"
| author | azomics |
|---|---|
| date | Sun, 25 Jul 2021 10:36:03 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:bf6470882a15 |
|---|---|
| 1 #!/usr/bin/env Rscript | |
| 2 ###################################################################### | |
| 3 # Copyright (c) 2018 Northrop Grumman. | |
| 4 # All rights reserved. | |
| 5 ###################################################################### | |
| 6 # | |
| 7 # Version 1 - January 2018 | |
| 8 # Author: Cristel Thomas | |
| 9 # | |
| 10 # | |
| 11 | |
| 12 library(flowCore) | |
| 13 library(MetaCyto) | |
| 14 | |
| 15 compare_lists <- function(m1, m2) { | |
| 16 list_check <- T | |
| 17 if (is.na(all(m1 == m2))) { | |
| 18 mm1 <- is.na(m1) | |
| 19 mm2 <- is.na(m2) | |
| 20 if (all(mm1 == mm2)) { | |
| 21 if (!all(m1 == m2, na.rm = TRUE)) { | |
| 22 list_check <- F | |
| 23 } | |
| 24 } else { | |
| 25 list_check <- F | |
| 26 } | |
| 27 } else if (!all(m1 == m2)) { | |
| 28 list_check <- F | |
| 29 } | |
| 30 return(list_check) | |
| 31 } | |
| 32 | |
| 33 | |
| 34 run_batch_processing <- function(sampling_size = 5000, flag_default = T, | |
| 35 to_exclude, outdir = "", outfile = "", | |
| 36 labels, assays, factors, fcspaths, fcsnames) { | |
| 37 # Create meta_data object | |
| 38 fp <- unlist(fcspaths) | |
| 39 file_counts <- lengths(fcspaths) | |
| 40 group_names <- rep(labels, times = file_counts) | |
| 41 group_bs <- rep(factors, times = file_counts) | |
| 42 group_types <- rep(assays, times = file_counts) | |
| 43 | |
| 44 meta_data <- data.frame(fcs_files = fp, study_id = group_names) | |
| 45 | |
| 46 # excluded_parameters | |
| 47 default_param <- c("FSC-A", "FSC-H", "FSC-W", "FSC", "SSC-A", "SSC-H", | |
| 48 "SSC-W", "SSC", "Time", "Cell_length", "cell_length", | |
| 49 "CELL_LENGTH") | |
| 50 excluded_parameters <- if (flag_default) default_param else to_exclude | |
| 51 # Run preprocessing.batch | |
| 52 preprocessing.batch(inputMeta = meta_data, | |
| 53 assay = group_types, | |
| 54 b = group_bs, | |
| 55 fileSampleSize = sampling_size, | |
| 56 outpath = outdir, | |
| 57 excludeTransformParameters = excluded_parameters) | |
| 58 | |
| 59 # deal with outputs | |
| 60 # output[2]: a csv file summarizing the pre-processing result. | |
| 61 ## -> open file to pull info and print out filenames rather than path. | |
| 62 tmp_csv <- file.path(outdir, "processed_sample_summary.csv") | |
| 63 tmp <- read.csv(tmp_csv) | |
| 64 tmp$old_index <- seq(1, length(tmp$fcs_names)) | |
| 65 | |
| 66 fn <- unlist(fcsnames) | |
| 67 df <- data.frame(fcs_files = fp, filenames = fn) | |
| 68 | |
| 69 # merge two data frames by ID | |
| 70 total <- merge(tmp, df, by = "fcs_files") | |
| 71 total2 <- total[order(total$old_index), ] | |
| 72 to_drop <- c("fcs_names", "fcs_files", "old_index") | |
| 73 newdf <- total2[, !(names(total2) %in% to_drop)] | |
| 74 write.table(newdf, file = outfile, quote = F, row.names = F, col.names = T, sep = "\t") | |
| 75 | |
| 76 file.remove(tmp_csv) | |
| 77 } | |
| 78 | |
| 79 check_fcs <- function(sampling = 5000, flag_default = TRUE, to_exclude, | |
| 80 outdir = "", outfile = "", labels, assays, factors, | |
| 81 fcspaths, fcsnames) { | |
| 82 | |
| 83 if (length(labels) > length(unique(labels))) { | |
| 84 # we have repeated group names, all group names need to be different | |
| 85 print("ERROR: repeated labels among groups, make sure that labels are all different for groups.") | |
| 86 print("The following labels are repeated") | |
| 87 table(labels)[table(labels) > 1] | |
| 88 quit(save = "no", status = 13, runLast = FALSE) | |
| 89 } | |
| 90 | |
| 91 marker_pb <- FALSE | |
| 92 for (i in seq_len(length(fcspaths))) { | |
| 93 for (n in seq_len(length(fcspaths[[i]]))) { | |
| 94 marker_check <- FALSE | |
| 95 marker_channel <- FALSE | |
| 96 tryCatch({ | |
| 97 fcs <- read.FCS(fcspaths[[i]][[n]], transformation = FALSE) | |
| 98 }, error = function(ex) { | |
| 99 print(paste("File is not a valid FCS file:", fnames[[i]][[n]], ex)) | |
| 100 quit(save = "no", status = 10, runLast = FALSE) | |
| 101 }) | |
| 102 | |
| 103 if (n == 1) { | |
| 104 m1 <- as.vector(pData(parameters(fcs))$desc) | |
| 105 c1 <- colnames(fcs) | |
| 106 } else { | |
| 107 m2 <- as.vector(pData(parameters(fcs))$desc) | |
| 108 c2 <- colnames(fcs) | |
| 109 marker_check <- compare_lists(m1, m2) | |
| 110 marker_channel <- compare_lists(c1, c2) | |
| 111 } | |
| 112 if (n > 1 && marker_check == F) { | |
| 113 marker_pb <- TRUE | |
| 114 print(paste("Marker discrepancy detected in markers -- group", labels[[i]])) | |
| 115 } else if (n > 1 && marker_channel == F) { | |
| 116 marker_pb <- TRUE | |
| 117 print(paste("Marker discrepancy detected in channels -- group", labels[[i]])) | |
| 118 } | |
| 119 } | |
| 120 } | |
| 121 | |
| 122 if (marker_pb) { | |
| 123 quit(save = "no", status = 12, runLast = FALSE) | |
| 124 } else { | |
| 125 run_batch_processing(sampling, flag_default, to_exclude, outdir, outfile, | |
| 126 labels, assays, factors, fcspaths, fcsnames) | |
| 127 } | |
| 128 } | |
| 129 | |
| 130 ################################################################################ | |
| 131 ################################################################################ | |
| 132 args <- commandArgs(trailingOnly = TRUE) | |
| 133 | |
| 134 # Arg 1: sub_sampling number | |
| 135 # Arg 2: output dir for processed FCS files for check_fcs and run_batch_processing | |
| 136 # Arg 3: Main output file (text file) | |
| 137 # Arg 4: excluded params | |
| 138 # Arg 5: Group 1 Name | |
| 139 # Arg 6: Group 1 format | |
| 140 # Arg 7: Group 1 Scaling factor | |
| 141 # Cycle through files in group 1 | |
| 142 # Arg : file path in Galaxy | |
| 143 # Arg : desired real file name | |
| 144 # Cycle through at at least one additional group | |
| 145 # Arg : 'new_panel' - used as some sort of delimiter | |
| 146 # Arg : Group n+1 Name | |
| 147 # Arg : Group n+1 format | |
| 148 # Arg : Group n+1 Scaling factor | |
| 149 ## Cycle through files in that group | |
| 150 ## Arg : file path in Galaxy | |
| 151 ## Arg : desired real file path | |
| 152 | |
| 153 sub_sampling <- NULL | |
| 154 if (as.numeric(args[1]) > 0) { | |
| 155 sub_sampling <- as.numeric(args[1]) | |
| 156 } | |
| 157 | |
| 158 # parameters to exclude => args[4] | |
| 159 to_exclude <- vector() | |
| 160 flag_default <- FALSE | |
| 161 i <- 1 | |
| 162 if (args[4] == "None" || args[4] == "") { | |
| 163 flag_default <- TRUE | |
| 164 } else { | |
| 165 excluded <- unlist(strsplit(args[4], ",")) | |
| 166 for (channel in excluded) { | |
| 167 stripped_chan <- gsub(" ", "", channel, fixed = TRUE) | |
| 168 if (!is.na(stripped_chan)) { | |
| 169 to_exclude[[i]] <- stripped_chan | |
| 170 } | |
| 171 i <- i + 1 | |
| 172 } | |
| 173 } | |
| 174 | |
| 175 # handle group cycle in arguments to produce iterable panels | |
| 176 tot_args <- length(args) | |
| 177 tmpargs <- paste(args[5:tot_args], collapse = "=%=") | |
| 178 tmppanels <- strsplit(tmpargs, "=%=new_panel=%=") | |
| 179 nb_panel <- length(tmppanels[[1]]) | |
| 180 | |
| 181 labels <- vector(mode = "character", length = nb_panel) | |
| 182 assay_types <- vector(mode = "character", length = nb_panel) | |
| 183 scaling_factors <- vector(mode = "numeric", length = nb_panel) | |
| 184 filepaths <- list() | |
| 185 filenames <- list() | |
| 186 | |
| 187 # iterate over panels (groups of fcs files) | |
| 188 j <- 1 | |
| 189 for (pnl in tmppanels[[1]]) { | |
| 190 tmppanel <- strsplit(pnl, "=%=") | |
| 191 # number of FCS files | |
| 192 nb_files <- (length(tmppanel[[1]]) - 3) / 2 | |
| 193 tmplist <- character(nb_files) | |
| 194 tmpnames <- character(nb_files) | |
| 195 if (tmppanel[[1]][[1]] == "None" || tmppanel[[1]][[1]] == "") { | |
| 196 print(paste("ERROR: Empty group name/label for group ", j)) | |
| 197 quit(save = "no", status = 11, runLast = FALSE) | |
| 198 } else { | |
| 199 labels[[j]] <- tmppanel[[1]][[1]] | |
| 200 } | |
| 201 # assay type | |
| 202 assay_types[[j]] <- tmppanel[[1]][[2]] | |
| 203 | |
| 204 scaling_factors[[j]] <- 0 | |
| 205 if (as.numeric(tmppanel[[1]][[3]]) > 0) { | |
| 206 scaling_factors[[j]] <- 1 / as.numeric(tmppanel[[1]][[3]]) | |
| 207 } | |
| 208 | |
| 209 k <- 1 | |
| 210 for (m in 4:length(tmppanel[[1]])) { | |
| 211 if (!m %% 2) { | |
| 212 tmplist[[k]] <- tmppanel[[1]][[m]] | |
| 213 tmpnames[[k]] <- tmppanel[[1]][[m + 1]] | |
| 214 k <- k + 1 | |
| 215 } | |
| 216 } | |
| 217 filepaths[[tmppanel[[1]][1]]] <- tmplist | |
| 218 filenames[[tmppanel[[1]][1]]] <- tmpnames | |
| 219 j <- j + 1 | |
| 220 } | |
| 221 | |
| 222 check_fcs(sub_sampling, flag_default, to_exclude, args[2], args[3], labels, | |
| 223 assay_types, scaling_factors, filepaths, filenames) | |
| 224 | |
| 225 # check_fcs <- function(sampling = 5000, flag_default = TRUE, to_exclude, | |
| 226 # outdir = "", outfile = "", labels, assays, factors, | |
| 227 # fcspaths, fcsnames) |
