notos: KDEanalysis.r comparison

comparison KDEanalysis.r @ 0:1535ffddeff4 draft

planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty

author	cristian
date	Thu, 07 Sep 2017 08:51:57 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:1535ffddeff4
+# Carry out analysis of CpGo/E data for Galaxy module
+# Ingo Bulla
+# 27 Jan 16
+# load packages
+pckg <- c("methods", "optparse")
+for (p in pckg) {
+if (!(p %in% rownames(installed.packages()))) {
+stop( paste("R package", p , "is not installed"), call. = FALSE)
+}
+}
+require(methods, quietly = TRUE)
+require(optparse, quietly = TRUE)
+# determine directory where functions are located
+cmdArgs <- commandArgs(trailingOnly = FALSE)
+str <- "--file="
+match <- grep(str, cmdArgs)
+if (length(match) == 0) {
+stop("notos.r not set up to be called from R console")
+}
+path <- normalizePath( sub(str, "", cmdArgs[match]) )
+FCTN.DIR <- file.path(dirname(path), "Functions")
+source( file.path( FCTN.DIR, "Kernel_function_form.R") )
+MAX.CPGOE <- 10   # maximum value for CpGo/e ratios
+# process outliers and return quantities characterizing the distribution
+# obs: CpGo/e ratios
+proc.outliers <- function(obs, frac.outl) {
+ret <- list()
+# remove all zeros from sample
+no.obs.raw <- length(obs)
+ret[["prop.zero"]] <- sum(obs == 0) / no.obs.raw
+obs <- obs[obs != 0]
+if (length(obs) < 3) {
+ret[["valid"]] <- FALSE
+return(ret)
+}
+ret[["obs.nz"]] <- obs
+# replace very large values by a maximum value
+obs <- sapply(obs, function(x) min(x, MAX.CPGOE))
+# defining variables
+# ... mean, median and standard deviation
+ret[["mu.obs"]] <- mu.obs <- mean(obs)
+ret[["me.obs"]] <- me.obs <- median(obs)
+sd.obs <- sd(obs)
+iqr.obs <- IQR(obs)
+# ... uppper and lower limits, based on mean +- k * sd, med. +- k * iqr, k = 2, ..., 4
+ul.mu <- mu.obs + (2 : 5) * sd.obs
+ll.mu <- mu.obs - (2 : 5) * sd.obs
+ul.me <- quantile(obs, 0.75) + (2 : 5) * iqr.obs
+ll.me <- quantile(obs, 0.25) - (2 : 5) * iqr.obs
+names(ul.mu) <- names(ll.mu) <- 2 : 5
+names(ul.me) <- names(ll.me) <- 2 : 5
+ret[["ul.mu"]] <- ul.mu
+ret[["ll.mu"]] <- ll.mu
+ret[["ul.me"]] <- ul.me
+ret[["ll.me"]] <- ll.me
+# summary statistics and data output
+# ... calculate proportion of data excluded when using different ranges
+ret[["prop2"]] <- prop2 <- length(obs[obs < ll.me["2"] | ul.me["2"] < obs]) / no.obs.raw
+ret[["prop3"]] <- prop3 <- length(obs[obs < ll.me["3"] | ul.me["3"] < obs]) / no.obs.raw
+ret[["prop4"]] <- prop4 <- length(obs[obs < ll.me["4"] | ul.me["4"] < obs]) / no.obs.raw
+ret[["prop5"]] <- prop5 <- length(obs[obs < ll.me["5"] | ul.me["5"] < obs]) / no.obs.raw
+# ... choose k in Q1 / Q3 +- k * IQR such that no more than 1% of the data are excluded
+v <- c(prop2, prop3, prop4, prop5) < frac.outl
+if (any(v)) {
+excl.crit <- min(which(v))
+ret[["obs.cl"]] <- obs[!(obs < ll.me[excl.crit] | ul.me[excl.crit] < obs)]
+ret[["used"]] <- paste(2 : 5, "iqr", sep = "")[excl.crit]
+} else {
+ret[["obs.cl"]] <- obs[!(obs < ll.me[4] | ul.me[4] < obs)]
+ret[["used"]] <- "limited to 5 * iqr"
+}
+ret[["valid"]] <- TRUE
+return(ret)
+}
+# Read CpGo/e ratios from file
+# warn: issue warning if necessary
+read.CpGoe <- function(fname, warn) {
+	# read input file line by line, split by whitespaces, assign last substring to CpGo/e ratios
+	# ... remove comments and trailing whitespaces
+	print(fname)
+	v <- read.table(fname,  fill = TRUE, col.names = c("seq", "val"))
+	obs <- v$val
+	obs <- obs[!is.na(obs)]
+	return(obs)
+}
+# process command line arguments
+# expected arguments:
+# - names of the species (each as a separate argument)
+# - names of CpGo/e files of the species (each as a separate argument)
+# ... parse arguments
+option_list <- list(make_option(c("-o", "--frac-outl"), type = "double", default = 0.01,
+help = "maximum fraction of CpGo/e ratios excluded as outliers [default %default]"),
+make_option(c("-d", "--min-dist"), type = "double", default = 0.2,
+help = "minimum distance between modes, modes that are closer are joined [default %default]"),
+make_option(c("-c", "--conf-level"), type = "double", default = 0.95,
+help = "level of the confidence intervals of the mode positions [default %default]"),
+make_option(c("-m", "--mode-mass"), type = "double", default = 0.05,
+help = "minimum probability mass of a mode [default %default]"),
+make_option(c("-b", "--band-width"), type = "double", default = 1.06,
+help = "bandwidth constant for kernels [default %default]"),
+make_option(c("-B", "--bootstrap"), action="store_true", default = FALSE,
+help = "calculate confidence intervals of mode positions using bootstrap [default %default]"),
+make_option(c("-r", "--bootstrap-reps"), type = "integer", default = 1500,
+help = "number of bootstrap repetitions [default %default]"),
+make_option(c("-H", "--outlier-hist-file"), type = "character", default = "outliers_hist.pdf",
+help = "name of the output file for the outlier histograms [default %default]"),
+make_option(c("-C", "--cutoff-file"), type = "character", default = "outliers_cutoff.csv",
+help = "name of the output file for the outlier cutoff [default %default]"),
+make_option(c("-k", "--kde-file"), type = "character", default = "KDE.pdf",
+help = "name of the output file for the KDE [default %default]"),
+make_option(c("-v", "--valley-file"), type = "character", default = "valleys.csv",
+help = "name of the output file with the values for valleys of the KDE [default %default]"),
+make_option(c("-p", "--peak-file"), type = "character", default = "modes_basic_stats.csv",
+help = "name of the output file describing the peaks of the KDE [default %default]"),
+make_option(c("-s", "--bootstrap-file"), type = "character", default = "modes_bootstrap.csv",
+help = "name of the output file for the bootstrap results [default %default]"),
+make_option(c("-u", "--summary-file"), type = "character", default = "summary.csv",
+help = "name of the summary file for the KDE results [default %default]"),
+make_option(c("-f", "--no-warning-few-seqs"), action = "store_true", default = FALSE,
+help = paste("suppress warning in case the input file only contains few values ",
+"[default %default]", sep = "")))
+op <- OptionParser(usage = "notos.r [options] spc_name_1 ... spc_name_N CpGoe_file_name_1 ... CpGoe_file_name_N",
+description = paste("\nDescription: Notos generates a histogram and a kernel density estimator from files containing CpGo/e ratios. ",
+"Moreover, it determines the number of modes of the CpGo/e ratio for each input file. The input files ",
+"can either be composed of \n",
+"1) CpGo/e ratios separated by linebreaks or\n",
+"2) sequence names and CpGo/e ratios with each sequence name put on a separate line together with its CpGo/e ratio ",
+"and sequence and CpGo/e being separated by whitespaces on each line.", sep = ""),
+option_list = option_list)
+args <- parse_args(op, positional_arguments = c(2, Inf))
+num.args <- length(args$args)
+use.bstrp <- args$options$`bootstrap`
+supp.warn.few <- args$options$`no-warning-few-seqs`
+# ... check number of arguments
+# ... ... check number of mandatory arguments
+if (num.args < 2) {
+stop("One species name and one file containing CpGo/e ratios have to be provided")
+}
+# ... ... check whether number of mandatory arguments is even
+if (num.args %% 2 != 0) {
+stop("Number of arguments has to be even")
+}
+# ... ... check maximum fraction of CpGo/e ratios excluded as outliers
+frac.outl <- args$options$`frac-outl`
+if ((frac.outl <= 0) || (frac.outl >= 1)) {
+stop("The maximum fraction of CpGo/e ratios excluded as outliers has to be greater than zero and less than one")
+}
+if (frac.outl >= 0.2) {
+warning("The maximum fraction of CpGo/e ratios excluded as outliers has been set to a rather large value, resulting in the removal of many CpGo/e ratios")
+}
+# ... check numerical arguments
+# ... ... check minimum distance between modes
+min.dist <- args$options$`min-dist`
+if (min.dist < 0) {
+stop("The minimum distance between modes has to be equal to or larger than zero")
+}
+if (min.dist >= 0.4) {
+warning("The minimum distance between modes has been set to a rather large value, resulting in a strong reduction of the number of modes")
+}
+# ... ... check confidence level
+conf.lev <- args$options$`conf-level`
+if ((conf.lev <= 0) || (conf.lev >= 1)) {
+stop("The level of the confidence intervals of the mode positions has to be larger than zero and smaller than one.")
+}
+if (conf.lev >= 0.995) {
+warning("The level of the confidence intervals of the mode positions has been set to a rather high value, resulting in very broad confidence intervals")
+}
+# ... ... check minimum probability mass of a mode
+mode.mass <- args$options$`mode-mass`
+if ((mode.mass < 0) || (mode.mass >= 1)) {
+stop("The minimum probability mass of a mode has to be larger than or equal to zero and smaller than one.")
+}
+if (mode.mass >= 0.3) {
+warning("The minimum probability mass of a mode has been set to a rather large value, resulting in the elemination of a high number of modes.")
+}
+# ... ... check bandwidth constant
+band.width <- args$options$`band-width`
+if (band.width <= 0) {
+stop("The bandwidth constant has to be positive")
+}
+if (band.width >= 5) {
+warning("The bandwidth constant has to been set to a rather large value, resulting in a strong smoothing")
+}
+# ... ... check number of boostrap repetitions
+bstrp.reps <- args$options$`bootstrap-reps`
+if (bstrp.reps != round(bstrp.reps)) {
+stop("The number of boostrap repetitions has to be a positive integer")
+}
+if (bstrp.reps <= 0) {
+stop("The number of boostrap repetitions has to be positive")
+}
+if (bstrp.reps >= 10000) {
+warning("The number of boostrap repetitions has been set to a rather large value, resulting in a long running time")
+}
+# ... check file name arguments
+# ... ... check histogram output file name
+outlier.hist.fname <- args$options$`outlier-hist-file`
+if ( file.exists(outlier.hist.fname) && (file.info(outlier.hist.fname)$isdir) ) {
+stop(paste("File name for the outlier histogram output refers to a directory:", outlier.hist.fname))
+}
+v <- strsplit(outlier.hist.fname, split = ".", fixed = TRUE)[[1]]
+if ((length(v) == 1) || (v[ length(v) ] != "pdf")) {
+warning(paste("File name for the outlier histogram output does not have a .pdf extension:", outlier.hist.fname))
+}
+g <- gregexpr(pattern ='/', outlier.hist.fname)[[1]]
+if (as.vector(g)[1] != -1) {
+v <- as.vector(g)
+d <- substr(outlier.hist.fname, 1, v[length(v)])
+if (!file.exists(d)) {
+stop(paste("Path to file for the outlier histogram output is not valid:", outlier.hist.fname))
+}
+}
+# ... ... check outlier cutoff output file name
+cutoff.fname <- args$options$`cutoff-file`
+if ( file.exists(cutoff.fname) && (file.info(cutoff.fname)$isdir) ) {
+stop(paste("File name for the outlier cutoff table output refers to a directory:", cutoff.fname))
+}
+v <- strsplit(cutoff.fname, split = ".", fixed = TRUE)[[1]]
+if (length(v) == 1) {
+stop(paste("File name for the outlier cutoff table output does not have a file extension:", cutoff.fname))
+}
+#if (v[ length(v) ] != "xlsx") {
+#  warning(paste("File name for the outlier cutoff table output does not have a .xlsx extension:", cutoff.fname))
+#}
+g <- gregexpr(pattern ='/', cutoff.fname)[[1]]
+if (as.vector(g)[1] != -1) {
+v <- as.vector(g)
+d <- substr(cutoff.fname, 1, v[length(v)])
+if (!file.exists(d)) {
+stop(paste("Path to file for the outlier cutoff is not valid:", cutoff.fname))
+}
+}
+# ... ... check KDE output file name
+kde.fname <- args$options$`kde-file`
+if ( file.exists(kde.fname) && (file.info(kde.fname)$isdir) ) {
+stop(paste("File name for the KDE output refers to a directory:", kde.fname))
+}
+v <- strsplit(kde.fname, split = ".", fixed = TRUE)[[1]]
+if ((length(v) == 1) || (v[ length(v) ] != "pdf")) {
+warning(paste("File name for the KDE output does not have a .pdf extension:", kde.fname))
+}
+g <- gregexpr(pattern ='/', kde.fname)[[1]]
+if (as.vector(g)[1] != -1) {
+v <- as.vector(g)
+d <- substr(kde.fname, 1, v[length(v)])
+if (!file.exists(d)) {
+stop(paste("Path to file for the KDE output is not valid:", kde.fname))
+}
+}
+# ... ... check peak descriptives output file name
+peak.fname <- args$options$`peak-file`
+if ( file.exists(peak.fname) && (file.info(peak.fname)$isdir) ) {
+stop(paste("File name for the peak descriptives refers to a directory:", peak.fname))
+}
+v <- strsplit(peak.fname, split = ".", fixed = TRUE)[[1]]
+if ((length(v) == 1) || (v[ length(v) ] != "csv")) {
+warning(paste("File name for the peak descriptives does not have a .csv extension:", peak.fname))
+}
+g <- gregexpr(pattern ='/', peak.fname)[[1]]
+if (as.vector(g)[1] != -1) {
+v <- as.vector(g)
+d <- substr(peak.fname, 1, v[length(v)])
+if (!file.exists(d)) {
+stop(paste("Path to file for the peak descriptives is not valid:", peak.fname))
+}
+}
+# ... ... check bootstrap results output file name
+bstrp.fname <- args$options$`bootstrap-file`
+if ( file.exists(bstrp.fname) && (file.info(bstrp.fname)$isdir) ) {
+stop(paste("File name for the bootstrap results refers to a directory:", bstrp.fname))
+}
+v <- strsplit(bstrp.fname, split = ".", fixed = TRUE)[[1]]
+if ((length(v) == 1) || (v[ length(v) ] != "csv")) {
+warning(paste("File name for the bootstrap results does not have a .csv extension:", bstrp.fname))
+}
+g <- gregexpr(pattern ='/', bstrp.fname)[[1]]
+if (as.vector(g)[1] != -1) {
+v <- as.vector(g)
+d <- substr(bstrp.fname, 1, v[length(v)])
+if (!file.exists(d)) {
+stop(paste("Path to file for the bootstrap results is not valid:", bstrp.fname))
+}
+}
+# ... ... check summary results output file name
+summ.fname <- args$options$`summary-file`
+if ( file.exists(summ.fname) && (file.info(summ.fname)$isdir) ) {
+stop(paste("File name for the bootstrap results refers to a directory:", summ.fname))
+}
+v <- strsplit(summ.fname, split = ".", fixed = TRUE)[[1]]
+if ((length(v) == 1) || (v[ length(v) ] != "csv")) {
+warning(paste("File name for the bootstrap results does not have a .csv extension:", summ.fname))
+}
+g <- gregexpr(pattern ='/', summ.fname)[[1]]
+if (as.vector(g)[1] != -1) {
+v <- as.vector(g)
+d <- substr(summ.fname, 1, v[length(v)])
+if (!file.exists(d)) {
+stop(paste("Path to file for the bootstrap results is not valid:", summ.fname))
+}
+}
+# ... ... check CpGo/e input file names
+num.spec <- num.args / 2
+spec.names <- args$args[1:num.spec]
+cpgoe.fnames <- args$args[(num.spec + 1):num.args]
+for (i in 1:length(cpgoe.fnames)) {
+if (!file.exists(cpgoe.fnames[i])) {
+stop(paste("CpGo/e file does not exist:", cpgoe.fnames[i]))
+}
+if (file.info(cpgoe.fnames[i])$isdir) {
+stop(paste("CpGo/e file name refers to a directory:", cpgoe.fnames[i]))
+}
+}
+valleys.fname <- args$options$`valley-file`
+# remove outliers and output histograms
+# ... set up table with cutoff quantities
+tab.des <- data.frame(matrix(NA, nrow = num.spec, ncol = 6))
+names(tab.des) <- c("prop.zero", "prop.out.2iqr", "prop.out.3iqr",
+"prop.out.4iqr", "prop.out.5iqr", "used")
+rownames(tab.des) <- spec.names
+# ... set up figure
+t.height <- 6
+t.width <- 20
+pdf(outlier.hist.fname, height = t.height,width = t.width, paper = "special")
+par(mfrow = c(1, 3), mgp = c(2, 0.5, 0), mar = c(4.0, 3.0, 1.5, 1))
+tmp.fnames <- c()
+# ... iterate through species
+for (i in 1:num.spec) {
+fname <- cpgoe.fnames[i]
+obs <- read.CpGoe(fname, TRUE)
+# check CpGo/e ratios
+for (j in 1:length(obs)) {
+# is format legal?
+val <- as.numeric( obs[j] )
+err.str <- paste("Observation", i, "in", fname)
+if (!is.finite(val)) {
+stop(paste(err.str, "could not be converted to a number:", obs[j]))
+}
+# is ratio too small / large?
+if (val < 0) {
+stop(paste(err.str, "is negative:", val))
+} else {
+if (val > MAX.CPGOE) {
+warning(paste(err.str   , "is suspiciously large:", val, "\nthis value is replaced by", MAX.CPGOE))
+}
+}
+}
+# process outliers and store the results
+obs.org <- obs
+l <- proc.outliers(obs, frac.outl)
+if (!l[["valid"]]) {
+stop( paste("Too few values in", fname, "(less than 3) after removal of zeros"), call. = FALSE )
+}
+tab.des[i, "prop.zero"] <- l[["prop.zero"]]
+mu.obs <- l[["mu.obs"]]
+me.obs <- l[["me.obs"]]
+ul.mu <- l[["ul.mu"]]
+ll.mu <- l[["ll.mu"]]
+ul.me <- l[["ul.me"]]
+ll.me <- l[["ll.me"]]
+tab.des[i, "prop.out.2iqr"] <- l[["prop2"]]
+tab.des[i, "prop.out.3iqr"] <- l[["prop3"]]
+tab.des[i, "prop.out.4iqr"] <- l[["prop4"]]
+tab.des[i, "prop.out.5iqr"] <- l[["prop5"]]
+obs.cl <- l[["obs.cl"]]
+obs.nz <- l[["obs.nz"]]
+tab.des[i, "used"] <- l[["used"]]
+tab.des[i, "no.obs.raw"] <- length(obs.org)
+tab.des[i, "no.obs.nozero"] <- length(obs.nz)
+tab.des[i, "no.obs.clean"] <- length(obs.cl)
+usedindex <- substr(l[["used"]],1,1)
+# Histograms
+# ... histogram 1: original data with zeros
+t.breaks <- seq(0, max(obs.org) + 1, by = 0.03)
+t.xlim <- c(0, ul.me["5"] + 0.1)
+hist(obs.org, breaks = t.breaks, xlim = t.xlim, xlab = "CpG o/e", main = "",
+sub = "Original data", prob = TRUE,
+	  col = grey(0.9), border = grey(0.6))
+mtext(paste(spec.names[i]), side = 3, adj = 0)
+# ... histogram 3: median / iqr based
+t.lty <- rep(3, 4)
+t.lty[usedindex] <- 1
+hist(obs.nz, breaks = t.breaks, xlim = t.xlim, xlab = "CpG o/e", main = "",
+sub = "Data without zeros, Q1/3 +- k*IQR, k=2,...,5", prob = TRUE,
+	  col = grey(0.9), border = grey(0.6))
+abline(v = me.obs, col = 'blue', lwd = 2)
+abline(v = c(ll.me, ul.me), col = "red", lty = rep(t.lty, 2))
+# ... histogram 4: cleaned data
+hist(obs.cl, breaks = t.breaks, xlim = t.xlim, xlab = "CpG o/e", main = "",
+sub = "Cleaned data", prob = TRUE,
+	  col = grey(0.9), border = grey(0.6))
+abline(v = me.obs, col = 'blue', lwd = 2)
+abline(v = c(ll.me[usedindex], ul.me[usedindex]), col = "red")
+}
+invisible(dev.off())
+# output cutoff quantities
+write.table(tab.des, file = cutoff.fname, sep = "\t", col.names=NA)
+# plot KDE and output quantities characterizing the peaks and the bootstrap results
+# ... table with quantities characterizing the peaks
+v <- col.names.peaks()
+tab1.m <- data.frame(matrix(NA, nrow = num.spec, ncol = length(v)))
+names(tab1.m) <- col.names.peaks()
+rownames(tab1.m) <- spec.names
+# ... table for the bootstrap
+tab2.m <- data.frame(matrix(NA, nrow = num.spec, ncol = 7))
+names(tab2.m) <- col.names.bs()
+rownames(tab2.m) <- spec.names
+# summary table
+sum1.m <-  data.frame(matrix(NA, nrow = num.spec, ncol = 13))
+names(sum1.m) <- c("Modes", "Skewness", "Variance", "Modes too close", "Peak1", "Peak2", "Peak3", "Peak4", "Peak5", "Peak6", "Peak7", "Peak8", "Peak9")
+rownames(sum1.m) <- spec.names
+# ... plotting
+t.height <- 6
+t.width <- 20
+pdf(kde.fname, height = t.height,width = t.width, paper = "special")
+for (i in 1:num.spec) {
+# read in GcGo/e ratios
+obs <- read.CpGoe(cpgoe.fnames[i], FALSE)
+l <- proc.outliers(obs, frac.outl)
+obs.cl <- l[["obs.cl"]]
+# check number of values
+fname <- cpgoe.fnames[i]
+if (length(obs.cl) < 3) {
+stop( paste("Too few values in", fname, "(less than 3) after removal of outliers and zeros"), call. = FALSE )
+}
+if (!supp.warn.few & length(obs.cl) < 250) {
+warning( paste(fname, " contains only few values (", length(obs.cl), ") after removal of outliers and zeros, which may lead to unreliable results", sep = ""), call. = FALSE )
+}
+# plotting
+l <- plot.KDE(obs.cl, t.name = spec.names[i], bs.cis = use.bstrp, bstrp.reps = bstrp.reps, conf.lev = conf.lev,
+min.dist = min.dist, mode.mass = mode.mass, band.width = band.width)
+tab1.m[i, ] <- l$tab.des
+sum1.m[i, ] <- l$tab.des[c(1, 4, 33, 30, 10+(2*0:8))]
+if (use.bstrp) {
+tab2.m[i, ] <- l$tab.bs
+}
+valleys = l$valleys
+}
+invisible(dev.off())
+#sessionInfo()
+# ... output quantities in tables
+write.table(sum1.m, file = summ.fname, sep = "\t", col.names = NA)
+write.table(tab1.m, file = peak.fname, sep = "\t", col.names=NA)
+write.table(valleys, file = valleys.fname, sep = "\t", col.names=NA)
+if (use.bstrp) {
+write.table(tab2.m, file = bstrp.fname, sep = "\t", col.names=NA)
+}

Mercurial > repos > cristian > notos

comparison KDEanalysis.r @ 0:1535ffddeff4 draft