view KDEanalysis.r @ 0:1535ffddeff4 draft

planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
author cristian
date Thu, 07 Sep 2017 08:51:57 -0400
parents
children
line wrap: on
line source

# Carry out analysis of CpGo/E data for Galaxy module
# Ingo Bulla
# 27 Jan 16

# load packages
pckg <- c("methods", "optparse")
for (p in pckg) {
  if (!(p %in% rownames(installed.packages()))) {
    stop( paste("R package", p , "is not installed"), call. = FALSE)
  }
}
require(methods, quietly = TRUE)
require(optparse, quietly = TRUE)

# determine directory where functions are located
cmdArgs <- commandArgs(trailingOnly = FALSE)
str <- "--file="
match <- grep(str, cmdArgs)
if (length(match) == 0) {
  stop("notos.r not set up to be called from R console")
}
path <- normalizePath( sub(str, "", cmdArgs[match]) )
FCTN.DIR <- file.path(dirname(path), "Functions")

source( file.path( FCTN.DIR, "Kernel_function_form.R") )


MAX.CPGOE <- 10   # maximum value for CpGo/e ratios


# process outliers and return quantities characterizing the distribution
# obs: CpGo/e ratios
proc.outliers <- function(obs, frac.outl) {
  ret <- list()

  # remove all zeros from sample
  no.obs.raw <- length(obs)
  ret[["prop.zero"]] <- sum(obs == 0) / no.obs.raw
  obs <- obs[obs != 0]
  if (length(obs) < 3) {
    ret[["valid"]] <- FALSE
    return(ret)
  }
  ret[["obs.nz"]] <- obs

  # replace very large values by a maximum value
  obs <- sapply(obs, function(x) min(x, MAX.CPGOE))

  # defining variables
  # ... mean, median and standard deviation
  ret[["mu.obs"]] <- mu.obs <- mean(obs)
  ret[["me.obs"]] <- me.obs <- median(obs)
  sd.obs <- sd(obs)
  iqr.obs <- IQR(obs)

  # ... uppper and lower limits, based on mean +- k * sd, med. +- k * iqr, k = 2, ..., 4
  ul.mu <- mu.obs + (2 : 5) * sd.obs
  ll.mu <- mu.obs - (2 : 5) * sd.obs
  ul.me <- quantile(obs, 0.75) + (2 : 5) * iqr.obs
  ll.me <- quantile(obs, 0.25) - (2 : 5) * iqr.obs
  names(ul.mu) <- names(ll.mu) <- 2 : 5
  names(ul.me) <- names(ll.me) <- 2 : 5
  ret[["ul.mu"]] <- ul.mu
  ret[["ll.mu"]] <- ll.mu
  ret[["ul.me"]] <- ul.me
  ret[["ll.me"]] <- ll.me

  # summary statistics and data output
  # ... calculate proportion of data excluded when using different ranges
  ret[["prop2"]] <- prop2 <- length(obs[obs < ll.me["2"] | ul.me["2"] < obs]) / no.obs.raw
  ret[["prop3"]] <- prop3 <- length(obs[obs < ll.me["3"] | ul.me["3"] < obs]) / no.obs.raw
  ret[["prop4"]] <- prop4 <- length(obs[obs < ll.me["4"] | ul.me["4"] < obs]) / no.obs.raw
  ret[["prop5"]] <- prop5 <- length(obs[obs < ll.me["5"] | ul.me["5"] < obs]) / no.obs.raw
  # ... choose k in Q1 / Q3 +- k * IQR such that no more than 1% of the data are excluded
  v <- c(prop2, prop3, prop4, prop5) < frac.outl

  if (any(v)) {
    excl.crit <- min(which(v))
    ret[["obs.cl"]] <- obs[!(obs < ll.me[excl.crit] | ul.me[excl.crit] < obs)]
    ret[["used"]] <- paste(2 : 5, "iqr", sep = "")[excl.crit]
  } else {
    ret[["obs.cl"]] <- obs[!(obs < ll.me[4] | ul.me[4] < obs)]
    ret[["used"]] <- "limited to 5 * iqr"
  }
  ret[["valid"]] <- TRUE
  return(ret)
}


# Read CpGo/e ratios from file
# warn: issue warning if necessary
read.CpGoe <- function(fname, warn) {
	# read input file line by line, split by whitespaces, assign last substring to CpGo/e ratios
	# ... remove comments and trailing whitespaces
	print(fname)
	v <- read.table(fname,  fill = TRUE, col.names = c("seq", "val"))
	obs <- v$val

	obs <- obs[!is.na(obs)]
	return(obs)
}


# process command line arguments
# expected arguments:
# - names of the species (each as a separate argument)
# - names of CpGo/e files of the species (each as a separate argument)
# ... parse arguments
option_list <- list(make_option(c("-o", "--frac-outl"), type = "double", default = 0.01,
                                help = "maximum fraction of CpGo/e ratios excluded as outliers [default %default]"),
                    make_option(c("-d", "--min-dist"), type = "double", default = 0.2,
                                help = "minimum distance between modes, modes that are closer are joined [default %default]"),
                    make_option(c("-c", "--conf-level"), type = "double", default = 0.95,
                                help = "level of the confidence intervals of the mode positions [default %default]"),
                    make_option(c("-m", "--mode-mass"), type = "double", default = 0.05,
                                help = "minimum probability mass of a mode [default %default]"),
                    make_option(c("-b", "--band-width"), type = "double", default = 1.06,
                                help = "bandwidth constant for kernels [default %default]"),
                    make_option(c("-B", "--bootstrap"), action="store_true", default = FALSE,
                                help = "calculate confidence intervals of mode positions using bootstrap [default %default]"),
                    make_option(c("-r", "--bootstrap-reps"), type = "integer", default = 1500,
                                help = "number of bootstrap repetitions [default %default]"),
                    make_option(c("-H", "--outlier-hist-file"), type = "character", default = "outliers_hist.pdf",
                                help = "name of the output file for the outlier histograms [default %default]"),
                    make_option(c("-C", "--cutoff-file"), type = "character", default = "outliers_cutoff.csv",
                                help = "name of the output file for the outlier cutoff [default %default]"),
                    make_option(c("-k", "--kde-file"), type = "character", default = "KDE.pdf",
                                help = "name of the output file for the KDE [default %default]"),
                    make_option(c("-v", "--valley-file"), type = "character", default = "valleys.csv",
                                help = "name of the output file with the values for valleys of the KDE [default %default]"),
                    make_option(c("-p", "--peak-file"), type = "character", default = "modes_basic_stats.csv",
                                help = "name of the output file describing the peaks of the KDE [default %default]"),
                    make_option(c("-s", "--bootstrap-file"), type = "character", default = "modes_bootstrap.csv",
                                help = "name of the output file for the bootstrap results [default %default]"),
                    make_option(c("-u", "--summary-file"), type = "character", default = "summary.csv",
                                help = "name of the summary file for the KDE results [default %default]"),
                    make_option(c("-f", "--no-warning-few-seqs"), action = "store_true", default = FALSE,
                                help = paste("suppress warning in case the input file only contains few values ",
                                             "[default %default]", sep = "")))

op <- OptionParser(usage = "notos.r [options] spc_name_1 ... spc_name_N CpGoe_file_name_1 ... CpGoe_file_name_N",
                   description = paste("\nDescription: Notos generates a histogram and a kernel density estimator from files containing CpGo/e ratios. ",
                                       "Moreover, it determines the number of modes of the CpGo/e ratio for each input file. The input files ",
                                       "can either be composed of \n",
                                       "1) CpGo/e ratios separated by linebreaks or\n",
                                       "2) sequence names and CpGo/e ratios with each sequence name put on a separate line together with its CpGo/e ratio ",
                                       "and sequence and CpGo/e being separated by whitespaces on each line.", sep = ""),
                   option_list = option_list)
args <- parse_args(op, positional_arguments = c(2, Inf))
num.args <- length(args$args)
use.bstrp <- args$options$`bootstrap`
supp.warn.few <- args$options$`no-warning-few-seqs`


# ... check number of arguments
# ... ... check number of mandatory arguments
if (num.args < 2) {
   stop("One species name and one file containing CpGo/e ratios have to be provided")
}

# ... ... check whether number of mandatory arguments is even
if (num.args %% 2 != 0) {
   stop("Number of arguments has to be even")
}

# ... ... check maximum fraction of CpGo/e ratios excluded as outliers
frac.outl <- args$options$`frac-outl`
if ((frac.outl <= 0) || (frac.outl >= 1)) {
   stop("The maximum fraction of CpGo/e ratios excluded as outliers has to be greater than zero and less than one")
}
if (frac.outl >= 0.2) {
   warning("The maximum fraction of CpGo/e ratios excluded as outliers has been set to a rather large value, resulting in the removal of many CpGo/e ratios")
}


# ... check numerical arguments
# ... ... check minimum distance between modes
min.dist <- args$options$`min-dist`
if (min.dist < 0) {
   stop("The minimum distance between modes has to be equal to or larger than zero")
}
if (min.dist >= 0.4) {
   warning("The minimum distance between modes has been set to a rather large value, resulting in a strong reduction of the number of modes")
}

# ... ... check confidence level
conf.lev <- args$options$`conf-level`
if ((conf.lev <= 0) || (conf.lev >= 1)) {
   stop("The level of the confidence intervals of the mode positions has to be larger than zero and smaller than one.")
}
if (conf.lev >= 0.995) {
   warning("The level of the confidence intervals of the mode positions has been set to a rather high value, resulting in very broad confidence intervals")
}

# ... ... check minimum probability mass of a mode
mode.mass <- args$options$`mode-mass`
if ((mode.mass < 0) || (mode.mass >= 1)) {
   stop("The minimum probability mass of a mode has to be larger than or equal to zero and smaller than one.")
}
if (mode.mass >= 0.3) {
   warning("The minimum probability mass of a mode has been set to a rather large value, resulting in the elemination of a high number of modes.")
}

# ... ... check bandwidth constant
band.width <- args$options$`band-width`
if (band.width <= 0) {
   stop("The bandwidth constant has to be positive")
}
if (band.width >= 5) {
   warning("The bandwidth constant has to been set to a rather large value, resulting in a strong smoothing")
}

# ... ... check number of boostrap repetitions
bstrp.reps <- args$options$`bootstrap-reps`
if (bstrp.reps != round(bstrp.reps)) {
  stop("The number of boostrap repetitions has to be a positive integer")
}
if (bstrp.reps <= 0) {
   stop("The number of boostrap repetitions has to be positive")
}
if (bstrp.reps >= 10000) {
   warning("The number of boostrap repetitions has been set to a rather large value, resulting in a long running time")
}

# ... check file name arguments
# ... ... check histogram output file name
outlier.hist.fname <- args$options$`outlier-hist-file`
if ( file.exists(outlier.hist.fname) && (file.info(outlier.hist.fname)$isdir) ) {
  stop(paste("File name for the outlier histogram output refers to a directory:", outlier.hist.fname))
}
v <- strsplit(outlier.hist.fname, split = ".", fixed = TRUE)[[1]]
if ((length(v) == 1) || (v[ length(v) ] != "pdf")) {
  warning(paste("File name for the outlier histogram output does not have a .pdf extension:", outlier.hist.fname))
}
g <- gregexpr(pattern ='/', outlier.hist.fname)[[1]]
if (as.vector(g)[1] != -1) {
  v <- as.vector(g)
  d <- substr(outlier.hist.fname, 1, v[length(v)])
  if (!file.exists(d)) {
    stop(paste("Path to file for the outlier histogram output is not valid:", outlier.hist.fname))
  }
}

# ... ... check outlier cutoff output file name
cutoff.fname <- args$options$`cutoff-file`
if ( file.exists(cutoff.fname) && (file.info(cutoff.fname)$isdir) ) {
  stop(paste("File name for the outlier cutoff table output refers to a directory:", cutoff.fname))
}
v <- strsplit(cutoff.fname, split = ".", fixed = TRUE)[[1]]
if (length(v) == 1) {
  stop(paste("File name for the outlier cutoff table output does not have a file extension:", cutoff.fname))
}
#if (v[ length(v) ] != "xlsx") {
#  warning(paste("File name for the outlier cutoff table output does not have a .xlsx extension:", cutoff.fname))
#}
g <- gregexpr(pattern ='/', cutoff.fname)[[1]]
if (as.vector(g)[1] != -1) {
  v <- as.vector(g)
  d <- substr(cutoff.fname, 1, v[length(v)])
  if (!file.exists(d)) {
    stop(paste("Path to file for the outlier cutoff is not valid:", cutoff.fname))
  }
}

# ... ... check KDE output file name
kde.fname <- args$options$`kde-file`
if ( file.exists(kde.fname) && (file.info(kde.fname)$isdir) ) {
  stop(paste("File name for the KDE output refers to a directory:", kde.fname))
}
v <- strsplit(kde.fname, split = ".", fixed = TRUE)[[1]]
if ((length(v) == 1) || (v[ length(v) ] != "pdf")) {
  warning(paste("File name for the KDE output does not have a .pdf extension:", kde.fname))
}
g <- gregexpr(pattern ='/', kde.fname)[[1]]
if (as.vector(g)[1] != -1) {
  v <- as.vector(g)
  d <- substr(kde.fname, 1, v[length(v)])
  if (!file.exists(d)) {
    stop(paste("Path to file for the KDE output is not valid:", kde.fname))
  }
}


# ... ... check peak descriptives output file name
peak.fname <- args$options$`peak-file`
if ( file.exists(peak.fname) && (file.info(peak.fname)$isdir) ) {
  stop(paste("File name for the peak descriptives refers to a directory:", peak.fname))
}
v <- strsplit(peak.fname, split = ".", fixed = TRUE)[[1]]
if ((length(v) == 1) || (v[ length(v) ] != "csv")) {
  warning(paste("File name for the peak descriptives does not have a .csv extension:", peak.fname))
}
g <- gregexpr(pattern ='/', peak.fname)[[1]]
if (as.vector(g)[1] != -1) {
  v <- as.vector(g)
  d <- substr(peak.fname, 1, v[length(v)])
  if (!file.exists(d)) {
    stop(paste("Path to file for the peak descriptives is not valid:", peak.fname))
  }
}

# ... ... check bootstrap results output file name
bstrp.fname <- args$options$`bootstrap-file`
if ( file.exists(bstrp.fname) && (file.info(bstrp.fname)$isdir) ) {
  stop(paste("File name for the bootstrap results refers to a directory:", bstrp.fname))
}
v <- strsplit(bstrp.fname, split = ".", fixed = TRUE)[[1]]
if ((length(v) == 1) || (v[ length(v) ] != "csv")) {
  warning(paste("File name for the bootstrap results does not have a .csv extension:", bstrp.fname))
}
g <- gregexpr(pattern ='/', bstrp.fname)[[1]]
if (as.vector(g)[1] != -1) {
  v <- as.vector(g)
  d <- substr(bstrp.fname, 1, v[length(v)])
  if (!file.exists(d)) {
    stop(paste("Path to file for the bootstrap results is not valid:", bstrp.fname))
  }
}

# ... ... check summary results output file name
summ.fname <- args$options$`summary-file`
if ( file.exists(summ.fname) && (file.info(summ.fname)$isdir) ) {
  stop(paste("File name for the bootstrap results refers to a directory:", summ.fname))
}
v <- strsplit(summ.fname, split = ".", fixed = TRUE)[[1]]
if ((length(v) == 1) || (v[ length(v) ] != "csv")) {
  warning(paste("File name for the bootstrap results does not have a .csv extension:", summ.fname))
}
g <- gregexpr(pattern ='/', summ.fname)[[1]]
if (as.vector(g)[1] != -1) {
  v <- as.vector(g)
  d <- substr(summ.fname, 1, v[length(v)])
  if (!file.exists(d)) {
    stop(paste("Path to file for the bootstrap results is not valid:", summ.fname))
  }
}


# ... ... check CpGo/e input file names
num.spec <- num.args / 2
spec.names <- args$args[1:num.spec]
cpgoe.fnames <- args$args[(num.spec + 1):num.args]
for (i in 1:length(cpgoe.fnames)) {
  if (!file.exists(cpgoe.fnames[i])) {
    stop(paste("CpGo/e file does not exist:", cpgoe.fnames[i]))
  }
  if (file.info(cpgoe.fnames[i])$isdir) {
    stop(paste("CpGo/e file name refers to a directory:", cpgoe.fnames[i]))
  }
}

valleys.fname <- args$options$`valley-file`	

# remove outliers and output histograms
# ... set up table with cutoff quantities
tab.des <- data.frame(matrix(NA, nrow = num.spec, ncol = 6))
names(tab.des) <- c("prop.zero", "prop.out.2iqr", "prop.out.3iqr",
                    "prop.out.4iqr", "prop.out.5iqr", "used")
rownames(tab.des) <- spec.names

# ... set up figure
t.height <- 6
t.width <- 20
pdf(outlier.hist.fname, height = t.height,width = t.width, paper = "special")
par(mfrow = c(1, 3), mgp = c(2, 0.5, 0), mar = c(4.0, 3.0, 1.5, 1))
tmp.fnames <- c()

# ... iterate through species
for (i in 1:num.spec) {
  fname <- cpgoe.fnames[i]
  obs <- read.CpGoe(fname, TRUE)


  # check CpGo/e ratios
  for (j in 1:length(obs)) {
    # is format legal?
    val <- as.numeric( obs[j] )
    err.str <- paste("Observation", i, "in", fname)
    if (!is.finite(val)) {
      stop(paste(err.str, "could not be converted to a number:", obs[j]))
    }

    # is ratio too small / large?
    if (val < 0) {
      stop(paste(err.str, "is negative:", val))
    } else {
      if (val > MAX.CPGOE) {
        warning(paste(err.str   , "is suspiciously large:", val, "\nthis value is replaced by", MAX.CPGOE))
      }
    }
  }

  # process outliers and store the results
  obs.org <- obs
  l <- proc.outliers(obs, frac.outl)
  if (!l[["valid"]]) {
    stop( paste("Too few values in", fname, "(less than 3) after removal of zeros"), call. = FALSE )
  }
  tab.des[i, "prop.zero"] <- l[["prop.zero"]]
  mu.obs <- l[["mu.obs"]]
  me.obs <- l[["me.obs"]]
  ul.mu <- l[["ul.mu"]]
  ll.mu <- l[["ll.mu"]]
  ul.me <- l[["ul.me"]]
  ll.me <- l[["ll.me"]]
  tab.des[i, "prop.out.2iqr"] <- l[["prop2"]]
  tab.des[i, "prop.out.3iqr"] <- l[["prop3"]]
  tab.des[i, "prop.out.4iqr"] <- l[["prop4"]]
  tab.des[i, "prop.out.5iqr"] <- l[["prop5"]]
  obs.cl <- l[["obs.cl"]]
  obs.nz <- l[["obs.nz"]]
  tab.des[i, "used"] <- l[["used"]]
  tab.des[i, "no.obs.raw"] <- length(obs.org)
  tab.des[i, "no.obs.nozero"] <- length(obs.nz)
  tab.des[i, "no.obs.clean"] <- length(obs.cl)
  usedindex <- substr(l[["used"]],1,1)
  # Histograms
  # ... histogram 1: original data with zeros
  t.breaks <- seq(0, max(obs.org) + 1, by = 0.03)
  t.xlim <- c(0, ul.me["5"] + 0.1)
  hist(obs.org, breaks = t.breaks, xlim = t.xlim, xlab = "CpG o/e", main = "",
      sub = "Original data", prob = TRUE,
	  col = grey(0.9), border = grey(0.6))
  mtext(paste(spec.names[i]), side = 3, adj = 0)


  # ... histogram 3: median / iqr based
  t.lty <- rep(3, 4)
  t.lty[usedindex] <- 1

  hist(obs.nz, breaks = t.breaks, xlim = t.xlim, xlab = "CpG o/e", main = "",
      sub = "Data without zeros, Q1/3 +- k*IQR, k=2,...,5", prob = TRUE,
	  col = grey(0.9), border = grey(0.6))
  abline(v = me.obs, col = 'blue', lwd = 2)
  abline(v = c(ll.me, ul.me), col = "red", lty = rep(t.lty, 2))

  # ... histogram 4: cleaned data
  hist(obs.cl, breaks = t.breaks, xlim = t.xlim, xlab = "CpG o/e", main = "",
      sub = "Cleaned data", prob = TRUE,
	  col = grey(0.9), border = grey(0.6))
  abline(v = me.obs, col = 'blue', lwd = 2)
  abline(v = c(ll.me[usedindex], ul.me[usedindex]), col = "red")
}
invisible(dev.off())

# output cutoff quantities
write.table(tab.des, file = cutoff.fname, sep = "\t", col.names=NA)

# plot KDE and output quantities characterizing the peaks and the bootstrap results
# ... table with quantities characterizing the peaks
v <- col.names.peaks()
tab1.m <- data.frame(matrix(NA, nrow = num.spec, ncol = length(v)))
names(tab1.m) <- col.names.peaks()
rownames(tab1.m) <- spec.names

# ... table for the bootstrap
tab2.m <- data.frame(matrix(NA, nrow = num.spec, ncol = 7))
names(tab2.m) <- col.names.bs()
rownames(tab2.m) <- spec.names

# summary table
sum1.m <-  data.frame(matrix(NA, nrow = num.spec, ncol = 13)) 
names(sum1.m) <- c("Modes", "Skewness", "Variance", "Modes too close", "Peak1", "Peak2", "Peak3", "Peak4", "Peak5", "Peak6", "Peak7", "Peak8", "Peak9")
rownames(sum1.m) <- spec.names

# ... plotting
t.height <- 6
t.width <- 20
pdf(kde.fname, height = t.height,width = t.width, paper = "special")
for (i in 1:num.spec) {
  # read in GcGo/e ratios
  obs <- read.CpGoe(cpgoe.fnames[i], FALSE)
  l <- proc.outliers(obs, frac.outl)
  obs.cl <- l[["obs.cl"]]

  # check number of values
  fname <- cpgoe.fnames[i]
  if (length(obs.cl) < 3) {
    stop( paste("Too few values in", fname, "(less than 3) after removal of outliers and zeros"), call. = FALSE )
  }
  if (!supp.warn.few & length(obs.cl) < 250) {
    warning( paste(fname, " contains only few values (", length(obs.cl), ") after removal of outliers and zeros, which may lead to unreliable results", sep = ""), call. = FALSE )
  }

  # plotting
  l <- plot.KDE(obs.cl, t.name = spec.names[i], bs.cis = use.bstrp, bstrp.reps = bstrp.reps, conf.lev = conf.lev,
                min.dist = min.dist, mode.mass = mode.mass, band.width = band.width)
  tab1.m[i, ] <- l$tab.des
  sum1.m[i, ] <- l$tab.des[c(1, 4, 33, 30, 10+(2*0:8))]
  if (use.bstrp) {
    tab2.m[i, ] <- l$tab.bs
  }
  valleys = l$valleys
}
invisible(dev.off())
#sessionInfo()

# ... output quantities in tables
write.table(sum1.m, file = summ.fname, sep = "\t", col.names = NA)
write.table(tab1.m, file = peak.fname, sep = "\t", col.names=NA)
write.table(valleys, file = valleys.fname, sep = "\t", col.names=NA)
if (use.bstrp) {
    write.table(tab2.m, file = bstrp.fname, sep = "\t", col.names=NA)
}