Mercurial > repos > davidecangelosi > pipe_t
changeset 0:185ba61836ab draft
planemo upload for repository https://github.com/igg-molecular-biology-lab/pipe-t.git commit 04049039da97e1c9a8048e732afca48f2741cadf
author | davidecangelosi |
---|---|
date | Thu, 02 May 2019 04:49:04 -0400 |
parents | |
children | ecd0a79e8130 |
files | LICENSE README.md Symlink/.data pipe-t.R pipe-t.xml tool_dependencies.xml |
diffstat | 5 files changed, 959 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/LICENSE Thu May 02 04:49:04 2019 -0400 @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 molecuar-biology-lab-igg + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.md Thu May 02 04:49:04 2019 -0400 @@ -0,0 +1,2 @@ +# PIPE-T +A Galaxy Workflow for processing and analyzing miR expression profiles by RTqPCR
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pipe-t.R Thu May 02 04:49:04 2019 -0400 @@ -0,0 +1,699 @@ + +cat("\n Started! \n") +#sessionInfo() + + +# Send R errors to stderr +options(show.error.messages = F, error = function(){cat(geterrmessage(), file = stderr()); q("no", 1, F)}) + +# Avoid crashing Galaxy with an UTF8 error on German LC settings +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") +suppressPackageStartupMessages({ +library(HTqPCR) +library(base) +library(Biobase) +library(utils) +library(stats) +library(graphics) +library(grDevices) +library(RColorBrewer) +library(limma) +library(RankProd) +library(methods) +library(impute) +library(BBmisc) +library(affy) +library(psych) +#library(gmp) +library(zoo) +library(nondetects) +library(Hmisc) +#library(missForest) +#library(mice) +}) + +cat("\n R libraries...loaded!\n") + +args = commandArgs(trailingOnly=TRUE) +dpfiles<-basename(args[1]) +path000<-dirname(args[1]) +format<-args[2] +nfeatures<-args[3] +rawout<-args[4] +path <- args[5] +dcCtmin<-args[6] +dcCtmax<-args[7] +dcflag<-args[8] +x<-args[9] +normalizationMethod<-args[10] +if (normalizationMethod=="deltaCt") { + normalizers<-args[11] + outputNorm<-args[12] + outputECDF<-args[13] + percentofnastoremove<-args[14] + outputRemaining<-args[15] + imputeMethod<-args[16] + if (imputeMethod=="knn") { + kappa<- args[17] + macsp<-args[18] + outputIMP<-args[19] + + DEAMethod<-args[20] + if (DEAMethod=="ttest") { + alternative<- args[21] + paired<-args[22] + replicates<- args[23] + sort<-args[24] + stringent<- args[25] + padjust<-args[26] + outputDEA<-args[27] + filtnames<-args[28] + } else { + outputDEA<-args[21] + filtnames<-args[22] + } + } else { + #mean, median, nondetects, cubic + outputIMP<-args[17] + DEAMethod<-args[18] + if (DEAMethod=="ttest") { + alternative<- args[19] + paired<-args[20] + replicates<- args[21] + sort<-args[22] + stringent<- args[23] + padjust<-args[24] + outputDEA<-args[25] + filtnames<-args[26] + } else { + outputDEA<-args[19] + filtnames<-args[20] + } + } + }else { + outputNorm<-args[11] + outputECDF<-args[12] + percentofnastoremove<-args[13] + outputRemaining<-args[14] + imputeMethod<-args[15] + + if (imputeMethod=="knn") { + kappa<- args[16] + macsp<-args[17] + outputIMP<-args[18] + + DEAMethod<-args[19] + + if (DEAMethod=="ttest") { + alternative<- args[20] + paired<-args[21] + replicates<- args[22] + sort<-args[23] + stringent<- args[24] + padjust<-args[25] + outputDEA<-args[26] + filtnames<-args[27] + } else { + outputDEA<-args[20] + filtnames<-args[21] + } + } else { + #mean, median, nondetects, cubic + outputIMP<-args[16] + DEAMethod<-args[17] + if (DEAMethod=="ttest") { + alternative<- args[18] + paired<-args[19] + replicates<- args[20] + sort<-args[21] + stringent<- args[22] + padjust<-args[23] + outputDEA<-args[24] + filtnames<-args[25] + + } else { + outputDEA<-args[18] + filtnames<-args[19] + + } + } + } +cat("\n Initialization completed! \n") + +readCtDataDav<- +function (files, path = NULL, n.features = 384, format = "plain", + column.info, flag, feature, type, position, Ct, header = FALSE, + SDS = FALSE, n.data = 1, samples, na.value = 40, sample.info, + ...) +{ + if (missing(files)) + stop("No input files specified") + if (length(files) != length(n.data) & length(n.data) != 1) + stop("n.data must either be a single integer, or same length as number of files") + if (length(n.data) == 1) + n.data <- rep(n.data, length(files)) + nsamples <- sum(n.data) + ncum <- cumsum(n.data) + s.names <- NULL + nspots <- n.features + if (SDS) { + warning("Please use format='SDS'. The SDS' parameter is retained for backward compatibility only.") + format <- "SDS" + } + if (!missing(flag) | !missing(feature) | !missing(type) | + !missing(position) | !missing(Ct)) { + warning("Please use 'column.info' for providing a list of column numbers containing particular information. The use of 'flag', 'feature', 'type', 'position' and 'Ct' is deprecated and will be removed in future versions.") + } + if (missing(column.info)) { + column.info <- switch(format, EDS = list(flag="EXPFAIL", feature="Target.Name", position="Well.Position", Ct="CT"), + plain = list(flag = 4, + feature = 6, type = 7, position = 3, Ct = 8), SDS = list(flag = 4, + feature = 6, type = 7, position = 3, Ct = 8), LightCycler = list(feature = "Name", + position = "Pos", Ct = "Cp"), CFX = list(feature = "Content", + position = "Well", Ct = "Cq.Mean"), OpenArray = list(flag = "ThroughHole.Outlier", + feature = "Assay.Assay.ID", type = "Assay.Assay.Type", + position = "ThroughHole.Address", Ct = "ThroughHole.Ct"), + BioMark = list(flag = "Call", feature = "Name.1", + position = "ID", Ct = "Value")) + } + X <- matrix(0, nspots, nsamples) + X.flags <- as.data.frame(X) + X.cat <- data.frame(matrix("OK", ncol = nsamples, nrow = nspots), + stringsAsFactors = FALSE) + for (i in seq_along(files)) { + if (i == 1) { + cols <- 1:ncum[i] + } + else { + cols <- (ncum[i - 1] + 1):ncum[i] + } + readfile <- ifelse(is.null(path), files[i], file.path(path, + files[i])) + sample <- switch(format, EDS =.readCtEDS(readfile = readfile, + n.data = n.data, i = i, nspots = nspots, ...), plain = .readCtPlain(readfile = readfile, + header = header, n.features = n.features, n.data = n.data, + i = i, ...), SDS = .readCtSDS(readfile = readfile, + n.data = n.data, i = i, nspots = nspots, ...), LightCycler = .readCtLightCycler(readfile = readfile, + n.data = n.data, i = i, nspots = nspots, ...), CFX = .readCtCFX(readfile = readfile, + n.data = n.data, i = i, nspots = nspots, ...), OpenArray = .readCtOpenArray(readfile = readfile, + n.data = n.data, i = i, nspots = nspots, ...), BioMark = .readCtBioMark(readfile = readfile, + n.data = n.data, i = i, nspots = nspots, ...)) + + data <- matrix(sample[, column.info[["Ct"]]], ncol = n.data[i]) + undeter <- apply(data, 2, function(x) x %in% c("Undetermined", + "No Ct")) + X.cat[, cols][undeter] <- "Undetermined" + nas <- c("Undetermined", "No Ct", "999", "N/A") + if (is.null(na.value)) { + data[data %in% nas | data == 0] <- NA + } + else { + data[data %in% nas | is.na(data) | data == 0] <- na.value + } + X[, cols] <- apply(data, 2, function(x) as.numeric(as.character(x))) + if ("flag" %in% names(column.info)) { + flags <- matrix(sample[, column.info[["flag"]]], + ncol = n.data[i]) + flags[flags == "-"] <- "Failed" + flags[flags == "+"] <- "Passed" + X.flags[, cols] <- flags + } + else { + X.flags[, cols] <- "Passed" + } + if (format == "OpenArray") { + s.names <- c(s.names, unique(sample$SampleInfo.SampleID)) + } + else if (format %in% c("EDS","plain", "SDS")) { + s.names <- c(s.names, unique(sample[, 2])) + } + else { + s.names <- s.names + } + if (i == 1) { + featPos <- paste("feature", 1:nspots, sep = "") + if ("position" %in% names(column.info)) + featPos <- as.character(sample[1:nspots, column.info[["position"]]]) + featType <- factor(rep("Target", nspots)) + if ("type" %in% names(column.info)) + featType <- sample[1:nspots, column.info[["type"]]] + featName <- paste("feature", 1:nspots, sep = "") + if ("feature" %in% names(column.info)) + featName <- as.character(sample[1:nspots, column.info[["feature"]]]) + df <- data.frame(featureNames = featName, featureType = as.factor(featType), + featurePos = featPos) + metaData <- data.frame(labelDescription = c("Name of the qPCR feature (gene)", + "Type pf feature", "Position on assay")) + featData <- AnnotatedDataFrame(data = df, varMetadata = metaData) + } + } + if (!missing(samples)) { + if (length(samples) < nsamples) { + warning("Not enough sample names provided; using Sample1, Sample2, ... instead\n") + samples <- paste("Sample", 1:nsamples, sep = "") + } + else if (length(samples) == nsamples) { + samples <- samples + } + } + else if (missing(samples)) { + if (length(files) == nsamples) { + samples <- gsub("(.+)\\..+", "\\1", files) + } + else if (length(s.names) == nsamples) { + samples <- s.names + } + else { + samples <- paste("Sample", 1:nsamples, sep = "") + } + } + samples <- make.unique(samples) + if (any(is.na(X))) + warning("One or more samples contain NAs. Consider replacing these with e.g. Ct=40 now.") + if (missing(sample.info)) { + pdata <- data.frame(sample = 1:length(samples), row.names = samples) + sample.info <- new("AnnotatedDataFrame", data = pdata, + varMetadata = data.frame(labelDescription = "Sample numbering", + row.names = "Sample names")) + } + X.hist <- data.frame(history = capture.output(match.call(readCtData)), + stringsAsFactors = FALSE) + out <- new("qPCRset", exprs = X, phenoData = sample.info, + featureData = featData, featureCategory = X.cat, flag = X.flags, + CtHistory = X.hist) + out +} +.readCtEDS <- +function(readfile=readfile, n.data=n.data, i=i, nspots=nspots, ...) +{ + # Scan through beginning of file, max 100 lines + file.header <- readLines(con=readfile, n=100) + n.header <- grep("^Well", file.header) + if (length(n.header)==0) + n.header <- 0 + # Read data, skip the required lines + out <- read.delim(file=readfile, header=TRUE, colClasses="character", nrows=nspots*n.data[i], skip=n.header-1, strip.white=TRUE, ...) + out +} # .readCtEDS + +head(read.delim(file.path(path000, dpfiles), sep="\t")) +files <- read.delim(file.path(path000, dpfiles), sep="\t") +switch(format, + "EDS"={ + columns<- list(flag="EXPFAIL", feature="Target.Name", position="Well.Position", Ct="CT") + metadata <- data.frame(labelDescription = c("sampleName", "Treatment"), row.names = c("sampleName", "Treatment")) + phenoData <- new("AnnotatedDataFrame", data = files, varMetadata = metadata) + rownames(phenoData)=as.vector(files$sampleName) + raw<- readCtDataDav(files = as.vector(files$sampleName), header=TRUE, format="EDS", column.info=columns, path = path,sample.info=phenoData,n.features = as.numeric(nfeatures)) + }, + "plain"={ + metadata <- data.frame(labelDescription = c("sampleName", "Treatment"), row.names = c("sampleName", "Treatment")) + phenoData <- new("AnnotatedDataFrame", data = files, varMetadata = metadata) + rownames(phenoData)=as.vector(files$sampleName) + raw<- readCtDataDav(files = as.vector(files$sampleName), header=FALSE, format="plain", path = path, sample.info=phenoData,n.features = as.numeric(nfeatures)) + }, + "SDS"={ + columns<- list(feature=3, Ct=6, flag=11) + metadata <- data.frame(labelDescription = c("sampleName", "Treatment"), row.names = c("sampleName", "Treatment")) + phenoData <- new("AnnotatedDataFrame", data = files, varMetadata = metadata) + rownames(phenoData)=as.vector(files$sampleName) + raw<- readCtDataDav(files = files$sampleName, format="SDS",column.info=columns, path = path, sample.info=phenoData, n.features=as.numeric(nfeatures)) + }, + "LightCycler"={ + metadata <- data.frame(labelDescription = c("sampleName", "Treatment"), row.names = c("sampleName", "Treatment")) + phenoData <- new("AnnotatedDataFrame", data = files, varMetadata = metadata) + rownames(phenoData)=as.vector(files$sampleName) + raw <- readCtDataDav(files = files$sampleName, path = path, format = "LightCycler", sample.info=phenoData,n.features = as.numeric(nfeatures)) + }, + "CFX"={ + metadata <- data.frame(labelDescription = c("sampleName", "Treatment"), row.names = c("sampleName", "Treatment")) + phenoData <- new("AnnotatedDataFrame", data = files, varMetadata = metadata) + rownames(phenoData)=as.vector(files$sampleName) + raw <- readCtDataDav(files = files$sampleName, path = path, format = "CFX", sample.info=phenoData,n.features = as.numeric(nfeatures)) + }, + "OpenArray"={ + metadata <- data.frame(labelDescription = c("sampleName", "Treatment"), row.names = c("sampleName", "Treatment")) + phenoData <- new("AnnotatedDataFrame", data = files, varMetadata = metadata) + rownames(phenoData)=as.vector(files$sampleName) + raw <- readCtDataDav(files = files$sampleName, path = path, format = "OpenArray", sample.info=phenoData,n.features = as.numeric(nfeatures)) + }, + "BioMark"={ + metadata <- data.frame(labelDescription = c("sampleName", "Treatment"), row.names = c("sampleName", "Treatment")) + phenoData <- new("AnnotatedDataFrame", data = files, varMetadata = metadata) + rownames(phenoData)=as.vector(files$sampleName) + raw <- readCtDataDav(files = files$sampleName, path = path, format = "BioMark", sample.info=phenoData, n.features = as.numeric(nfeatures)) + }, + stop("Enter something that switches me!") +) +cat("\n Files read correctly! ") + +write.table(exprs(raw), file=rawout, quote=FALSE, row.names=TRUE, col.names=TRUE,sep = "\t") +#################################################################################################################### +#Set a new categories for the values meeting two criterions +switch(format, + "EDS"={ + unreliable<-setCategory(raw, Ct.max=dcCtmax, Ct.min=dcCtmin,replicates=FALSE, flag=dcflag, flag.out="Y", quantile=NULL) + }, + "plain"={ + unreliable<-setCategory(raw, Ct.max=dcCtmax, Ct.min=dcCtmin,replicates=FALSE, flag=dcflag, flag.out="Flagged", quantile=NULL) + }, + "SDS"={ + unreliable<-setCategory(raw, Ct.max=dcCtmax, Ct.min=dcCtmin,replicates=FALSE, flag=dcflag, flag.out="TRUE", quantile=NULL) + }, + "LightCycler"={ + unreliable<-setCategory(raw, Ct.max=dcCtmax, Ct.min=dcCtmin,replicates=FALSE, flag=dcflag, flag.out="Y", quantile=NULL) + }, + "CFX"={ + unreliable<-setCategory(raw, Ct.max=dcCtmax, Ct.min=dcCtmin,replicates=FALSE, flag=dcflag, flag.out="Y", quantile=NULL) + }, + "OpenArray"={ + unreliable<-setCategory(raw, Ct.max=dcCtmax, Ct.min=dcCtmin,replicates=FALSE, flag=dcflag, flag.out="TRUE ", quantile=NULL) + }, + "BioMark"={ + unreliable<-setCategory(raw, Ct.max=dcCtmax, Ct.min=dcCtmin,replicates=FALSE, flag=dcflag, flag.out="Fail", quantile=NULL) + }, + stop("Enter something that switches me!") +) +#unreliable<-setCategory(raw, Ct.max=dcCtmax, Ct.min=dcCtmin,replicates=FALSE, flag=dcflag, flag.out="Y", quantile=NULL) + +#################################################################################################################### +#Filter out the values of the new category +xFilter <- filterCategory(unreliable) + +cat("\n Categorization completed! ") + +png(x, # create PNG for the heat map + width = 10*300, # 5 x 300 pixels + height = 10*300, + res = 300, # 300 pixels per inch + pointsize = 8) + plotCtBoxes(xFilter, stratify=NULL, xlab = "Samples", ylab="Ct", names=as.character(seq(1, ncol(xFilter), 1))) # smaller font size +dev.off() + +#write.table(exprs(xFilter), file=x, quote=FALSE, row.names=TRUE, col.names=TRUE,sep = "\t") + +#################################################################################################################### +#NORMALIZATION +#method version 3.5.1 + Global mean +normalizeCtDataDav <- +function(q, + norm = "deltaCt", + deltaCt.genes = NULL, + scale.rank.samples, + rank.type = "pseudo.median", + Ct.max = dcCtmax, + geo.mean.ref, + verbose = TRUE) +{ + # Extract the data + data <- exprs(q) + data.norm <- data + # Get the normalisation method + method <- match.arg(norm, c("quantile", "scale.rankinvariant", "norm.rankinvariant", "deltaCt", "geometric.mean", "globalmean")) + # Some general stuff that will be used by both rank.invariant methods + if (method %in% c("scale.rankinvariant", "norm.rankinvariant")) { + # Index to use for too high Ct values + #Ct.index <- data>Ct.max + data.Ctmax <- data + Ct.index <- is.na(data) + #data.Ctmax[Ct.index] <- NA + data<-na.spline(data) + # Define what to rank against + if (rank.type=="pseudo.median") { + ref.data <- apply(data.Ctmax, 1, median, na.rm=TRUE) + } else if (rank.type=="pseudo.mean") { + ref.data <- apply(data.Ctmax, 1, mean, na.rm=TRUE) + } + # Mark + replace NA values with something temporary + na.index <- is.na(ref.data) + ref.data[na.index] <- 30 + # Run the rank.invariant function + data.rankinvar <- apply(data, 2, normalize.invariantset, ref=ref.data) + } + # The actual normalisation + switch(method, + quantile = { + # Use an internal limma function + data.norm <- normalizeQuantiles(data) + }, + scale.rankinvariant = { + # Get all the rank invariant genes + ri.genes <- sapply(data.rankinvar, "[[", "i.set") + # Remove those with too high Ct values + ri.genes[Ct.index] <- FALSE + # Remove those that were all NA for potentially other reasons + ri.genes[na.index,] <- FALSE + # Select those to use here + ri.count <- rowSums(ri.genes) + if (missing(scale.rank.samples)) + scale.rank.samples <- ncol(data)-1 + ri.index <- ri.count >= scale.rank.samples + if (sum(ri.index)==0) + stop(paste("No rank invariant genes were found across", scale.rank.samples, "samples")) + # Extract the corresponding Ct values; average + ri.mean <- colMeans(data[ri.index,,drop=FALSE]) + ri.scale <- ri.mean/ri.mean[1] + # Correct the data + data.norm <- t(t(data)*ri.scale) + # Print info + if (verbose) { + cat(c("Scaling Ct values\n\tUsing rank invariant genes:", paste(featureNames(q)[ri.index], collapse=" "), "\n")) + cat(c("\tScaling factors:", format(ri.scale, digits=3), "\n")) + } + }, + norm.rankinvariant = { + # Print info + if (verbose) + cat("Normalizing Ct values\n\tUsing rank invariant genes:\n") + # Correct the data based on the calculations above + for (i in 1:ncol(data)) { + # Check if there are any rank invariant genes + ri.sub <- data.rankinvar[[i]] + ri.genes <- ri.sub[["i.set"]] + # Remove those that don't pass the Ct.max criteria + ri.genes[Ct.index[,i]] <- FALSE + # Remove those that are NA for other reasons + ri.genes[na.index] <- FALSE + if (sum(ri.genes)==0) { + warning(paste("\tNo rank invariant genes were found for sample ", sampleNames(q)[i], "; sample not normalized\n", sep="")) + next + } + # If verbose, print some info + if (verbose) + cat(paste("\t", sampleNames(q)[i], ": ", sum(ri.genes), " rank invariant genes\n", sep="")) + # The actual correction + data.norm[,i] <- as.numeric(approx(ri.sub$n.curve$y, ri.sub$n.curve$x, xout=data[,i], rule=2)$y) + } + }, + deltaCt = { + # Which are the reference genes (endogenous controls) + if (is.null(deltaCt.genes)) + deltaCt.genes <- unique(featureNames(q)[featureType(q)=="Endogenous Control"]) + c.index <- featureNames(q) %in% deltaCt.genes + if (verbose) { + cat(c("Calculating deltaCt values\n\tUsing control gene(s):", paste(deltaCt.genes, collapse=" "), "\n")) + } + # Run though all cards; perform internal normalisation + for (c in 1:ncol(data)) { + # Calculate the control genes + refCt <- mean(data[c.index,c], na.rm=TRUE) + refsd <- sd(data[c.index,c], na.rm=TRUE) + # Difference for target and controls + data.norm[,c] <- data[,c]-refCt + # Print results + if (verbose) + cat(paste("\tCard ", c, ":\tMean=", format(refCt, dig=4), "\tStdev=", format(refsd, dig=3), "\n", sep="")) + } + }, + geometric.mean = { + # For each column, calculate the geometric mean of Ct values<Ct.max + #geo.mean <- apply(data, 2, function(x) { + # xx <- log2(subset(x, x<Ct.max)) + # 2^mean(xx)}) + geo.mean <- apply(data, 2, function(x) { + xx <- subset(x, x<=Ct.max) + geometric.mean(xx)}) + # Which sample to scale to + #if (missing(geo.mean.ref)) + # geo.mean.ref <- 1 + # Calculate the scaling factor + #geo.scale <- geo.mean/geo.mean[geo.mean.ref] + # Adjust the data accordingly + data.norm <- t(t(data) - geo.mean) + if (verbose) { + cat(c("Scaling Ct values\n\tUsing geometric mean within each sample\n")) + #cat(c("\tScaling factors:", format(geo.scale, digits=3), "\n")) + } + } # switch + ,globalmean = { + glo <- apply(data, 2, function(x) { + xx <- subset(x, x <= Ct.max) + mean(xx) + }) + data.norm <- t(t(data) - glo) + } + ) + # Replace with the normalised Ct exprs + exprs(q) <- data.norm + # Add to the history of the object + if (nrow(getCtHistory(q))==0) + setCtHistory(q) <- data.frame(history="Manually created qPCRset object.", stringsAsFactors=FALSE) + setCtHistory(q) <- rbind(getCtHistory(q), capture.output(match.call(normalizeCtData))) + # Return the normalised object + q +} + + +switch(normalizationMethod, + "deltaCt"={ + normalizedDataset <- normalizeCtDataDav(xFilter, norm="deltaCt", deltaCt.genes =explode(normalizers, sep = ",")) + }, + "quantile"={ + normalizedDataset <- normalizeCtDataDav(xFilter, norm=normalizationMethod) + }, + "scale.rankinvariant"={ + normalizedDataset <- normalizeCtDataDav(xFilter, norm=normalizationMethod) + }, + "norm.rankinvariant"={ + normalizedDataset <- normalizeCtDataDav(xFilter, norm=normalizationMethod) + }, + "geometric.mean"={ + normalizedDataset <- normalizeCtDataDav(xFilter, norm=normalizationMethod) + }, + "globalmean"={ + normalizedDataset <- normalizeCtDataDav(xFilter, norm=normalizationMethod) + }, + stop("Enter something that switches me!") +) + + #if (normalizationMethod=="deltaCt") { +#normalize CT data + +#normalizedDataset <- normalizeCtDataDav(xFilter, norm="deltaCt", deltaCt.genes =explode(normalizers, sep = ",")) +#} else { +#normalizedDataset <- normalizeCtDataDav(xFilter, norm=normalizationMethod) + +#} +cat("\n Data normalized correctly! \n") +write.table(exprs(normalizedDataset), file=outputNorm, quote=FALSE, row.names=TRUE, col.names=TRUE,sep = "\t") + + +#normalizedDataset +#################################################################################################################### +#Check noise reduction by empirical cumulative distribution + +#X = rnorm(100) # X is a sample of 100 normally distributed random variables +# P = ecdf(X) # P is a function giving the empirical CDF of X +#Y = rnorm(1000) # X is a sample of 100 normally distributed random variables +# PY = ecdf(Y) +#plotâ„— + +#lines(PY) +png(outputECDF, # create PNG for the heat map + width = 10*300, # 5 x 300 pixels + height = 10*300, + res = 300, # 300 pixels per inch + pointsize = 8) # smaller font size +vec=c() +for (i in 1:nrow(exprs(xFilter))){ + CVX<-(sd(2^-exprs(xFilter)[i,], na.rm = TRUE)/mean(2^-exprs(xFilter)[i,], na.rm = TRUE))*100 +vec=c(vec, c(CVX)) +} +vec<-na.omit(vec) +P = ecdf(vec) +gm=c() +for (i in 1:nrow(exprs(normalizedDataset))){ + CVGM<-(sd(2^-exprs(normalizedDataset)[i,], na.rm = TRUE)/mean(2^-exprs(normalizedDataset)[i,], na.rm = TRUE))*100 +gm=c(gm, c(CVGM)) +} +gm<-na.omit(gm) + +PY = ecdf(gm) + +plot_colors <- c(rgb(r=0.0,g=0.0,b=0.9), "red", "forestgreen",rgb(r=0.0,g=0.0,b=0.0),rgb(r=0.5,g=0.0,b=0.3),rgb(r=0.0,g=0.4,b=0.4)) + +plot(P,col=plot_colors[1],xlim=c(0.0,600), ylim=c(0.0,1),xaxp = c(0.0, 600, 6),yaxp = c(0.0, 1, 10), cex=1.3, lwd=5, main=NULL,xlab="CV(%)",ylab="Empirical Cumulative Distribution") +lines(PY, lwd=5, col=plot_colors[6],cex=1.3) +legend("bottomright", c("not normalized", "normalized"), cex=1.3, col=c(plot_colors[1],plot_colors[6]), lwd=c(5,5)); +dev.off() + +#Two-sample Kolmogorov-Smirnov +ks.test(vec,gm) + + +#write.table(exprs(qFiltNAs), file=outputIMP, quote=FALSE, row.names=TRUE, col.names=TRUE,sep = "\t") +png(outputIMP, # create PNG for the heat map + width = 10*300, # 5 x 300 pixels + height = 10*300, + res = 300, # 300 pixels per inch + pointsize = 8) + plotCtBoxes(normalizedDataset, stratify=NULL, xlab = "Samples", ylab="DeltaCt", names=as.character(seq(1, ncol(normalizedDataset), 1))) # smaller font size +dev.off() + +################################################## Filtering based on number of NAs################################################## + +#FILTERING on the basis of NAs +#qPCRset.R +setMethod("exprs", signature(object="qPCRset"), definition = + function (object) {x <- assayDataElement(object, "exprs"); rownames(x) <- featureNames(object); colnames(x) <- sampleNames(phenoData(object));x} +) +ncatn <- as.integer(n.samples(normalizedDataset))*as.integer(percentofnastoremove)/100 + +qFiltNAs <- filterCtData(normalizedDataset, remove.category=c("Undetermined","Unreliable"), n.category=as.integer(ncatn),remove.name=explode(filtnames, sep = ",")) + +cat("\n Data filtered correctly! \n") + +if (is.na(exprs(qFiltNAs))){ +switch(imputeMethod, + "knn"={ + imp<-impute.knn(exprs(qFiltNAs) ,k = as.integer(kappa), maxp = as.integer(macsp), rng.seed=362436069) + exprs(qFiltNAs)=imp$data + }, + "mestdagh"={ + #Mesdagh + #sostituisce a NA -1000 + for (i in 1:nrow(exprs(qFiltNAs))){ + for(j in 1:ncol(exprs(qFiltNAs))){ + if(is.na(exprs(qFiltNAs)[i,j])>0)exprs(qFiltNAs)[i,j]<- -1000 + } + } + temp<-exprs(qFiltNAs) + for (i in 1:nrow(exprs(qFiltNAs))){ + for(j in 1:ncol(exprs(qFiltNAs))){ + if(exprs(qFiltNAs)[i,j]<(-100))exprs(qFiltNAs)[i,j]<- max(temp[i,])+1 + } + } + }, + "cubic"={ + exprs(qFiltNAs) <- na.spline(exprs(qFiltNAs)) + }, + "mean"={ + exprs(qFiltNAs)<-impute(exprs(qFiltNAs),mean) + }, + "median"={ + exprs(qFiltNAs)<-impute(exprs(qFiltNAs),median) + }, + "nondetects"={ + qFiltNAs <- qpcrImpute(qFiltNAs, outform=c("Single"),linkglm = c("logit")) + }, + stop("Enter something that switches me!") +) + +cat("\n Imputation completed! \n") +}else{ + cat("\n Nothing to impute! \n") +} + +write.table(exprs(qFiltNAs), file=outputRemaining, quote=FALSE, row.names=TRUE, col.names=TRUE, sep = "\t") + +if (DEAMethod=="ttest") { + #Differential expression analysis (paired t test+BH). Returns Fold change in linear scale. + DEG<-ttestCtData(qFiltNAs, groups = files$Treatment, alternative = alternative, paired = ifelse(paired=="TRUE", TRUE, FALSE), replicates =ifelse(replicates=="TRUE", TRUE, FALSE), sort=ifelse(sort=="TRUE", TRUE, FALSE), stringent=ifelse(stringent=="TRUE", TRUE, FALSE), p.adjust=padjust) + write.table(DEG, file=outputDEA, quote=FALSE, row.names=TRUE, col.names=TRUE,sep = "\t") +} +if (DEAMethod=="rp") { + DEG<-RP(exprs(qFiltNAs), as.numeric(pData(qFiltNAs)$Treatment)-1, num.perm = 1000,logged = TRUE, gene.names = featureNames(qFiltNAs), huge=TRUE, plot = FALSE, rand = 123) + write.table(DEG[1:5], file=outputDEA, quote=FALSE, row.names=TRUE, col.names=TRUE,sep = "\t") +} +cat("\n Differential expression analysis completed correctly! \n") +cat("\n Workflow ended correctly! \n")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pipe-t.xml Thu May 02 04:49:04 2019 -0400 @@ -0,0 +1,234 @@ +<?xml version="1.0"?> +<tool id="pipe-t" name="PIPE-T" version="1.0" hidden="false"> +<description>A tool for analyzing RTqPCR expression data </description> +<requirements> + <requirement type="package" version="3.5.0">r-base</requirement> + <requirement type="package" version="7.2.0">libgcc</requirement> + <requirement type="package" version="1.36.0">bioconductor-htqpcr</requirement> + <requirement type="package" version="3.8.0">bioconductor-rankprod</requirement> + <requirement type="package" version="1.56.0">bioconductor-impute</requirement> +<requirement type="package" version="1.11.0">r-bbmisc</requirement> +<requirement type="package" version="1.8.4">r-psych</requirement> +<requirement type="package" version="1.8_1">r-zoo</requirement> +<requirement type="package" version="2.10.0">bioconductor-nondetects</requirement> +<requirement type="package" version="4.1_1">r-hmisc</requirement> +</requirements> +<stdio> + <exit_code range="1:" /> +</stdio> +<command> + <![CDATA[ + #for $input in $dp.list_files + ln -sf '$input' '$__tool_directory__/Symlink/${input.element_identifier}'; + #end for + + #if str( $dn.condNorm.normMethod ) == "deltaCt": + #if str( $df.condImpute.format ) == "mestdagh" or str( $df.condImpute.format ) == "mean" or str( $df.condImpute.format ) == "median" or str( $df.condImpute.format ) == "nondetects" or str( $df.condImpute.format ) == "cubic": + #if str( $de.condDEA.method ) == "ttest": + Rscript $__tool_directory__/pipe-t.R $dp.files "$dp.formatFile" $dp.n $galaxy_output1 "$__tool_directory__/Symlink" $dc.Ctmin $dc.Ctmax $dc.flag $galaxy_output2 $dn.condNorm.normMethod "$dn.condNorm.normalizers" $galaxy_output3 $galaxy_output4 $df.percent $galaxy_output5 $df.condImpute.format $galaxy_output6 $de.condDEA.method $de.condDEA.alternative $de.condDEA.paired $de.condDEA.replicates $de.condDEA.sort $de.condDEA.stringent $de.condDEA.padjust $galaxy_output7 "$df.filtname"; + #end if + #if str( $de.condDEA.method ) == "rp": + Rscript $__tool_directory__/pipe-t.R $dp.files "$dp.formatFile" $dp.n $galaxy_output1 "$__tool_directory__/Symlink" $dc.Ctmin $dc.Ctmax $dc.flag $galaxy_output2 $dn.condNorm.normMethod "$dn.condNorm.normalizers" $galaxy_output3 $galaxy_output4 $df.percent $galaxy_output5 $df.condImpute.format $galaxy_output6 $de.condDEA.method $galaxy_output7 "$df.filtname"; + #end if + #if str( $de.condDEA.method ) == "none": + Rscript $__tool_directory__/pipe-t.R $dp.files "$dp.formatFile" $dp.n $galaxy_output1 "$__tool_directory__/Symlink" $dc.Ctmin $dc.Ctmax $dc.flag $galaxy_output2 $dn.condNorm.normMethod "$dn.condNorm.normalizers" $galaxy_output3 $galaxy_output4 $df.percent $galaxy_output5 $df.condImpute.format $galaxy_output6 $de.condDEA.method $galaxy_output7 "$df.filtname"; + #end if + #end if + + #if str( $df.condImpute.format ) == "knn": + #if str( $de.condDEA.method ) == "ttest": + Rscript $__tool_directory__/pipe-t.R $dp.files "$dp.formatFile" $dp.n $galaxy_output1 "$__tool_directory__/Symlink" $dc.Ctmin $dc.Ctmax $dc.flag $galaxy_output2 $dn.condNorm.normMethod "$dn.condNorm.normalizers" $galaxy_output3 $galaxy_output4 $df.percent $galaxy_output5 $df.condImpute.format $df.condImpute.k $df.condImpute.maxp $galaxy_output6 $de.condDEA.method $de.condDEA.alternative $de.condDEA.paired $de.condDEA.replicates $de.condDEA.sort $de.condDEA.stringent $de.condDEA.padjust $galaxy_output7 "$df.filtname"; + #end if + #if str( $de.condDEA.method ) == "rp": + Rscript $__tool_directory__/pipe-t.R $dp.files "$dp.formatFile" $dp.n $galaxy_output1 "$__tool_directory__/Symlink" $dc.Ctmin $dc.Ctmax $dc.flag $galaxy_output2 $dn.condNorm.normMethod "$dn.condNorm.normalizers" $galaxy_output3 $galaxy_output4 $df.percent $galaxy_output5 $df.condImpute.format $df.condImpute.k $df.condImpute.maxp $galaxy_output6 $de.condDEA.method $galaxy_output7 "$df.filtname"; + #end if + #if str( $de.condDEA.method ) == "none": + Rscript $__tool_directory__/pipe-t.R $dp.files "$dp.formatFile" $dp.n $galaxy_output1 "$__tool_directory__/Symlink" $dc.Ctmin $dc.Ctmax $dc.flag $galaxy_output2 $dn.condNorm.normMethod "$dn.condNorm.normalizers" $galaxy_output3 $galaxy_output4 $df.percent $galaxy_output5 $df.condImpute.format $df.condImpute.k $df.condImpute.maxp $galaxy_output6 $de.condDEA.method $galaxy_output7 "$df.filtname"; + #end if + #end if + #else + #if str( $df.condImpute.format ) == "mestdagh" or str( $df.condImpute.format ) == "mean" or str( $df.condImpute.format ) == "median" or str( $df.condImpute.format ) == "nondetects" or str( $df.condImpute.format ) == "cubic": + #if str( $de.condDEA.method ) == "ttest": + Rscript $__tool_directory__/pipe-t.R $dp.files "$dp.formatFile" $dp.n $galaxy_output1 "$__tool_directory__/Symlink" $dc.Ctmin $dc.Ctmax $dc.flag $galaxy_output2 $dn.condNorm.normMethod $galaxy_output3 $galaxy_output4 $df.percent $galaxy_output5 $df.condImpute.format $galaxy_output6 $de.condDEA.method $de.condDEA.alternative $de.condDEA.paired $de.condDEA.replicates $de.condDEA.sort $de.condDEA.stringent $de.condDEA.padjust $galaxy_output7 "$df.filtname"; + #end if + #if str( $de.condDEA.method ) == "rp": + Rscript $__tool_directory__/pipe-t.R $dp.files "$dp.formatFile" $dp.n $galaxy_output1 "$__tool_directory__/Symlink" $dc.Ctmin $dc.Ctmax $dc.flag $galaxy_output2 $dn.condNorm.normMethod $galaxy_output3 $galaxy_output4 $df.percent $galaxy_output5 $df.condImpute.format $galaxy_output6 $de.condDEA.method $galaxy_output7 "$df.filtname"; + #end if + #if str( $de.condDEA.method ) == "none": + Rscript $__tool_directory__/pipe-t.R $dp.files "$dp.formatFile" $dp.n $galaxy_output1 "$__tool_directory__/Symlink" $dc.Ctmin $dc.Ctmax $dc.flag $galaxy_output2 $dn.condNorm.normMethod $galaxy_output3 $galaxy_output4 $df.percent $galaxy_output5 $df.condImpute.format $galaxy_output6 $de.condDEA.method $galaxy_output7 "$df.filtname"; + #end if + #end if + + #if str( $df.condImpute.format ) == "knn": + #if str( $de.condDEA.method ) == "ttest": + Rscript $__tool_directory__/pipe-t.R $dp.files "$dp.formatFile" $dp.n $galaxy_output1 "$__tool_directory__/Symlink" $dc.Ctmin $dc.Ctmax $dc.flag $galaxy_output2 $dn.condNorm.normMethod $galaxy_output3 $galaxy_output4 $df.percent $galaxy_output5 $df.condImpute.format $df.condImpute.k $df.condImpute.maxp $galaxy_output6 $de.condDEA.method $de.condDEA.alternative $de.condDEA.paired $de.condDEA.replicates $de.condDEA.sort $de.condDEA.stringent $de.condDEA.padjust $galaxy_output7 "$df.filtname"; + #end if + #if str( $de.condDEA.method ) == "rp": + Rscript $__tool_directory__/pipe-t.R $dp.files "$dp.formatFile" $dp.n $galaxy_output1 "$__tool_directory__/Symlink" $dc.Ctmin $dc.Ctmax $dc.flag $galaxy_output2 $dn.condNorm.normMethod $galaxy_output3 $galaxy_output4 $df.percent $galaxy_output5 $df.condImpute.format $df.condImpute.k $df.condImpute.maxp $galaxy_output6 $de.condDEA.method $galaxy_output7 "$df.filtname"; + #end if + #if str( $de.condDEA.method ) == "none": + Rscript $__tool_directory__/pipe-t.R $dp.files "$dp.formatFile" $dp.n $galaxy_output1 "$__tool_directory__/Symlink" $dc.Ctmin $dc.Ctmax $dc.flag $galaxy_output2 $dn.condNorm.normMethod $galaxy_output3 $galaxy_output4 $df.percent $galaxy_output5 $df.condImpute.format $df.condImpute.k $df.condImpute.maxp $galaxy_output6 $de.condDEA.method $galaxy_output7 "$df.filtname"; + #end if + #end if + #end if +]]> +</command> +<inputs> +<section name="dp" title="File uploading and parsing" expanded="true"> + <param name="list_files" type="data_collection" collection_type="list" value="" label="Select a collection list of TaqMan + Low Density Array files in the history tab" help="Collection should be of category List. "/> + <param name="files" type="data" format="txt" label="Select one of the files in the history tab" help="File should contains only the fields: sampleName and Treatment." /> + <param name="formatFile" type="select" label="Select one of the file formats from the list below." help=""> + <option value="EDS" selected="true">EDS</option> + <option value="plain">Plain</option> + <option value="SDS" >SDS</option> + <option value="LightCycler" >LightCycler</option> + <option value="CFX">CFX</option> + <option value="OpenArray" >OpenArray</option> + <option value="BioMark" >BioMark</option> + </param> + <param name="n" type="integer" min="1" max="1000" value="384" label="Type the number of transcripts in your file" /> +</section> +<section name="dc" title="Ct filtering and categorization" expanded="true"> + <param name="Ctmin" type="integer" min="0" max="40" value="14" label="Set up a minimum Ct value" + help="Any Ct below your selected value will be labelled as Unreliable." /> + <param name="Ctmax" type="integer" min="0" max="40" value="32" label="Set up a maximum Ct value" + help="Any Ct above your selected value will be labelled as Unreliable."/> + <param name="flag" type="select" label="Select TRUE if you want that PIPE-T assigns category 'Unreliable' on the basis of FAILURE flag" help="Data in qPCRset objects will have feature categories (Unreliable, Undetermined) assigned to them based on different Ct criteria."> + <option value="TRUE">TRUE</option> + <option value="FALSE">FALSE</option> + </param> +</section> +<section name="dn" title="Normalization" expanded="true"> + <conditional name="condNorm"> + <param name="normMethod" type="select" label="Select one of the normalization methods from the list below." help="Normalization is important to reduce technical variability from the data."> + <option value="globalmean" selected="true">Global mean</option> + <option value="deltaCt">DeltaCt method (housekeeping genes)</option> + <option value="geometric.mean">Modified global mean</option> + <option value="quantile">Quantile</option> + <option value="norm.rankinvariant">Rank invariant</option> + <option value="scale.rankinvariant">Scale rank invariant</option> + </param> + <when value="deltaCt"> + <param name="normalizers" type="text" label="Type a comma separated list of housekeeping transcript identifiers that will be used as normalizers." value="U6 snRNA-001973"> + </param> + </when> + </conditional> +</section> +<section name="df" title="Transcript filtering and imputation" expanded="true"> + <param name="percent" type="integer" min="0" max="100" value="0" label="Set up a percentage of NAs." + help="miRs\genes with more than the specified percentage of NAs across samples will be removed." /> +<param name="filtname" type="text" label="Type a comma separeted list of transcript identifiers to filter out" help="transcript identifiers specified in the List will be removed." value="U6 snRNA-001973,hsa-miR-520a"> + </param> + <conditional name="condImpute"> + <param name="format" type="select" label="Select one of the imputation methods from the list below." help=""> + <option value="mestdagh" selected="true">Mestdagh</option> + <option value="knn">K-Nearest Neighbour</option> + <option value="mean">Mean</option> + <option value="median">Median</option> + <option value="cubic">Cubic Spline</option> + <option value="nondetects">Non-detects</option> + </param> + <when value="knn"> + <param name="k" type="integer" min="1" max="100" value="10" label="Type a number of neighbors to use by the KNN imputation method" /> + <param name="maxp" type="integer" min="1" max="2000" value="1500" label="Type the maximum number of imputed transcript by KNN method." help="Larger numbers are divided by two-means clustering (recursively) + prior to imputation. "/> + </when> + </conditional> + </section> + <section name="de" title="Differential expression analysis" expanded="true"> + <conditional name="condDEA"> + <param name="method" type="select" label="Select one of the methods from the list below or NONE."> + <option value="ttest" selected="true">T-test and fold change</option> + <option value="rp">Rank Product (Only for unpaired data)</option> + <option value="none">NONE</option> + </param> + <when value="ttest"> + <param name="alternative" type="select" label="Select one of the types of alternative hypothesis to assess significance."> + <option value="two.sided" selected="true">Two sided</option> + <option value="greater">Greater</option> + <option value="less">Lower</option> + </param> + <param name="paired" type="select" label="Select TRUE if you want a paired analysis?" help="Pairing of samples will follow the order of the sampleNames in the input file"> + <option value="TRUE" >TRUE</option> + <option value="FALSE" selected="true">FALSE</option> + </param> + <param name="replicates" type="select" label="Select TRUE if you have replicated miR\gene in your data" help="If replicated miRs\genes are present in the data, the statistics will be calculated once for each replicated miR\gene, rather than the separately."> + <option value="TRUE" selected="true">TRUE</option> + <option value="FALSE" >FALSE</option> + </param> + <param name="sort" type="select" label="Select TRUE if you want the output to be sorted by increasing order of p-value?" help=""> + <option value="TRUE" selected="true">TRUE</option> + <option value="FALSE" >FALSE</option> + </param> + <param name="stringent" type="select" label="Select TRUE to admit more stringent analysis." help=" If stringent is TRUE any unreliable or undetermined measurements among technical and +biological replicates will result in the final result being Undetermined. If stringent is FALSE result will be OK unless at least half of the Ct values for a given gene are unreliable/undetermined."> + <option value="TRUE" selected="true">TRUE</option> + <option value="FALSE" >FALSE</option> + </param> + <param name="padjust" type="select" label="Select one of the methods to adjust pvalues for multiple hypothesis testing"> + <option value="BH" selected="true">Benjamini-Hochberg</option> + <option value="bonferroni">Bonferroni</option> + </param> + </when> + <when value="rp"> + </when> + </conditional> + </section> +</inputs> +<outputs> + <data format="txt" name="galaxy_output1" label="1_Ct_Raw"/> + <data format="png" name="galaxy_output2" label="2_Boxplot after data categorization"/> + <data format="txt" name="galaxy_output3" label="3_Normalized data"/> + <data format="png" name="galaxy_output4" label="4_ECDF"/> + <data format="png" name="galaxy_output6" label="5_Boxplot after data normalization"/> + <data format="txt" name="galaxy_output5" label="6_Imputed data"/> + <data format="txt" name="galaxy_output7" label="7_Differentially Expressed transcripts"/> +</outputs> + +<help> +<![CDATA[ +**What it does** +INPUTS: This tool parses a list of RTqPCR file and a file with the groups +OUTPUTS: and returs +1) A txt file with the raw Ct data +2) A PNG file with a boxplot of Ct data after data categorization for each sample +3) A txt file with deltaCt data after data normalization +4) A PNG file with the Empirical cumulative distribution before and after data normalization +5) A PNG file with boxplot of deltaCt data after data normalization +6) A txt file after imputation of missing values +7) A txt file with a number statistics about the significance of each miR\gene +]]> +</help> +<citations> + <citation type="bibtex"> + @Manual{HTqPCR, + title = {HTqPCR: Automated analysis of high-throughput qPCR data.}, + author = {Heidi Dvinge, Paul Bertone}, + year = {2009}, + note = {R package version 1.36.0}, + url = {http://bioconductor.org/packages/HTqPCR/}, + } + </citation> + <citation type="bibtex"> + @Manual{impute, + title = {Impute: Imputation for microarray data.}, + author = {Trevor Hastie, Robert Tibshirani, Balasubramanian Narasimhan, Gilbert Chu}, + year = {2018}, + note = {R package version 1.56.0}, + url = {http://bioconductor.org/packages/impute/}, + } + </citation> + <citation type="bibtex"> + @Manual{RankProd, + title = {RankProd: Rank Product method for identifying differentially expressed +genes with application in meta-analysis.}, + author = {Francesco Del Carratore}, + year = {2018}, + note = {R package version 3.8.0}, + url = {http://bioconductor.org/packages/RankProd/}, + } + </citation> + </citations> + +</tool>