Mercurial > repos > iuc > ruvseq
changeset 2:fed9d0350d72 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ruvseq commit 4daa375d022673d2437d609b1865b78c64b04415"
author | iuc |
---|---|
date | Fri, 15 Jan 2021 17:53:15 +0000 (2021-01-15) |
parents | c24765926774 |
children | d1f7fa5bb3cb |
files | get_deseq_dataset.R ruvseq.R ruvseq.xml |
diffstat | 3 files changed, 123 insertions(+), 120 deletions(-) [+] |
line wrap: on
line diff
--- a/get_deseq_dataset.R Tue Mar 26 06:25:38 2019 -0400 +++ b/get_deseq_dataset.R Fri Jan 15 17:53:15 2021 +0000 @@ -1,76 +1,80 @@ -get_deseq_dataset <- function(sampleTable, header, designFormula, tximport, txtype, tx2gene) { +get_deseq_dataset <- function(sample_table, header, design_formula, tximport, txtype, tx2gene) { dir <- "" - if (!is.null(header)) { - hasHeader <- TRUE - } else { - hasHeader <- FALSE - } - - if (!is.null(tximport)) { + has_header <- !is.null(header) + use_txi <- !is.null(tximport) + if (use_txi) { if (is.null(tx2gene)) stop("A transcript-to-gene map or a GTF/GFF3 file is required for tximport") if (tolower(file_ext(tx2gene)) == "gff") { - gffFile <-tx2gene + gff_file <- tx2gene } else { - gffFile <- NULL - tx2gene <- read.table(tx2gene, header=hasHeader) + gff_file <- NULL + tx2gene <- read.table(tx2gene, header = has_header) } - useTXI <- TRUE - } else { - useTXI <- FALSE } - if (!useTXI & hasHeader) { - countfiles <- lapply(as.character(sampleTable$filename), function(x){read.delim(x, row.names=1)}) + if (!use_txi & has_header) { + countfiles <- lapply(as.character(sample_table$filename), read.delim, row.names = 1) tbl <- do.call("cbind", countfiles) - colnames(tbl) <- rownames(sampleTable) # take sample ids from header + colnames(tbl) <- rownames(sample_table) # take sample ids from header # check for htseq report lines (from DESeqDataSetFromHTSeqCount function) - oldSpecialNames <- c("no_feature", "ambiguous", "too_low_aQual", - "not_aligned", "alignment_not_unique") - specialRows <- (substr(rownames(tbl), 1, 1) == "_") | rownames(tbl) %in% oldSpecialNames - tbl <- tbl[!specialRows, , drop = FALSE] + old_special_names <- c( + "no_feature", + "ambiguous", + "too_low_aQual", + "not_aligned", + "alignment_not_unique" + ) + special_rows <- (substr(rownames(tbl), 1, 1) == "_") | rownames(tbl) %in% old_special_names + tbl <- tbl[!special_rows, , drop = FALSE] - dds <- DESeqDataSetFromMatrix(countData = tbl, - colData = subset(sampleTable, select=-(filename)), - design = designFormula) - } else if (!useTXI & !hasHeader) { + dds <- DESeqDataSetFromMatrix( + countData = tbl, + colData = subset(sample_table, select = -filename), + design = design_formula + ) + } else if (!use_txi & !has_header) { # construct the object from HTSeq files - dds <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, - directory = dir, - design = designFormula) - colnames(dds) <- row.names(sampleTable) + dds <- DESeqDataSetFromHTSeqCount( + sampleTable = sample_table, + directory = dir, + design = design_formula + ) + colnames(dds) <- row.names(sample_table) } else { # construct the object using tximport library("tximport") - txiFiles <- as.character(sampleTable$filename) - labs <- row.names(sampleTable) - names(txiFiles) <- labs - if (!is.null(gffFile)) { + txi_files <- as.character(sample_table$filename) + labs <- row.names(sample_table) + names(txi_files) <- labs + if (!is.null(gff_file)) { # first need to make the tx2gene table # this takes ~2-3 minutes using Bioconductor functions suppressPackageStartupMessages({ library("GenomicFeatures") }) - txdb <- makeTxDbFromGFF(gffFile) + txdb <- makeTxDbFromGFF(gff_file) k <- keys(txdb, keytype = "TXNAME") - tx2gene <- select(txdb, keys=k, columns="GENEID", keytype="TXNAME") - # Remove 'transcript:' from transcript IDs (when gffFile is a GFF3 from Ensembl and the transcript does not have a Name) - tx2gene$TXNAME <- sub('^transcript:', '', tx2gene$TXNAME) + tx2gene <- select(txdb, keys = k, columns = "GENEID", keytype = "TXNAME") + # Remove 'transcript:' from transcript IDs (when gff_file is a GFF3 from Ensembl and the transcript does not have a Name) + tx2gene$TXNAME <- sub("^transcript:", "", tx2gene$TXNAME) # nolint } - try(txi <- tximport(txiFiles, type=txtype, tx2gene=tx2gene)) + try(txi <- tximport(txi_files, type = txtype, tx2gene = tx2gene)) if (!exists("txi")) { # Remove version from transcript IDs in tx2gene... - tx2gene$TXNAME <- sub('\\.[0-9]+$', '', tx2gene$TXNAME) - # ...and in txiFiles - txi <- tximport(txiFiles, type=txtype, tx2gene=tx2gene, ignoreTxVersion=TRUE) + tx2gene$TXNAME <- sub("\\.[0-9]+$", "", tx2gene$TXNAME) # nolint + # ...and in txi_files + txi <- tximport(txi_files, type = txtype, tx2gene = tx2gene, ignoreTxVersion = TRUE) } - dds <- DESeqDataSetFromTximport(txi, - subset(sampleTable, select=-c(filename)), - designFormula) + dds <- DESeqDataSetFromTximport( + txi, + subset(sample_table, select = -c(filename)), + design_formula + ) } return(dds) }
--- a/ruvseq.R Tue Mar 26 06:25:38 2019 -0400 +++ b/ruvseq.R Fri Jan 15 17:53:15 2021 +0000 @@ -1,6 +1,8 @@ # setup R error handling to go to stderr library("getopt") -options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) +options(show.error.messages = F, error = function() { + cat(geterrmessage(), file = stderr()); q("no", 1, F) +}) options(stringAsFactors = FALSE, useFancyQuotes = FALSE) setup_cmdline_options <- function() { @@ -12,18 +14,18 @@ "min_k", "min_k", 1, "double", "max_k", "max_k", 1, "double", "sample_json", "s", 1, "character", - "plots" , "p", 1, "character", + "plots", "p", 1, "character", "header", "H", 0, "logical", "txtype", "y", 1, "character", "tx2gene", "x", 1, "character"), # a space-sep tx-to-gene map or GTF file (auto detect .gtf/.GTF) - byrow=TRUE, ncol=4) + byrow = TRUE, ncol = 4) opt <- getopt(spec) # if help was asked for print a friendly message # and exit with a non-zero error code if (!is.null(opt$help)) { - cat(getopt(spec, usage=TRUE)) - q(status=1) + cat(getopt(spec, usage = TRUE)) + q(status = 1) } else { load_libraries() } @@ -42,117 +44,112 @@ library("ggrepel") } -source_local <- function(fname){ +source_local <- function(fname) { argv <- commandArgs(trailingOnly = FALSE) base_dir <- dirname(substring(argv[grep("--file=", argv)], 8)) - source(paste(base_dir, fname, sep="/")) + source(paste(base_dir, fname, sep = "/")) } # Source get_deseq_dataset.R for getting deseq dataset from htseq/featurecounts/tximport -source_local('get_deseq_dataset.R') +source_local("get_deseq_dataset.R") # RUVseq function definitions -plot_pca_rle <- function (set, title) { - x <- pData(set)[,1] +plot_pca_rle <- function(set, title) { + x <- pData(set)[, 1] colors <- brewer.pal(3, "Set2") - label <- paste0(' for ', title) - plotRLE(set, outline=FALSE, ylim=c(-4, 4), col=colors[x]) - title(main=paste0("RLE", label)) - plotPCA(set, col=colors[x], cex=1.2) - title(main=paste0("PCA", label)) + label <- paste0(" for ", title) + plotRLE(set, outline = FALSE, ylim = c(-4, 4), col = colors[x]) + title(main = paste0("RLE", label)) + plotPCA(set, col = colors[x], cex = 1.2) + title(main = paste0("PCA", label)) } -plot_factors_of_unwanted_variation <- function(set, method, k){ +plot_factors_of_unwanted_var <- function(set, method, k) { pd <- pData(set) - pd['sample'] <- row.names(pd) - colnames(pd)[1] <- 'condition' - d = melt(pd, id.vars = c('sample', 'condition')) - d['x'] <- 1 # There is no information on the X, so we just fake it to be able to do a scatterplot - print(ggplot(d, aes(x=x, y=value, color=condition, label=sample)) + + pd["sample"] <- row.names(pd) + colnames(pd)[1] <- "condition" + d <- melt(pd, id.vars = c("sample", "condition")) + d["x"] <- 1 # There is no information on the X, so we just fake it to be able to do a scatterplot + print(ggplot(d, aes(x = x, y = value, color = condition, label = sample)) + geom_point() + - ggtitle(paste0('Factors of unwanted variation for method: ', method, ", k=", k)) + - facet_wrap( ~ variable, scales = "free_x") + + ggtitle(paste0("Factors of unwanted variation for method: ", method, ", k=", k)) + + facet_wrap(~ variable, scales = "free_x") + geom_text_repel() + - theme(axis.title.x=element_blank(), - axis.text.x=element_blank(), - axis.ticks.x=element_blank(), + theme(axis.title.x = element_blank(), + axis.text.x = element_blank(), + axis.ticks.x = element_blank(), plot.title = element_text(hjust = 0.5)) ) } -create_seq_expression_set <- function (dds, min_mean_count) { +create_seq_expression_set <- function(dds, min_mean_count) { count_values <- counts(dds) - print(paste0("feature count before filtering :",nrow(count_values),"\n")) + print(paste0("feature count before filtering :", nrow(count_values), "\n")) print(paste0("Filtering features which mean expression is less or eq. than ", min_mean_count, " counts\n")) filter <- apply(count_values, 1, function(x) mean(x) > min_mean_count) - filtered <- count_values[filter,] - print(paste0("feature count after filtering :",nrow(filtered),"\n")) - set = newSeqExpressionSet(as.matrix(filtered), - phenoData = data.frame(colData(dds)$condition, row.names=colnames(filtered))) - plot_pca_rle(set = set, title = 'raw data') - set <- betweenLaneNormalization(set, which="upper") - plot_pca_rle(set = set, title = 'upper quartile normalized') + filtered <- count_values[filter, ] + print(paste0("feature count after filtering :", nrow(filtered), "\n")) + set <- newSeqExpressionSet(as.matrix(filtered), + phenoData = data.frame(colData(dds)$condition, row.names = colnames(filtered))) + plot_pca_rle(set = set, title = "raw data") + set <- betweenLaneNormalization(set, which = "upper") + plot_pca_rle(set = set, title = "upper quartile normalized") return(set) } get_empirical_control_genes <- function(set, cutoff_p) { - x <- pData(set)[,1] - design <- model.matrix(~x, data=pData(set)) - y <- DGEList(counts=counts(set), group=x) - y <- calcNormFactors(y, method="upperquartile") + x <- pData(set)[, 1] + design <- model.matrix(~x, data = pData(set)) + y <- DGEList(counts = counts(set), group = x) + y <- calcNormFactors(y, method = "upperquartile") y <- estimateGLMCommonDisp(y, design) y <- estimateGLMTagwiseDisp(y, design) fit <- glmFit(y, design) - lrt <- glmLRT(fit, coef=2) - top <- topTags(lrt, n=nrow(set))$table + lrt <- glmLRT(fit, coef = 2) + top <- topTags(lrt, n = nrow(set))$table top_rows <- rownames(top)[which(top$PValue < cutoff_p)] empirical <- rownames(set)[which(!(rownames(set) %in% top_rows))] return(empirical) } -ruv_control_gene_method <- function(set, k, control_genes='empirical', cutoff_p=0.2) { - if (control_genes == 'empirical') { - control_genes = get_empirical_control_genes(set, cutoff_p=cutoff_p) +ruv_control_gene_method <- function(set, k, control_genes = "empirical", cutoff_p = 0.2) { + if (control_genes == "empirical") { + control_genes <- get_empirical_control_genes(set, cutoff_p = cutoff_p) } - set <- RUVg(set, control_genes, k=k) + set <- RUVg(set, control_genes, k = k) plot_pca_rle(set, paste0("RUVg with empirical control genes, k=", k)) - plot_factors_of_unwanted_variation(set, method="RUVg with empirical control genes", k=k) + plot_factors_of_unwanted_var(set, method = "RUVg with empirical control genes", k = k) return(set) } ruv_residual_method <- function(set, k) { genes <- rownames(counts(set)) - x <- pData(set)[,1] + x <- pData(set)[, 1] # Initial edger residuals - design <- model.matrix(~x, data=pData(set)) - y <- DGEList(counts=counts(set), group=x) - y <- calcNormFactors(y, method="upperquartile") + design <- model.matrix(~x, data = pData(set)) + y <- DGEList(counts = counts(set), group = x) + y <- calcNormFactors(y, method = "upperquartile") y <- estimateGLMCommonDisp(y, design) y <- estimateGLMTagwiseDisp(y, design) fit <- glmFit(y, design) - res <- residuals(fit, type="deviance") - set <- RUVr(set, genes, k=k, res) - plot_pca_rle(set = set, title = paste0('RUVr using residuals, k=', k)) - plot_factors_of_unwanted_variation(set, method="RUVr using residuals", k=k) + res <- residuals(fit, type = "deviance") + set <- RUVr(set, genes, k = k, res) + plot_pca_rle(set = set, title = paste0("RUVr using residuals, k=", k)) + plot_factors_of_unwanted_var(set, method = "RUVr using residuals", k = k) return(set) } -ruv_replicate_method <- function (set, k) { +ruv_replicate_method <- function(set, k) { genes <- rownames(counts(set)) - x <- pData(set)[,1] + x <- pData(set)[, 1] differences <- makeGroups(x) - set <- RUVs(set, genes, k=k, differences) - plot_pca_rle(set, paste0('RUVs with replicate samples, k=', k)) - plot_factors_of_unwanted_variation(set, method="RUVs using replicates", k=k) + set <- RUVs(set, genes, k = k, differences) + plot_pca_rle(set, paste0("RUVs with replicate samples, k=", k)) + plot_factors_of_unwanted_var(set, method = "RUVs using replicates", k = k) return(set) } -get_differentially_expressed_genes <- function(dds, contrast, alpha=0.01) { - r <- results(dds, contrast=contrast, alpha=alpha) - return(rownames(r[which(r$padj < alpha),])) -} - opt <- setup_cmdline_options() alpha <- opt$alpha min_k <- opt$min_k @@ -162,12 +159,10 @@ sample_paths <- sample_json$path sample_names <- sample_json$label condition <- as.factor(sample_json$condition) -sampleTable <- data.frame(samplename=sample_names, - filename = sample_paths, - condition=condition) -rownames(sampleTable) <- sample_names +sample_table <- data.frame(samplename = sample_names, filename = sample_paths, condition = condition) +rownames(sample_table) <- sample_names -dds <- get_deseq_dataset(sampleTable, header=opt$header, designFormula= ~ condition, tximport=opt$txtype, txtype=opt$txtype, tx2gene=opt$tx2gene) +dds <- get_deseq_dataset(sample_table, header = opt$header, design_formula = ~ condition, tximport = opt$txtype, txtype = opt$txtype, tx2gene = opt$tx2gene) if (!is.null(opt$plots)) { pdf(opt$plots) } @@ -176,9 +171,9 @@ set <- create_seq_expression_set(dds, min_mean_count = min_c) result <- list(no_correction = set) for (k in seq(min_k, max_k)) { - result[[paste0('residual_method_k', k)]] <- ruv_residual_method(set, k=k) - result[[paste0('replicate_method_k', k)]] <- ruv_replicate_method(set, k=k) - result[[paste0('control_method_k', k)]] <- ruv_control_gene_method(set, k=k, cutoff_p=0.5) + result[[paste0("residual_method_k", k)]] <- ruv_residual_method(set, k = k) + result[[paste0("replicate_method_k", k)]] <- ruv_replicate_method(set, k = k) + result[[paste0("control_method_k", k)]] <- ruv_control_gene_method(set, k = k, cutoff_p = 0.5) } for (name in names(result)) { @@ -187,8 +182,8 @@ unwanted_variation <- pData(set) df <- data.frame(identifier = rownames(unwanted_variation)) df <- cbind(df, unwanted_variation) - colnames(df)[2] <- 'condition' - write.table(df, file=paste0("batch_effects_", name, ".tabular"), sep="\t", quote=F, row.names=F) + colnames(df)[2] <- "condition" + write.table(df, file = paste0("batch_effects_", name, ".tabular"), sep = "\t", quote = F, row.names = F) } }
--- a/ruvseq.xml Tue Mar 26 06:25:38 2019 -0400 +++ b/ruvseq.xml Fri Jan 15 17:53:15 2021 +0000 @@ -1,7 +1,11 @@ -<tool id="ruvseq" name="Remove Unwanted Variation" version="1.16.0"> +<tool id="ruvseq" name="Remove Unwanted Variation" version="@TOOL_VERSION@+galaxy@WRAPPER_VERSION@"> <description>from RNA-seq data</description> + <macros> + <token name="@TOOL_VERSION@">1.16.0</token> + <token name="@WRAPPER_VERSION@">1</token> + </macros> <requirements> - <requirement type="package" version="1.16.0">bioconductor-ruvseq</requirement> + <requirement type="package" version="@TOOL_VERSION@">bioconductor-ruvseq</requirement> <requirement type="package" version="1.22.1">bioconductor-deseq2</requirement> <requirement type="package" version="1.10.0">bioconductor-tximport</requirement> <requirement type="package" version="1.34.1">bioconductor-genomicfeatures</requirement>