Mercurial > repos > iuc > deseq2
changeset 25:de44f8eff84a draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deseq2 commit 9eb6d07510ccf27d6499172d62c81661078ec57b"
author | iuc |
---|---|
date | Wed, 25 Nov 2020 18:36:55 +0000 |
parents | 71bacea10eee |
children | 6a3a025714d3 |
files | deseq2.R deseq2.xml get_deseq_dataset.R |
diffstat | 3 files changed, 204 insertions(+), 183 deletions(-) [+] |
line wrap: on
line diff
--- a/deseq2.R Sat Feb 29 17:08:08 2020 -0500 +++ b/deseq2.R Wed Nov 25 18:36:55 2020 +0000 @@ -31,7 +31,9 @@ # 3 "mean" # setup R error handling to go to stderr -options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) +options(show.error.messages = F, error = function() { + cat(geterrmessage(), file = stderr()); q("no", 1, F) +}) # we need that to not crash galaxy with an UTF8 error on German LC settings. loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") @@ -55,50 +57,46 @@ "header", "H", 0, "logical", "factors", "f", 1, "character", "files_to_labels", "l", 1, "character", - "plots" , "p", 1, "character", + "plots", "p", 1, "character", "tximport", "i", 0, "logical", "txtype", "y", 1, "character", "tx2gene", "x", 1, "character", # a space-sep tx-to-gene map or GTF/GFF3 file "esf", "e", 1, "character", "fit_type", "t", 1, "integer", "many_contrasts", "m", 0, "logical", - "outlier_replace_off" , "a", 0, "logical", - "outlier_filter_off" , "b", 0, "logical", + "outlier_replace_off", "a", 0, "logical", + "outlier_filter_off", "b", 0, "logical", "auto_mean_filter_off", "c", 0, "logical", - "beta_prior_off", "d", 0, "logical"), - byrow=TRUE, ncol=4) + "beta_prior_off", "d", 0, "logical" +), byrow = TRUE, ncol = 4) opt <- getopt(spec) # if help was asked for print a friendly message # and exit with a non-zero error code if (!is.null(opt$help)) { - cat(getopt(spec, usage=TRUE)) - q(status=1) + cat(getopt(spec, usage = TRUE)) + q(status = 1) } # enforce the following required arguments if (is.null(opt$outfile)) { cat("'outfile' is required\n") - q(status=1) + q(status = 1) } if (is.null(opt$factors)) { cat("'factors' is required\n") - q(status=1) + q(status = 1) } -verbose <- if (is.null(opt$quiet)) { - TRUE -} else { - FALSE +verbose <- is.null(opt$quiet) + +source_local <- function(fname) { + argv <- commandArgs(trailingOnly = FALSE) + base_dir <- dirname(substring(argv[grep("--file=", argv)], 8)) + source(paste(base_dir, fname, sep = "/")) } -source_local <- function(fname){ - argv <- commandArgs(trailingOnly = FALSE) - base_dir <- dirname(substring(argv[grep("--file=", argv)], 8)) - source(paste(base_dir, fname, sep="/")) -} - -source_local('get_deseq_dataset.R') +source_local("get_deseq_dataset.R") suppressPackageStartupMessages({ library("DESeq2") @@ -109,102 +107,116 @@ if (opt$cores > 1) { library("BiocParallel") register(MulticoreParam(opt$cores)) - parallel = TRUE + parallel <- TRUE } else { - parallel = FALSE + parallel <- FALSE } # build or read sample table -trim <- function (x) gsub("^\\s+|\\s+$", "", x) +trim <- function(x) gsub("^\\s+|\\s+$", "", x) # switch on if 'factors' was provided: library("rjson") parser <- newJSONParser() parser$addData(opt$factors) -factorList <- parser$getObject() +factor_list <- parser$getObject() filenames_to_labels <- fromJSON(opt$files_to_labels) -factors <- sapply(factorList, function(x) x[[1]]) -primaryFactor <- factors[1] -filenamesIn <- unname(unlist(factorList[[1]][[2]])) -labs = unname(unlist(filenames_to_labels[basename(filenamesIn)])) -sampleTable <- data.frame(sample=basename(filenamesIn), - filename=filenamesIn, - row.names=filenamesIn, - stringsAsFactors=FALSE) -for (factor in factorList) { - factorName <- trim(factor[[1]]) - sampleTable[[factorName]] <- character(nrow(sampleTable)) +factors <- sapply(factor_list, function(x) x[[1]]) +primary_factor <- factors[1] +filenames_in <- unname(unlist(factor_list[[1]][[2]])) +labs <- unname(unlist(filenames_to_labels[basename(filenames_in)])) +sample_table <- data.frame( + sample = basename(filenames_in), + filename = filenames_in, + row.names = filenames_in, + stringsAsFactors = FALSE +) +for (factor in factor_list) { + factor_name <- trim(factor[[1]]) + sample_table[[factor_name]] <- character(nrow(sample_table)) lvls <- sapply(factor[[2]], function(x) names(x)) for (i in seq_along(factor[[2]])) { files <- factor[[2]][[i]][[1]] - sampleTable[files,factorName] <- trim(lvls[i]) + sample_table[files, factor_name] <- trim(lvls[i]) } - sampleTable[[factorName]] <- factor(sampleTable[[factorName]], levels=lvls) + sample_table[[factor_name]] <- factor(sample_table[[factor_name]], levels = lvls) } -rownames(sampleTable) <- labs +rownames(sample_table) <- labs -primaryFactor <- factors[1] -designFormula <- as.formula(paste("~", paste(rev(factors), collapse=" + "))) +design_formula <- as.formula(paste("~", paste(rev(factors), collapse = " + "))) # these are plots which are made once for each analysis -generateGenericPlots <- function(dds, factors) { +generate_generic_plots <- function(dds, factors) { library("ggplot2") library("ggrepel") library("pheatmap") rld <- rlog(dds) - p <- plotPCA(rld, intgroup=rev(factors)) - print(p + geom_text_repel(aes_string(x = "PC1", y = "PC2", label = factor(colnames(dds))), size=3) + geom_point()) + p <- plotPCA(rld, intgroup = rev(factors)) + print(p + geom_text_repel(aes_string(x = "PC1", y = "PC2", label = factor(colnames(dds))), size = 3) + geom_point()) dat <- assay(rld) - distsRL <- dist(t(dat)) - mat <- as.matrix(distsRL) - colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) - pheatmap(mat, - clustering_distance_rows=distsRL, - clustering_distance_cols=distsRL, - col=colors, - main="Sample-to-sample distances") - plotDispEsts(dds, main="Dispersion estimates") + dists_rl <- dist(t(dat)) + mat <- as.matrix(dists_rl) + colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255) + pheatmap( + mat, + clustering_distance_rows = dists_rl, + clustering_distance_cols = dists_rl, + col = colors, + main = "Sample-to-sample distances" + ) + plotDispEsts(dds, main = "Dispersion estimates") } # these are plots which can be made for each comparison, e.g. # once for C vs A and once for B vs A -generateSpecificPlots <- function(res, threshold, title_suffix) { +generate_specific_plots <- function(res, threshold, title_suffix) { use <- res$baseMean > threshold if (sum(!use) == 0) { - h <- hist(res$pvalue, breaks=0:50/50, plot=FALSE) - barplot(height = h$counts, - col = "powderblue", space = 0, xlab="p-values", ylab="frequency", - main=paste("Histogram of p-values for",title_suffix)) - text(x = c(0, length(h$counts)), y = 0, label=paste(c(0,1)), adj=c(0.5,1.7), xpd=NA) + h <- hist(res$pvalue, breaks = 0:50 / 50, plot = FALSE) + barplot( + height = h$counts, + col = "powderblue", + space = 0, + xlab = "p-values", + ylab = "frequency", + main = paste("Histogram of p-values for", title_suffix) + ) + text(x = c(0, length(h$counts)), y = 0, label = paste(c(0, 1)), adj = c(0.5, 1.7), xpd = NA) } else { - h1 <- hist(res$pvalue[!use], breaks=0:50/50, plot=FALSE) - h2 <- hist(res$pvalue[use], breaks=0:50/50, plot=FALSE) - colori <- c("filtered (low count)"="khaki", "not filtered"="powderblue") - barplot(height = rbind(h1$counts, h2$counts), beside = FALSE, - col = colori, space = 0, xlab="p-values", ylab="frequency", - main=paste("Histogram of p-values for",title_suffix)) - text(x = c(0, length(h1$counts)), y = 0, label=paste(c(0,1)), adj=c(0.5,1.7), xpd=NA) - legend("topright", fill=rev(colori), legend=rev(names(colori)), bg="white") + h1 <- hist(res$pvalue[!use], breaks = 0:50 / 50, plot = FALSE) + h2 <- hist(res$pvalue[use], breaks = 0:50 / 50, plot = FALSE) + colori <- c("filtered (low count)" = "khaki", "not filtered" = "powderblue") + barplot( + height = rbind(h1$counts, h2$counts), + beside = FALSE, + col = colori, + space = 0, + xlab = "p-values", + ylab = "frequency", + main = paste("Histogram of p-values for", title_suffix) + ) + text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0, 1)), adj = c(0.5, 1.7), xpd = NA) + legend("topright", fill = rev(colori), legend = rev(names(colori)), bg = "white") } - plotMA(res, main= paste("MA-plot for",title_suffix), ylim=range(res$log2FoldChange, na.rm=TRUE)) + plotMA(res, main = paste("MA-plot for", title_suffix), ylim = range(res$log2FoldChange, na.rm = TRUE)) } if (verbose) { - cat(paste("primary factor:",primaryFactor,"\n")) + cat(paste("primary factor:", primary_factor, "\n")) if (length(factors) > 1) { - cat(paste("other factors in design:",paste(factors[-length(factors)],collapse=","),"\n")) + cat(paste("other factors in design:", paste(factors[-length(factors)], collapse = ","), "\n")) } cat("\n---------------------\n") } -dds <- get_deseq_dataset(sampleTable, header=opt$header, designFormula=designFormula, tximport=opt$tximport, txtype=opt$txtype, tx2gene=opt$tx2gene) +dds <- get_deseq_dataset(sample_table, header = opt$header, design_formula = design_formula, tximport = opt$tximport, txtype = opt$txtype, tx2gene = opt$tx2gene) # estimate size factors for the chosen method -if(!is.null(opt$esf)){ - dds <- estimateSizeFactors(dds, type=opt$esf) +if (!is.null(opt$esf)) { + dds <- estimateSizeFactors(dds, type = opt$esf) } -apply_batch_factors <- function (dds, batch_factors) { +apply_batch_factors <- function(dds, batch_factors) { rownames(batch_factors) <- batch_factors$identifier batch_factors <- subset(batch_factors, select = -c(identifier, condition)) dds_samples <- colnames(dds) @@ -215,155 +227,161 @@ dds_data <- colData(dds) # Merge dds_data with batch_factors using indexes, which are sample names # Set sort to False, which maintains the order in dds_data - reordered_batch <- merge(dds_data, batch_factors, by.x = 0, by.y = 0, sort=F) + reordered_batch <- merge(dds_data, batch_factors, by.x = 0, by.y = 0, sort = F) batch_factors <- reordered_batch[, ncol(dds_data):ncol(reordered_batch)] for (factor in colnames(batch_factors)) { dds[[factor]] <- batch_factors[[factor]] } - colnames(dds) <- reordered_batch[,1] + colnames(dds) <- reordered_batch[, 1] return(dds) } if (!is.null(opt$batch_factors)) { - batch_factors <- read.table(opt$batch_factors, sep="\t", header=T) + batch_factors <- read.table(opt$batch_factors, sep = "\t", header = T) dds <- apply_batch_factors(dds = dds, batch_factors = batch_factors) - batch_design <- colnames(batch_factors)[-c(1,2)] - designFormula <- as.formula(paste("~", paste(c(batch_design, rev(factors)), collapse=" + "))) - design(dds) <- designFormula + batch_design <- colnames(batch_factors)[-c(1, 2)] + design_formula <- as.formula(paste("~", paste(c(batch_design, rev(factors)), collapse = " + "))) + design(dds) <- design_formula } if (verbose) { cat("DESeq2 run information\n\n") cat("sample table:\n") - print(sampleTable[,-c(1:2),drop=FALSE]) + print(sample_table[, -c(1:2), drop = FALSE]) cat("\ndesign formula:\n") - print(designFormula) + print(design_formula) cat("\n\n") cat(paste(ncol(dds), "samples with counts over", nrow(dds), "genes\n")) } # optional outlier behavior if (is.null(opt$outlier_replace_off)) { - minRep <- 7 + min_rep <- 7 } else { - minRep <- Inf + min_rep <- Inf if (verbose) cat("outlier replacement off\n") } if (is.null(opt$outlier_filter_off)) { - cooksCutoff <- TRUE + cooks_cutoff <- TRUE } else { - cooksCutoff <- FALSE + cooks_cutoff <- FALSE if (verbose) cat("outlier filtering off\n") } # optional automatic mean filtering if (is.null(opt$auto_mean_filter_off)) { - independentFiltering <- TRUE + independent_filtering <- TRUE } else { - independentFiltering <- FALSE + independent_filtering <- FALSE if (verbose) cat("automatic filtering on the mean off\n") } # shrinkage of LFCs if (is.null(opt$beta_prior_off)) { - betaPrior <- TRUE + beta_prior <- TRUE } else { - betaPrior <- FALSE + beta_prior <- FALSE if (verbose) cat("beta prior off\n") } # dispersion fit type if (is.null(opt$fit_type)) { - fitType <- "parametric" + fit_type <- "parametric" } else { - fitType <- c("parametric","local","mean")[opt$fit_type] + fit_type <- c("parametric", "local", "mean")[opt$fit_type] } -if (verbose) cat(paste("using disperion fit type:",fitType,"\n")) +if (verbose) cat(paste("using disperion fit type:", fit_type, "\n")) # run the analysis -dds <- DESeq(dds, fitType=fitType, betaPrior=betaPrior, minReplicatesForReplace=minRep, parallel=parallel) +dds <- DESeq(dds, fitType = fit_type, betaPrior = beta_prior, minReplicatesForReplace = min_rep, parallel = parallel) # create the generic plots and leave the device open if (!is.null(opt$plots)) { if (verbose) cat("creating plots\n") pdf(opt$plots) - generateGenericPlots(dds, factors) + generate_generic_plots(dds, factors) } -n <- nlevels(colData(dds)[[primaryFactor]]) -allLevels <- levels(colData(dds)[[primaryFactor]]) +n <- nlevels(colData(dds)[[primary_factor]]) +all_levels <- levels(colData(dds)[[primary_factor]]) if (!is.null(opt$countsfile)) { - normalizedCounts<-counts(dds,normalized=TRUE) - write.table(normalizedCounts, file=opt$countsfile, sep="\t", col.names=NA, quote=FALSE) + normalized_counts <- counts(dds, normalized = TRUE) + write.table(normalized_counts, file = opt$countsfile, sep = "\t", col.names = NA, quote = FALSE) } if (!is.null(opt$rlogfile)) { - rLogNormalized <-rlogTransformation(dds) - rLogNormalizedMat <- assay(rLogNormalized) - write.table(rLogNormalizedMat, file=opt$rlogfile, sep="\t", col.names=NA, quote=FALSE) + rlog_normalized <- rlogTransformation(dds) + rlog_normalized_mat <- assay(rlog_normalized) + write.table(rlog_normalized_mat, file = opt$rlogfile, sep = "\t", col.names = NA, quote = FALSE) } if (!is.null(opt$vstfile)) { - vstNormalized<-varianceStabilizingTransformation(dds) - vstNormalizedMat <- assay(vstNormalized) - write.table(vstNormalizedMat, file=opt$vstfile, sep="\t", col.names=NA, quote=FALSE) + vst_normalized <- varianceStabilizingTransformation(dds) + vst_normalized_mat <- assay(vst_normalized) + write.table(vst_normalized_mat, file = opt$vstfile, sep = "\t", col.names = NA, quote = FALSE) } if (is.null(opt$many_contrasts)) { # only contrast the first and second level of the primary factor - ref <- allLevels[1] - lvl <- allLevels[2] - res <- results(dds, contrast=c(primaryFactor, lvl, ref), - cooksCutoff=cooksCutoff, - independentFiltering=independentFiltering) + ref <- all_levels[1] + lvl <- all_levels[2] + res <- results( + dds, + contrast = c(primary_factor, lvl, ref), + cooksCutoff = cooks_cutoff, + independentFiltering = independent_filtering + ) if (verbose) { cat("summary of results\n") - cat(paste0(primaryFactor,": ",lvl," vs ",ref,"\n")) + cat(paste0(primary_factor, ": ", lvl, " vs ", ref, "\n")) print(summary(res)) } - resSorted <- res[order(res$padj),] - outDF <- as.data.frame(resSorted) - outDF$geneID <- rownames(outDF) - outDF <- outDF[,c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")] + res_sorted <- res[order(res$padj), ] + out_df <- as.data.frame(res_sorted) + out_df$geneID <- rownames(out_df) # nolint + out_df <- out_df[, c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")] filename <- opt$outfile - write.table(outDF, file=filename, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE) - if (independentFiltering) { + write.table(out_df, file = filename, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + if (independent_filtering) { threshold <- unname(attr(res, "filterThreshold")) } else { threshold <- 0 } - title_suffix <- paste0(primaryFactor,": ",lvl," vs ",ref) + title_suffix <- paste0(primary_factor, ": ", lvl, " vs ", ref) if (!is.null(opt$plots)) { - generateSpecificPlots(res, threshold, title_suffix) + generate_specific_plots(res, threshold, title_suffix) } } else { # rotate through the possible contrasts of the primary factor # write out a sorted table of results with the contrast as a suffix # add contrast specific plots to the device - for (i in seq_len(n-1)) { - ref <- allLevels[i] - contrastLevels <- allLevels[(i+1):n] - for (lvl in contrastLevels) { - res <- results(dds, contrast=c(primaryFactor, lvl, ref), - cooksCutoff=cooksCutoff, - independentFiltering=independentFiltering) - resSorted <- res[order(res$padj),] - outDF <- as.data.frame(resSorted) - outDF$geneID <- rownames(outDF) - outDF <- outDF[,c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")] - filename <- paste0(primaryFactor,"_",lvl,"_vs_",ref) - write.table(outDF, file=filename, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE) - if (independentFiltering) { + for (i in seq_len(n - 1)) { + ref <- all_levels[i] + contrast_levels <- all_levels[(i + 1):n] + for (lvl in contrast_levels) { + res <- results( + dds, + contrast = c(primary_factor, lvl, ref), + cooksCutoff = cooks_cutoff, + independentFiltering = independent_filtering + ) + res_sorted <- res[order(res$padj), ] + out_df <- as.data.frame(res_sorted) + out_df$geneID <- rownames(out_df) # nolint + out_df <- out_df[, c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")] + filename <- paste0(primary_factor, "_", lvl, "_vs_", ref) + write.table(out_df, file = filename, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + if (independent_filtering) { threshold <- unname(attr(res, "filterThreshold")) } else { threshold <- 0 } - title_suffix <- paste0(primaryFactor,": ",lvl," vs ",ref) + title_suffix <- paste0(primary_factor, ": ", lvl, " vs ", ref) if (!is.null(opt$plots)) { - generateSpecificPlots(res, threshold, title_suffix) + generate_specific_plots(res, threshold, title_suffix) } } } @@ -378,4 +396,3 @@ cat("Session information:\n\n") sessionInfo() -
--- a/deseq2.xml Sat Feb 29 17:08:08 2020 -0500 +++ b/deseq2.xml Wed Nov 25 18:36:55 2020 +0000 @@ -176,7 +176,7 @@ label="Output VST normalized table" /> <param name="many_contrasts" type="boolean" truevalue="1" falsevalue="0" checked="false" label="Output all levels vs all levels of primary factor (use when you have >2 levels for primary factor)" - help=" DESeq2 performs independent filtering by default using the mean of normalized counts as a filter statistic" /> + help=" DESeq2 performs independent filtering by default using the mean of normalized counts as a filter statistic" /> <param name="esf" type="select" label="(Optional) Method for estimateSizeFactors" help="Method for estimation: either 'ratio', 'poscounts', or 'iterate'. 'ratio' uses the standard median ratio method introduced in DESeq. The size factor is the median ratio of the sample over a 'pseudosample': for each gene, the geometric mean of all samples. @@ -203,10 +203,10 @@ <param name="outlier_filter_off" type="boolean" truevalue="1" falsevalue="0" checked="false" label="Turn off outliers filtering (only affects with >2 replicates)" help="When there are more than 2 replicates for a given sample, the DESeq2 will automatically - filter genes which contain a Cook’s distance above a cutoff" /> + filter genes which contain a Cook’s distance above a cutoff" /> <param name="auto_mean_filter_off" type="boolean" truevalue="1" falsevalue="0" checked="false" label="Turn off independent filtering" - help=" DESeq2 performs independent filtering by default using the mean of normalized counts as a filter statistic" /> + help=" DESeq2 performs independent filtering by default using the mean of normalized counts as a filter statistic" /> </inputs> <outputs> <data name="deseq_out" format="tabular" label="DESeq2 result file on ${on_string}">
--- a/get_deseq_dataset.R Sat Feb 29 17:08:08 2020 -0500 +++ b/get_deseq_dataset.R Wed Nov 25 18:36:55 2020 +0000 @@ -1,76 +1,80 @@ -get_deseq_dataset <- function(sampleTable, header, designFormula, tximport, txtype, tx2gene) { +get_deseq_dataset <- function(sample_table, header, design_formula, tximport, txtype, tx2gene) { dir <- "" - if (!is.null(header)) { - hasHeader <- TRUE - } else { - hasHeader <- FALSE - } - - if (!is.null(tximport)) { + has_header <- !is.null(header) + use_txi <- !is.null(tximport) + if (use_txi) { if (is.null(tx2gene)) stop("A transcript-to-gene map or a GTF/GFF3 file is required for tximport") if (tolower(file_ext(tx2gene)) == "gff") { - gffFile <-tx2gene + gff_file <- tx2gene } else { - gffFile <- NULL - tx2gene <- read.table(tx2gene, header=hasHeader) + gff_file <- NULL + tx2gene <- read.table(tx2gene, header = has_header) } - useTXI <- TRUE - } else { - useTXI <- FALSE } - if (!useTXI & hasHeader) { - countfiles <- lapply(as.character(sampleTable$filename), function(x){read.delim(x, row.names=1)}) + if (!use_txi & has_header) { + countfiles <- lapply(as.character(sample_table$filename), read.delim, row.names = 1) tbl <- do.call("cbind", countfiles) - colnames(tbl) <- rownames(sampleTable) # take sample ids from header + colnames(tbl) <- rownames(sample_table) # take sample ids from header # check for htseq report lines (from DESeqDataSetFromHTSeqCount function) - oldSpecialNames <- c("no_feature", "ambiguous", "too_low_aQual", - "not_aligned", "alignment_not_unique") - specialRows <- (substr(rownames(tbl), 1, 1) == "_") | rownames(tbl) %in% oldSpecialNames - tbl <- tbl[!specialRows, , drop = FALSE] + old_special_names <- c( + "no_feature", + "ambiguous", + "too_low_aQual", + "not_aligned", + "alignment_not_unique" + ) + special_rows <- (substr(rownames(tbl), 1, 1) == "_") | rownames(tbl) %in% old_special_names + tbl <- tbl[!special_rows, , drop = FALSE] - dds <- DESeqDataSetFromMatrix(countData = tbl, - colData = subset(sampleTable, select=-(filename)), - design = designFormula) - } else if (!useTXI & !hasHeader) { + dds <- DESeqDataSetFromMatrix( + countData = tbl, + colData = subset(sample_table, select = -filename), + design = design_formula + ) + } else if (!use_txi & !has_header) { # construct the object from HTSeq files - dds <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, - directory = dir, - design = designFormula) - colnames(dds) <- row.names(sampleTable) + dds <- DESeqDataSetFromHTSeqCount( + sampleTable = sample_table, + directory = dir, + design = design_formula + ) + colnames(dds) <- row.names(sample_table) } else { # construct the object using tximport library("tximport") - txiFiles <- as.character(sampleTable$filename) - labs <- row.names(sampleTable) - names(txiFiles) <- labs - if (!is.null(gffFile)) { + txi_files <- as.character(sample_table$filename) + labs <- row.names(sample_table) + names(txi_files) <- labs + if (!is.null(gff_file)) { # first need to make the tx2gene table # this takes ~2-3 minutes using Bioconductor functions suppressPackageStartupMessages({ library("GenomicFeatures") }) - txdb <- makeTxDbFromGFF(gffFile) + txdb <- makeTxDbFromGFF(gff_file) k <- keys(txdb, keytype = "TXNAME") - tx2gene <- select(txdb, keys=k, columns="GENEID", keytype="TXNAME") - # Remove 'transcript:' from transcript IDs (when gffFile is a GFF3 from Ensembl and the transcript does not have a Name) - tx2gene$TXNAME <- sub('^transcript:', '', tx2gene$TXNAME) + tx2gene <- select(txdb, keys = k, columns = "GENEID", keytype = "TXNAME") + # Remove 'transcript:' from transcript IDs (when gff_file is a GFF3 from Ensembl and the transcript does not have a Name) + tx2gene$TXNAME <- sub("^transcript:", "", tx2gene$TXNAME) # nolint } - try(txi <- tximport(txiFiles, type=txtype, tx2gene=tx2gene)) + try(txi <- tximport(txi_files, type = txtype, tx2gene = tx2gene)) if (!exists("txi")) { # Remove version from transcript IDs in tx2gene... - tx2gene$TXNAME <- sub('\\.[0-9]+$', '', tx2gene$TXNAME) - # ...and in txiFiles - txi <- tximport(txiFiles, type=txtype, tx2gene=tx2gene, ignoreTxVersion=TRUE) + tx2gene$TXNAME <- sub("\\.[0-9]+$", "", tx2gene$TXNAME) # nolint + # ...and in txi_files + txi <- tximport(txi_files, type = txtype, tx2gene = tx2gene, ignoreTxVersion = TRUE) } - dds <- DESeqDataSetFromTximport(txi, - subset(sampleTable, select=-c(filename)), - designFormula) + dds <- DESeqDataSetFromTximport( + txi, + subset(sample_table, select = -c(filename)), + design_formula + ) } return(dds) }