Mercurial > repos > shians > shrnaseq
diff hairpinTool.R @ 2:076ca575208f
First commit
author | shian_su <registertonysu@gmail.com> |
---|---|
date | Fri, 21 Feb 2014 12:52:56 +1100 |
parents | |
children | f8af57d6f60b |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hairpinTool.R Fri Feb 21 12:52:56 2014 +1100 @@ -0,0 +1,631 @@ +# Record starting time +timeStart <- as.character(Sys.time()) + +# Loading and checking required packages +library(methods, quietly=TRUE, warn.conflicts=FALSE) +library(statmod, quietly=TRUE, warn.conflicts=FALSE) +library(splines, quietly=TRUE, warn.conflicts=FALSE) +library(edgeR, quietly=TRUE, warn.conflicts=FALSE) +library(limma, quietly=TRUE, warn.conflicts=FALSE) + +if (packageVersion("edgeR") < "3.5.23") { + message("Please update 'edgeR' to version >= 3.5.23 to run this script") +} + +################################################################################ +### Function declarations +################################################################################ + +# Function to sanitise contrast equations so there are no whitespaces +# surrounding the arithmetic operators, leading or trailing whitespace +sanitiseEquation <- function(equation) { + equation <- gsub(" *[+] *", "+", equation) + equation <- gsub(" *[-] *", "-", equation) + equation <- gsub(" *[/] *", "/", equation) + equation <- gsub(" *[*] *", "*", equation) + equation <- gsub("^\\s+|\\s+$", "", equation) + return(equation) +} + +# Function to sanitise group information +sanitiseGroups <- function(string) { + string <- gsub(" *[,] *", ",", string) + string <- gsub("^\\s+|\\s+$", "", string) + return(string) +} + +# Function to change periods to whitespace in a string +unmake.names <- function(string) { + string <- gsub(".", " ", string, fixed=TRUE) + return(string) +} + +# Function has string input and generates an output path string +makeOut <- function(filename) { + return(paste0(folderPath, "/", filename)) +} + +# Function has string input and generates both a pdf and png output strings +imgOut <- function(filename) { + assign(paste0(filename, "Png"), makeOut(paste0(filename,".png")), + envir = .GlobalEnv) + assign(paste0(filename, "Pdf"), makeOut(paste0(filename,".pdf")), + envir = .GlobalEnv) +} + +# Create cat function default path set, default seperator empty and appending +# true by default (Ripped straight from the cat function with altered argument +# defaults) +cata <- function(..., file = htmlPath, sep = "", fill = FALSE, labels = NULL, + append = TRUE) { + if (is.character(file)) + if (file == "") + file <- stdout() + else if (substring(file, 1L, 1L) == "|") { + file <- pipe(substring(file, 2L), "w") + on.exit(close(file)) + } + else { + file <- file(file, ifelse(append, "a", "w")) + on.exit(close(file)) + } + .Internal(cat(list(...), file, sep, fill, labels, append)) +} + +# Function to write code for html head and title +HtmlHead <- function(title) { + cata("<head>\n") + cata("<title>", title, "</title>\n") + cata("</head>\n") +} + +# Function to write code for html links +HtmlLink <- function(address, label=address) { + cata("<a href=\"", address, "\" target=\"_blank\">", label, "</a><br />\n") +} + +# Function to write code for html images +HtmlImage <- function(source, label=source, height=600, width=600) { + cata("<img src=\"", source, "\" alt=\"", label, "\" height=\"", height) + cata("\" width=\"", width, "\"/>\n") +} + +# Function to write code for html list items +ListItem <- function(...) { + cata("<li>", ..., "</li>\n") +} + +TableItem <- function(...) { + cata("<td>", ..., "</td>\n") +} + +TableHeadItem <- function(...) { + cata("<th>", ..., "</th>\n") +} +################################################################################ +### Input Processing +################################################################################ + +# Grabbing arguments from command line +argv <- commandArgs(TRUE) + +# Remove fastq file paths after collecting from argument vector +inputType <- as.character(argv[1]) +if (inputType=="fastq") { + fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], + fixed=TRUE)) + argv <- argv[!grepl("fastq::", argv, fixed=TRUE)] + hairpinPath <- as.character(argv[2]) + samplePath <- as.character(argv[3]) + barStart <- as.numeric(argv[4]) + barEnd <- as.numeric(argv[5]) + hpStart <- as.numeric(argv[6]) + hpEnd <- as.numeric(argv[7]) +} else if (inputType=="counts") { + countPath <- as.character(argv[2]) + annoPath <- as.character(argv[3]) + samplePath <- as.character(argv[4]) +} + +cpmReq <- as.numeric(argv[8]) +sampleReq <- as.numeric(argv[9]) +fdrThresh <- as.numeric(argv[10]) +lfcThresh <- as.numeric(argv[11]) +workMode <- as.character(argv[12]) +htmlPath <- as.character(argv[13]) +folderPath <- as.character(argv[14]) +if (workMode=="classic") { + pairData <- character() + pairData[2] <- as.character(argv[15]) + pairData[1] <- as.character(argv[16]) +} else if (workMode=="glm") { + contrastData <- as.character(argv[15]) + roastOpt <- as.character(argv[16]) + hairpinReq <- as.numeric(argv[17]) + selectOpt <- as.character(argv[18]) + selectVals <- as.character(argv[19]) +} + +# Read in inputs +if (inputType=="fastq") { + samples <- read.table(samplePath, header=TRUE, sep="\t") + hairpins <- read.table(hairpinPath, header=TRUE, sep="\t") +} else if (inputType=="counts") { + samples <- read.table(samplePath, header=TRUE, sep="\t") + counts <- read.table(countPath, header=TRUE, sep="\t") + anno <- read.table(annoPath, header=TRUE, sep="\t") +} +###################### Check inputs for correctness ############################ +samples$ID <- make.names(samples$ID) + +if (!any(grepl("group", names(samples)))) { + stop("'group' column not specified in sample annotation file") +} # Check if grouping variable has been specified + +if (any(table(samples$ID)>1)){ + tab <- table(samples$ID) + offenders <- paste(names(tab[tab>1]), collapse=", ") + offenders <- unmake.names(offenders) + stop("ID column of sample annotation must have unique values, values ", + offenders, " are repeated") +} # Check that IDs in sample annotation are unique + +if (any(is.na(match(samples$ID, colnames(counts))))) { + stop("not all samples have groups specified") +} # Check that a group has be specifed for each sample + +if (inputType=="fastq") { + + if (any(table(hairpins$ID)>1)){ + tab <- table(hairpins$ID) + offenders <- paste(names(tab[tab>1]), collapse=", ") + stop("ID column of hairpin annotation must have unique values, values ", + offenders, " are repeated") + } # Check that IDs in hairpin annotation are unique + +} else if (inputType=="counts") { + + if (any(table(counts$ID)>1)){ + tab <- table(counts$ID) + offenders <- paste(names(tab[tab>1]), collapse=", ") + stop("ID column of count table must have unique values, values ", + offenders, " are repeated") + } # Check that IDs in count table are unique +} +################################################################################ + +# Process arguments +if (workMode=="glm") { + if (roastOpt=="yes") { + wantRoast <- TRUE + } else { + wantRoast <- FALSE + } +} + +# Split up contrasts seperated by comma into a vector and replace spaces with +# periods +if (exists("contrastData")) { + contrastData <- unlist(strsplit(contrastData, split=",")) + contrastData <- sanitiseEquation(contrastData) + contrastData <- gsub(" ", ".", contrastData, fixed=TRUE) +} + +# Replace spaces with periods in pair data +if (exists("pairData")) { + pairData <- make.names(pairData) +} + +# Generate output folder and paths +dir.create(folderPath) + +# Generate links for outputs +imgOut("barHairpin") +imgOut("barIndex") +imgOut("mds") +imgOut("bcv") +if (workMode == "classic") { + smearPng <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").png")) + smearPdf <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").pdf")) + topOut <- makeOut(paste0("toptag(", pairData[2], "-", pairData[1],").tsv")) +} else if (workMode=="glm") { + smearPng <- character() + smearPdf <- character() + topOut <- character() + roastOut <- character() + barcodePng <- character() + barcodePdf <- character() + for (i in 1:length(contrastData)) { + smearPng[i] <- makeOut(paste0("smear(", contrastData[i], ").png")) + smearPdf[i] <- makeOut(paste0("smear(", contrastData[i], ").pdf")) + topOut[i] <- makeOut(paste0("toptag(", contrastData[i], ").tsv")) + roastOut[i] <- makeOut(paste0("roast(", contrastData[i], ").tsv")) + barcodePng[i] <- makeOut(paste0("barcode(", contrastData[i], ").png")) + barcodePdf[i] <- makeOut(paste0("barcode(", contrastData[i], ").pdf")) + } +} +# Initialise data for html links and images, table with the link label and +# link address +linkData <- data.frame(Label=character(), Link=character(), + stringsAsFactors=FALSE) +imageData <- data.frame(Label=character(), Link=character(), + stringsAsFactors=FALSE) +################################################################################ +### Data Processing +################################################################################ + +# Transform gene selection from string into index values for mroast +if (workMode=="glm") { + if (selectOpt=="rank") { + selectVals <- gsub(" ", "", selectVals, fixed=TRUE) + selectVals <- unlist(strsplit(selectVals, ",")) + + for (i in 1:length(selectVals)) { + if (grepl(":", selectVals[i], fixed=TRUE)) { + temp <- unlist(strsplit(selectVals[i], ":")) + selectVals <- selectVals[-i] + a <- as.numeric(temp[1]) + b <- as.numeric(temp[2]) + selectVals <- c(selectVals, a:b) + } + } + selectVals <- as.numeric(unique(selectVals)) + } else { + selectVals <- gsub(" ", "", selectVals, fixed=TRUE) + selectVals <- unlist(strsplit(selectVals, " ")) + } +} + +if (inputType=="fastq") { +# Use EdgeR hairpin process and capture outputs +hpReadout <- capture.output( + data <- processHairpinReads(fastqPath, samplePath, hairpinPath, + hairpinStart=hpStart, hairpinEnd=hpEnd, + verbose=TRUE) +) + +# Remove function output entries that show processing data or is empty +hpReadout <- hpReadout[hpReadout!=""] +hpReadout <- hpReadout[!grepl("Processing", hpReadout)] +hpReadout <- hpReadout[!grepl("in file", hpReadout)] +hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE) + + +# Make the names of groups syntactically valid (replace spaces with periods) +data$samples$group <- make.names(data$samples$group) +} else { + # Process counts information, set ID column to be row names + rownames(counts) <- counts$ID + counts <- counts[ , !(colnames(counts)=="ID")] + countsRows <- nrow(counts) + + # Process group information + factors <- samples$group[match(samples$ID, colnames(counts))] + annoRows <- nrow(anno) + anno <- anno[match(rownames(counts), anno$ID), ] + annoMatched <- sum(!is.na(anno$ID)) + + if (any(is.na(anno$ID))) { + warningStr <- paste("count table contained more hairpins than", + "specified in hairpin annotation file") + warning(warningStr) + } + + # Filter out rows with zero counts + sel <- rowSums(counts)!=0 + counts <- counts[sel, ] + anno <- anno[sel, ] + + # Create DGEList + data <- DGEList(counts=counts, lib.size=colSums(counts), + norm.factors=rep(1,ncol(counts)), genes=anno, group=factors) + + # Make the names of groups syntactically valid (replace spaces with periods) + data$samples$group <- make.names(data$samples$group) +} + +# Filter hairpins with low counts +sel <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq +data <- data[sel, ] + +# Estimate dispersions +data <- estimateDisp(data) +commonBCV <- sqrt(data$common.dispersion) + +################################################################################ +### Output Processing +################################################################################ + +# Plot number of hairpins that could be matched per sample +png(barIndexPng, width=600, height=600) +barplot(height<-colSums(data$counts), las=2, main="Counts per index", + cex.names=1.0, cex.axis=0.8, ylim=c(0, max(height)*1.2)) +imageData[1, ] <- c("Counts per Index", "barIndex.png") +invisible(dev.off()) + +pdf(barIndexPdf) +barplot(height<-colSums(data$counts), las=2, main="Counts per index", + cex.names=1.0, cex.axis=0.8, ylim=c(0, max(height)*1.2)) +linkData[1, ] <- c("Counts per Index Barplot (.pdf)", "barIndex.pdf") +invisible(dev.off()) + +# Plot per hairpin totals across all samples +png(barHairpinPng, width=600, height=600) +if (nrow(data$counts)<50) { + barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin", + cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2)) +} else { + barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin", + cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2), + names.arg=FALSE) +} +imageData <- rbind(imageData, c("Counts per Hairpin", "barHairpin.png")) +invisible(dev.off()) + +pdf(barHairpinPdf) +if (nrow(data$counts)<50) { + barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin", + cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2)) +} else { + barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin", + cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2), + names.arg=FALSE) +} +newEntry <- c("Counts per Hairpin Barplot (.pdf)", "barHairpin.pdf") +linkData <- rbind(linkData, newEntry) +invisible(dev.off()) + +# Make an MDS plot to visualise relationships between replicate samples +png(mdsPng, width=600, height=600) +plotMDS(data, labels=data$samples$group, col=as.numeric(data$samples$group), + main="MDS Plot") +imageData <- rbind(imageData, c("MDS Plot", "mds.png")) +invisible(dev.off()) + +pdf(mdsPdf) +plotMDS(data, labels=data$samples$group, col=as.numeric(data$samples$group), + main="MDS Plot") +newEntry <- c("MDS Plot (.pdf)", "mds.pdf") +linkData <- rbind(linkData, newEntry) +invisible(dev.off()) + +if (workMode=="classic") { + # Assess differential representation using classic exact testing methodology + # in edgeR + testData <- exactTest(data, pair=pairData) + + top <- topTags(testData, n=Inf) + topIDs <- top$table[(top$table$FDR < fdrThresh) & + (abs(top$table$logFC) > lfcThresh), 1] + write.table(top, file=topOut, row.names=FALSE, sep="\t") + linkName <- paste0("Top Tags Table(", pairData[2], "-", pairData[1], + ") (.tsv)") + linkAddr <- paste0("toptag(", pairData[2], "-", pairData[1], ").tsv") + linkData <- rbind(linkData, c(linkName, linkAddr)) + + # Select hairpins with FDR < 0.05 to highlight on plot + png(smearPng, width=600, height=600) + plotTitle <- gsub(".", " ", + paste0("Smear Plot: ", pairData[2], "-", pairData[1]), + fixed = TRUE) + plotSmear(testData, de.tags=topIDs, + pch=20, cex=1.0, main=plotTitle) + abline(h = c(-1, 0, 1), col = c("dodgerblue", "yellow", "dodgerblue"), lty=2) + imgName <- paste0("Smear Plot(", pairData[2], "-", pairData[1], ")") + imgAddr <- paste0("smear(", pairData[2], "-", pairData[1],").png") + imageData <- rbind(imageData, c(imgName, imgAddr)) + invisible(dev.off()) + + pdf(smearPdf) + plotTitle <- gsub(".", " ", + paste0("Smear Plot: ", pairData[2], "-", pairData[1]), + fixed = TRUE) + plotSmear(testData, de.tags=topIDs, + pch=20, cex=1.0, main=plotTitle) + abline(h = c(-1, 0, 1), col = c("dodgerblue", "yellow", "dodgerblue"), lty=2) + imgName <- paste0("Smear Plot(", pairData[2], "-", pairData[1], ") (.pdf)") + imgAddr <- paste0("smear(", pairData[2], "-", pairData[1], ").pdf") + linkData <- rbind(linkData, c(imgName, imgAddr)) + invisible(dev.off()) +} else if (workMode=="glm") { + # Generating design information + factors <- factor(data$sample$group) + design <- model.matrix(~0+factors) + + colnames(design) <- gsub("factors", "", colnames(design), fixed=TRUE) + + # Split up contrasts seperated by comma into a vector + contrastData <- unlist(strsplit(contrastData, split=",")) + for (i in 1:length(contrastData)) { + # Generate contrasts information + contrasts <- makeContrasts(contrasts=contrastData[i], levels=design) + + # Fit negative bionomial GLM + fit = glmFit(data, design) + # Carry out Likelihood ratio test + testData = glmLRT(fit, contrast=contrasts) + + # Select hairpins with FDR < 0.05 to highlight on plot + top <- topTags(testData, n=Inf) + topIDs <- top$table[(top$table$FDR < fdrThresh) & + (abs(top$table$logFC) > lfcThresh), 1] + write.table(top, file=topOut[i], row.names=FALSE, sep="\t") + + linkName <- paste0("Top Tags Table(", contrastData[i], ") (.tsv)") + linkAddr <- paste0("toptag(", contrastData[i], ").tsv") + linkData <- rbind(linkData, c(linkName, linkAddr)) + + # Make a plot of logFC versus logCPM + png(smearPng[i], height=600, width=600) + plotTitle <- paste("Smear Plot:", gsub(".", " ", contrastData[i], + fixed=TRUE)) + plotSmear(testData, de.tags=topIDs, pch=20, cex=0.8, main=plotTitle) + abline(h=c(-1, 0, 1), col=c("dodgerblue", "yellow", "dodgerblue"), lty=2) + + imgName <- paste0("Smear Plot(", contrastData[i], ")") + imgAddr <- paste0("smear(", contrastData[i], ").png") + imageData <- rbind(imageData, c(imgName, imgAddr)) + invisible(dev.off()) + + pdf(smearPdf[i]) + plotTitle <- paste("Smear Plot:", gsub(".", " ", contrastData[i], + fixed=TRUE)) + plotSmear(testData, de.tags=topIDs, pch=20, cex=0.8, main=plotTitle) + abline(h=c(-1, 0, 1), col=c("dodgerblue", "yellow", "dodgerblue"), lty=2) + + linkName <- paste0("Smear Plot(", contrastData[i], ") (.pdf)") + linkAddr <- paste0("smear(", contrastData[i], ").pdf") + linkData <- rbind(linkData, c(linkName, linkAddr)) + invisible(dev.off()) + + genes <- as.character(data$genes$Gene) + unq <- unique(genes) + unq <- unq[!is.na(unq)] + geneList <- list() + for (gene in unq) { + if (length(which(genes==gene)) >= hairpinReq) { + geneList[[gene]] <- which(genes==gene) + } + } + + if (wantRoast) { + # Input preparaton for roast + nrot = 9999 + set.seed(602214129) + roastData <- mroast(data, index=geneList, design=design, + contrast=contrasts, nrot=nrot) + roastData <- cbind(GeneID=rownames(roastData), roastData) + write.table(roastData, file=roastOut[i], row.names=FALSE, sep="\t") + linkName <- paste0("Gene Level Analysis Table(", contrastData[i], + ") (.tsv)") + linkAddr <- paste0("roast(", contrastData[i], ").tsv") + linkData <- rbind(linkData, c(linkName, linkAddr)) + if (selectOpt=="rank") { + selectedGenes <- rownames(roastData)[selectVals] + } else { + selectedGenes <- selectVals + } + + if (packageVersion("limma")<"3.19.19") { + png(barcodePng[i], width=600, height=length(selectedGenes)*150) + } else { + png(barcodePng[i], width=600, height=length(selectedGenes)*300) + } + par(mfrow=c(length(selectedGenes), 1)) + for (gene in selectedGenes) { + barcodeplot(testData$table$logFC, index=geneList[[gene]], + main=paste("Barcode Plot for", gene, "(logFCs)", + gsub(".", " ", contrastData[i])), + labels=c("Positive logFC", "Negative logFC")) + } + imgName <- paste0("Barcode Plot(", contrastData[i], ")") + imgAddr <- paste0("barcode(", contrastData[i], ").png") + imageData <- rbind(imageData, c(imgName, imgAddr)) + dev.off() + if (packageVersion("limma")<"3.19.19") { + pdf(barcodePdf[i], width=8, height=2) + } else { + pdf(barcodePdf[i], width=8, height=4) + } + for (gene in selectedGenes) { + barcodeplot(testData$table$logFC, index=geneList[[gene]], + main=paste("Barcode Plot for", gene, "(logFCs)", + gsub(".", " ", contrastData[i])), + labels=c("Positive logFC", "Negative logFC")) + } + linkName <- paste0("Barcode Plot(", contrastData[i], ") (.pdf)") + linkAddr <- paste0("barcode(", contrastData[i], ").pdf") + linkData <- rbind(linkData, c(linkName, linkAddr)) + dev.off() + } + } +} + +# Record ending time +timeEnd <- as.character(Sys.time()) +################################################################################ +### HTML Generation +################################################################################ +# Clear file +cat("", file=htmlPath) + +cata("<html>\n") +HtmlHead("EdgeR Output") + +cata("<body>\n") +cata("<h3>EdgeR Analysis Output:</h3>\n") +cata("<h4>Input Summary:</h4>\n") +if (inputType=="fastq") { + cata("<ul>\n") + ListItem(hpReadout[1]) + ListItem(hpReadout[2]) + cata("</ul>\n") + cata(hpReadout[3], "<br/>\n") + cata("<ul>\n") + ListItem(hpReadout[4]) + ListItem(hpReadout[7]) + cata("</ul>\n") + cata(hpReadout[8:11], sep="<br/>\n") + cata("<br />\n") + cata("<b>Please check that read percentages are consistent with ") + cata("expectations.</b><br >\n") +} else if (inputType=="counts") { + cata("<ul>\n") + ListItem("Number of Samples: ", ncol(data$counts)) + ListItem("Number of Hairpins: ", countsRows) + ListItem("Number of annotations provided: ", annoRows) + ListItem("Number of annotations matched to hairpin: ", annoMatched) + cata("</ul>\n") +} + +cata("The estimated common biological coefficient of variation (BCV) is: ", + commonBCV, "<br />\n") + +cata("<h4>Output:</h4>\n") +cata("All images displayed have PDF copy at the bottom of the page, these can ") +cata("exported in a pdf viewer to high resolution image format. <br/>\n") +for (i in 1:nrow(imageData)) { + if (grepl("barcode", imageData$Link[i])) { + if (packageVersion("limma")<"3.19.19") { + HtmlImage(imageData$Link[i], imageData$Label[i], + height=length(selectedGenes)*150) + } else { + HtmlImage(imageData$Link[i], imageData$Label[i], + height=length(selectedGenes)*300) + } + } else { + HtmlImage(imageData$Link[i], imageData$Label[i]) + } +} +cata("<br/>\n") + +cata("<h4>Plots:</h4>\n") +for (i in 1:nrow(linkData)) { + if (!grepl(".tsv", linkData$Link[i])) { + HtmlLink(linkData$Link[i], linkData$Label[i]) + } +} + +cata("<h4>Tables:</h4>\n") +for (i in 1:nrow(linkData)) { + if (grepl(".tsv", linkData$Link[i])) { + HtmlLink(linkData$Link[i], linkData$Label[i]) + } +} + +cata("<p>alt-click any of the links to download the file, or click the name ") +cata("of this task in the galaxy history panel and click on the floppy ") +cata("disk icon to download all files in a zip archive.</p>\n") +cata("<p>.tsv files are tab seperated files that can be viewed using Excel ") +cata("or other spreadsheet programs</p>\n") +cata("<table border=\"0\">\n") + +cata("<tr>\n") +TableItem("Task started at:"); TableItem(timeStart) +cata("</tr>\n") +cata("<tr>\n") +TableItem("Task ended at:"); TableItem(timeEnd) +cata("</tr>\n") + +cata("</body>\n") +cata("</html>")