Mercurial > repos > shians > shrnaseq
changeset 2:076ca575208f
First commit
author | shian_su <registertonysu@gmail.com> |
---|---|
date | Fri, 21 Feb 2014 12:52:56 +1100 |
parents | aa02cf19e1b3 |
children | 17fee0726221 f8af57d6f60b |
files | hairpinTool.R hairpinTool.xml |
diffstat | 2 files changed, 1040 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hairpinTool.R Fri Feb 21 12:52:56 2014 +1100 @@ -0,0 +1,631 @@ +# Record starting time +timeStart <- as.character(Sys.time()) + +# Loading and checking required packages +library(methods, quietly=TRUE, warn.conflicts=FALSE) +library(statmod, quietly=TRUE, warn.conflicts=FALSE) +library(splines, quietly=TRUE, warn.conflicts=FALSE) +library(edgeR, quietly=TRUE, warn.conflicts=FALSE) +library(limma, quietly=TRUE, warn.conflicts=FALSE) + +if (packageVersion("edgeR") < "3.5.23") { + message("Please update 'edgeR' to version >= 3.5.23 to run this script") +} + +################################################################################ +### Function declarations +################################################################################ + +# Function to sanitise contrast equations so there are no whitespaces +# surrounding the arithmetic operators, leading or trailing whitespace +sanitiseEquation <- function(equation) { + equation <- gsub(" *[+] *", "+", equation) + equation <- gsub(" *[-] *", "-", equation) + equation <- gsub(" *[/] *", "/", equation) + equation <- gsub(" *[*] *", "*", equation) + equation <- gsub("^\\s+|\\s+$", "", equation) + return(equation) +} + +# Function to sanitise group information +sanitiseGroups <- function(string) { + string <- gsub(" *[,] *", ",", string) + string <- gsub("^\\s+|\\s+$", "", string) + return(string) +} + +# Function to change periods to whitespace in a string +unmake.names <- function(string) { + string <- gsub(".", " ", string, fixed=TRUE) + return(string) +} + +# Function has string input and generates an output path string +makeOut <- function(filename) { + return(paste0(folderPath, "/", filename)) +} + +# Function has string input and generates both a pdf and png output strings +imgOut <- function(filename) { + assign(paste0(filename, "Png"), makeOut(paste0(filename,".png")), + envir = .GlobalEnv) + assign(paste0(filename, "Pdf"), makeOut(paste0(filename,".pdf")), + envir = .GlobalEnv) +} + +# Create cat function default path set, default seperator empty and appending +# true by default (Ripped straight from the cat function with altered argument +# defaults) +cata <- function(..., file = htmlPath, sep = "", fill = FALSE, labels = NULL, + append = TRUE) { + if (is.character(file)) + if (file == "") + file <- stdout() + else if (substring(file, 1L, 1L) == "|") { + file <- pipe(substring(file, 2L), "w") + on.exit(close(file)) + } + else { + file <- file(file, ifelse(append, "a", "w")) + on.exit(close(file)) + } + .Internal(cat(list(...), file, sep, fill, labels, append)) +} + +# Function to write code for html head and title +HtmlHead <- function(title) { + cata("<head>\n") + cata("<title>", title, "</title>\n") + cata("</head>\n") +} + +# Function to write code for html links +HtmlLink <- function(address, label=address) { + cata("<a href=\"", address, "\" target=\"_blank\">", label, "</a><br />\n") +} + +# Function to write code for html images +HtmlImage <- function(source, label=source, height=600, width=600) { + cata("<img src=\"", source, "\" alt=\"", label, "\" height=\"", height) + cata("\" width=\"", width, "\"/>\n") +} + +# Function to write code for html list items +ListItem <- function(...) { + cata("<li>", ..., "</li>\n") +} + +TableItem <- function(...) { + cata("<td>", ..., "</td>\n") +} + +TableHeadItem <- function(...) { + cata("<th>", ..., "</th>\n") +} +################################################################################ +### Input Processing +################################################################################ + +# Grabbing arguments from command line +argv <- commandArgs(TRUE) + +# Remove fastq file paths after collecting from argument vector +inputType <- as.character(argv[1]) +if (inputType=="fastq") { + fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], + fixed=TRUE)) + argv <- argv[!grepl("fastq::", argv, fixed=TRUE)] + hairpinPath <- as.character(argv[2]) + samplePath <- as.character(argv[3]) + barStart <- as.numeric(argv[4]) + barEnd <- as.numeric(argv[5]) + hpStart <- as.numeric(argv[6]) + hpEnd <- as.numeric(argv[7]) +} else if (inputType=="counts") { + countPath <- as.character(argv[2]) + annoPath <- as.character(argv[3]) + samplePath <- as.character(argv[4]) +} + +cpmReq <- as.numeric(argv[8]) +sampleReq <- as.numeric(argv[9]) +fdrThresh <- as.numeric(argv[10]) +lfcThresh <- as.numeric(argv[11]) +workMode <- as.character(argv[12]) +htmlPath <- as.character(argv[13]) +folderPath <- as.character(argv[14]) +if (workMode=="classic") { + pairData <- character() + pairData[2] <- as.character(argv[15]) + pairData[1] <- as.character(argv[16]) +} else if (workMode=="glm") { + contrastData <- as.character(argv[15]) + roastOpt <- as.character(argv[16]) + hairpinReq <- as.numeric(argv[17]) + selectOpt <- as.character(argv[18]) + selectVals <- as.character(argv[19]) +} + +# Read in inputs +if (inputType=="fastq") { + samples <- read.table(samplePath, header=TRUE, sep="\t") + hairpins <- read.table(hairpinPath, header=TRUE, sep="\t") +} else if (inputType=="counts") { + samples <- read.table(samplePath, header=TRUE, sep="\t") + counts <- read.table(countPath, header=TRUE, sep="\t") + anno <- read.table(annoPath, header=TRUE, sep="\t") +} +###################### Check inputs for correctness ############################ +samples$ID <- make.names(samples$ID) + +if (!any(grepl("group", names(samples)))) { + stop("'group' column not specified in sample annotation file") +} # Check if grouping variable has been specified + +if (any(table(samples$ID)>1)){ + tab <- table(samples$ID) + offenders <- paste(names(tab[tab>1]), collapse=", ") + offenders <- unmake.names(offenders) + stop("ID column of sample annotation must have unique values, values ", + offenders, " are repeated") +} # Check that IDs in sample annotation are unique + +if (any(is.na(match(samples$ID, colnames(counts))))) { + stop("not all samples have groups specified") +} # Check that a group has be specifed for each sample + +if (inputType=="fastq") { + + if (any(table(hairpins$ID)>1)){ + tab <- table(hairpins$ID) + offenders <- paste(names(tab[tab>1]), collapse=", ") + stop("ID column of hairpin annotation must have unique values, values ", + offenders, " are repeated") + } # Check that IDs in hairpin annotation are unique + +} else if (inputType=="counts") { + + if (any(table(counts$ID)>1)){ + tab <- table(counts$ID) + offenders <- paste(names(tab[tab>1]), collapse=", ") + stop("ID column of count table must have unique values, values ", + offenders, " are repeated") + } # Check that IDs in count table are unique +} +################################################################################ + +# Process arguments +if (workMode=="glm") { + if (roastOpt=="yes") { + wantRoast <- TRUE + } else { + wantRoast <- FALSE + } +} + +# Split up contrasts seperated by comma into a vector and replace spaces with +# periods +if (exists("contrastData")) { + contrastData <- unlist(strsplit(contrastData, split=",")) + contrastData <- sanitiseEquation(contrastData) + contrastData <- gsub(" ", ".", contrastData, fixed=TRUE) +} + +# Replace spaces with periods in pair data +if (exists("pairData")) { + pairData <- make.names(pairData) +} + +# Generate output folder and paths +dir.create(folderPath) + +# Generate links for outputs +imgOut("barHairpin") +imgOut("barIndex") +imgOut("mds") +imgOut("bcv") +if (workMode == "classic") { + smearPng <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").png")) + smearPdf <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").pdf")) + topOut <- makeOut(paste0("toptag(", pairData[2], "-", pairData[1],").tsv")) +} else if (workMode=="glm") { + smearPng <- character() + smearPdf <- character() + topOut <- character() + roastOut <- character() + barcodePng <- character() + barcodePdf <- character() + for (i in 1:length(contrastData)) { + smearPng[i] <- makeOut(paste0("smear(", contrastData[i], ").png")) + smearPdf[i] <- makeOut(paste0("smear(", contrastData[i], ").pdf")) + topOut[i] <- makeOut(paste0("toptag(", contrastData[i], ").tsv")) + roastOut[i] <- makeOut(paste0("roast(", contrastData[i], ").tsv")) + barcodePng[i] <- makeOut(paste0("barcode(", contrastData[i], ").png")) + barcodePdf[i] <- makeOut(paste0("barcode(", contrastData[i], ").pdf")) + } +} +# Initialise data for html links and images, table with the link label and +# link address +linkData <- data.frame(Label=character(), Link=character(), + stringsAsFactors=FALSE) +imageData <- data.frame(Label=character(), Link=character(), + stringsAsFactors=FALSE) +################################################################################ +### Data Processing +################################################################################ + +# Transform gene selection from string into index values for mroast +if (workMode=="glm") { + if (selectOpt=="rank") { + selectVals <- gsub(" ", "", selectVals, fixed=TRUE) + selectVals <- unlist(strsplit(selectVals, ",")) + + for (i in 1:length(selectVals)) { + if (grepl(":", selectVals[i], fixed=TRUE)) { + temp <- unlist(strsplit(selectVals[i], ":")) + selectVals <- selectVals[-i] + a <- as.numeric(temp[1]) + b <- as.numeric(temp[2]) + selectVals <- c(selectVals, a:b) + } + } + selectVals <- as.numeric(unique(selectVals)) + } else { + selectVals <- gsub(" ", "", selectVals, fixed=TRUE) + selectVals <- unlist(strsplit(selectVals, " ")) + } +} + +if (inputType=="fastq") { +# Use EdgeR hairpin process and capture outputs +hpReadout <- capture.output( + data <- processHairpinReads(fastqPath, samplePath, hairpinPath, + hairpinStart=hpStart, hairpinEnd=hpEnd, + verbose=TRUE) +) + +# Remove function output entries that show processing data or is empty +hpReadout <- hpReadout[hpReadout!=""] +hpReadout <- hpReadout[!grepl("Processing", hpReadout)] +hpReadout <- hpReadout[!grepl("in file", hpReadout)] +hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE) + + +# Make the names of groups syntactically valid (replace spaces with periods) +data$samples$group <- make.names(data$samples$group) +} else { + # Process counts information, set ID column to be row names + rownames(counts) <- counts$ID + counts <- counts[ , !(colnames(counts)=="ID")] + countsRows <- nrow(counts) + + # Process group information + factors <- samples$group[match(samples$ID, colnames(counts))] + annoRows <- nrow(anno) + anno <- anno[match(rownames(counts), anno$ID), ] + annoMatched <- sum(!is.na(anno$ID)) + + if (any(is.na(anno$ID))) { + warningStr <- paste("count table contained more hairpins than", + "specified in hairpin annotation file") + warning(warningStr) + } + + # Filter out rows with zero counts + sel <- rowSums(counts)!=0 + counts <- counts[sel, ] + anno <- anno[sel, ] + + # Create DGEList + data <- DGEList(counts=counts, lib.size=colSums(counts), + norm.factors=rep(1,ncol(counts)), genes=anno, group=factors) + + # Make the names of groups syntactically valid (replace spaces with periods) + data$samples$group <- make.names(data$samples$group) +} + +# Filter hairpins with low counts +sel <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq +data <- data[sel, ] + +# Estimate dispersions +data <- estimateDisp(data) +commonBCV <- sqrt(data$common.dispersion) + +################################################################################ +### Output Processing +################################################################################ + +# Plot number of hairpins that could be matched per sample +png(barIndexPng, width=600, height=600) +barplot(height<-colSums(data$counts), las=2, main="Counts per index", + cex.names=1.0, cex.axis=0.8, ylim=c(0, max(height)*1.2)) +imageData[1, ] <- c("Counts per Index", "barIndex.png") +invisible(dev.off()) + +pdf(barIndexPdf) +barplot(height<-colSums(data$counts), las=2, main="Counts per index", + cex.names=1.0, cex.axis=0.8, ylim=c(0, max(height)*1.2)) +linkData[1, ] <- c("Counts per Index Barplot (.pdf)", "barIndex.pdf") +invisible(dev.off()) + +# Plot per hairpin totals across all samples +png(barHairpinPng, width=600, height=600) +if (nrow(data$counts)<50) { + barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin", + cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2)) +} else { + barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin", + cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2), + names.arg=FALSE) +} +imageData <- rbind(imageData, c("Counts per Hairpin", "barHairpin.png")) +invisible(dev.off()) + +pdf(barHairpinPdf) +if (nrow(data$counts)<50) { + barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin", + cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2)) +} else { + barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin", + cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2), + names.arg=FALSE) +} +newEntry <- c("Counts per Hairpin Barplot (.pdf)", "barHairpin.pdf") +linkData <- rbind(linkData, newEntry) +invisible(dev.off()) + +# Make an MDS plot to visualise relationships between replicate samples +png(mdsPng, width=600, height=600) +plotMDS(data, labels=data$samples$group, col=as.numeric(data$samples$group), + main="MDS Plot") +imageData <- rbind(imageData, c("MDS Plot", "mds.png")) +invisible(dev.off()) + +pdf(mdsPdf) +plotMDS(data, labels=data$samples$group, col=as.numeric(data$samples$group), + main="MDS Plot") +newEntry <- c("MDS Plot (.pdf)", "mds.pdf") +linkData <- rbind(linkData, newEntry) +invisible(dev.off()) + +if (workMode=="classic") { + # Assess differential representation using classic exact testing methodology + # in edgeR + testData <- exactTest(data, pair=pairData) + + top <- topTags(testData, n=Inf) + topIDs <- top$table[(top$table$FDR < fdrThresh) & + (abs(top$table$logFC) > lfcThresh), 1] + write.table(top, file=topOut, row.names=FALSE, sep="\t") + linkName <- paste0("Top Tags Table(", pairData[2], "-", pairData[1], + ") (.tsv)") + linkAddr <- paste0("toptag(", pairData[2], "-", pairData[1], ").tsv") + linkData <- rbind(linkData, c(linkName, linkAddr)) + + # Select hairpins with FDR < 0.05 to highlight on plot + png(smearPng, width=600, height=600) + plotTitle <- gsub(".", " ", + paste0("Smear Plot: ", pairData[2], "-", pairData[1]), + fixed = TRUE) + plotSmear(testData, de.tags=topIDs, + pch=20, cex=1.0, main=plotTitle) + abline(h = c(-1, 0, 1), col = c("dodgerblue", "yellow", "dodgerblue"), lty=2) + imgName <- paste0("Smear Plot(", pairData[2], "-", pairData[1], ")") + imgAddr <- paste0("smear(", pairData[2], "-", pairData[1],").png") + imageData <- rbind(imageData, c(imgName, imgAddr)) + invisible(dev.off()) + + pdf(smearPdf) + plotTitle <- gsub(".", " ", + paste0("Smear Plot: ", pairData[2], "-", pairData[1]), + fixed = TRUE) + plotSmear(testData, de.tags=topIDs, + pch=20, cex=1.0, main=plotTitle) + abline(h = c(-1, 0, 1), col = c("dodgerblue", "yellow", "dodgerblue"), lty=2) + imgName <- paste0("Smear Plot(", pairData[2], "-", pairData[1], ") (.pdf)") + imgAddr <- paste0("smear(", pairData[2], "-", pairData[1], ").pdf") + linkData <- rbind(linkData, c(imgName, imgAddr)) + invisible(dev.off()) +} else if (workMode=="glm") { + # Generating design information + factors <- factor(data$sample$group) + design <- model.matrix(~0+factors) + + colnames(design) <- gsub("factors", "", colnames(design), fixed=TRUE) + + # Split up contrasts seperated by comma into a vector + contrastData <- unlist(strsplit(contrastData, split=",")) + for (i in 1:length(contrastData)) { + # Generate contrasts information + contrasts <- makeContrasts(contrasts=contrastData[i], levels=design) + + # Fit negative bionomial GLM + fit = glmFit(data, design) + # Carry out Likelihood ratio test + testData = glmLRT(fit, contrast=contrasts) + + # Select hairpins with FDR < 0.05 to highlight on plot + top <- topTags(testData, n=Inf) + topIDs <- top$table[(top$table$FDR < fdrThresh) & + (abs(top$table$logFC) > lfcThresh), 1] + write.table(top, file=topOut[i], row.names=FALSE, sep="\t") + + linkName <- paste0("Top Tags Table(", contrastData[i], ") (.tsv)") + linkAddr <- paste0("toptag(", contrastData[i], ").tsv") + linkData <- rbind(linkData, c(linkName, linkAddr)) + + # Make a plot of logFC versus logCPM + png(smearPng[i], height=600, width=600) + plotTitle <- paste("Smear Plot:", gsub(".", " ", contrastData[i], + fixed=TRUE)) + plotSmear(testData, de.tags=topIDs, pch=20, cex=0.8, main=plotTitle) + abline(h=c(-1, 0, 1), col=c("dodgerblue", "yellow", "dodgerblue"), lty=2) + + imgName <- paste0("Smear Plot(", contrastData[i], ")") + imgAddr <- paste0("smear(", contrastData[i], ").png") + imageData <- rbind(imageData, c(imgName, imgAddr)) + invisible(dev.off()) + + pdf(smearPdf[i]) + plotTitle <- paste("Smear Plot:", gsub(".", " ", contrastData[i], + fixed=TRUE)) + plotSmear(testData, de.tags=topIDs, pch=20, cex=0.8, main=plotTitle) + abline(h=c(-1, 0, 1), col=c("dodgerblue", "yellow", "dodgerblue"), lty=2) + + linkName <- paste0("Smear Plot(", contrastData[i], ") (.pdf)") + linkAddr <- paste0("smear(", contrastData[i], ").pdf") + linkData <- rbind(linkData, c(linkName, linkAddr)) + invisible(dev.off()) + + genes <- as.character(data$genes$Gene) + unq <- unique(genes) + unq <- unq[!is.na(unq)] + geneList <- list() + for (gene in unq) { + if (length(which(genes==gene)) >= hairpinReq) { + geneList[[gene]] <- which(genes==gene) + } + } + + if (wantRoast) { + # Input preparaton for roast + nrot = 9999 + set.seed(602214129) + roastData <- mroast(data, index=geneList, design=design, + contrast=contrasts, nrot=nrot) + roastData <- cbind(GeneID=rownames(roastData), roastData) + write.table(roastData, file=roastOut[i], row.names=FALSE, sep="\t") + linkName <- paste0("Gene Level Analysis Table(", contrastData[i], + ") (.tsv)") + linkAddr <- paste0("roast(", contrastData[i], ").tsv") + linkData <- rbind(linkData, c(linkName, linkAddr)) + if (selectOpt=="rank") { + selectedGenes <- rownames(roastData)[selectVals] + } else { + selectedGenes <- selectVals + } + + if (packageVersion("limma")<"3.19.19") { + png(barcodePng[i], width=600, height=length(selectedGenes)*150) + } else { + png(barcodePng[i], width=600, height=length(selectedGenes)*300) + } + par(mfrow=c(length(selectedGenes), 1)) + for (gene in selectedGenes) { + barcodeplot(testData$table$logFC, index=geneList[[gene]], + main=paste("Barcode Plot for", gene, "(logFCs)", + gsub(".", " ", contrastData[i])), + labels=c("Positive logFC", "Negative logFC")) + } + imgName <- paste0("Barcode Plot(", contrastData[i], ")") + imgAddr <- paste0("barcode(", contrastData[i], ").png") + imageData <- rbind(imageData, c(imgName, imgAddr)) + dev.off() + if (packageVersion("limma")<"3.19.19") { + pdf(barcodePdf[i], width=8, height=2) + } else { + pdf(barcodePdf[i], width=8, height=4) + } + for (gene in selectedGenes) { + barcodeplot(testData$table$logFC, index=geneList[[gene]], + main=paste("Barcode Plot for", gene, "(logFCs)", + gsub(".", " ", contrastData[i])), + labels=c("Positive logFC", "Negative logFC")) + } + linkName <- paste0("Barcode Plot(", contrastData[i], ") (.pdf)") + linkAddr <- paste0("barcode(", contrastData[i], ").pdf") + linkData <- rbind(linkData, c(linkName, linkAddr)) + dev.off() + } + } +} + +# Record ending time +timeEnd <- as.character(Sys.time()) +################################################################################ +### HTML Generation +################################################################################ +# Clear file +cat("", file=htmlPath) + +cata("<html>\n") +HtmlHead("EdgeR Output") + +cata("<body>\n") +cata("<h3>EdgeR Analysis Output:</h3>\n") +cata("<h4>Input Summary:</h4>\n") +if (inputType=="fastq") { + cata("<ul>\n") + ListItem(hpReadout[1]) + ListItem(hpReadout[2]) + cata("</ul>\n") + cata(hpReadout[3], "<br/>\n") + cata("<ul>\n") + ListItem(hpReadout[4]) + ListItem(hpReadout[7]) + cata("</ul>\n") + cata(hpReadout[8:11], sep="<br/>\n") + cata("<br />\n") + cata("<b>Please check that read percentages are consistent with ") + cata("expectations.</b><br >\n") +} else if (inputType=="counts") { + cata("<ul>\n") + ListItem("Number of Samples: ", ncol(data$counts)) + ListItem("Number of Hairpins: ", countsRows) + ListItem("Number of annotations provided: ", annoRows) + ListItem("Number of annotations matched to hairpin: ", annoMatched) + cata("</ul>\n") +} + +cata("The estimated common biological coefficient of variation (BCV) is: ", + commonBCV, "<br />\n") + +cata("<h4>Output:</h4>\n") +cata("All images displayed have PDF copy at the bottom of the page, these can ") +cata("exported in a pdf viewer to high resolution image format. <br/>\n") +for (i in 1:nrow(imageData)) { + if (grepl("barcode", imageData$Link[i])) { + if (packageVersion("limma")<"3.19.19") { + HtmlImage(imageData$Link[i], imageData$Label[i], + height=length(selectedGenes)*150) + } else { + HtmlImage(imageData$Link[i], imageData$Label[i], + height=length(selectedGenes)*300) + } + } else { + HtmlImage(imageData$Link[i], imageData$Label[i]) + } +} +cata("<br/>\n") + +cata("<h4>Plots:</h4>\n") +for (i in 1:nrow(linkData)) { + if (!grepl(".tsv", linkData$Link[i])) { + HtmlLink(linkData$Link[i], linkData$Label[i]) + } +} + +cata("<h4>Tables:</h4>\n") +for (i in 1:nrow(linkData)) { + if (grepl(".tsv", linkData$Link[i])) { + HtmlLink(linkData$Link[i], linkData$Label[i]) + } +} + +cata("<p>alt-click any of the links to download the file, or click the name ") +cata("of this task in the galaxy history panel and click on the floppy ") +cata("disk icon to download all files in a zip archive.</p>\n") +cata("<p>.tsv files are tab seperated files that can be viewed using Excel ") +cata("or other spreadsheet programs</p>\n") +cata("<table border=\"0\">\n") + +cata("<tr>\n") +TableItem("Task started at:"); TableItem(timeStart) +cata("</tr>\n") +cata("<tr>\n") +TableItem("Task ended at:"); TableItem(timeEnd) +cata("</tr>\n") + +cata("</body>\n") +cata("</html>")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hairpinTool.xml Fri Feb 21 12:52:56 2014 +1100 @@ -0,0 +1,409 @@ +<tool id="shRNAseq" name="shRNAseq Tool" version="1.0.5"> + <description> + Analyse hairpin differential representation using edgeR + </description> + + <requirements> + <requirement type="R-module">edgeR</requirement> + <requirement type="R-module">limma</requirement> + </requirements> + + <stdio> + <exit_code range="1:" level="fatal" description="Tool exception" /> + </stdio> + + <command interpreter="Rscript"> + hairpinTool.R $inputOpt.type + #if $inputOpt.type=="fastq": + #for $i, $fas in enumerate($inputOpt.fastq): + fastq::$fas.file + #end for + + $inputOpt.hairpin + $inputOpt.samples + + #if $inputOpt.positions.option=="yes": + $inputOpt.positions.barstart + $inputOpt.positions.barend + $inputOpt.positions.hpstart + $inputOpt.positions.hpend + #else: + 1 + 5 + 37 + 57 + #end if + #else: + $inputOpt.counts + $inputOpt.anno + "$inputOpt.factors" + 0 0 0 + #end if + + #if $filterCPM.option=="yes": + $filterCPM.cpmReq + $filterCPM.sampleReq + #else: + -Inf + -Inf + #end if + + $fdr + $lfc + $workMode.mode + $outFile + $outFile.files_path + + #if $workMode.mode=="classic": + "$workMode.pair1" + "$workMode.pair2" + #else: + "$workMode.contrast" + $workMode.roast.option + #if $workMode.roast.option=="yes": + $workMode.roast.hairpinReq + $workMode.roast.select.option + "$workMode.roast.select.selection" + #else: + 0 + 0 + 0 + #end if + #end if + </command> + + <inputs> + <conditional name="inputOpt"> + <param name="type" type="select" label="Input File Type"> + <option value="fastq">FastQ File</option> + <option value="counts">Table of Counts</option> + </param> + + <when value="fastq"> + <param name="hairpin" type="data" format="tabular" + label="Hairpin Annotation"/> + + + <param name="samples" type="data" format="tabular" + label="Sample Annotation"/> + + <repeat name="fastq" title="FastQ Files"> + <param name="file" type="data" format="fastq"/> + </repeat> + + <conditional name="positions"> + <param name="option" type="select" + label="Specify Barcode and Hairpin Locations?" + help="Default Positions: Barcode: 1 to 5, Hairpin: 37 to 57."> + <option value="no" selected="True">No</option> + <option value="yes">Yes</option> + </param> + + <when value="yes"> + <param name="barstart" type="integer" value="1" + label="Barcode Starting Position"/> + <param name="barend" type="integer" value="5" + label="Barcode Ending Position"/> + + <param name="hpstart" type="integer" value="37" + label="Hairpin Starting Position"/> + + <param name="hpend" type="integer" value="57" + label="Hairpin Ending Position"/> + </when> + + <when value="no"/> + </conditional> + </when> + + <when value="counts"> + <param name="counts" type="data" format="tabular" label="Counts Table"/> + <param name="anno" type="data" format="tabular" + label="Hairpin Annotation"/> + <param name="factors" type="data" format="tabular" + label="Sample Annotation"/> + </when> + </conditional> + + <conditional name="filterCPM"> + <param name="option" type="select" label="Filter Low CPM?" + help="Ignore hairpins with very low representation when performing + analysis."> + <option value="yes">Yes</option> + <option value="no">No</option> + </param> + + <when value="yes"> + <param name="cpmReq" type="float" value="0.5" min="0" max="1" + label="Minimum CPM"/> + + <param name="sampleReq" type="integer" value="1" min="0" + label="Minimum Samples" + help="Filter out all the genes that do not meet the minimum + CPM in at least this many samples."/> + </when> + + <when value="no"/> + + </conditional> + + <conditional name="workMode"> + <param name="mode" type="select" label="Analysis Type" + help="Classic Exact Tests are useful for simple comparisons across + two sampling groups. Generalised linear models allow for more + complex contrasts and gene level analysis to be made."> + <option value="classic">Classic Exact Test</option> + <option value="glm">Generalised Linear Model</option> + </param> + + <when value="classic"> + <param name="pair1" type="text" label="Compare" size="40"/> + <param name="pair2" type="text" label="To" size="40" + help="The analysis will subtract values of this group from those + in the group above to establish the difference."/> + </when> + + <when value="glm"> + <param name="contrast" type="text" size="60" + label="Contrasts of interest" + help="Specify equations defining contrasts to be made. Eg. + KD-Control will result in positive fold change if KD has + greater expression and negative if Control has greater + expression."/> + + <conditional name="roast"> + <param name="option" type="select" + label="Perform Gene Level Analysis?" + help="Analyse LogFC tendencies for hairpins belonging + to the same gene."> + <option value="no">No</option> + <option value="yes">Yes</option> + </param> + + <when value="yes"> + <param name="hairpinReq" type="integer" value="2" min="2" + label="Minimum Hairpins" + help="Only genes with at least this many hairpins will + be analysed."/> + + <conditional name="select"> + <param name="option" type="select" + label="Gene Selection Method"> + <option value="rank">By p-value Rank</option> + <option value="geneID">By Gene Identifier</option> + </param> + <when value="rank"> + <param name="selection" type="text" size="40" value="1:5" + label="Ranks of Top Genes to Plot" + help="Genes are ranked in ascending p-value for + differential representation, individual ranks can + be entered seperated by comma or a range seperated + by colon."/> + </when> + <when value="geneID"> + <param name="selection" type="text" size="80" value="" + label="Symbols of Genes to Plot" + help="Select genes based on their identifier in the + 'Gene' column of the sample information file. + Please ensure exact match with the values in input + file and separate selections with commas."/> + </when> + </conditional> + + + </when> + + <when value="no"/> + </conditional> + </when> + </conditional> + + <param name="fdr" type="float" value="0.05" min="0" max="1" + label="FDR Threshold" + help="All observations below this threshold will be highlighted + in the smear plot."/> + <param name="lfc" type="float" value="0" min="0" + label="Absolute LogFC Threshold" + help="In additional to meeting the FDR requirement, the absolute + value of the log-fold-change of the observation must be above + this threshold to be highlighted."/> + </inputs> + + <outputs> + <data format="html" name="outFile" label="shRNAseq Analysis"/> + </outputs> + + <help> +.. class:: infomark + +**What it does** + +Given tables containing information about the hairpins and their associated +barcodes, information about the samples and fastq file containing the hairpin +reads. This tool will generate plots and tables for the analysis of differential +representation. + +----- + +.. class:: infomark + +**INPUTS** + +**Input File Type:** + +This tool is able to either generate counts from a raw FastQ file given the +information regarding the samples and hairpins. Alternatively if a table of +counts has already been generated it can also be used. + +**Counts Table (Counts Input):** + +A tab delimited text table of information regarding the counts of hairpins. +Should have a column 'ID' to denote the hairpins that counts correspond to. Each +additional column should have titles corresponding to the label for the sample. + +Example:: + + ID Sample1 Sample2 Sample3 + Control1 49802 48014 40148 + Control2 12441 16352 14232 + Control3 9842 9148 9111 + Hairpin1 3300 3418 2914 + Hairpin2 91418 95812 93174 + Hairpin3 32985 31975 35104 + Hairpin4 12082 14081 14981 + Hairpin5 2491 2769 2691 + Hairpin6 1294 1486 1642 + Hairpin7 49501 49076 47611 + ... + +**Hairpin Annotation:** + +A tab delimited text table of information regarding the hairpins. Should have +columns 'ID', 'Sequences' and 'Gene' to uniquely identify the hairpin, align it +with the reads to produce counts and identify which gene the hairpin acts on. + +NOTE: the column names are case sensitive and should be input exactly as they +are shown here. + +Example:: + + ID Sequences Gene + Control1 TCTCGCTTGGGCGAGAGTAAG 2 + Control2 CCGCCTGAAGTCTCTGATTAA 2 + Control3 AGGAATTATAATGCTTATCTA 2 + Hairpin1 AAGGCAGAGACTGACCACCTA 4 + Hairpin2 GAGCGACCTGGTGTTACTCTA 4 + Hairpin3 ATGGTGTAAATAGAGCTGTTA 4 + Hairpin4 CAGCTCATCTTCTGTGAAGAA 4 + Hairpin5 CAGCTCTGTGGGTCAGAAGAA 4 + Hairpin6 CCAGGCACAGATCTCAAGATA 4 + Hairpin7 ATGACAAGAAAGACATCTCAA 7 + ... + +**Sample Annotation (FastQ Input):** + +A tab delimited text table of information regarding the samples. Should have +columns 'ID', 'Sequences' and 'group' to uniquely identify each sample, identify +the sample in the reads by its barcode sequence and correctly group replicates +for analysis. Additional columns may inserted for annotation purposes and will +not interfere with analysis as long as the necessary columns are present. + +NOTE: the column names are case sensitive and should be input exactly as they +are shown here. + +Example:: + + ID Sequences group Replicate + 3 GAAAG Day 2 1 + 6 GAACC Day 10 1 + 9 GAAGA Day 5 GFP neg 1 + 16 GAATT Day 5 GFP pos 1 + 18 GACAC Day 2 2 + 21 GACCA Day 10 2 + 28 GACGT Day 5 GFP neg 2 + 31 GACTG Day 5 GFP pos 2 + 33 GAGAA Day 2 3 + 40 GAGCT Day 10 3 + ... + +**Specify Barcode and Hairpin Locations (FastQ Input):** + +It is assumed that in the sequencing reads that the first 5 bases are the +barcodes and that bases 37-57 are the hairpins. If this is not the case then the +values of the positions can be changed, however it still requires the barcodes +and hairpins to be in a consistent location an in a continuous sequence. + +**Filter Low CPM?:** + +Often in a large screen there may members with very low counts which are of no +interest in the experiment, these may be filtered out to speed up computations. +Filtering will be based on counts per million in a required number of samples. + +**Analysis Type:** + + * **Classic Exact Test:** This allows two experimental groups to be compared and + p-values for differential representation derivec for each hairpin. Simple and + fast for straightforward comparisons. In this option you will have the option of + "*Compare* x *To* y" which implicitly subtracts the data from y from that of x + to produce the comparison. + + * **Generalised Linear Model:** This allow for complex contrasts to be specified + and also gene level analysis to be performed. If this option is chosen then + contrasts must be explicitly stated in equations and multiple contrasts can be + made. In addition there will be the option to analyse hairpins on a per-gene + basis to see if hairpins belonging to a particular gene have any overall + tendencies for the direction of their log-fold-change. + +**FDR Threshold:** +The smear plot in the output will have hairpins highlighted to signify +significant differential representation. The significance is determined by +contorlling the false discovery rate, only those with a FDR lower than the +threshold will be highlighted in the plot. + +----- + +**Citations:** + +.. class:: infomark + +limma + +Please cite the paper below for the limma software itself. Please also try +to cite the appropriate methodology articles that describe the statistical +methods implemented in limma, depending on which limma functions you are +using. The methodology articles are listed in Section 2.1 of the limma +User's Guide. + + * Smyth, GK (2005). Limma: linear models for microarray data. In: + 'Bioinformatics and Computational Biology Solutions using R and + Bioconductor'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry, + W. Huber (eds), Springer, New York, pages 397-420. + +.. class:: infomark + +edgeR + +Please cite the first paper for the software itself and the other papers for +the various original statistical methods implemented in edgeR. See +Section 1.2 in the User's Guide for more detail. + + * Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor + package for differential expression analysis of digital gene expression + data. Bioinformatics 26, 139-140 + + * Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing + differences in tag abundance. Bioinformatics 23, 2881-2887 + + * Robinson MD and Smyth GK (2008). Small-sample estimation of negative + binomial dispersion, with applications to SAGE data. + Biostatistics, 9, 321-332 + + * McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis + of multifactor RNA-Seq experiments with respect to biological variation. + Nucleic Acids Research 40, 4288-4297 + +.. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html +.. _limma: http://www.bioconductor.org/packages/release/bioc/html/limma.html + </help> +</tool> +