changeset 2:076ca575208f

First commit
author shian_su <registertonysu@gmail.com>
date Fri, 21 Feb 2014 12:52:56 +1100
parents aa02cf19e1b3
children 17fee0726221 f8af57d6f60b
files hairpinTool.R hairpinTool.xml
diffstat 2 files changed, 1040 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hairpinTool.R	Fri Feb 21 12:52:56 2014 +1100
@@ -0,0 +1,631 @@
+# Record starting time
+timeStart <- as.character(Sys.time())
+
+# Loading and checking required packages
+library(methods, quietly=TRUE, warn.conflicts=FALSE)
+library(statmod, quietly=TRUE, warn.conflicts=FALSE)
+library(splines, quietly=TRUE, warn.conflicts=FALSE)
+library(edgeR, quietly=TRUE, warn.conflicts=FALSE)
+library(limma, quietly=TRUE, warn.conflicts=FALSE)
+
+if (packageVersion("edgeR") < "3.5.23") {
+  message("Please update 'edgeR' to version >= 3.5.23 to run this script")
+}
+
+################################################################################
+### Function declarations
+################################################################################
+
+# Function to sanitise contrast equations so there are no whitespaces
+# surrounding the arithmetic operators, leading or trailing whitespace
+sanitiseEquation <- function(equation) {
+  equation <- gsub(" *[+] *", "+", equation)
+  equation <- gsub(" *[-] *", "-", equation)
+  equation <- gsub(" *[/] *", "/", equation)
+  equation <- gsub(" *[*] *", "*", equation)
+  equation <- gsub("^\\s+|\\s+$", "", equation)
+  return(equation)
+}
+
+# Function to sanitise group information
+sanitiseGroups <- function(string) {
+  string <- gsub(" *[,] *", ",", string)
+  string <- gsub("^\\s+|\\s+$", "", string)
+  return(string)
+}
+
+# Function to change periods to whitespace in a string
+unmake.names <- function(string) {
+  string <- gsub(".", " ", string, fixed=TRUE)
+  return(string)
+}
+
+# Function has string input and generates an output path string
+makeOut <- function(filename) {
+  return(paste0(folderPath, "/", filename))
+}
+
+# Function has string input and generates both a pdf and png output strings
+imgOut <- function(filename) {
+  assign(paste0(filename, "Png"), makeOut(paste0(filename,".png")), 
+         envir = .GlobalEnv)
+  assign(paste0(filename, "Pdf"), makeOut(paste0(filename,".pdf")),
+         envir = .GlobalEnv)
+}
+
+# Create cat function default path set, default seperator empty and appending
+# true by default (Ripped straight from the cat function with altered argument
+# defaults)
+cata <- function(..., file = htmlPath, sep = "", fill = FALSE, labels = NULL, 
+                 append = TRUE) {
+  if (is.character(file)) 
+    if (file == "") 
+      file <- stdout()
+  else if (substring(file, 1L, 1L) == "|") {
+    file <- pipe(substring(file, 2L), "w")
+    on.exit(close(file))
+  }
+  else {
+    file <- file(file, ifelse(append, "a", "w"))
+    on.exit(close(file))
+  }
+  .Internal(cat(list(...), file, sep, fill, labels, append))
+}
+
+# Function to write code for html head and title
+HtmlHead <- function(title) {
+  cata("<head>\n")
+  cata("<title>", title, "</title>\n")
+  cata("</head>\n")
+}
+
+# Function to write code for html links
+HtmlLink <- function(address, label=address) {
+  cata("<a href=\"", address, "\" target=\"_blank\">", label, "</a><br />\n")
+}
+
+# Function to write code for html images
+HtmlImage <- function(source, label=source, height=600, width=600) {
+  cata("<img src=\"", source, "\" alt=\"", label, "\" height=\"", height)
+  cata("\" width=\"", width, "\"/>\n")
+}
+
+# Function to write code for html list items
+ListItem <- function(...) {
+  cata("<li>", ..., "</li>\n")
+}
+
+TableItem <- function(...) {
+  cata("<td>", ..., "</td>\n")
+}
+
+TableHeadItem <- function(...) {
+  cata("<th>", ..., "</th>\n")
+}
+################################################################################
+### Input Processing
+################################################################################
+
+# Grabbing arguments from command line
+argv <- commandArgs(TRUE)
+
+# Remove fastq file paths after collecting from argument vector
+inputType <- as.character(argv[1])
+if (inputType=="fastq") {
+  fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], 
+                                 fixed=TRUE))
+  argv <- argv[!grepl("fastq::", argv, fixed=TRUE)]
+  hairpinPath <- as.character(argv[2])
+  samplePath <- as.character(argv[3])
+  barStart <- as.numeric(argv[4])
+  barEnd <- as.numeric(argv[5])
+  hpStart <- as.numeric(argv[6])
+  hpEnd <- as.numeric(argv[7])
+} else if (inputType=="counts") {
+  countPath <- as.character(argv[2])
+  annoPath <- as.character(argv[3])
+  samplePath <- as.character(argv[4])
+}
+  
+cpmReq <- as.numeric(argv[8])
+sampleReq <- as.numeric(argv[9])
+fdrThresh <- as.numeric(argv[10])
+lfcThresh <- as.numeric(argv[11])
+workMode <- as.character(argv[12])
+htmlPath <- as.character(argv[13])
+folderPath <- as.character(argv[14])
+if (workMode=="classic") {
+  pairData <- character()
+  pairData[2] <- as.character(argv[15])
+  pairData[1] <- as.character(argv[16])
+} else if (workMode=="glm") {
+  contrastData <- as.character(argv[15])
+  roastOpt <- as.character(argv[16])
+  hairpinReq <- as.numeric(argv[17])
+  selectOpt <- as.character(argv[18])
+  selectVals <- as.character(argv[19])
+}
+
+# Read in inputs
+if (inputType=="fastq") {
+  samples <- read.table(samplePath, header=TRUE, sep="\t")
+  hairpins <- read.table(hairpinPath, header=TRUE, sep="\t")
+} else if (inputType=="counts") {
+  samples <- read.table(samplePath, header=TRUE, sep="\t")
+  counts <- read.table(countPath, header=TRUE, sep="\t")
+  anno <- read.table(annoPath, header=TRUE, sep="\t")
+}
+###################### Check inputs for correctness ############################
+samples$ID <- make.names(samples$ID)
+
+if (!any(grepl("group", names(samples)))) {
+  stop("'group' column not specified in sample annotation file")
+} # Check if grouping variable has been specified
+
+if (any(table(samples$ID)>1)){
+  tab <- table(samples$ID)
+  offenders <- paste(names(tab[tab>1]), collapse=", ")
+  offenders <- unmake.names(offenders)
+  stop("ID column of sample annotation must have unique values, values ",
+       offenders, " are repeated")
+} # Check that IDs in sample annotation are unique
+
+if (any(is.na(match(samples$ID, colnames(counts))))) {
+  stop("not all samples have groups specified")
+} # Check that a group has be specifed for each sample
+
+if (inputType=="fastq") {
+
+  if (any(table(hairpins$ID)>1)){
+    tab <- table(hairpins$ID)
+    offenders <- paste(names(tab[tab>1]), collapse=", ")
+    stop("ID column of hairpin annotation must have unique values, values ",
+    offenders, " are repeated")
+  } # Check that IDs in hairpin annotation are unique
+
+} else if (inputType=="counts") {
+  
+  if (any(table(counts$ID)>1)){
+    tab <- table(counts$ID)
+    offenders <- paste(names(tab[tab>1]), collapse=", ")
+    stop("ID column of count table must have unique values, values ",
+    offenders, " are repeated")
+  } # Check that IDs in count table are unique
+}
+################################################################################
+
+# Process arguments
+if (workMode=="glm") {
+  if (roastOpt=="yes") {
+    wantRoast <- TRUE
+  } else {
+    wantRoast <- FALSE
+  }
+}
+
+# Split up contrasts seperated by comma into a vector and replace spaces with
+# periods
+if (exists("contrastData")) {
+  contrastData <- unlist(strsplit(contrastData, split=","))
+  contrastData <- sanitiseEquation(contrastData)
+  contrastData <- gsub(" ", ".", contrastData, fixed=TRUE)
+}
+
+# Replace spaces with periods in pair data
+if (exists("pairData")) {
+  pairData <- make.names(pairData)
+}
+
+# Generate output folder and paths
+dir.create(folderPath)
+
+# Generate links for outputs
+imgOut("barHairpin")
+imgOut("barIndex")
+imgOut("mds")
+imgOut("bcv")
+if (workMode == "classic") {
+  smearPng <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").png"))
+  smearPdf <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").pdf"))
+  topOut <- makeOut(paste0("toptag(", pairData[2], "-", pairData[1],").tsv"))
+} else if (workMode=="glm") {
+  smearPng <- character()
+  smearPdf <- character()
+  topOut <- character()
+  roastOut <- character()
+  barcodePng <- character()
+  barcodePdf <- character()
+  for (i in 1:length(contrastData)) {
+    smearPng[i] <- makeOut(paste0("smear(", contrastData[i], ").png"))
+    smearPdf[i] <- makeOut(paste0("smear(", contrastData[i], ").pdf"))
+    topOut[i] <- makeOut(paste0("toptag(", contrastData[i], ").tsv"))
+    roastOut[i] <- makeOut(paste0("roast(", contrastData[i], ").tsv"))
+    barcodePng[i] <- makeOut(paste0("barcode(", contrastData[i], ").png"))
+    barcodePdf[i] <- makeOut(paste0("barcode(", contrastData[i], ").pdf"))
+  }
+}
+# Initialise data for html links and images, table with the link label and
+# link address
+linkData <- data.frame(Label=character(), Link=character(),
+                       stringsAsFactors=FALSE)
+imageData <- data.frame(Label=character(), Link=character(),
+                        stringsAsFactors=FALSE)
+################################################################################
+### Data Processing
+################################################################################
+
+# Transform gene selection from string into index values for mroast
+if (workMode=="glm") {
+  if (selectOpt=="rank") {
+    selectVals <- gsub(" ", "", selectVals, fixed=TRUE)
+    selectVals <- unlist(strsplit(selectVals, ","))
+    
+    for (i in 1:length(selectVals)) {
+      if (grepl(":", selectVals[i], fixed=TRUE)) {
+        temp <- unlist(strsplit(selectVals[i], ":"))
+        selectVals <- selectVals[-i]
+        a <- as.numeric(temp[1])
+        b <- as.numeric(temp[2])
+        selectVals <- c(selectVals, a:b)         
+      }
+    }
+    selectVals <- as.numeric(unique(selectVals))
+  } else {
+    selectVals <- gsub(" ", "", selectVals, fixed=TRUE)
+    selectVals <- unlist(strsplit(selectVals, " "))
+  }                                                           
+}
+                                                  
+if (inputType=="fastq") {
+# Use EdgeR hairpin process and capture outputs
+hpReadout <- capture.output(
+  data <- processHairpinReads(fastqPath, samplePath, hairpinPath,
+                              hairpinStart=hpStart, hairpinEnd=hpEnd, 
+                              verbose=TRUE)
+)
+
+# Remove function output entries that show processing data or is empty
+hpReadout <- hpReadout[hpReadout!=""]
+hpReadout <- hpReadout[!grepl("Processing", hpReadout)]
+hpReadout <- hpReadout[!grepl("in file", hpReadout)]
+hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE)
+
+
+# Make the names of groups syntactically valid (replace spaces with periods)
+data$samples$group <- make.names(data$samples$group)
+} else {
+  # Process counts information, set ID column to be row names
+  rownames(counts) <- counts$ID
+  counts <- counts[ , !(colnames(counts)=="ID")]
+  countsRows <- nrow(counts)
+  
+  # Process group information
+  factors <- samples$group[match(samples$ID, colnames(counts))]
+  annoRows <- nrow(anno)
+  anno <- anno[match(rownames(counts), anno$ID), ]
+  annoMatched <- sum(!is.na(anno$ID))
+  
+  if (any(is.na(anno$ID))) {
+    warningStr <- paste("count table contained more hairpins than",
+                        "specified in hairpin annotation file")
+    warning(warningStr)
+  }
+  
+  # Filter out rows with zero counts
+  sel <- rowSums(counts)!=0
+  counts <- counts[sel, ]
+  anno <- anno[sel, ]
+  
+  # Create DGEList
+  data <- DGEList(counts=counts, lib.size=colSums(counts), 
+                  norm.factors=rep(1,ncol(counts)), genes=anno, group=factors)
+                  
+  # Make the names of groups syntactically valid (replace spaces with periods)
+  data$samples$group <- make.names(data$samples$group)
+}
+
+# Filter hairpins with low counts
+sel <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq
+data <- data[sel, ]
+
+# Estimate dispersions
+data <- estimateDisp(data)
+commonBCV <- sqrt(data$common.dispersion)
+
+################################################################################
+### Output Processing
+################################################################################
+
+# Plot number of hairpins that could be matched per sample
+png(barIndexPng, width=600, height=600)
+barplot(height<-colSums(data$counts), las=2, main="Counts per index", 
+        cex.names=1.0, cex.axis=0.8, ylim=c(0, max(height)*1.2))
+imageData[1, ] <- c("Counts per Index", "barIndex.png")
+invisible(dev.off())
+
+pdf(barIndexPdf)
+barplot(height<-colSums(data$counts), las=2, main="Counts per index", 
+        cex.names=1.0, cex.axis=0.8, ylim=c(0, max(height)*1.2))
+linkData[1, ] <- c("Counts per Index Barplot (.pdf)", "barIndex.pdf")
+invisible(dev.off())
+
+# Plot per hairpin totals across all samples
+png(barHairpinPng, width=600, height=600)
+if (nrow(data$counts)<50) {
+  barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin",
+          cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2))
+} else {
+  barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin",
+          cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2),
+          names.arg=FALSE)
+}
+imageData <- rbind(imageData, c("Counts per Hairpin", "barHairpin.png"))
+invisible(dev.off())
+
+pdf(barHairpinPdf)
+if (nrow(data$counts)<50) {
+  barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin",
+          cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2))
+} else {
+  barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin",
+          cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2),
+          names.arg=FALSE)
+}
+newEntry <- c("Counts per Hairpin Barplot (.pdf)", "barHairpin.pdf")
+linkData <- rbind(linkData, newEntry)
+invisible(dev.off())
+
+# Make an MDS plot to visualise relationships between replicate samples
+png(mdsPng, width=600, height=600)
+plotMDS(data, labels=data$samples$group, col=as.numeric(data$samples$group),
+        main="MDS Plot")
+imageData <- rbind(imageData, c("MDS Plot", "mds.png"))
+invisible(dev.off())
+
+pdf(mdsPdf)
+plotMDS(data, labels=data$samples$group, col=as.numeric(data$samples$group),
+        main="MDS Plot")
+newEntry <- c("MDS Plot (.pdf)", "mds.pdf")
+linkData <- rbind(linkData, newEntry)
+invisible(dev.off())
+
+if (workMode=="classic") {
+  # Assess differential representation using classic exact testing methodology 
+  # in edgeR
+  testData <- exactTest(data, pair=pairData)
+  
+  top <- topTags(testData, n=Inf)
+  topIDs <- top$table[(top$table$FDR < fdrThresh) &
+                      (abs(top$table$logFC) > lfcThresh), 1]
+  write.table(top, file=topOut, row.names=FALSE, sep="\t")
+  linkName <- paste0("Top Tags Table(", pairData[2], "-", pairData[1], 
+                     ") (.tsv)")
+  linkAddr <- paste0("toptag(", pairData[2], "-", pairData[1], ").tsv")
+  linkData <- rbind(linkData, c(linkName, linkAddr))
+  
+  # Select hairpins with FDR < 0.05 to highlight on plot
+  png(smearPng, width=600, height=600)
+  plotTitle <- gsub(".", " ", 
+                    paste0("Smear Plot: ", pairData[2], "-", pairData[1]),
+                    fixed = TRUE)
+  plotSmear(testData, de.tags=topIDs, 
+            pch=20, cex=1.0, main=plotTitle)
+  abline(h = c(-1, 0, 1), col = c("dodgerblue", "yellow", "dodgerblue"), lty=2)
+  imgName <- paste0("Smear Plot(", pairData[2], "-", pairData[1], ")")
+  imgAddr <- paste0("smear(", pairData[2], "-", pairData[1],").png")
+  imageData <- rbind(imageData, c(imgName, imgAddr))
+  invisible(dev.off())
+  
+  pdf(smearPdf)
+  plotTitle <- gsub(".", " ", 
+                    paste0("Smear Plot: ", pairData[2], "-", pairData[1]),
+                    fixed = TRUE)
+  plotSmear(testData, de.tags=topIDs, 
+            pch=20, cex=1.0, main=plotTitle)
+  abline(h = c(-1, 0, 1), col = c("dodgerblue", "yellow", "dodgerblue"), lty=2)
+  imgName <- paste0("Smear Plot(", pairData[2], "-", pairData[1], ") (.pdf)")
+  imgAddr <- paste0("smear(", pairData[2], "-", pairData[1], ").pdf")
+  linkData <- rbind(linkData, c(imgName, imgAddr))
+  invisible(dev.off())
+} else if (workMode=="glm") {
+  # Generating design information
+  factors <- factor(data$sample$group)
+  design <- model.matrix(~0+factors)
+  
+  colnames(design) <- gsub("factors", "", colnames(design), fixed=TRUE)
+  
+  # Split up contrasts seperated by comma into a vector
+  contrastData <- unlist(strsplit(contrastData, split=","))
+  for (i in 1:length(contrastData)) {
+    # Generate contrasts information
+    contrasts <- makeContrasts(contrasts=contrastData[i], levels=design)
+    
+    # Fit negative bionomial GLM
+    fit = glmFit(data, design)
+    # Carry out Likelihood ratio test
+    testData = glmLRT(fit, contrast=contrasts)
+    
+    # Select hairpins with FDR < 0.05 to highlight on plot
+    top <- topTags(testData, n=Inf)
+    topIDs <- top$table[(top$table$FDR < fdrThresh) &
+                        (abs(top$table$logFC) > lfcThresh), 1]
+    write.table(top, file=topOut[i], row.names=FALSE, sep="\t")
+    
+    linkName <- paste0("Top Tags Table(", contrastData[i], ") (.tsv)")
+    linkAddr <- paste0("toptag(", contrastData[i], ").tsv")
+    linkData <- rbind(linkData, c(linkName, linkAddr))
+    
+    # Make a plot of logFC versus logCPM
+    png(smearPng[i], height=600, width=600)
+    plotTitle <- paste("Smear Plot:", gsub(".", " ", contrastData[i], 
+                       fixed=TRUE))
+    plotSmear(testData, de.tags=topIDs, pch=20, cex=0.8, main=plotTitle)
+    abline(h=c(-1, 0, 1), col=c("dodgerblue", "yellow", "dodgerblue"), lty=2)
+    
+    imgName <- paste0("Smear Plot(", contrastData[i], ")")
+    imgAddr <- paste0("smear(", contrastData[i], ").png")
+    imageData <- rbind(imageData, c(imgName, imgAddr))
+    invisible(dev.off())
+    
+    pdf(smearPdf[i])
+    plotTitle <- paste("Smear Plot:", gsub(".", " ", contrastData[i], 
+                       fixed=TRUE))
+    plotSmear(testData, de.tags=topIDs, pch=20, cex=0.8, main=plotTitle)
+    abline(h=c(-1, 0, 1), col=c("dodgerblue", "yellow", "dodgerblue"), lty=2)
+    
+    linkName <- paste0("Smear Plot(", contrastData[i], ") (.pdf)")
+    linkAddr <- paste0("smear(", contrastData[i], ").pdf")
+    linkData <- rbind(linkData, c(linkName, linkAddr))
+    invisible(dev.off())
+    
+    genes <- as.character(data$genes$Gene)
+    unq <- unique(genes)
+    unq <- unq[!is.na(unq)]
+    geneList <- list()
+    for (gene in unq) {
+      if (length(which(genes==gene)) >= hairpinReq) {
+        geneList[[gene]] <- which(genes==gene)
+      }
+    }
+    
+    if (wantRoast) {
+      # Input preparaton for roast
+      nrot = 9999
+      set.seed(602214129)
+      roastData <- mroast(data, index=geneList, design=design,
+                         contrast=contrasts, nrot=nrot)
+      roastData <- cbind(GeneID=rownames(roastData), roastData)
+      write.table(roastData, file=roastOut[i], row.names=FALSE, sep="\t")
+      linkName <- paste0("Gene Level Analysis Table(", contrastData[i], 
+                         ") (.tsv)")
+      linkAddr <- paste0("roast(", contrastData[i], ").tsv")
+      linkData <- rbind(linkData, c(linkName, linkAddr))
+      if (selectOpt=="rank") {
+        selectedGenes <- rownames(roastData)[selectVals]
+      } else {
+        selectedGenes <- selectVals
+      }
+      
+      if (packageVersion("limma")<"3.19.19") {
+        png(barcodePng[i], width=600, height=length(selectedGenes)*150)
+      } else {
+        png(barcodePng[i], width=600, height=length(selectedGenes)*300)
+      }
+      par(mfrow=c(length(selectedGenes), 1))
+      for (gene in selectedGenes) {
+        barcodeplot(testData$table$logFC, index=geneList[[gene]],
+                    main=paste("Barcode Plot for", gene, "(logFCs)", 
+                               gsub(".", " ", contrastData[i])),
+                    labels=c("Positive logFC", "Negative logFC"))
+      }
+      imgName <- paste0("Barcode Plot(", contrastData[i], ")")
+      imgAddr <- paste0("barcode(", contrastData[i], ").png")
+      imageData <- rbind(imageData, c(imgName, imgAddr))
+      dev.off()
+      if (packageVersion("limma")<"3.19.19") {
+        pdf(barcodePdf[i], width=8, height=2)
+      } else {
+        pdf(barcodePdf[i], width=8, height=4)
+      }
+      for (gene in selectedGenes) {
+        barcodeplot(testData$table$logFC, index=geneList[[gene]],
+                    main=paste("Barcode Plot for", gene, "(logFCs)", 
+                               gsub(".", " ", contrastData[i])),
+                    labels=c("Positive logFC", "Negative logFC"))
+      }
+      linkName <- paste0("Barcode Plot(", contrastData[i], ") (.pdf)")
+      linkAddr <- paste0("barcode(", contrastData[i], ").pdf")
+      linkData <- rbind(linkData, c(linkName, linkAddr))
+      dev.off()
+    }
+  }
+}
+
+# Record ending time
+timeEnd <- as.character(Sys.time())
+################################################################################
+### HTML Generation
+################################################################################
+# Clear file
+cat("", file=htmlPath)
+
+cata("<html>\n")
+HtmlHead("EdgeR Output")
+
+cata("<body>\n")
+cata("<h3>EdgeR Analysis Output:</h3>\n")
+cata("<h4>Input Summary:</h4>\n")
+if (inputType=="fastq") {
+  cata("<ul>\n")
+  ListItem(hpReadout[1])
+  ListItem(hpReadout[2])
+  cata("</ul>\n")
+  cata(hpReadout[3], "<br/>\n")
+  cata("<ul>\n")
+  ListItem(hpReadout[4])
+  ListItem(hpReadout[7])
+  cata("</ul>\n")
+  cata(hpReadout[8:11], sep="<br/>\n")
+  cata("<br />\n")
+  cata("<b>Please check that read percentages are consistent with ")
+  cata("expectations.</b><br >\n")
+} else if (inputType=="counts") {
+  cata("<ul>\n")
+  ListItem("Number of Samples: ", ncol(data$counts))
+  ListItem("Number of Hairpins: ", countsRows)
+  ListItem("Number of annotations provided: ", annoRows)
+  ListItem("Number of annotations matched to hairpin: ", annoMatched)
+  cata("</ul>\n")
+}
+
+cata("The estimated common biological coefficient of variation (BCV) is: ", 
+     commonBCV, "<br />\n")
+
+cata("<h4>Output:</h4>\n")
+cata("All images displayed have PDF copy at the bottom of the page, these can ")
+cata("exported in a pdf viewer to high resolution image format. <br/>\n")
+for (i in 1:nrow(imageData)) {
+  if (grepl("barcode", imageData$Link[i])) {
+    if (packageVersion("limma")<"3.19.19") {
+      HtmlImage(imageData$Link[i], imageData$Label[i], 
+                height=length(selectedGenes)*150)
+    } else {
+      HtmlImage(imageData$Link[i], imageData$Label[i], 
+                height=length(selectedGenes)*300)
+    }
+  } else {
+    HtmlImage(imageData$Link[i], imageData$Label[i])
+  }
+}
+cata("<br/>\n")
+
+cata("<h4>Plots:</h4>\n")
+for (i in 1:nrow(linkData)) {
+  if (!grepl(".tsv", linkData$Link[i])) {
+    HtmlLink(linkData$Link[i], linkData$Label[i])
+  }
+}
+
+cata("<h4>Tables:</h4>\n")
+for (i in 1:nrow(linkData)) {
+  if (grepl(".tsv", linkData$Link[i])) {
+    HtmlLink(linkData$Link[i], linkData$Label[i])
+  }
+}
+
+cata("<p>alt-click any of the links to download the file, or click the name ")
+cata("of this task in the galaxy history panel and click on the floppy ")
+cata("disk icon to download all files in a zip archive.</p>\n")
+cata("<p>.tsv files are tab seperated files that can be viewed using Excel ")
+cata("or other spreadsheet programs</p>\n")
+cata("<table border=\"0\">\n")
+
+cata("<tr>\n")
+TableItem("Task started at:"); TableItem(timeStart)
+cata("</tr>\n")
+cata("<tr>\n")
+TableItem("Task ended at:"); TableItem(timeEnd)
+cata("</tr>\n")
+
+cata("</body>\n")
+cata("</html>")
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hairpinTool.xml	Fri Feb 21 12:52:56 2014 +1100
@@ -0,0 +1,409 @@
+<tool id="shRNAseq" name="shRNAseq Tool" version="1.0.5">
+  <description>
+    Analyse hairpin differential representation using edgeR
+  </description>
+    
+  <requirements>
+    <requirement type="R-module">edgeR</requirement>
+    <requirement type="R-module">limma</requirement>
+  </requirements>
+  
+  <stdio>
+    <exit_code range="1:" level="fatal" description="Tool exception" />
+  </stdio>
+  
+  <command interpreter="Rscript">
+  hairpinTool.R $inputOpt.type
+                #if $inputOpt.type=="fastq":
+                  #for $i, $fas in enumerate($inputOpt.fastq):
+                    fastq::$fas.file
+                  #end for
+    
+                  $inputOpt.hairpin
+                  $inputOpt.samples
+                    
+                  #if $inputOpt.positions.option=="yes":
+                    $inputOpt.positions.barstart
+                    $inputOpt.positions.barend
+                    $inputOpt.positions.hpstart
+                    $inputOpt.positions.hpend
+                  #else:
+                    1
+                    5
+                    37
+                    57
+                  #end if
+                #else:
+                  $inputOpt.counts
+                  $inputOpt.anno
+                  "$inputOpt.factors"
+                  0 0 0
+                #end if
+          
+                #if $filterCPM.option=="yes":
+                  $filterCPM.cpmReq
+                  $filterCPM.sampleReq
+                #else:
+                  -Inf
+                  -Inf
+                #end if
+          
+                $fdr
+                $lfc
+                $workMode.mode
+                $outFile
+                $outFile.files_path
+          
+                #if $workMode.mode=="classic":
+                  "$workMode.pair1"
+                  "$workMode.pair2"
+                #else:
+                  "$workMode.contrast"
+                  $workMode.roast.option
+                  #if $workMode.roast.option=="yes":
+                    $workMode.roast.hairpinReq
+                    $workMode.roast.select.option
+                    "$workMode.roast.select.selection"
+                  #else:
+                    0
+                    0
+                    0
+                  #end if
+                #end if
+  </command>
+  
+  <inputs>
+    <conditional name="inputOpt">
+      <param name="type" type="select" label="Input File Type">
+        <option value="fastq">FastQ File</option>
+        <option value="counts">Table of Counts</option>
+      </param>
+    
+      <when value="fastq">
+        <param name="hairpin" type="data" format="tabular" 
+               label="Hairpin Annotation"/>
+          
+        
+        <param name="samples" type="data" format="tabular" 
+               label="Sample Annotation"/>
+               
+        <repeat name="fastq" title="FastQ Files">       
+          <param name="file" type="data" format="fastq"/>
+        </repeat>
+          
+        <conditional name="positions">
+          <param name="option" type="select" 
+                 label="Specify Barcode and Hairpin Locations?"
+                 help="Default Positions: Barcode: 1 to 5, Hairpin: 37 to 57.">
+            <option value="no" selected="True">No</option>
+            <option value="yes">Yes</option>
+          </param>
+          
+          <when value="yes">
+            <param name="barstart" type="integer" value="1"
+                   label="Barcode Starting Position"/>
+            <param name="barend" type="integer" value="5"
+                   label="Barcode Ending Position"/>
+            
+            <param name="hpstart" type="integer" value="37"
+                   label="Hairpin Starting Position"/>
+               
+            <param name="hpend" type="integer" value="57"
+                   label="Hairpin Ending Position"/>
+          </when>
+          
+          <when value="no"/>
+        </conditional>
+      </when>
+      
+      <when value="counts">
+        <param name="counts" type="data" format="tabular" label="Counts Table"/>
+        <param name="anno" type="data" format="tabular" 
+               label="Hairpin Annotation"/>
+        <param name="factors" type="data" format="tabular"
+               label="Sample Annotation"/> 
+      </when>
+    </conditional>
+    
+    <conditional name="filterCPM">
+      <param name="option" type="select" label="Filter Low CPM?"
+       help="Ignore hairpins with very low representation when performing 
+             analysis.">
+        <option value="yes">Yes</option>
+       	<option value="no">No</option>
+      </param>
+      
+        <when value="yes">
+          <param name="cpmReq" type="float" value="0.5" min="0" max="1"
+                 label="Minimum CPM"/>
+                 
+          <param name="sampleReq" type="integer" value="1" min="0"
+                 label="Minimum Samples" 
+                 help="Filter out all the genes that do not meet the minimum 
+                       CPM in at least this many samples."/>
+        </when>
+        
+        <when value="no"/>
+        
+    </conditional>
+    
+    <conditional name="workMode">
+      <param name="mode" type="select" label="Analysis Type"
+       help="Classic Exact Tests are useful for simple comparisons across
+             two sampling groups. Generalised linear models allow for more 
+             complex contrasts and gene level analysis to be made.">
+        <option value="classic">Classic Exact Test</option>
+        <option value="glm">Generalised Linear Model</option>
+      </param>
+      
+      <when value="classic">
+        <param name="pair1" type="text" label="Compare" size="40"/>
+        <param name="pair2" type="text" label="To" size="40"
+               help="The analysis will subtract values of this group from those
+                     in the group above to establish the difference."/>
+      </when>
+      
+      <when value="glm">
+        <param name="contrast" type="text" size="60"
+               label="Contrasts of interest"
+               help="Specify equations defining contrasts to be made. Eg. 
+                     KD-Control will result in positive fold change if KD has
+                     greater expression and negative if Control has greater
+                     expression."/>
+               
+        <conditional name="roast">
+          <param name="option" type="select" 
+                 label="Perform Gene Level Analysis?"
+                 help="Analyse LogFC tendencies for hairpins belonging
+                       to the same gene.">
+            <option value="no">No</option>
+            <option value="yes">Yes</option>
+          </param>
+          
+          <when value="yes">
+            <param name="hairpinReq" type="integer" value="2" min="2"
+                   label="Minimum Hairpins"
+                   help="Only genes with at least this many hairpins will
+                         be analysed."/>
+                         
+            <conditional name="select">
+              <param name="option" type="select"
+                     label="Gene Selection Method">
+                <option value="rank">By p-value Rank</option>
+                <option value="geneID">By Gene Identifier</option>
+              </param>
+              <when value="rank">
+                <param name="selection" type="text" size="40" value="1:5"
+                       label="Ranks of Top Genes to Plot"
+                       help="Genes are ranked in ascending p-value for
+                             differential representation, individual ranks can
+                             be entered seperated by comma or a range seperated
+                             by colon."/>
+              </when>
+              <when value="geneID">
+                <param name="selection" type="text" size="80" value=""
+                       label="Symbols of Genes to Plot"
+                       help="Select genes based on their identifier in the
+                             'Gene' column of the sample information file.
+                             Please ensure exact match with the values in input
+                             file and separate selections with commas."/>
+              </when>
+            </conditional>
+
+            
+          </when>
+          
+          <when value="no"/>
+        </conditional>
+      </when>
+    </conditional>
+    
+    <param name="fdr" type="float" value="0.05" min="0" max="1"
+           label="FDR Threshold"
+           help="All observations below this threshold will be highlighted
+                 in the smear plot."/>
+    <param name="lfc" type="float" value="0" min="0" 
+           label="Absolute LogFC Threshold"
+           help="In additional to meeting the FDR requirement, the absolute 
+                 value of the log-fold-change of the observation must be above
+                 this threshold to be highlighted."/>
+  </inputs>
+
+  <outputs>
+    <data format="html" name="outFile" label="shRNAseq Analysis"/>
+  </outputs>
+  
+  <help>
+.. class:: infomark
+
+**What it does**
+
+Given tables containing information about the hairpins and their associated
+barcodes, information about the samples and fastq file containing the hairpin
+reads. This tool will generate plots and tables for the analysis of differential
+representation.
+
+-----
+
+.. class:: infomark
+
+**INPUTS**
+
+**Input File Type:**
+
+This tool is able to either generate counts from a raw FastQ file given the
+information regarding the samples and hairpins. Alternatively if a table of
+counts has already been generated it can also be used.
+
+**Counts Table (Counts Input):**
+
+A tab delimited text table of information regarding the counts of hairpins.
+Should have a column 'ID' to denote the hairpins that counts correspond to. Each
+additional column should have titles corresponding to the label for the sample.
+
+Example::
+
+  ID  Sample1 Sample2 Sample3
+  Control1 49802 48014 40148
+  Control2 12441 16352 14232
+  Control3 9842  9148  9111
+  Hairpin1 3300  3418  2914
+  Hairpin2 91418 95812 93174
+  Hairpin3 32985 31975 35104
+  Hairpin4 12082 14081 14981
+  Hairpin5 2491  2769  2691
+  Hairpin6 1294  1486  1642
+  Hairpin7 49501 49076 47611
+  ...
+  
+**Hairpin Annotation:**
+
+A tab delimited text table of information regarding the hairpins. Should have
+columns 'ID', 'Sequences' and 'Gene' to uniquely identify the hairpin, align it
+with the reads to produce counts and identify which gene the hairpin acts on.
+
+NOTE: the column names are case sensitive and should be input exactly as they
+are shown here.
+
+Example::
+
+  ID	Sequences	Gene
+  Control1	TCTCGCTTGGGCGAGAGTAAG	2
+  Control2	CCGCCTGAAGTCTCTGATTAA	2
+  Control3	AGGAATTATAATGCTTATCTA	2
+  Hairpin1	AAGGCAGAGACTGACCACCTA	4
+  Hairpin2	GAGCGACCTGGTGTTACTCTA	4
+  Hairpin3	ATGGTGTAAATAGAGCTGTTA	4
+  Hairpin4	CAGCTCATCTTCTGTGAAGAA	4
+  Hairpin5	CAGCTCTGTGGGTCAGAAGAA	4
+  Hairpin6	CCAGGCACAGATCTCAAGATA	4
+  Hairpin7	ATGACAAGAAAGACATCTCAA	7
+  ...
+  
+**Sample Annotation (FastQ Input):**
+
+A tab delimited text table of information regarding the samples. Should have
+columns 'ID', 'Sequences' and 'group' to uniquely identify each sample, identify
+the sample in the reads by its barcode sequence and correctly group replicates
+for analysis. Additional columns may inserted for annotation purposes and will
+not interfere with analysis as long as the necessary columns are present.
+
+NOTE: the column names are case sensitive and should be input exactly as they
+are shown here.
+
+Example::
+
+  ID	Sequences	group	Replicate
+  3	GAAAG	Day 2	1
+  6	GAACC	Day 10	1
+  9	GAAGA	Day 5 GFP neg	1
+  16	GAATT	Day 5 GFP pos	1
+  18	GACAC	Day 2	2
+  21	GACCA	Day 10	2
+  28	GACGT	Day 5 GFP neg	2
+  31	GACTG	Day 5 GFP pos	2
+  33	GAGAA	Day 2	3
+  40	GAGCT	Day 10	3
+  ...
+  
+**Specify Barcode and Hairpin Locations (FastQ Input):**
+
+It is assumed that in the sequencing reads that the first 5 bases are the
+barcodes and that bases 37-57 are the hairpins. If this is not the case then the
+values of the positions can be changed, however it still requires the barcodes
+and hairpins to be in a consistent location an in a continuous sequence.
+
+**Filter Low CPM?:**
+
+Often in a large screen there may members with very low counts which are of no
+interest in the experiment, these may be filtered out to speed up computations.
+Filtering will be based on counts per million in a required number of samples.
+
+**Analysis Type:**
+
+ * **Classic Exact Test:** This allows two experimental groups to be compared and
+   p-values for differential representation derivec for each hairpin. Simple and
+   fast for straightforward comparisons. In this option you will have the option of
+   "*Compare* x *To* y" which implicitly subtracts the data from y from that of x
+   to produce the comparison.
+
+ * **Generalised Linear Model:** This allow for complex contrasts to be specified
+   and also gene level analysis to be performed. If this option is chosen then
+   contrasts must be explicitly stated in equations and multiple contrasts can be
+   made. In addition there will be the option to analyse hairpins on a per-gene
+   basis to see if hairpins belonging to a particular gene have any overall
+   tendencies for the direction of their log-fold-change.
+
+**FDR Threshold:**
+The smear plot in the output will have hairpins highlighted to signify
+significant differential representation. The significance is determined by
+contorlling the false discovery rate, only those with a FDR lower than the
+threshold will be highlighted in the plot.
+
+-----
+
+**Citations:**
+
+.. class:: infomark
+
+limma
+
+Please cite the paper below for the limma software itself.  Please also try
+to cite the appropriate methodology articles that describe the statistical
+methods implemented in limma, depending on which limma functions you are
+using.  The methodology articles are listed in Section 2.1 of the limma 
+User's Guide.
+
+	* Smyth, GK (2005). Limma: linear models for microarray data. In: 
+	  'Bioinformatics and Computational Biology Solutions using R and 
+	  Bioconductor'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry, 
+	  W. Huber (eds), Springer, New York, pages 397-420.
+
+.. class:: infomark
+
+edgeR
+
+Please cite the first paper for the software itself and the other papers for
+the various original statistical methods implemented in edgeR.  See 
+Section 1.2 in the User's Guide for more detail.
+
+	* Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor 
+	  package for differential expression analysis of digital gene expression 
+	  data. Bioinformatics 26, 139-140
+	  
+	* Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing 
+	  differences in tag abundance. Bioinformatics 23, 2881-2887
+	  
+	* Robinson MD and Smyth GK (2008). Small-sample estimation of negative 
+	  binomial dispersion, with applications to SAGE data.
+	  Biostatistics, 9, 321-332
+	  
+	* McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis 
+	  of multifactor RNA-Seq experiments with respect to biological variation. 
+	  Nucleic Acids Research 40, 4288-4297
+
+.. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
+.. _limma: http://www.bioconductor.org/packages/release/bioc/html/limma.html
+  </help>
+</tool>
+