view hairpinTool.R @ 2:076ca575208f

First commit
author shian_su <registertonysu@gmail.com>
date Fri, 21 Feb 2014 12:52:56 +1100
parents
children f8af57d6f60b
line wrap: on
line source

# Record starting time
timeStart <- as.character(Sys.time())

# Loading and checking required packages
library(methods, quietly=TRUE, warn.conflicts=FALSE)
library(statmod, quietly=TRUE, warn.conflicts=FALSE)
library(splines, quietly=TRUE, warn.conflicts=FALSE)
library(edgeR, quietly=TRUE, warn.conflicts=FALSE)
library(limma, quietly=TRUE, warn.conflicts=FALSE)

if (packageVersion("edgeR") < "3.5.23") {
  message("Please update 'edgeR' to version >= 3.5.23 to run this script")
}

################################################################################
### Function declarations
################################################################################

# Function to sanitise contrast equations so there are no whitespaces
# surrounding the arithmetic operators, leading or trailing whitespace
sanitiseEquation <- function(equation) {
  equation <- gsub(" *[+] *", "+", equation)
  equation <- gsub(" *[-] *", "-", equation)
  equation <- gsub(" *[/] *", "/", equation)
  equation <- gsub(" *[*] *", "*", equation)
  equation <- gsub("^\\s+|\\s+$", "", equation)
  return(equation)
}

# Function to sanitise group information
sanitiseGroups <- function(string) {
  string <- gsub(" *[,] *", ",", string)
  string <- gsub("^\\s+|\\s+$", "", string)
  return(string)
}

# Function to change periods to whitespace in a string
unmake.names <- function(string) {
  string <- gsub(".", " ", string, fixed=TRUE)
  return(string)
}

# Function has string input and generates an output path string
makeOut <- function(filename) {
  return(paste0(folderPath, "/", filename))
}

# Function has string input and generates both a pdf and png output strings
imgOut <- function(filename) {
  assign(paste0(filename, "Png"), makeOut(paste0(filename,".png")), 
         envir = .GlobalEnv)
  assign(paste0(filename, "Pdf"), makeOut(paste0(filename,".pdf")),
         envir = .GlobalEnv)
}

# Create cat function default path set, default seperator empty and appending
# true by default (Ripped straight from the cat function with altered argument
# defaults)
cata <- function(..., file = htmlPath, sep = "", fill = FALSE, labels = NULL, 
                 append = TRUE) {
  if (is.character(file)) 
    if (file == "") 
      file <- stdout()
  else if (substring(file, 1L, 1L) == "|") {
    file <- pipe(substring(file, 2L), "w")
    on.exit(close(file))
  }
  else {
    file <- file(file, ifelse(append, "a", "w"))
    on.exit(close(file))
  }
  .Internal(cat(list(...), file, sep, fill, labels, append))
}

# Function to write code for html head and title
HtmlHead <- function(title) {
  cata("<head>\n")
  cata("<title>", title, "</title>\n")
  cata("</head>\n")
}

# Function to write code for html links
HtmlLink <- function(address, label=address) {
  cata("<a href=\"", address, "\" target=\"_blank\">", label, "</a><br />\n")
}

# Function to write code for html images
HtmlImage <- function(source, label=source, height=600, width=600) {
  cata("<img src=\"", source, "\" alt=\"", label, "\" height=\"", height)
  cata("\" width=\"", width, "\"/>\n")
}

# Function to write code for html list items
ListItem <- function(...) {
  cata("<li>", ..., "</li>\n")
}

TableItem <- function(...) {
  cata("<td>", ..., "</td>\n")
}

TableHeadItem <- function(...) {
  cata("<th>", ..., "</th>\n")
}
################################################################################
### Input Processing
################################################################################

# Grabbing arguments from command line
argv <- commandArgs(TRUE)

# Remove fastq file paths after collecting from argument vector
inputType <- as.character(argv[1])
if (inputType=="fastq") {
  fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], 
                                 fixed=TRUE))
  argv <- argv[!grepl("fastq::", argv, fixed=TRUE)]
  hairpinPath <- as.character(argv[2])
  samplePath <- as.character(argv[3])
  barStart <- as.numeric(argv[4])
  barEnd <- as.numeric(argv[5])
  hpStart <- as.numeric(argv[6])
  hpEnd <- as.numeric(argv[7])
} else if (inputType=="counts") {
  countPath <- as.character(argv[2])
  annoPath <- as.character(argv[3])
  samplePath <- as.character(argv[4])
}
  
cpmReq <- as.numeric(argv[8])
sampleReq <- as.numeric(argv[9])
fdrThresh <- as.numeric(argv[10])
lfcThresh <- as.numeric(argv[11])
workMode <- as.character(argv[12])
htmlPath <- as.character(argv[13])
folderPath <- as.character(argv[14])
if (workMode=="classic") {
  pairData <- character()
  pairData[2] <- as.character(argv[15])
  pairData[1] <- as.character(argv[16])
} else if (workMode=="glm") {
  contrastData <- as.character(argv[15])
  roastOpt <- as.character(argv[16])
  hairpinReq <- as.numeric(argv[17])
  selectOpt <- as.character(argv[18])
  selectVals <- as.character(argv[19])
}

# Read in inputs
if (inputType=="fastq") {
  samples <- read.table(samplePath, header=TRUE, sep="\t")
  hairpins <- read.table(hairpinPath, header=TRUE, sep="\t")
} else if (inputType=="counts") {
  samples <- read.table(samplePath, header=TRUE, sep="\t")
  counts <- read.table(countPath, header=TRUE, sep="\t")
  anno <- read.table(annoPath, header=TRUE, sep="\t")
}
###################### Check inputs for correctness ############################
samples$ID <- make.names(samples$ID)

if (!any(grepl("group", names(samples)))) {
  stop("'group' column not specified in sample annotation file")
} # Check if grouping variable has been specified

if (any(table(samples$ID)>1)){
  tab <- table(samples$ID)
  offenders <- paste(names(tab[tab>1]), collapse=", ")
  offenders <- unmake.names(offenders)
  stop("ID column of sample annotation must have unique values, values ",
       offenders, " are repeated")
} # Check that IDs in sample annotation are unique

if (any(is.na(match(samples$ID, colnames(counts))))) {
  stop("not all samples have groups specified")
} # Check that a group has be specifed for each sample

if (inputType=="fastq") {

  if (any(table(hairpins$ID)>1)){
    tab <- table(hairpins$ID)
    offenders <- paste(names(tab[tab>1]), collapse=", ")
    stop("ID column of hairpin annotation must have unique values, values ",
    offenders, " are repeated")
  } # Check that IDs in hairpin annotation are unique

} else if (inputType=="counts") {
  
  if (any(table(counts$ID)>1)){
    tab <- table(counts$ID)
    offenders <- paste(names(tab[tab>1]), collapse=", ")
    stop("ID column of count table must have unique values, values ",
    offenders, " are repeated")
  } # Check that IDs in count table are unique
}
################################################################################

# Process arguments
if (workMode=="glm") {
  if (roastOpt=="yes") {
    wantRoast <- TRUE
  } else {
    wantRoast <- FALSE
  }
}

# Split up contrasts seperated by comma into a vector and replace spaces with
# periods
if (exists("contrastData")) {
  contrastData <- unlist(strsplit(contrastData, split=","))
  contrastData <- sanitiseEquation(contrastData)
  contrastData <- gsub(" ", ".", contrastData, fixed=TRUE)
}

# Replace spaces with periods in pair data
if (exists("pairData")) {
  pairData <- make.names(pairData)
}

# Generate output folder and paths
dir.create(folderPath)

# Generate links for outputs
imgOut("barHairpin")
imgOut("barIndex")
imgOut("mds")
imgOut("bcv")
if (workMode == "classic") {
  smearPng <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").png"))
  smearPdf <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").pdf"))
  topOut <- makeOut(paste0("toptag(", pairData[2], "-", pairData[1],").tsv"))
} else if (workMode=="glm") {
  smearPng <- character()
  smearPdf <- character()
  topOut <- character()
  roastOut <- character()
  barcodePng <- character()
  barcodePdf <- character()
  for (i in 1:length(contrastData)) {
    smearPng[i] <- makeOut(paste0("smear(", contrastData[i], ").png"))
    smearPdf[i] <- makeOut(paste0("smear(", contrastData[i], ").pdf"))
    topOut[i] <- makeOut(paste0("toptag(", contrastData[i], ").tsv"))
    roastOut[i] <- makeOut(paste0("roast(", contrastData[i], ").tsv"))
    barcodePng[i] <- makeOut(paste0("barcode(", contrastData[i], ").png"))
    barcodePdf[i] <- makeOut(paste0("barcode(", contrastData[i], ").pdf"))
  }
}
# Initialise data for html links and images, table with the link label and
# link address
linkData <- data.frame(Label=character(), Link=character(),
                       stringsAsFactors=FALSE)
imageData <- data.frame(Label=character(), Link=character(),
                        stringsAsFactors=FALSE)
################################################################################
### Data Processing
################################################################################

# Transform gene selection from string into index values for mroast
if (workMode=="glm") {
  if (selectOpt=="rank") {
    selectVals <- gsub(" ", "", selectVals, fixed=TRUE)
    selectVals <- unlist(strsplit(selectVals, ","))
    
    for (i in 1:length(selectVals)) {
      if (grepl(":", selectVals[i], fixed=TRUE)) {
        temp <- unlist(strsplit(selectVals[i], ":"))
        selectVals <- selectVals[-i]
        a <- as.numeric(temp[1])
        b <- as.numeric(temp[2])
        selectVals <- c(selectVals, a:b)         
      }
    }
    selectVals <- as.numeric(unique(selectVals))
  } else {
    selectVals <- gsub(" ", "", selectVals, fixed=TRUE)
    selectVals <- unlist(strsplit(selectVals, " "))
  }                                                           
}
                                                  
if (inputType=="fastq") {
# Use EdgeR hairpin process and capture outputs
hpReadout <- capture.output(
  data <- processHairpinReads(fastqPath, samplePath, hairpinPath,
                              hairpinStart=hpStart, hairpinEnd=hpEnd, 
                              verbose=TRUE)
)

# Remove function output entries that show processing data or is empty
hpReadout <- hpReadout[hpReadout!=""]
hpReadout <- hpReadout[!grepl("Processing", hpReadout)]
hpReadout <- hpReadout[!grepl("in file", hpReadout)]
hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE)


# Make the names of groups syntactically valid (replace spaces with periods)
data$samples$group <- make.names(data$samples$group)
} else {
  # Process counts information, set ID column to be row names
  rownames(counts) <- counts$ID
  counts <- counts[ , !(colnames(counts)=="ID")]
  countsRows <- nrow(counts)
  
  # Process group information
  factors <- samples$group[match(samples$ID, colnames(counts))]
  annoRows <- nrow(anno)
  anno <- anno[match(rownames(counts), anno$ID), ]
  annoMatched <- sum(!is.na(anno$ID))
  
  if (any(is.na(anno$ID))) {
    warningStr <- paste("count table contained more hairpins than",
                        "specified in hairpin annotation file")
    warning(warningStr)
  }
  
  # Filter out rows with zero counts
  sel <- rowSums(counts)!=0
  counts <- counts[sel, ]
  anno <- anno[sel, ]
  
  # Create DGEList
  data <- DGEList(counts=counts, lib.size=colSums(counts), 
                  norm.factors=rep(1,ncol(counts)), genes=anno, group=factors)
                  
  # Make the names of groups syntactically valid (replace spaces with periods)
  data$samples$group <- make.names(data$samples$group)
}

# Filter hairpins with low counts
sel <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq
data <- data[sel, ]

# Estimate dispersions
data <- estimateDisp(data)
commonBCV <- sqrt(data$common.dispersion)

################################################################################
### Output Processing
################################################################################

# Plot number of hairpins that could be matched per sample
png(barIndexPng, width=600, height=600)
barplot(height<-colSums(data$counts), las=2, main="Counts per index", 
        cex.names=1.0, cex.axis=0.8, ylim=c(0, max(height)*1.2))
imageData[1, ] <- c("Counts per Index", "barIndex.png")
invisible(dev.off())

pdf(barIndexPdf)
barplot(height<-colSums(data$counts), las=2, main="Counts per index", 
        cex.names=1.0, cex.axis=0.8, ylim=c(0, max(height)*1.2))
linkData[1, ] <- c("Counts per Index Barplot (.pdf)", "barIndex.pdf")
invisible(dev.off())

# Plot per hairpin totals across all samples
png(barHairpinPng, width=600, height=600)
if (nrow(data$counts)<50) {
  barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin",
          cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2))
} else {
  barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin",
          cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2),
          names.arg=FALSE)
}
imageData <- rbind(imageData, c("Counts per Hairpin", "barHairpin.png"))
invisible(dev.off())

pdf(barHairpinPdf)
if (nrow(data$counts)<50) {
  barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin",
          cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2))
} else {
  barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin",
          cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2),
          names.arg=FALSE)
}
newEntry <- c("Counts per Hairpin Barplot (.pdf)", "barHairpin.pdf")
linkData <- rbind(linkData, newEntry)
invisible(dev.off())

# Make an MDS plot to visualise relationships between replicate samples
png(mdsPng, width=600, height=600)
plotMDS(data, labels=data$samples$group, col=as.numeric(data$samples$group),
        main="MDS Plot")
imageData <- rbind(imageData, c("MDS Plot", "mds.png"))
invisible(dev.off())

pdf(mdsPdf)
plotMDS(data, labels=data$samples$group, col=as.numeric(data$samples$group),
        main="MDS Plot")
newEntry <- c("MDS Plot (.pdf)", "mds.pdf")
linkData <- rbind(linkData, newEntry)
invisible(dev.off())

if (workMode=="classic") {
  # Assess differential representation using classic exact testing methodology 
  # in edgeR
  testData <- exactTest(data, pair=pairData)
  
  top <- topTags(testData, n=Inf)
  topIDs <- top$table[(top$table$FDR < fdrThresh) &
                      (abs(top$table$logFC) > lfcThresh), 1]
  write.table(top, file=topOut, row.names=FALSE, sep="\t")
  linkName <- paste0("Top Tags Table(", pairData[2], "-", pairData[1], 
                     ") (.tsv)")
  linkAddr <- paste0("toptag(", pairData[2], "-", pairData[1], ").tsv")
  linkData <- rbind(linkData, c(linkName, linkAddr))
  
  # Select hairpins with FDR < 0.05 to highlight on plot
  png(smearPng, width=600, height=600)
  plotTitle <- gsub(".", " ", 
                    paste0("Smear Plot: ", pairData[2], "-", pairData[1]),
                    fixed = TRUE)
  plotSmear(testData, de.tags=topIDs, 
            pch=20, cex=1.0, main=plotTitle)
  abline(h = c(-1, 0, 1), col = c("dodgerblue", "yellow", "dodgerblue"), lty=2)
  imgName <- paste0("Smear Plot(", pairData[2], "-", pairData[1], ")")
  imgAddr <- paste0("smear(", pairData[2], "-", pairData[1],").png")
  imageData <- rbind(imageData, c(imgName, imgAddr))
  invisible(dev.off())
  
  pdf(smearPdf)
  plotTitle <- gsub(".", " ", 
                    paste0("Smear Plot: ", pairData[2], "-", pairData[1]),
                    fixed = TRUE)
  plotSmear(testData, de.tags=topIDs, 
            pch=20, cex=1.0, main=plotTitle)
  abline(h = c(-1, 0, 1), col = c("dodgerblue", "yellow", "dodgerblue"), lty=2)
  imgName <- paste0("Smear Plot(", pairData[2], "-", pairData[1], ") (.pdf)")
  imgAddr <- paste0("smear(", pairData[2], "-", pairData[1], ").pdf")
  linkData <- rbind(linkData, c(imgName, imgAddr))
  invisible(dev.off())
} else if (workMode=="glm") {
  # Generating design information
  factors <- factor(data$sample$group)
  design <- model.matrix(~0+factors)
  
  colnames(design) <- gsub("factors", "", colnames(design), fixed=TRUE)
  
  # Split up contrasts seperated by comma into a vector
  contrastData <- unlist(strsplit(contrastData, split=","))
  for (i in 1:length(contrastData)) {
    # Generate contrasts information
    contrasts <- makeContrasts(contrasts=contrastData[i], levels=design)
    
    # Fit negative bionomial GLM
    fit = glmFit(data, design)
    # Carry out Likelihood ratio test
    testData = glmLRT(fit, contrast=contrasts)
    
    # Select hairpins with FDR < 0.05 to highlight on plot
    top <- topTags(testData, n=Inf)
    topIDs <- top$table[(top$table$FDR < fdrThresh) &
                        (abs(top$table$logFC) > lfcThresh), 1]
    write.table(top, file=topOut[i], row.names=FALSE, sep="\t")
    
    linkName <- paste0("Top Tags Table(", contrastData[i], ") (.tsv)")
    linkAddr <- paste0("toptag(", contrastData[i], ").tsv")
    linkData <- rbind(linkData, c(linkName, linkAddr))
    
    # Make a plot of logFC versus logCPM
    png(smearPng[i], height=600, width=600)
    plotTitle <- paste("Smear Plot:", gsub(".", " ", contrastData[i], 
                       fixed=TRUE))
    plotSmear(testData, de.tags=topIDs, pch=20, cex=0.8, main=plotTitle)
    abline(h=c(-1, 0, 1), col=c("dodgerblue", "yellow", "dodgerblue"), lty=2)
    
    imgName <- paste0("Smear Plot(", contrastData[i], ")")
    imgAddr <- paste0("smear(", contrastData[i], ").png")
    imageData <- rbind(imageData, c(imgName, imgAddr))
    invisible(dev.off())
    
    pdf(smearPdf[i])
    plotTitle <- paste("Smear Plot:", gsub(".", " ", contrastData[i], 
                       fixed=TRUE))
    plotSmear(testData, de.tags=topIDs, pch=20, cex=0.8, main=plotTitle)
    abline(h=c(-1, 0, 1), col=c("dodgerblue", "yellow", "dodgerblue"), lty=2)
    
    linkName <- paste0("Smear Plot(", contrastData[i], ") (.pdf)")
    linkAddr <- paste0("smear(", contrastData[i], ").pdf")
    linkData <- rbind(linkData, c(linkName, linkAddr))
    invisible(dev.off())
    
    genes <- as.character(data$genes$Gene)
    unq <- unique(genes)
    unq <- unq[!is.na(unq)]
    geneList <- list()
    for (gene in unq) {
      if (length(which(genes==gene)) >= hairpinReq) {
        geneList[[gene]] <- which(genes==gene)
      }
    }
    
    if (wantRoast) {
      # Input preparaton for roast
      nrot = 9999
      set.seed(602214129)
      roastData <- mroast(data, index=geneList, design=design,
                         contrast=contrasts, nrot=nrot)
      roastData <- cbind(GeneID=rownames(roastData), roastData)
      write.table(roastData, file=roastOut[i], row.names=FALSE, sep="\t")
      linkName <- paste0("Gene Level Analysis Table(", contrastData[i], 
                         ") (.tsv)")
      linkAddr <- paste0("roast(", contrastData[i], ").tsv")
      linkData <- rbind(linkData, c(linkName, linkAddr))
      if (selectOpt=="rank") {
        selectedGenes <- rownames(roastData)[selectVals]
      } else {
        selectedGenes <- selectVals
      }
      
      if (packageVersion("limma")<"3.19.19") {
        png(barcodePng[i], width=600, height=length(selectedGenes)*150)
      } else {
        png(barcodePng[i], width=600, height=length(selectedGenes)*300)
      }
      par(mfrow=c(length(selectedGenes), 1))
      for (gene in selectedGenes) {
        barcodeplot(testData$table$logFC, index=geneList[[gene]],
                    main=paste("Barcode Plot for", gene, "(logFCs)", 
                               gsub(".", " ", contrastData[i])),
                    labels=c("Positive logFC", "Negative logFC"))
      }
      imgName <- paste0("Barcode Plot(", contrastData[i], ")")
      imgAddr <- paste0("barcode(", contrastData[i], ").png")
      imageData <- rbind(imageData, c(imgName, imgAddr))
      dev.off()
      if (packageVersion("limma")<"3.19.19") {
        pdf(barcodePdf[i], width=8, height=2)
      } else {
        pdf(barcodePdf[i], width=8, height=4)
      }
      for (gene in selectedGenes) {
        barcodeplot(testData$table$logFC, index=geneList[[gene]],
                    main=paste("Barcode Plot for", gene, "(logFCs)", 
                               gsub(".", " ", contrastData[i])),
                    labels=c("Positive logFC", "Negative logFC"))
      }
      linkName <- paste0("Barcode Plot(", contrastData[i], ") (.pdf)")
      linkAddr <- paste0("barcode(", contrastData[i], ").pdf")
      linkData <- rbind(linkData, c(linkName, linkAddr))
      dev.off()
    }
  }
}

# Record ending time
timeEnd <- as.character(Sys.time())
################################################################################
### HTML Generation
################################################################################
# Clear file
cat("", file=htmlPath)

cata("<html>\n")
HtmlHead("EdgeR Output")

cata("<body>\n")
cata("<h3>EdgeR Analysis Output:</h3>\n")
cata("<h4>Input Summary:</h4>\n")
if (inputType=="fastq") {
  cata("<ul>\n")
  ListItem(hpReadout[1])
  ListItem(hpReadout[2])
  cata("</ul>\n")
  cata(hpReadout[3], "<br/>\n")
  cata("<ul>\n")
  ListItem(hpReadout[4])
  ListItem(hpReadout[7])
  cata("</ul>\n")
  cata(hpReadout[8:11], sep="<br/>\n")
  cata("<br />\n")
  cata("<b>Please check that read percentages are consistent with ")
  cata("expectations.</b><br >\n")
} else if (inputType=="counts") {
  cata("<ul>\n")
  ListItem("Number of Samples: ", ncol(data$counts))
  ListItem("Number of Hairpins: ", countsRows)
  ListItem("Number of annotations provided: ", annoRows)
  ListItem("Number of annotations matched to hairpin: ", annoMatched)
  cata("</ul>\n")
}

cata("The estimated common biological coefficient of variation (BCV) is: ", 
     commonBCV, "<br />\n")

cata("<h4>Output:</h4>\n")
cata("All images displayed have PDF copy at the bottom of the page, these can ")
cata("exported in a pdf viewer to high resolution image format. <br/>\n")
for (i in 1:nrow(imageData)) {
  if (grepl("barcode", imageData$Link[i])) {
    if (packageVersion("limma")<"3.19.19") {
      HtmlImage(imageData$Link[i], imageData$Label[i], 
                height=length(selectedGenes)*150)
    } else {
      HtmlImage(imageData$Link[i], imageData$Label[i], 
                height=length(selectedGenes)*300)
    }
  } else {
    HtmlImage(imageData$Link[i], imageData$Label[i])
  }
}
cata("<br/>\n")

cata("<h4>Plots:</h4>\n")
for (i in 1:nrow(linkData)) {
  if (!grepl(".tsv", linkData$Link[i])) {
    HtmlLink(linkData$Link[i], linkData$Label[i])
  }
}

cata("<h4>Tables:</h4>\n")
for (i in 1:nrow(linkData)) {
  if (grepl(".tsv", linkData$Link[i])) {
    HtmlLink(linkData$Link[i], linkData$Label[i])
  }
}

cata("<p>alt-click any of the links to download the file, or click the name ")
cata("of this task in the galaxy history panel and click on the floppy ")
cata("disk icon to download all files in a zip archive.</p>\n")
cata("<p>.tsv files are tab seperated files that can be viewed using Excel ")
cata("or other spreadsheet programs</p>\n")
cata("<table border=\"0\">\n")

cata("<tr>\n")
TableItem("Task started at:"); TableItem(timeStart)
cata("</tr>\n")
cata("<tr>\n")
TableItem("Task ended at:"); TableItem(timeEnd)
cata("</tr>\n")

cata("</body>\n")
cata("</html>")