# HG changeset patch
# User shian_su
# Date 1397200635 -36000
# Node ID 3d04308a99f9d09a6df68e559a9d81862605d586
# Parent 17befe9f8b033af5026bff9223547ee236e788e1
- Added differentially expressed hairpin count output
- Added running time output
- Added counts table output
diff -r 17befe9f8b03 -r 3d04308a99f9 hairpinTool.R
--- a/hairpinTool.R Mon Feb 24 14:50:08 2014 +1100
+++ b/hairpinTool.R Fri Apr 11 17:17:15 2014 +1000
@@ -1,3 +1,51 @@
+# ARGS: 1.inputType -String specifying format of input (fastq or table)
+# IF inputType is "fastQ":
+# 2*.fastqPath -One or more strings specifying path to fastq files
+# 2.annoPath -String specifying path to hairpin annotation table
+# 3.samplePath -String specifying path to sample annotation table
+# 4.barStart -Integer specifying starting position of barcode
+# 5.barEnd -Integer specifying ending position of barcode
+# 6.hpStart -Integer specifying startins position of hairpin
+# unique region
+# 7.hpEnd -Integer specifying ending position of hairpin
+# unique region
+# ###
+# IF inputType is "counts":
+# 2.countPath -String specifying path to count table
+# 3.annoPath -String specifying path to hairpin annotation table
+# 4.samplePath -String specifying path to sample annotation table
+# ###
+# 8.cpmReq -Float specifying cpm requirement
+# 9.sampleReq -Integer specifying cpm requirement
+# 10.fdrThresh -Float specifying the FDR requirement
+# 11.lfcThresh -Float specifying the log-fold-change requirement
+# 12.workMode -String specifying exact test or GLM usage
+# 13.htmlPath -String specifying path to HTML file
+# 14.folderPath -STring specifying path to folder for output
+# IF workMode is "classic" (exact test)
+# 15.pairData[2] -String specifying first group for exact test
+# 16.pairData[1] -String specifying second group for exact test
+# ###
+# IF workMode is "glm"
+# 15.contrastData -String specifying contrasts to be made
+# 16.roastOpt -String specifying usage of gene-wise tests
+# 17.hairpinReq -String specifying hairpin requirement for gene-
+# wise test
+# 18.selectOpt -String specifying type of selection for barcode
+# plots
+# 19.selectVals -String specifying members selected for barcode
+# plots
+#
+# OUT: Bar Plot of Counts Per Index
+# Bar Plot of Counts Per Hairpin
+# MDS Plot
+# Smear Plot
+# Barcode Plots (If Genewise testing was selected)
+# Top Expression Table
+# HTML file linking to the ouputs
+#
+# Author: Shian Su - registertonysu@gmail.com - Jan 2014
+
# Record starting time
timeStart <- as.character(Sys.time())
@@ -115,7 +163,7 @@
fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)],
fixed=TRUE))
argv <- argv[!grepl("fastq::", argv, fixed=TRUE)]
- hairpinPath <- as.character(argv[2])
+ annoPath <- as.character(argv[2])
samplePath <- as.character(argv[3])
barStart <- as.numeric(argv[4])
barEnd <- as.numeric(argv[5])
@@ -134,6 +182,7 @@
workMode <- as.character(argv[12])
htmlPath <- as.character(argv[13])
folderPath <- as.character(argv[14])
+
if (workMode=="classic") {
pairData <- character()
pairData[2] <- as.character(argv[15])
@@ -147,14 +196,13 @@
}
# Read in inputs
-if (inputType=="fastq") {
- samples <- read.table(samplePath, header=TRUE, sep="\t")
- hairpins <- read.table(hairpinPath, header=TRUE, sep="\t")
-} else if (inputType=="counts") {
- samples <- read.table(samplePath, header=TRUE, sep="\t")
+
+samples <- read.table(samplePath, header=TRUE, sep="\t")
+anno <- read.table(annoPath, header=TRUE, sep="\t")
+if (inputType=="counts") {
counts <- read.table(countPath, header=TRUE, sep="\t")
- anno <- read.table(annoPath, header=TRUE, sep="\t")
}
+
###################### Check inputs for correctness ############################
samples$ID <- make.names(samples$ID)
@@ -166,16 +214,16 @@
tab <- table(samples$ID)
offenders <- paste(names(tab[tab>1]), collapse=", ")
offenders <- unmake.names(offenders)
- stop("ID column of sample annotation must have unique values, values ",
+ stop("'ID' column of sample annotation must have unique values, values ",
offenders, " are repeated")
} # Check that IDs in sample annotation are unique
if (inputType=="fastq") {
-
- if (any(table(hairpins$ID)>1)){
- tab <- table(hairpins$ID)
+
+ if (any(table(anno$ID)>1)){
+ tab <- table(anno$ID)
offenders <- paste(names(tab[tab>1]), collapse=", ")
- stop("ID column of hairpin annotation must have unique values, values ",
+ stop("'ID' column of hairpin annotation must have unique values, values ",
offenders, " are repeated")
} # Check that IDs in hairpin annotation are unique
@@ -187,10 +235,20 @@
if (any(table(counts$ID)>1)){
tab <- table(counts$ID)
offenders <- paste(names(tab[tab>1]), collapse=", ")
- stop("ID column of count table must have unique values, values ",
+ stop("'ID' column of count table must have unique values, values ",
offenders, " are repeated")
} # Check that IDs in count table are unique
}
+if (workMode=="glm") {
+ if (roastOpt == "yes") {
+ if (is.na(match("Gene", colnames(anno)))) {
+ tempStr <- paste("Gene-wise tests selected but'Gene' column not",
+ "specified in hairpin annotation file")
+ stop(tempStr)
+ }
+ }
+}
+
################################################################################
# Process arguments
@@ -216,7 +274,7 @@
}
# Generate output folder and paths
-dir.create(folderPath)
+dir.create(folderPath, showWarnings=FALSE)
# Generate links for outputs
imgOut("barHairpin")
@@ -243,6 +301,8 @@
barcodePdf[i] <- makeOut(paste0("barcode(", contrastData[i], ").pdf"))
}
}
+countsOut <- makeOut("counts.tsv")
+
# Initialise data for html links and images, table with the link label and
# link address
linkData <- data.frame(Label=character(), Link=character(),
@@ -276,22 +336,21 @@
}
if (inputType=="fastq") {
-# Use EdgeR hairpin process and capture outputs
-hpReadout <- capture.output(
- data <- processHairpinReads(fastqPath, samplePath, hairpinPath,
+ # Use EdgeR hairpin process and capture outputs
+ hpReadout <- capture.output(
+ data <- processHairpinReads(fastqPath, samplePath, annoPath,
hairpinStart=hpStart, hairpinEnd=hpEnd,
verbose=TRUE)
-)
-
-# Remove function output entries that show processing data or is empty
-hpReadout <- hpReadout[hpReadout!=""]
-hpReadout <- hpReadout[!grepl("Processing", hpReadout)]
-hpReadout <- hpReadout[!grepl("in file", hpReadout)]
-hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE)
-
-
-# Make the names of groups syntactically valid (replace spaces with periods)
-data$samples$group <- make.names(data$samples$group)
+ )
+
+ # Remove function output entries that show processing data or is empty
+ hpReadout <- hpReadout[hpReadout!=""]
+ hpReadout <- hpReadout[!grepl("Processing", hpReadout)]
+ hpReadout <- hpReadout[!grepl("in file", hpReadout)]
+ hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE)
+
+ # Make the names of groups syntactically valid (replace spaces with periods)
+ data$samples$group <- make.names(data$samples$group)
} else if (inputType=="counts") {
# Process counts information, set ID column to be row names
rownames(counts) <- counts$ID
@@ -318,14 +377,17 @@
# Create DGEList
data <- DGEList(counts=counts, lib.size=colSums(counts),
norm.factors=rep(1,ncol(counts)), genes=anno, group=factors)
-
+
# Make the names of groups syntactically valid (replace spaces with periods)
data$samples$group <- make.names(data$samples$group)
}
# Filter hairpins with low counts
+preFilterCount <- nrow(data)
sel <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq
data <- data[sel, ]
+postFilterCount <- nrow(data)
+filteredCount <- preFilterCount-postFilterCount
# Estimate dispersions
data <- estimateDisp(data)
@@ -540,8 +602,17 @@
}
}
-# Record ending time
+ID <- rownames(data$counts)
+outputCounts <- cbind(ID, data$counts)
+write.table(outputCounts, file=countsOut, row.names=FALSE, sep="\t")
+linkName <- "Counts table (.tsv)"
+linkAddr <- "counts.tsv"
+linkData <- rbind(linkData, c(linkName, linkAddr))
+
+# Record ending time and calculate total run time
timeEnd <- as.character(Sys.time())
+timeTaken <- capture.output(round(difftime(timeEnd,timeStart), digits=3))
+timeTaken <- gsub("Time difference of ", "", timeTaken, fixed=TRUE)
################################################################################
### HTML Generation
################################################################################
@@ -559,12 +630,12 @@
ListItem(hpReadout[1])
ListItem(hpReadout[2])
cata("\n")
- cata(hpReadout[3], "
\n")
+ cata(hpReadout[3], "
\n")
cata("\n")
ListItem(hpReadout[4])
ListItem(hpReadout[7])
cata("
\n")
- cata(hpReadout[8:11], sep="
\n")
+ cata(hpReadout[8:11], sep="
\n")
cata("
\n")
cata("Please check that read percentages are consistent with ")
cata("expectations.
\n")
@@ -582,7 +653,7 @@
cata("Output:
\n")
cata("All images displayed have PDF copy at the bottom of the page, these can ")
-cata("exported in a pdf viewer to high resolution image format.
\n")
+cata("exported in a pdf viewer to high resolution image format.
\n")
for (i in 1:nrow(imageData)) {
if (grepl("barcode", imageData$Link[i])) {
if (packageVersion("limma")<"3.19.19") {
@@ -596,7 +667,7 @@
HtmlImage(imageData$Link[i], imageData$Label[i])
}
}
-cata("
\n")
+cata("
\n")
cata("Plots:
\n")
for (i in 1:nrow(linkData)) {
@@ -617,14 +688,80 @@
cata("disk icon to download all files in a zip archive.
\n")
cata(".tsv files are tab seperated files that can be viewed using Excel ")
cata("or other spreadsheet programs
\n")
+
+cata("Additional Information:
\n")
+
+if (inputType == "fastq") {
+ ListItem("Data was gathered from fastq raw read file(s).")
+} else if (inputType == "counts") {
+ ListItem("Data was gathered from a table of counts.")
+}
+
+if (cpmReq!=0 && sampleReq!=0) {
+ tempStr <- paste("Hairpins that do not have more than", cpmReq,
+ "CPM in at least", sampleReq, "samples are considered",
+ "insignificant and filtered out.")
+ ListItem(tempStr)
+ filterProp <- round(filteredCount/preFilterCount*100, digits=2)
+ tempStr <- paste0(filteredCount, " of ", preFilterCount," (", filterProp,
+ "%) hairpins were filtered out for low count-per-million.")
+ ListItem(tempStr)
+}
+
+if (workMode == "classic") {
+ ListItem("An exact test was performed on each hairpin.")
+} else if (workMode == "glm") {
+ ListItem("A generalised linear model was fitted to each hairpin.")
+}
+
+
+
+cit <- character()
+link <-character()
+link[1] <- paste0("", "limma User's Guide", ".")
+link[2] <- paste0("", "edgeR User's Guide", "")
+
+cit[1] <- paste("Robinson MD, McCarthy DJ and Smyth GK (2010).",
+ "edgeR: a Bioconductor package for differential",
+ "expression analysis of digital gene expression",
+ "data. Bioinformatics 26, 139-140")
+cit[2] <- paste("Robinson MD and Smyth GK (2007). Moderated statistical tests",
+ "for assessing differences in tag abundance. Bioinformatics",
+ "23, 2881-2887")
+cit[3] <- paste("Robinson MD and Smyth GK (2008). Small-sample estimation of",
+ "negative binomial dispersion, with applications to SAGE data.",
+ "Biostatistics, 9, 321-332")
+
+cit[4] <- paste("McCarthy DJ, Chen Y and Smyth GK (2012). Differential",
+ "expression analysis of multifactor RNA-Seq experiments with",
+ "respect to biological variation. Nucleic Acids Research 40,",
+ "4288-4297")
+
+cata("Citations
")
+cata("\n")
+ListItem(cit[1])
+ListItem(cit[2])
+ListItem(cit[3])
+ListItem(cit[4])
+cata("
\n")
+
cata("\n")
-
cata("\n")
TableItem("Task started at:"); TableItem(timeStart)
cata("
\n")
cata("\n")
TableItem("Task ended at:"); TableItem(timeEnd)
cata("
\n")
+cata("\n")
+TableItem("Task run time:"); TableItem(timeTaken)
+cata("
\n")
+cata("
\n")
cata("