Galaxy |

Changeset 2:076ca575208f (2014-02-21)

Previous changeset 1:aa02cf19e1b3 (2014-02-20) Next changeset 3:17fee0726221 (2014-02-20)

Commit message:
First commit

added:
hairpinTool.R
hairpinTool.xml

diff -r aa02cf19e1b3 -r 076ca575208f hairpinTool.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/hairpinTool.R Fri Feb 21 12:52:56 2014 +1100

[

b'@@ -0,0 +1,631 @@\n+# Record starting time\n+timeStart <- as.character(Sys.time())\n+\n+# Loading and checking required packages\n+library(methods, quietly=TRUE, warn.conflicts=FALSE)\n+library(statmod, quietly=TRUE, warn.conflicts=FALSE)\n+library(splines, quietly=TRUE, warn.conflicts=FALSE)\n+library(edgeR, quietly=TRUE, warn.conflicts=FALSE)\n+library(limma, quietly=TRUE, warn.conflicts=FALSE)\n+\n+if (packageVersion("edgeR") < "3.5.23") {\n+ message("Please update \'edgeR\' to version >= 3.5.23 to run this script")\n+}\n+\n+################################################################################\n+### Function declarations\n+################################################################################\n+\n+# Function to sanitise contrast equations so there are no whitespaces\n+# surrounding the arithmetic operators, leading or trailing whitespace\n+sanitiseEquation <- function(equation) {\n+ equation <- gsub(" *[+] *", "+", equation)\n+ equation <- gsub(" *[-] *", "-", equation)\n+ equation <- gsub(" *[/] *", "/", equation)\n+ equation <- gsub(" *[*] *", "*", equation)\n+ equation <- gsub("^\\\\s+|\\\\s+$", "", equation)\n+ return(equation)\n+}\n+\n+# Function to sanitise group information\n+sanitiseGroups <- function(string) {\n+ string <- gsub(" *[,] *", ",", string)\n+ string <- gsub("^\\\\s+|\\\\s+$", "", string)\n+ return(string)\n+}\n+\n+# Function to change periods to whitespace in a string\n+unmake.names <- function(string) {\n+ string <- gsub(".", " ", string, fixed=TRUE)\n+ return(string)\n+}\n+\n+# Function has string input and generates an output path string\n+makeOut <- function(filename) {\n+ return(paste0(folderPath, "/", filename))\n+}\n+\n+# Function has string input and generates both a pdf and png output strings\n+imgOut <- function(filename) {\n+ assign(paste0(filename, "Png"), makeOut(paste0(filename,".png")), \n+ envir = .GlobalEnv)\n+ assign(paste0(filename, "Pdf"), makeOut(paste0(filename,".pdf")),\n+ envir = .GlobalEnv)\n+}\n+\n+# Create cat function default path set, default seperator empty and appending\n+# true by default (Ripped straight from the cat function with altered argument\n+# defaults)\n+cata <- function(..., file = htmlPath, sep = "", fill = FALSE, labels = NULL, \n+ append = TRUE) {\n+ if (is.character(file)) \n+ if (file == "") \n+ file <- stdout()\n+ else if (substring(file, 1L, 1L) == "|") {\n+ file <- pipe(substring(file, 2L), "w")\n+ on.exit(close(file))\n+ }\n+ else {\n+ file <- file(file, ifelse(append, "a", "w"))\n+ on.exit(close(file))\n+ }\n+ .Internal(cat(list(...), file, sep, fill, labels, append))\n+}\n+\n+# Function to write code for html head and title\n+HtmlHead <- function(title) {\n+ cata("<head>\\n")\n+ cata("<title>", title, "</title>\\n")\n+ cata("</head>\\n")\n+}\n+\n+# Function to write code for html links\n+HtmlLink <- function(address, label=address) {\n+ cata("<a href=\\"", address, "\\" target=\\"_blank\\">", label, "</a><br />\\n")\n+}\n+\n+# Function to write code for html images\n+HtmlImage <- function(source, label=source, height=600, width=600) {\n+ cata("<img src=\\"", source, "\\" alt=\\"", label, "\\" height=\\"", height)\n+ cata("\\" width=\\"", width, "\\"/>\\n")\n+}\n+\n+# Function to write code for html list items\n+ListItem <- function(...) {\n+ cata("<li>", ..., "</li>\\n")\n+}\n+\n+TableItem <- function(...) {\n+ cata("<td>", ..., "</td>\\n")\n+}\n+\n+TableHeadItem <- function(...) {\n+ cata("<th>", ..., "</th>\\n")\n+}\n+################################################################################\n+### Input Processing\n+################################################################################\n+\n+# Grabbing arguments from command line\n+argv <- commandArgs(TRUE)\n+\n+# Remove fastq file paths after collecting from argument vector\n+inputType <- as.character(argv[1])\n+if (inputType=="fastq") {\n+ fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], \n+ fixed=TRUE))\n+ argv <- argv[!grepl("fastq::", argv, fixed=TRUE)]\n+ hair'..b'edGenes) {\n+ barcodeplot(testData$table$logFC, index=geneList[[gene]],\n+ main=paste("Barcode Plot for", gene, "(logFCs)", \n+ gsub(".", " ", contrastData[i])),\n+ labels=c("Positive logFC", "Negative logFC"))\n+ }\n+ imgName <- paste0("Barcode Plot(", contrastData[i], ")")\n+ imgAddr <- paste0("barcode(", contrastData[i], ").png")\n+ imageData <- rbind(imageData, c(imgName, imgAddr))\n+ dev.off()\n+ if (packageVersion("limma")<"3.19.19") {\n+ pdf(barcodePdf[i], width=8, height=2)\n+ } else {\n+ pdf(barcodePdf[i], width=8, height=4)\n+ }\n+ for (gene in selectedGenes) {\n+ barcodeplot(testData$table$logFC, index=geneList[[gene]],\n+ main=paste("Barcode Plot for", gene, "(logFCs)", \n+ gsub(".", " ", contrastData[i])),\n+ labels=c("Positive logFC", "Negative logFC"))\n+ }\n+ linkName <- paste0("Barcode Plot(", contrastData[i], ") (.pdf)")\n+ linkAddr <- paste0("barcode(", contrastData[i], ").pdf")\n+ linkData <- rbind(linkData, c(linkName, linkAddr))\n+ dev.off()\n+ }\n+ }\n+}\n+\n+# Record ending time\n+timeEnd <- as.character(Sys.time())\n+################################################################################\n+### HTML Generation\n+################################################################################\n+# Clear file\n+cat("", file=htmlPath)\n+\n+cata("<html>\\n")\n+HtmlHead("EdgeR Output")\n+\n+cata("<body>\\n")\n+cata("<h3>EdgeR Analysis Output:</h3>\\n")\n+cata("<h4>Input Summary:</h4>\\n")\n+if (inputType=="fastq") {\n+ cata("<ul>\\n")\n+ ListItem(hpReadout[1])\n+ ListItem(hpReadout[2])\n+ cata("</ul>\\n")\n+ cata(hpReadout[3], "<br/>\\n")\n+ cata("<ul>\\n")\n+ ListItem(hpReadout[4])\n+ ListItem(hpReadout[7])\n+ cata("</ul>\\n")\n+ cata(hpReadout[8:11], sep="<br/>\\n")\n+ cata("<br />\\n")\n+ cata("<b>Please check that read percentages are consistent with ")\n+ cata("expectations.</b><br >\\n")\n+} else if (inputType=="counts") {\n+ cata("<ul>\\n")\n+ ListItem("Number of Samples: ", ncol(data$counts))\n+ ListItem("Number of Hairpins: ", countsRows)\n+ ListItem("Number of annotations provided: ", annoRows)\n+ ListItem("Number of annotations matched to hairpin: ", annoMatched)\n+ cata("</ul>\\n")\n+}\n+\n+cata("The estimated common biological coefficient of variation (BCV) is: ", \n+ commonBCV, "<br />\\n")\n+\n+cata("<h4>Output:</h4>\\n")\n+cata("All images displayed have PDF copy at the bottom of the page, these can ")\n+cata("exported in a pdf viewer to high resolution image format. <br/>\\n")\n+for (i in 1:nrow(imageData)) {\n+ if (grepl("barcode", imageData$Link[i])) {\n+ if (packageVersion("limma")<"3.19.19") {\n+ HtmlImage(imageData$Link[i], imageData$Label[i], \n+ height=length(selectedGenes)*150)\n+ } else {\n+ HtmlImage(imageData$Link[i], imageData$Label[i], \n+ height=length(selectedGenes)*300)\n+ }\n+ } else {\n+ HtmlImage(imageData$Link[i], imageData$Label[i])\n+ }\n+}\n+cata("<br/>\\n")\n+\n+cata("<h4>Plots:</h4>\\n")\n+for (i in 1:nrow(linkData)) {\n+ if (!grepl(".tsv", linkData$Link[i])) {\n+ HtmlLink(linkData$Link[i], linkData$Label[i])\n+ }\n+}\n+\n+cata("<h4>Tables:</h4>\\n")\n+for (i in 1:nrow(linkData)) {\n+ if (grepl(".tsv", linkData$Link[i])) {\n+ HtmlLink(linkData$Link[i], linkData$Label[i])\n+ }\n+}\n+\n+cata("<p>alt-click any of the links to download the file, or click the name ")\n+cata("of this task in the galaxy history panel and click on the floppy ")\n+cata("disk icon to download all files in a zip archive.</p>\\n")\n+cata("<p>.tsv files are tab seperated files that can be viewed using Excel ")\n+cata("or other spreadsheet programs</p>\\n")\n+cata("<table border=\\"0\\">\\n")\n+\n+cata("<tr>\\n")\n+TableItem("Task started at:"); TableItem(timeStart)\n+cata("</tr>\\n")\n+cata("<tr>\\n")\n+TableItem("Task ended at:"); TableItem(timeEnd)\n+cata("</tr>\\n")\n+\n+cata("</body>\\n")\n+cata("</html>")\n'

diff -r aa02cf19e1b3 -r 076ca575208f hairpinTool.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/hairpinTool.xml Fri Feb 21 12:52:56 2014 +1100

b'@@ -0,0 +1,409 @@\n+<tool id="shRNAseq" name="shRNAseq Tool" version="1.0.5">\n+ <description>\n+ Analyse hairpin differential representation using edgeR\n+ </description>\n+ \n+ <requirements>\n+ <requirement type="R-module">edgeR</requirement>\n+ <requirement type="R-module">limma</requirement>\n+ </requirements>\n+ \n+ <stdio>\n+ <exit_code range="1:" level="fatal" description="Tool exception" />\n+ </stdio>\n+ \n+ <command interpreter="Rscript">\n+ hairpinTool.R $inputOpt.type\n+ #if $inputOpt.type=="fastq":\n+ #for $i, $fas in enumerate($inputOpt.fastq):\n+ fastq::$fas.file\n+ #end for\n+ \n+ $inputOpt.hairpin\n+ $inputOpt.samples\n+ \n+ #if $inputOpt.positions.option=="yes":\n+ $inputOpt.positions.barstart\n+ $inputOpt.positions.barend\n+ $inputOpt.positions.hpstart\n+ $inputOpt.positions.hpend\n+ #else:\n+ 1\n+ 5\n+ 37\n+ 57\n+ #end if\n+ #else:\n+ $inputOpt.counts\n+ $inputOpt.anno\n+ "$inputOpt.factors"\n+ 0 0 0\n+ #end if\n+ \n+ #if $filterCPM.option=="yes":\n+ $filterCPM.cpmReq\n+ $filterCPM.sampleReq\n+ #else:\n+ -Inf\n+ -Inf\n+ #end if\n+ \n+ $fdr\n+ $lfc\n+ $workMode.mode\n+ $outFile\n+ $outFile.files_path\n+ \n+ #if $workMode.mode=="classic":\n+ "$workMode.pair1"\n+ "$workMode.pair2"\n+ #else:\n+ "$workMode.contrast"\n+ $workMode.roast.option\n+ #if $workMode.roast.option=="yes":\n+ $workMode.roast.hairpinReq\n+ $workMode.roast.select.option\n+ "$workMode.roast.select.selection"\n+ #else:\n+ 0\n+ 0\n+ 0\n+ #end if\n+ #end if\n+ </command>\n+ \n+ <inputs>\n+ <conditional name="inputOpt">\n+ <param name="type" type="select" label="Input File Type">\n+ <option value="fastq">FastQ File</option>\n+ <option value="counts">Table of Counts</option>\n+ </param>\n+ \n+ <when value="fastq">\n+ <param name="hairpin" type="data" format="tabular" \n+ label="Hairpin Annotation"/>\n+ \n+ \n+ <param name="samples" type="data" format="tabular" \n+ label="Sample Annotation"/>\n+ \n+ <repeat name="fastq" title="FastQ Files"> \n+ <param name="file" type="data" format="fastq"/>\n+ </repeat>\n+ \n+ <conditional name="positions">\n+ <param name="option" type="select" \n+ label="Specify Barcode and Hairpin Locations?"\n+ help="Default Positions: Barcode: 1 to 5, Hairpin: 37 to 57.">\n+ <option value="no" selected="True">No</option>\n+ <option value="yes">Yes</option>\n+ </param>\n+ \n+ <when value="yes">\n+ <param name="barstart" type="integer" value="1"\n+ label="Barcode Starting Position"/>\n+ <param name="barend" type="integer" value="5"\n+ label="Barcode Ending Position"/>\n+ \n+ <param name="hpstart" type="integer" value="37"\n+ label="Hairpin Starting Position"/>\n+ \n+ <param name="hpend" type="integer" value="57"\n+ label="Hairpin Ending Position"/>\n+ </when>\n+ \n+ <when value="no"/>\n+ </conditional>\n+'..b'interfere with analysis as long as the necessary columns are present.\n+\n+NOTE: the column names are case sensitive and should be input exactly as they\n+are shown here.\n+\n+Example::\n+\n+ ID\tSequences\tgroup\tReplicate\n+ 3\tGAAAG\tDay 2\t1\n+ 6\tGAACC\tDay 10\t1\n+ 9\tGAAGA\tDay 5 GFP neg\t1\n+ 16\tGAATT\tDay 5 GFP pos\t1\n+ 18\tGACAC\tDay 2\t2\n+ 21\tGACCA\tDay 10\t2\n+ 28\tGACGT\tDay 5 GFP neg\t2\n+ 31\tGACTG\tDay 5 GFP pos\t2\n+ 33\tGAGAA\tDay 2\t3\n+ 40\tGAGCT\tDay 10\t3\n+ ...\n+ \n+**Specify Barcode and Hairpin Locations (FastQ Input):**\n+\n+It is assumed that in the sequencing reads that the first 5 bases are the\n+barcodes and that bases 37-57 are the hairpins. If this is not the case then the\n+values of the positions can be changed, however it still requires the barcodes\n+and hairpins to be in a consistent location an in a continuous sequence.\n+\n+**Filter Low CPM?:**\n+\n+Often in a large screen there may members with very low counts which are of no\n+interest in the experiment, these may be filtered out to speed up computations.\n+Filtering will be based on counts per million in a required number of samples.\n+\n+**Analysis Type:**\n+\n+ * **Classic Exact Test:** This allows two experimental groups to be compared and\n+ p-values for differential representation derivec for each hairpin. Simple and\n+ fast for straightforward comparisons. In this option you will have the option of\n+ "*Compare* x *To* y" which implicitly subtracts the data from y from that of x\n+ to produce the comparison.\n+\n+ * **Generalised Linear Model:** This allow for complex contrasts to be specified\n+ and also gene level analysis to be performed. If this option is chosen then\n+ contrasts must be explicitly stated in equations and multiple contrasts can be\n+ made. In addition there will be the option to analyse hairpins on a per-gene\n+ basis to see if hairpins belonging to a particular gene have any overall\n+ tendencies for the direction of their log-fold-change.\n+\n+**FDR Threshold:**\n+The smear plot in the output will have hairpins highlighted to signify\n+significant differential representation. The significance is determined by\n+contorlling the false discovery rate, only those with a FDR lower than the\n+threshold will be highlighted in the plot.\n+\n+-----\n+\n+**Citations:**\n+\n+.. class:: infomark\n+\n+limma\n+\n+Please cite the paper below for the limma software itself. Please also try\n+to cite the appropriate methodology articles that describe the statistical\n+methods implemented in limma, depending on which limma functions you are\n+using. The methodology articles are listed in Section 2.1 of the limma \n+User\'s Guide.\n+\n+\t* Smyth, GK (2005). Limma: linear models for microarray data. In: \n+\t \'Bioinformatics and Computational Biology Solutions using R and \n+\t Bioconductor\'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry, \n+\t W. Huber (eds), Springer, New York, pages 397-420.\n+\n+.. class:: infomark\n+\n+edgeR\n+\n+Please cite the first paper for the software itself and the other papers for\n+the various original statistical methods implemented in edgeR. See \n+Section 1.2 in the User\'s Guide for more detail.\n+\n+\t* Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor \n+\t package for differential expression analysis of digital gene expression \n+\t data. Bioinformatics 26, 139-140\n+\t \n+\t* Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing \n+\t differences in tag abundance. Bioinformatics 23, 2881-2887\n+\t \n+\t* Robinson MD and Smyth GK (2008). Small-sample estimation of negative \n+\t binomial dispersion, with applications to SAGE data.\n+\t Biostatistics, 9, 321-332\n+\t \n+\t* McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis \n+\t of multifactor RNA-Seq experiments with respect to biological variation. \n+\t Nucleic Acids Research 40, 4288-4297\n+\n+.. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html\n+.. _limma: http://www.bioconductor.org/packages/release/bioc/html/limma.html\n+ </help>\n+</tool>\n+ \n'