Next changeset 1:b2fe55fd0651 (2014-12-16) |
Commit message:
- Initial commit |
added:
diffexp.R diffexp.xml |
b |
diff -r 000000000000 -r 7a80e9ec63cb diffexp.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/diffexp.R Tue Dec 16 14:38:15 2014 +1100 |
[ |
b'@@ -0,0 +1,644 @@\n+# This tool takes in a matrix of feature counts as well as gene annotations and\n+# outputs a table of top expressions as well as various plots for differential\n+# expression analysis\n+#\n+# ARGS: 1.countPath -Path to RData input containing counts\n+# 2.annoPath -Path to RData input containing gene annotations\n+# 3.htmlPath -Path to html file linking to other outputs\n+# 4.outPath -Path to folder to write all output to\n+# 5.rdaOpt -String specifying if RData should be saved\n+# 6.normOpt -String specifying type of normalisation used\n+# 7.weightOpt -String specifying usage of weights\n+# 8.contrastData -String containing contrasts of interest\n+# 9.cpmReq -Float specifying cpm requirement\n+# 10.sampleReq -Integer specifying cpm requirement\n+# 11.pAdjOpt -String specifying the p-value adjustment method\n+# 12.pValReq -Float specifying the p-value requirement\n+# 13.lfcReq -Float specifying the log-fold-change requirement\n+# 14.factorData -String containing factor names and values\n+#\n+# OUT: Voom Plot\n+# BCV Plot\n+# MA Plot\n+# Top Expression Table\n+# HTML file linking to the ouputs\n+#\n+# Author: Shian Su - registertonysu@gmail.com - Jan 2014\n+\n+# Record starting time\n+timeStart <- as.character(Sys.time())\n+\n+# Load all required libraries\n+library(methods, quietly=TRUE, warn.conflicts=FALSE)\n+library(statmod, quietly=TRUE, warn.conflicts=FALSE)\n+library(splines, quietly=TRUE, warn.conflicts=FALSE)\n+library(edgeR, quietly=TRUE, warn.conflicts=FALSE)\n+library(limma, quietly=TRUE, warn.conflicts=FALSE)\n+library(scales, quietly=TRUE, warn.conflicts=FALSE)\n+\n+if (packageVersion("limma") < "3.20.1") {\n+ stop("Please update \'limma\' to version >= 3.20.1 to run this tool")\n+}\n+\n+################################################################################\n+### Function Delcaration\n+################################################################################\n+# Function to sanitise contrast equations so there are no whitespaces\n+# surrounding the arithmetic operators, leading or trailing whitespace\n+sanitiseEquation <- function(equation) {\n+ equation <- gsub(" *[+] *", "+", equation)\n+ equation <- gsub(" *[-] *", "-", equation)\n+ equation <- gsub(" *[/] *", "/", equation)\n+ equation <- gsub(" *[*] *", "*", equation)\n+ equation <- gsub("^\\\\s+|\\\\s+$", "", equation)\n+ return(equation)\n+}\n+\n+# Function to sanitise group information\n+sanitiseGroups <- function(string) {\n+ string <- gsub(" *[,] *", ",", string)\n+ string <- gsub("^\\\\s+|\\\\s+$", "", string)\n+ return(string)\n+}\n+\n+# Function to change periods to whitespace in a string\n+unmake.names <- function(string) {\n+ string <- gsub(".", " ", string, fixed=TRUE)\n+ return(string)\n+}\n+\n+# Generate output folder and paths\n+makeOut <- function(filename) {\n+ return(paste0(outPath, "/", filename))\n+}\n+\n+# Generating design information\n+pasteListName <- function(string) {\n+ return(paste0("factors$", string))\n+}\n+\n+# Create cata function: default path set, default seperator empty and appending\n+# true by default (Ripped straight from the cat function with altered argument\n+# defaults)\n+cata <- function(..., file = htmlPath, sep = "", fill = FALSE, labels = NULL, \n+ append = TRUE) {\n+ if (is.character(file)) \n+ if (file == "") \n+ file <- stdout()\n+ else if (substring(file, 1L, 1L) == "|") {\n+ file <- pipe(substring(file, 2L), "w")\n+ on.exit(close(file))\n+ }\n+ else {\n+ file <- file(file, ifelse(append, "a", "w"))\n+ on.exit(close(file))\n+ }\n+ .Internal(cat(list(...), file, sep, fill, labels, append))\n+}\n+\n+# Function to write code for html head and title\n+HtmlHead <- function(title) {\n+ cata("<head>\\n")\n+ cata("<title>", title, "</title>\\n")\n+ cata("</head>\\n")\n+}\n+\n+# Function to write code for html links\n+HtmlLink <- function(address, l'..b'es(factors)[i])\n+ for (j in ncol(factors)) {\n+ TableItem(as.character(unmake.names(factors[i, j])))\n+ }\n+ cata("</tr>\\n")\n+}\n+cata("</table>")\n+\n+cit <- character()\n+link <- character()\n+link[1] <- paste0("<a href=\\"",\n+ "http://www.bioconductor.org/packages/release/bioc/",\n+ "vignettes/limma/inst/doc/usersguide.pdf",\n+ "\\">", "limma User\'s Guide", "</a>.")\n+\n+link[2] <- paste0("<a href=\\"",\n+ "http://www.bioconductor.org/packages/release/bioc/",\n+ "vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf",\n+ "\\">", "edgeR User\'s Guide", "</a>")\n+\n+cit[1] <- paste("Please cite the paper below for the limma software itself.",\n+ "Please also try to cite the appropriate methodology articles",\n+ "that describe the statistical methods implemented in limma,",\n+ "depending on which limma functions you are using. The",\n+ "methodology articles are listed in Section 2.1 of the",\n+ link[1],\n+ "Cite no. 3 only if sample weights were used.")\n+cit[2] <- paste("Smyth, GK (2005). Limma: linear models for microarray data.",\n+ "In: \'Bioinformatics and Computational Biology Solutions using",\n+ "R and Bioconductor\'. R. Gentleman, V. Carey, S. doit,.",\n+ "Irizarry, W. Huber (eds), Springer, New York, pages 397-420.")\n+cit[3] <- paste("Please cite the first paper for the software itself and the",\n+ "other papers for the various original statistical methods",\n+ "implemented in edgeR. See Section 1.2 in the", link[2],\n+ "for more detail.")\n+cit[4] <- paste("Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a",\n+ "Bioconductor package for differential expression analysis",\n+ "of digital gene expression data. Bioinformatics 26, 139-140")\n+cit[5] <- paste("Robinson MD and Smyth GK (2007). Moderated statistical tests",\n+ "for assessing differences in tag abundance. Bioinformatics",\n+ "23, 2881-2887")\n+cit[6] <- paste("Robinson MD and Smyth GK (2008). Small-sample estimation of",\n+ "negative binomial dispersion, with applications to SAGE data.",\n+ "Biostatistics, 9, 321-332")\n+cit[7] <- paste("McCarthy DJ, Chen Y and Smyth GK (2012). Differential",\n+ "expression analysis of multifactor RNA-Seq experiments with",\n+ "respect to biological variation. Nucleic Acids Research 40,",\n+ "4288-4297")\n+cit[8] <- paste("Law, CW, Chen, Y, Shi, W, and Smyth, GK (2014). Voom:",\n+ "precision weights unlock linear model analysis tools for",\n+ "RNA-seq read counts. Genome Biology 15, R29.")\n+cit[9] <- paste("Ritchie, M. E., Diyagama, D., Neilson, J., van Laar,", \n+ "R., Dobrovic, A., Holloway, A., and Smyth, G. K. (2006).",\n+ "Empirical array quality weights for microarray data.",\n+ "BMC Bioinformatics 7, Article 261.")\n+cata("<h3>Citations</h3>\\n")\n+\n+cata("<h4>limma</h4>\\n")\n+cata(cit[1], "\\n")\n+cata("<ol>\\n")\n+ListItem(cit[2])\n+ListItem(cit[8])\n+ListItem(cit[9])\n+cata("</ol>\\n")\n+\n+cata("<h4>edgeR</h4>\\n")\n+cata(cit[3], "\\n")\n+cata("<ol>\\n")\n+ListItem(cit[4])\n+ListItem(cit[5])\n+ListItem(cit[6])\n+ListItem(cit[7])\n+cata("</ol>\\n")\n+\n+cata("<p>Report problems to: su.s@wehi.edu.au</p>\\n")\n+\n+for (i in 1:nrow(linkData)) {\n+ if (grepl("session_info", linkData$Link[i])) {\n+ HtmlLink(linkData$Link[i], linkData$Label[i])\n+ }\n+}\n+\n+cata("<table border=\\"0\\">\\n")\n+cata("<tr>\\n")\n+TableItem("Task started at:"); TableItem(timeStart)\n+cata("</tr>\\n")\n+cata("<tr>\\n")\n+TableItem("Task ended at:"); TableItem(timeEnd)\n+cata("</tr>\\n")\n+cata("<tr>\\n")\n+TableItem("Task run time:"); TableItem(timeTaken)\n+cata("<tr>\\n")\n+cata("</table>\\n")\n+\n+cata("</body>\\n")\n+cata("</html>")\n\\ No newline at end of file\n' |
b |
diff -r 000000000000 -r 7a80e9ec63cb diffexp.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/diffexp.xml Tue Dec 16 14:38:15 2014 +1100 |
b |
b'@@ -0,0 +1,372 @@\n+<tool id="diffexp" name="Voom Rnaseq" version="1.1.0">\n+ <description>\n+ Perform differential expression analysis using pipeline based on the voom\n+ function of the limma bioconductor package. This tool takes a count matrix\n+ (tab separated) as input and produces a HTML report as output.\n+ </description>\n+ \n+ <requirements>\n+ <requirement type="R-module" version="3.5.27">edgeR</requirement>\n+ <requirement type="R-module" version="3.18.13">limma</requirement>\n+ </requirements>\n+\n+ <stdio>\n+ <exit_code range="1:" level="fatal" description="Tool exception" />\n+ </stdio>\n+ \n+ <command interpreter="Rscript">\n+ diffexp.R $counts\n+ \n+ #if $anno.annoOpt=="yes":\n+ $geneanno\n+ #else:\n+ None\n+ #end if\n+ \n+ $outFile\n+ $outFile.files_path\n+ "no" <!-- Disabled Rda option -->\n+ $normalisationOption\n+ $weightCond.weightOption\n+ "$contrast"\n+\n+ #if $filterCPM.filterLowCPM=="yes":\n+ $filterCPM.cpmReq\n+ $filterCPM.sampleReq\n+ #else:\n+ 0\n+ 0\n+ #end if\n+ \n+ #if $testOpt.wantOpt=="yes":\n+ \t"$testOpt.pAdjust"\n+ \t$testOpt.pVal\n+ \t$testOpt.lfc\n+ #else:\n+ "BH"\n+ 0.05\n+ 0\n+ #end if\n+ \n+ <!--*Code commented until solution for multiple factors is found*\n+ #for $i, $fct in enumerate($factors):\n+ $fct.factName::$fct.factLevel\n+ #end for\n+ -->\n+ "$factName::$factLevel"\n+ \n+ </command>\n+ \n+ <inputs>\n+ <param name="counts" type="data" format="tabular" label="Counts Data"/>\n+ \n+ <conditional name="anno">\n+ <param name="annoOpt" type="select" label="Use Gene Annotations?"\n+ help="Annotations will be added to table of top differential \n+ expressions to provide descriptions for each gene.">\n+ <option value="no">No</option>\n+ <option value="yes">Yes</option>\n+ </param>\n+ \n+ <when value="yes">\n+ <param name="geneanno" type="data" format="tabular"\n+ label="Gene Annotations"/>\n+ </when>\n+ </conditional>\n+\n+ <!--*Code commented until solution for multiple factors is found*\n+ <repeat name="factors" title="Factors" min="1" max="5" default="1">\n+ <param name="factName" type="text" label="Factor Name (No spaces)"\n+ help="Eg. Genotype"/>\n+ <param name="factLevel" type="text" size="100"\n+ label="Factor Levels (No spaces)"\n+ help="Eg. WT,WT,Mut,Mut,WT"/>\n+ </repeat>\n+ -->\n+ \n+ <param name="factName" type="text" label="Factor Name"\n+ help="Eg. Genotype."/>\n+ <param name="factLevel" type="text" size="100"\n+ label="Factor Values"\n+ help="Eg. WT,WT,Mut,Mut,WT... NOTE: Please ensure that the same\n+ \t\t levels are typed identically when repeated, with all cases\n+ \t\t matching."/>\n+ \n+ <param name="contrast" type="text" size="30"\n+ label="Contrasts of interest"\n+ help="Eg. Mut-WT,KD-Control."/>\n+ \n+ <conditional name="filterCPM">\n+ <param name="filterLowCPM" type="select" label="Filter Low CPM?"\n+ help="Treat genes with very low expression as unexpressed and \n+ \t\t\t filter out to speed up computation.">\n+ <option value="yes" selected="True">Yes</option>\n+ <option value="no">No</option>\n+ </param>\n+ \n+ <when value="yes">\n+ <param name="cpmReq" type="float" value="0.5" min="0"\n+ label="Minimum CPM"/>\n+ \n+ <param name="sampleReq" type="integer" value="1" min="0"\n+ '..b"ly genes that exhibit a CPM greater than the required amount in at least the\n+number of samples specified will be used for analysis. Care should be taken to\n+ensure that the sample requirement is appropriate. In the case of an experiment\n+with two experimental groups each with two members, if there is a change from\n+insignificant cpm to significant cpm but the sample requirement is set to 3,\n+then this will cause that gene to fail the criteria. When in doubt simply do not\n+filter.\n+\n+\n+**Normalisation Method:**\n+Option for using different methods to rescale the raw library\n+size. For more information, see calcNormFactor section in the edgeR_ user's\n+manual.\n+\n+**Apply Sample Weights:**\n+Option to downweight outlier samples such that their information is still\n+used in the statistical analysis but their impact is reduced. Use this\n+whenever significant outliers are present. The MDS plotting tool in this package\n+is useful for identifying outliers\n+\n+**Use Advanced Testing Options?:**\n+By default error rate for multiple testing is controlled using Benjamini and\n+Hochberg's false discovery rate control at a threshold value of 0.05. However\n+there are options to change this to custom values.\n+\n+ * **P-Value Adjustment Method:**\n+ Change the multiple testing control method, the options are BH(1995) and \n+ BY(2001) which are both false discovery rate controls. There is also\n+ Holm(1979) which is a method for family-wise error rate control.\n+ \n+ * **Adjusted Threshold:**\n+ Set the threshold for the resulting value of the multiple testing control\n+ method. Only observations whose statistic falls below this value is\n+ considered significant, thus highlighted in the MA plot.\n+ \n+ * **Minimum log2-fold-change Required:**\n+ In addition to meeting the requirement for the adjusted statistic for\n+ multiple testing, the observation must have an absolute log2-fold-change\n+ greater than this threshold to be considered significant, thus highlighted \n+ in the MA plot.\n+\n+-----\n+\n+**Citations:**\n+\n+.. class:: infomark\n+\n+limma\n+\n+Please cite the paper below for the limma software itself. Please also try\n+to cite the appropriate methodology articles that describe the statistical\n+methods implemented in limma, depending on which limma functions you are\n+using. The methodology articles are listed in Section 2.1 of the limma \n+User's Guide.\n+\n+ * Smyth, GK (2005). Limma: linear models for microarray data. In: \n+ 'Bioinformatics and Computational Biology Solutions using R and \n+ Bioconductor'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry, \n+ W. Huber (eds), Springer, New York, pages 397-420.\n+\t \n+ * Law, CW, Chen, Y, Shi, W, and Smyth, GK (2014). Voom:\n+ precision weights unlock linear model analysis tools for\n+ RNA-seq read counts. Genome Biology 15, R29.\n+\n+.. class:: infomark\n+\n+edgeR\n+\n+Please cite the first paper for the software itself and the other papers for\n+the various original statistical methods implemented in edgeR. See \n+Section 1.2 in the User's Guide for more detail.\n+\n+\t* Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor \n+\t package for differential expression analysis of digital gene expression \n+\t data. Bioinformatics 26, 139-140\n+\t \n+\t* Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing \n+\t differences in tag abundance. Bioinformatics 23, 2881-2887\n+\t \n+\t* Robinson MD and Smyth GK (2008). Small-sample estimation of negative \n+\t binomial dispersion, with applications to SAGE data.\n+\t Biostatistics, 9, 321-332\n+\t \n+\t* McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis \n+\t of multifactor RNA-Seq experiments with respect to biological variation. \n+\t Nucleic Acids Research 40, 4288-4297\n+\t \n+Report problems to: su.s@wehi.edu.au\n+\n+.. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html\n+.. _limma: http://www.bioconductor.org/packages/release/bioc/html/limma.html\n+\n+</help>\n+</tool>\n" |