Galaxy |

Changeset 0:bdebdea5f6a7 (2017-06-12)

Next changeset 1:76d01fe0ec36 (2017-07-05)

Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/limma_voom commit 2f34a215c35f08c3666f314a87d235437baa1d21

added:
limma_voom.R
limma_voom.xml
test-data/anno.txt
test-data/limma-voom_Mut-WT.tsv
test-data/limma-voom_Mut-WTanno.tsv
test-data/limma-voom_WT-Mut.tsv
test-data/matrix.txt

diff -r 000000000000 -r bdebdea5f6a7 limma_voom.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/limma_voom.R Mon Jun 12 07:41:02 2017 -0400

[

b'@@ -0,0 +1,654 @@\n+# This tool takes in a matrix of feature counts as well as gene annotations and\n+# outputs a table of top expressions as well as various plots for differential\n+# expression analysis\n+#\n+# ARGS: 1.countPath -Path to RData input containing counts\n+# 2.annoPath -Path to RData input containing gene annotations\n+# 3.htmlPath -Path to html file linking to other outputs\n+# 4.outPath -Path to folder to write all output to\n+# 5.rdaOpt -String specifying if RData should be saved\n+# 6.normOpt -String specifying type of normalisation used\n+# 7.weightOpt -String specifying usage of weights\n+# 8.contrastData -String containing contrasts of interest\n+# 9.cpmReq -Float specifying cpm requirement\n+# 10.sampleReq -Integer specifying cpm requirement\n+# 11.pAdjOpt -String specifying the p-value adjustment method\n+# 12.pValReq -Float specifying the p-value requirement\n+# 13.lfcReq -Float specifying the log-fold-change requirement\n+# 14.factorData -String containing factor names and values\n+#\n+# OUT: Voom Plot\n+# BCV Plot\n+# MA Plot\n+# Top Expression Table\n+# HTML file linking to the ouputs\n+#\n+# Author: Shian Su - registertonysu@gmail.com - Jan 2014\n+\n+# Record starting time\n+timeStart <- as.character(Sys.time())\n+\n+# Load all required libraries\n+library(methods, quietly=TRUE, warn.conflicts=FALSE)\n+library(statmod, quietly=TRUE, warn.conflicts=FALSE)\n+library(splines, quietly=TRUE, warn.conflicts=FALSE)\n+library(edgeR, quietly=TRUE, warn.conflicts=FALSE)\n+library(limma, quietly=TRUE, warn.conflicts=FALSE)\n+library(scales, quietly=TRUE, warn.conflicts=FALSE)\n+\n+if (packageVersion("limma") < "3.20.1") {\n+ stop("Please update \'limma\' to version >= 3.20.1 to run this tool")\n+}\n+\n+################################################################################\n+### Function Delcaration\n+################################################################################\n+# Function to sanitise contrast equations so there are no whitespaces\n+# surrounding the arithmetic operators, leading or trailing whitespace\n+sanitiseEquation <- function(equation) {\n+ equation <- gsub(" *[+] *", "+", equation)\n+ equation <- gsub(" *[-] *", "-", equation)\n+ equation <- gsub(" *[/] *", "/", equation)\n+ equation <- gsub(" *[*] *", "*", equation)\n+ equation <- gsub("^\\\\s+|\\\\s+$", "", equation)\n+ return(equation)\n+}\n+\n+# Function to sanitise group information\n+sanitiseGroups <- function(string) {\n+ string <- gsub(" *[,] *", ",", string)\n+ string <- gsub("^\\\\s+|\\\\s+$", "", string)\n+ return(string)\n+}\n+\n+# Function to change periods to whitespace in a string\n+unmake.names <- function(string) {\n+ string <- gsub(".", " ", string, fixed=TRUE)\n+ return(string)\n+}\n+\n+# Generate output folder and paths\n+makeOut <- function(filename) {\n+ return(paste0(outPath, "/", filename))\n+}\n+\n+# Generating design information\n+pasteListName <- function(string) {\n+ return(paste0("factors$", string))\n+}\n+\n+# Create cata function: default path set, default seperator empty and appending\n+# true by default (Ripped straight from the cat function with altered argument\n+# defaults)\n+cata <- function(..., file = htmlPath, sep = "", fill = FALSE, labels = NULL, \n+ append = TRUE) {\n+ if (is.character(file)) \n+ if (file == "") \n+ file <- stdout()\n+ else if (substring(file, 1L, 1L) == "|") {\n+ file <- pipe(substring(file, 2L), "w")\n+ on.exit(close(file))\n+ }\n+ else {\n+ file <- file(file, ifelse(append, "a", "w"))\n+ on.exit(close(file))\n+ }\n+ .Internal(cat(list(...), file, sep, fill, labels, append))\n+}\n+\n+# Function to write code for html head and title\n+HtmlHead <- function(title) {\n+ cata("<head>\\n")\n+ cata("<title>", title, "</title>\\n")\n+ cata("</head>\\n")\n+}\n+\n+# Function to write code for html links\n+HtmlLink <- function(address, l'..b' paste0("<a href=\\"",\n+ "http://www.bioconductor.org/packages/release/bioc/",\n+ "vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf",\n+ "\\">", "edgeR User\'s Guide", "</a>")\n+\n+cit[1] <- paste("Please cite the following paper for this tool:")\n+\n+cit[2] <- paste("Liu R, Holik AZ, Su S, Jansz N, Chen K, Leong HS, Blewitt ME,",\n+ "Asselin-Labat ML, Smyth GK, Ritchie ME (2015). Why weight? ",\n+ "Modelling sample and observational level variability improves power ",\n+ "in RNA-seq analyses. Nucleic Acids Research, 43(15), e97.")\n+\n+cit[3] <- paste("Please cite the paper below for the limma software itself.",\n+ "Please also try to cite the appropriate methodology articles",\n+ "that describe the statistical methods implemented in limma,",\n+ "depending on which limma functions you are using. The",\n+ "methodology articles are listed in Section 2.1 of the",\n+ link[1],\n+ "Cite no. 3 only if sample weights were used.")\n+cit[4] <- paste("Smyth GK (2005). Limma: linear models for microarray data.",\n+ "In: \'Bioinformatics and Computational Biology Solutions using",\n+ "R and Bioconductor\'. R. Gentleman, V. Carey, S. doit,.",\n+ "Irizarry, W. Huber (eds), Springer, New York, pages 397-420.")\n+cit[5] <- paste("Please cite the first paper for the software itself and the",\n+ "other papers for the various original statistical methods",\n+ "implemented in edgeR. See Section 1.2 in the", link[2],\n+ "for more detail.")\n+cit[6] <- paste("Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a",\n+ "Bioconductor package for differential expression analysis",\n+ "of digital gene expression data. Bioinformatics 26, 139-140")\n+cit[7] <- paste("Robinson MD and Smyth GK (2007). Moderated statistical tests",\n+ "for assessing differences in tag abundance. Bioinformatics",\n+ "23, 2881-2887")\n+cit[8] <- paste("Robinson MD and Smyth GK (2008). Small-sample estimation of",\n+ "negative binomial dispersion, with applications to SAGE data.",\n+ "Biostatistics, 9, 321-332")\n+cit[9] <- paste("McCarthy DJ, Chen Y and Smyth GK (2012). Differential",\n+ "expression analysis of multifactor RNA-Seq experiments with",\n+ "respect to biological variation. Nucleic Acids Research 40,",\n+ "4288-4297")\n+cit[10] <- paste("Law CW, Chen Y, Shi W, and Smyth GK (2014). Voom:",\n+ "precision weights unlock linear model analysis tools for",\n+ "RNA-seq read counts. Genome Biology 15, R29.")\n+cit[11] <- paste("Ritchie ME, Diyagama D, Neilson J, van Laar R,", \n+ "Dobrovic A, Holloway A and Smyth GK (2006).",\n+ "Empirical array quality weights for microarray data.",\n+ "BMC Bioinformatics 7, Article 261.")\n+cata("<h3>Citations</h3>\\n")\n+cata(cit[1], "\\n")\n+cata("<br>\\n")\n+cata(cit[2], "\\n")\n+\n+cata("<h4>limma</h4>\\n")\n+cata(cit[3], "\\n")\n+cata("<ol>\\n")\n+ListItem(cit[4])\n+ListItem(cit[10])\n+ListItem(cit[11])\n+cata("</ol>\\n")\n+\n+cata("<h4>edgeR</h4>\\n")\n+cata(cit[5], "\\n")\n+cata("<ol>\\n")\n+ListItem(cit[6])\n+ListItem(cit[7])\n+ListItem(cit[8])\n+ListItem(cit[9])\n+cata("</ol>\\n")\n+\n+cata("<p>Please report problems or suggestions to: su.s@wehi.edu.au</p>\\n")\n+\n+for (i in 1:nrow(linkData)) {\n+ if (grepl("session_info", linkData$Link[i])) {\n+ HtmlLink(linkData$Link[i], linkData$Label[i])\n+ }\n+}\n+\n+cata("<table border=\\"0\\">\\n")\n+cata("<tr>\\n")\n+TableItem("Task started at:"); TableItem(timeStart)\n+cata("</tr>\\n")\n+cata("<tr>\\n")\n+TableItem("Task ended at:"); TableItem(timeEnd)\n+cata("</tr>\\n")\n+cata("<tr>\\n")\n+TableItem("Task run time:"); TableItem(timeTaken)\n+cata("<tr>\\n")\n+cata("</table>\\n")\n+\n+cata("</body>\\n")\n+cata("</html>")\n'

diff -r 000000000000 -r bdebdea5f6a7 limma_voom.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/limma_voom.xml Mon Jun 12 07:41:02 2017 -0400

[

b'@@ -0,0 +1,388 @@\n+<tool id="limma_voom" name="limma-voom" version="1.1.1">\n+ <description>\n+ Differential expression with optional sample weights\n+ </description>\n+ \n+ <requirements>\n+ <requirement type="package" version="3.16.5">bioconductor-edger</requirement>\n+ <requirement type="package" version="3.30.13">bioconductor-limma</requirement>\n+ <requirement type="package" version="1.4.29">r-statmod</requirement>\n+ <requirement type="package" version="0.4.1">r-scales</requirement>\n+ </requirements>\n+\n+ <version_command>\n+ <![CDATA[\n+ echo $(R --version | grep version | grep -v GNU)", limma version" $(R --vanilla --slave -e "library(limma); cat(sessionInfo()\\$otherPkgs\\$limma\\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", edgeR version" $(R --vanilla --slave -e "library(edgeR); cat(sessionInfo()\\$otherPkgs\\$edgeR\\$Version)" 2> /dev/null | grep -v -i "WARNING: ")\n+ ]]>\n+ </version_command>\n+ \n+ <command detect_errors="exit_code">\n+ <![CDATA[\n+ Rscript \'$__tool_directory__/limma_voom.R\'\n+ \'$counts\'\n+ \n+ #if $anno.annoOpt==\'yes\':\n+ \'$geneanno\'\n+ #else:\n+ None\n+ #end if\n+ \n+ \'$outReport\'\n+ \'$outReport.files_path\'\n+ $rdaOption\n+ $normalisationOption\n+ $weightOption\n+ \'$contrast\'\n+\n+ #if $filterCPM.filterLowCPM==\'yes\':\n+ \'$filterCPM.cpmReq\'\n+ \'$filterCPM.sampleReq\'\n+ #else:\n+ 0\n+ 0\n+ #end if\n+ \n+ #if $testOpt.wantOpt==\'yes\':\n+ \'$testOpt.pAdjust\'\n+ \'$testOpt.pVal\'\n+ \'$testOpt.lfc\'\n+ #else:\n+ "BH"\n+ 0.05\n+ 0\n+ #end if\n+ \n+ \'$factName::$factLevel\'\n+\n+ &&\n+ mkdir ./output_dir\n+\n+ &&\n+ mv \'$outReport.files_path\'/*.tsv output_dir/\n+ \n+ ]]> \n+ </command>\n+ \n+ <inputs>\n+ <param name="counts" type="data" format="tabular" label="Counts Data"/>\n+ \n+ <conditional name="anno">\n+ <param name="annoOpt" type="select"\n+ label="Use Gene Annotations?" \n+ help="If an annotation file is provided, annotations will be added to the table of differential expression results to provide descriptions for each gene.">\n+ <option value="no">No</option>\n+ <option value="yes">Yes</option>\n+ </param>\n+ <when value="yes">\n+ <param name="geneanno" type="data" format="tabular" label="Gene Annotations"/>\n+ </when>\n+ <when value="no" />\n+ </conditional>\n+\n+ \n+ \n+ <param name="factName" type="text" label="Factor Name" help="Eg. Genotype."/>\n+ <param name="factLevel" type="text" label="Factor Values"\n+ help="Eg. WT,WT,WT,Mut,Mut,Mut\n+ NOTE: Please ensure that the same levels are typed identically with cases matching."/> \n+ <param name="contrast" type="text" label="Contrasts of interest" help="Eg. Mut-WT,KD-Control"/>\n+ \n+ <conditional name="filterCPM">\n+ <param name="filterLowCPM" type="select" label="Filter Low CPM?"\n+ help="Treat genes with very low expression as unexpressed and filter out to speed up computation.">\n+ <opt'..b'formation is still\n+used in the statistical analysis but their impact is reduced. Use this\n+whenever significant outliers are present. The MDS plotting tool in this package\n+is useful for identifying outliers. For more information on this option see Liu et al. (2015).\n+\n+**Use Advanced Testing Options?:**\n+By default error rate for multiple testing is controlled using Benjamini and\n+Hochberg\'s false discovery rate control at a threshold value of 0.05. However\n+there are options to change this to custom values.\n+\n+ * **P-Value Adjustment Method:**\n+ Change the multiple testing control method, the options are BH(1995) and \n+ BY(2001) which are both false discovery rate controls. There is also\n+ Holm(1979) which is a method for family-wise error rate control.\n+ \n+ * **Adjusted Threshold:**\n+ Set the threshold for the resulting value of the multiple testing control\n+ method. Only observations whose statistic falls below this value is\n+ considered significant, thus highlighted in the MA plot.\n+ \n+ * **Minimum log2-fold-change Required:**\n+ In addition to meeting the requirement for the adjusted statistic for\n+ multiple testing, the observation must have an absolute log2-fold-change\n+ greater than this threshold to be considered significant, thus highlighted \n+ in the MA plot.\n+\n+-----\n+\n+**Citations:**\n+\n+.. class:: infomark\n+\n+limma\n+\n+Please cite the paper below for the limma software itself. Please also try\n+to cite the appropriate methodology articles that describe the statistical\n+methods implemented in limma, depending on which limma functions you are\n+using. The methodology articles are listed in Section 2.1 of the limma \n+User\'s Guide.\n+\n+ * Smyth GK (2005). Limma: linear models for microarray data. In: \n+ \'Bioinformatics and Computational Biology Solutions using R and \n+ Bioconductor\'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry, \n+ W. Huber (eds), Springer, New York, pages 397-420.\n+ \n+ * Law CW, Chen Y, Shi W, and Smyth GK (2014). Voom:\n+ precision weights unlock linear model analysis tools for\n+ RNA-seq read counts. Genome Biology 15, R29.\n+\n+ * Liu R, Holik AZ, Su S, Jansz N, Chen K, Leong HS, Blewitt ME, Asselin-Labat ML, Smyth GK, Ritchie ME (2015). Why weight? Modelling sample and observational level variability improves power in RNA-seq analyses. Nucleic Acids Research, 43(15), e97.\n+\n+ * Ritchie, M. E., Diyagama, D., Neilson, J., van Laar, R., Dobrovic, \n+ A., Holloway, A., and Smyth, G. K. (2006). Empirical array quality weights\n+ for microarray data. BMC Bioinformatics 7, Article 261.\n+\n+.. class:: infomark\n+\n+edgeR\n+\n+Please cite the first paper for the software itself and the other papers for\n+the various original statistical methods implemented in edgeR. See \n+Section 1.2 in the User\'s Guide for more detail.\n+\n+ * Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor \n+ package for differential expression analysis of digital gene expression \n+ data. Bioinformatics 26, 139-140\n+ \n+ * Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing \n+ differences in tag abundance. Bioinformatics 23, 2881-2887\n+ \n+ * Robinson MD and Smyth GK (2008). Small-sample estimation of negative \n+ binomial dispersion, with applications to SAGE data.\n+ Biostatistics, 9, 321-332\n+ \n+ * McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis \n+ of multifactor RNA-Seq experiments with respect to biological variation. \n+ Nucleic Acids Research 40, 4288-4297\n+ \n+Please report problems or suggestions to: su.s@wehi.edu.au\n+\n+.. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html\n+.. _limma: http://www.bioconductor.org/packages/release/bioc/html/limma.html\n+]]>\n+ </help>\n+ <citations>\n+ <citation type="doi">10.1093/nar/gkv412</citation>\n+ </citations>\n+</tool>\n'

diff -r 000000000000 -r bdebdea5f6a7 test-data/anno.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/anno.txt Mon Jun 12 07:41:02 2017 -0400

@@ -0,0 +1,7 @@
+EntrezID Symbol GeneName Chr Length
+11287 Pzp pregnancy zone protein 6 4681
+11298 Aanat arylalkylamine N-acetyltransferase 11 1455
+11302 Aatk apoptosis-associated tyrosine kinase 11 5743
+11303 Abca1 ATP-binding cassette, sub-family A (ABC1), member 1 4 10260
+11304 Abca4 ATP-binding cassette, sub-family A (ABC1), member 4 3 7248
+11305 Abca2 ATP-binding cassette, sub-family A (ABC1), member 2 2 8061
\ No newline at end of file

diff -r 000000000000 -r bdebdea5f6a7 test-data/limma-voom_Mut-WT.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/limma-voom_Mut-WT.tsv Mon Jun 12 07:41:02 2017 -0400

@@ -0,0 +1,7 @@
+"ID" "logFC" "AveExpr" "t" "P.Value" "adj.P.Val" "B"
+"11304" 0.457332061341026 15.5254133001226 6.50459574633681 9.98720685006039e-07 5.99232411003624e-06 14.0741948485896
+"11287" 0.190749727701785 17.6546448244617 5.09535410066402 3.26518807654125e-05 9.79556422962375e-05 5.46773893802392
+"11298" -0.138014418336201 17.6747285193431 -3.33168485842331 0.00278753263633162 0.00557506527266324 -1.84301342041449
+"11303" -0.0558958943606989 17.886791401216 -1.30108531275576 0.205582481502297 0.254491025872973 -6.4924124057801
+"11305" -0.0606991650996633 18.1585474109909 -1.28203791127299 0.212075854894144 0.254491025872973 -6.42090197700503
+"11302" -0.0350239682204432 9.78883119065989 -0.236945963165269 0.814709535394087 0.814709535394087 -6.09497670655944

diff -r 000000000000 -r bdebdea5f6a7 test-data/limma-voom_Mut-WTanno.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/limma-voom_Mut-WTanno.tsv Mon Jun 12 07:41:02 2017 -0400

@@ -0,0 +1,7 @@
+"EntrezID" "Symbol" "GeneName" "Chr" "Length" "logFC" "AveExpr" "t" "P.Value" "adj.P.Val" "B"
+11304 "Abca4" "ATP-binding cassette, sub-family A (ABC1), member 4" 3 7248 0.457332061341026 15.5254133001226 6.50459574633681 9.98720685006039e-07 5.99232411003624e-06 14.0741948485896
+11287 "Pzp" "pregnancy zone protein" 6 4681 0.190749727701785 17.6546448244617 5.09535410066402 3.26518807654125e-05 9.79556422962375e-05 5.46773893802392
+11298 "Aanat" "arylalkylamine N-acetyltransferase" 11 1455 -0.138014418336201 17.6747285193431 -3.33168485842331 0.00278753263633162 0.00557506527266324 -1.84301342041449
+11303 "Abca1" "ATP-binding cassette, sub-family A (ABC1), member 1" 4 10260 -0.0558958943606989 17.886791401216 -1.30108531275576 0.205582481502297 0.254491025872973 -6.4924124057801
+11305 "Abca2" "ATP-binding cassette, sub-family A (ABC1), member 2" 2 8061 -0.0606991650996633 18.1585474109909 -1.28203791127299 0.212075854894144 0.254491025872973 -6.42090197700503
+11302 "Aatk" "apoptosis-associated tyrosine kinase" 11 5743 -0.0350239682204432 9.78883119065989 -0.236945963165269 0.814709535394087 0.814709535394087 -6.09497670655944

diff -r 000000000000 -r bdebdea5f6a7 test-data/limma-voom_WT-Mut.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/limma-voom_WT-Mut.tsv Mon Jun 12 07:41:02 2017 -0400

@@ -0,0 +1,7 @@
+"ID" "logFC" "AveExpr" "t" "P.Value" "adj.P.Val" "B"
+"11304" -0.457332061341026 15.5254133001226 -6.50459574633681 9.98720685006039e-07 5.99232411003624e-06 14.0741948485896
+"11287" -0.190749727701785 17.6546448244617 -5.09535410066402 3.26518807654125e-05 9.79556422962375e-05 5.46773893802392
+"11298" 0.138014418336201 17.6747285193431 3.33168485842331 0.00278753263633162 0.00557506527266324 -1.84301342041449
+"11303" 0.0558958943606989 17.886791401216 1.30108531275576 0.205582481502297 0.254491025872973 -6.4924124057801
+"11305" 0.0606991650996633 18.1585474109909 1.28203791127299 0.212075854894144 0.254491025872973 -6.42090197700503
+"11302" 0.0350239682204432 9.78883119065989 0.236945963165269 0.814709535394087 0.814709535394087 -6.09497670655944

diff -r 000000000000 -r bdebdea5f6a7 test-data/matrix.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/matrix.txt Mon Jun 12 07:41:02 2017 -0400

@@ -0,0 +1,7 @@
+GeneID WT1 WT2 WT3 Mut1 Mut2 Mut3
+11287 1699 1528 1601 1463 1441 1495
+11298 1905 1744 1834 1345 1291 1346
+11302 6 8 7 5 6 5
+11303 2099 1974 2100 1574 1519 1654
+11304 356 312 337 361 397 346
+11305 2528 2438 2493 1762 1942 2027