comparison egsea.R @ 0:a8a083193440 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/egsea commit 7d0c7d850cd56ea3e54d8c03266f719241b20b8b
author iuc
date Thu, 25 Jan 2018 02:23:23 -0500
parents
children 73281fbdf6c1
comparison
equal deleted inserted replaced
-1:000000000000 0:a8a083193440
1 # Code based on (and inspired by) the Galaxy limma-voom/edgeR/DESeq2 wrappers
2
3 options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
4
5 # we need that to not crash galaxy with an UTF8 error on German LC settings.
6 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
7
8 suppressPackageStartupMessages({
9 library(EGSEA)
10 library(limma)
11 library(edgeR)
12 library(optparse)
13 })
14
15
16 ## Function Declaration
17
18 sanitiseEquation <- function(equation) {
19 equation <- gsub(" *[+] *", "+", equation)
20 equation <- gsub(" *[-] *", "-", equation)
21 equation <- gsub(" *[/] *", "/", equation)
22 equation <- gsub(" *[*] *", "*", equation)
23 equation <- gsub("^\\s+|\\s+$", "", equation)
24 return(equation)
25 }
26
27 # Function to sanitise group information
28 sanitiseGroups <- function(string) {
29 string <- gsub(" *[,] *", ",", string)
30 string <- gsub("^\\s+|\\s+$", "", string)
31 return(string)
32 }
33
34 # Generating design information
35 pasteListName <- function(string) {
36 return(paste0("factors$", string))
37 }
38
39 ## Input Processing
40
41 option_list <- list(
42 make_option(c("-threads", "--threads"), default=2, type="integer", help="Number of threads for egsea"),
43 make_option(c("-filesPath", "--filesPath"), type="character", help="JSON list object if multiple files input"),
44 make_option(c("-matrixPath", "--matrixPath"), type="character", help="Path to count matrix"),
45 make_option(c("-factFile", "--factFile"), type="character", help="Path to factor information file"),
46 make_option(c("-factInput", "--factInput"), type="character", help="String containing factors if manually input"),
47 make_option(c("-contrastData", "--contrastData"), type="character", help="Contrasts of Interest (Groups to compare)"),
48 make_option(c("-genes", "--genes"), type="character", help="Path to genes file"),
49 make_option(c("-species", "--species"), type="character"),
50 make_option(c("-base_methods", "--base_methods"), type="character", help="Gene set testing methods"),
51 make_option(c("-msigdb", "--msigdb"), type="character", help="MSigDB Gene Set Collections"),
52 make_option(c("-keggdb", "--keggdb"), type="character", help="KEGG Pathways"),
53 make_option(c("-gsdb", "--gsdb"), type="character", help = "GeneSetDB Gene Sets"),
54 make_option(c("-display_top", "--display_top"), type="integer", help = "Number of top Gene Sets to display"),
55 make_option(c("-min_size", "--min_size"), type="integer", help = "Minimum Size of Gene Set"),
56 make_option(c("-fdr_cutoff", "--fdr_cutoff"), type="double", help = "FDR cutoff"),
57 make_option(c("-combine_method", "--combine_method"), type="character", help="Method to use to combine the p-values"),
58 make_option(c("-sort_method", "--sort_method"), type="character", help="Method to sort the results"),
59 make_option(c("-rdata", "--rdaOpt"), type="character", help="Output RData file")
60 )
61
62 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
63 args = parse_args(parser)
64
65
66 ## Read in Files
67
68 if (!is.null(args$filesPath)) {
69 # Process the separate count files (adapted from DESeq2 wrapper)
70 library("rjson")
71 parser <- newJSONParser()
72 parser$addData(args$filesPath)
73 factorList <- parser$getObject()
74 factors <- sapply(factorList, function(x) x[[1]])
75 filenamesIn <- unname(unlist(factorList[[1]][[2]]))
76 sampleTable <- data.frame(sample=basename(filenamesIn),
77 filename=filenamesIn,
78 row.names=filenamesIn,
79 stringsAsFactors=FALSE)
80 for (factor in factorList) {
81 factorName <- factor[[1]]
82 sampleTable[[factorName]] <- character(nrow(sampleTable))
83 lvls <- sapply(factor[[2]], function(x) names(x))
84 for (i in seq_along(factor[[2]])) {
85 files <- factor[[2]][[i]][[1]]
86 sampleTable[files,factorName] <- lvls[i]
87 }
88 sampleTable[[factorName]] <- factor(sampleTable[[factorName]], levels=lvls)
89 }
90 rownames(sampleTable) <- sampleTable$sample
91 rem <- c("sample","filename")
92 factors <- sampleTable[, !(names(sampleTable) %in% rem), drop=FALSE]
93
94 #read in count files and create single table
95 countfiles <- lapply(sampleTable$filename, function(x){read.delim(x, row.names=1)})
96 counts <- do.call("cbind", countfiles)
97
98 } else {
99 # Process the single count matrix
100 counts <- read.table(args$matrixPath, header=TRUE, sep="\t", stringsAsFactors=FALSE)
101 row.names(counts) <- counts[, 1]
102 counts <- counts[ , -1]
103 countsRows <- nrow(counts)
104
105 # Process factors
106 if (is.null(args$factInput)) {
107 factorData <- read.table(args$factFile, header=TRUE, sep="\t")
108 factors <- factorData[, -1, drop=FALSE]
109 } else {
110 factors <- unlist(strsplit(args$factInput, "|", fixed=TRUE))
111 factorData <- list()
112 for (fact in factors) {
113 newFact <- unlist(strsplit(fact, split="::"))
114 factorData <- rbind(factorData, newFact)
115 } # Factors have the form: FACT_NAME::LEVEL,LEVEL,LEVEL,LEVEL,... The first factor is the Primary Factor.
116
117 # Set the row names to be the name of the factor and delete first row
118 row.names(factorData) <- factorData[, 1]
119 factorData <- factorData[, -1]
120 factorData <- sapply(factorData, sanitiseGroups)
121 factorData <- sapply(factorData, strsplit, split=",")
122 factorData <- sapply(factorData, make.names)
123 # Transform factor data into data frame of R factor objects
124 factors <- data.frame(factorData)
125 }
126 }
127
128 # Create a DGEList object
129 counts <- DGEList(counts)
130
131 # Set group to be the Primary Factor input
132 group <- factors[, 1, drop=FALSE]
133
134 # Split up contrasts separated by comma into a vector then sanitise
135 contrastData <- unlist(strsplit(args$contrastData, split=","))
136 contrastData <- sanitiseEquation(contrastData)
137 contrastData <- gsub(" ", ".", contrastData, fixed=TRUE)
138
139 # Creating design
140 row.names(factors) <- colnames(counts)
141 factorList <- sapply(names(factors), pasteListName)
142
143 formula <- "~0"
144 for (i in 1:length(factorList)) {
145 formula <- paste(formula, factorList[i], sep="+")
146 }
147 formula <- formula(formula)
148
149 design <- model.matrix(formula)
150
151 for (i in 1:length(factorList)) {
152 colnames(design) <- gsub(factorList[i], "", colnames(design), fixed=TRUE)
153 }
154
155 ## Generate Contrasts information
156 contrasts <- makeContrasts(contrasts=contrastData, levels=design)
157
158
159 ## Add Gene Symbol information
160
161 genes <- read.table(args$genes, sep='\t', header=TRUE)
162
163
164 ## Set Gene Set Testing Methods
165
166 base_methods <- unlist(strsplit(args$base_methods, ","))
167
168
169 ## Set Gene Sets
170
171 if (args$msigdb != "None") {
172 msigdb <- unlist(strsplit(args$msigdb, ","))
173 } else {
174 msigdb <- "none"
175 }
176
177 if (args$keggdb != "None") {
178 keggdb <- unlist(strsplit(args$keggdb, ","))
179 kegg_all <- c("Metabolism"="keggmet", "Signaling"="keggsig", "Disease"="keggdis")
180 kegg_exclude <- names(kegg_all[!(kegg_all %in% keggdb)])
181 } else {
182 kegg_exclude <- "all"
183 }
184
185 if (args$gsdb != "None") {
186 gsdb <- unlist(strsplit(args$gsdb, ","))
187 } else {
188 gsdb <- "none"
189 }
190
191
192 ## Index gene sets
193
194 gs.annots <- buildIdx(entrezIDs=rownames(counts), species=args$species, msigdb.gsets=msigdb, gsdb.gsets=gsdb, kegg.exclude=kegg_exclude)
195
196
197 ## Run egsea.cnt
198
199 gsa <- egsea.cnt(counts=counts, group=group, design=design, contrasts=contrasts, gs.annots=gs.annots, symbolsMap=genes, baseGSEAs=base_methods, minSize=args$min_size, display.top=args$display_top, combineMethod=args$combine_method, sort.by=args$sort_method, report.dir='./report_dir', fdr.cutoff=args$fdr_cutoff, num.threads=args$threads, report=TRUE)
200
201
202 ## Output RData file
203
204 if (!is.null(args$rdata)) {
205 save.image(file = "EGSEA_analysis.RData")
206 }