Mercurial > repos > iuc > egsea
comparison egsea.R @ 0:a8a083193440 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/egsea commit 7d0c7d850cd56ea3e54d8c03266f719241b20b8b
author | iuc |
---|---|
date | Thu, 25 Jan 2018 02:23:23 -0500 |
parents | |
children | 73281fbdf6c1 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a8a083193440 |
---|---|
1 # Code based on (and inspired by) the Galaxy limma-voom/edgeR/DESeq2 wrappers | |
2 | |
3 options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) | |
4 | |
5 # we need that to not crash galaxy with an UTF8 error on German LC settings. | |
6 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") | |
7 | |
8 suppressPackageStartupMessages({ | |
9 library(EGSEA) | |
10 library(limma) | |
11 library(edgeR) | |
12 library(optparse) | |
13 }) | |
14 | |
15 | |
16 ## Function Declaration | |
17 | |
18 sanitiseEquation <- function(equation) { | |
19 equation <- gsub(" *[+] *", "+", equation) | |
20 equation <- gsub(" *[-] *", "-", equation) | |
21 equation <- gsub(" *[/] *", "/", equation) | |
22 equation <- gsub(" *[*] *", "*", equation) | |
23 equation <- gsub("^\\s+|\\s+$", "", equation) | |
24 return(equation) | |
25 } | |
26 | |
27 # Function to sanitise group information | |
28 sanitiseGroups <- function(string) { | |
29 string <- gsub(" *[,] *", ",", string) | |
30 string <- gsub("^\\s+|\\s+$", "", string) | |
31 return(string) | |
32 } | |
33 | |
34 # Generating design information | |
35 pasteListName <- function(string) { | |
36 return(paste0("factors$", string)) | |
37 } | |
38 | |
39 ## Input Processing | |
40 | |
41 option_list <- list( | |
42 make_option(c("-threads", "--threads"), default=2, type="integer", help="Number of threads for egsea"), | |
43 make_option(c("-filesPath", "--filesPath"), type="character", help="JSON list object if multiple files input"), | |
44 make_option(c("-matrixPath", "--matrixPath"), type="character", help="Path to count matrix"), | |
45 make_option(c("-factFile", "--factFile"), type="character", help="Path to factor information file"), | |
46 make_option(c("-factInput", "--factInput"), type="character", help="String containing factors if manually input"), | |
47 make_option(c("-contrastData", "--contrastData"), type="character", help="Contrasts of Interest (Groups to compare)"), | |
48 make_option(c("-genes", "--genes"), type="character", help="Path to genes file"), | |
49 make_option(c("-species", "--species"), type="character"), | |
50 make_option(c("-base_methods", "--base_methods"), type="character", help="Gene set testing methods"), | |
51 make_option(c("-msigdb", "--msigdb"), type="character", help="MSigDB Gene Set Collections"), | |
52 make_option(c("-keggdb", "--keggdb"), type="character", help="KEGG Pathways"), | |
53 make_option(c("-gsdb", "--gsdb"), type="character", help = "GeneSetDB Gene Sets"), | |
54 make_option(c("-display_top", "--display_top"), type="integer", help = "Number of top Gene Sets to display"), | |
55 make_option(c("-min_size", "--min_size"), type="integer", help = "Minimum Size of Gene Set"), | |
56 make_option(c("-fdr_cutoff", "--fdr_cutoff"), type="double", help = "FDR cutoff"), | |
57 make_option(c("-combine_method", "--combine_method"), type="character", help="Method to use to combine the p-values"), | |
58 make_option(c("-sort_method", "--sort_method"), type="character", help="Method to sort the results"), | |
59 make_option(c("-rdata", "--rdaOpt"), type="character", help="Output RData file") | |
60 ) | |
61 | |
62 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) | |
63 args = parse_args(parser) | |
64 | |
65 | |
66 ## Read in Files | |
67 | |
68 if (!is.null(args$filesPath)) { | |
69 # Process the separate count files (adapted from DESeq2 wrapper) | |
70 library("rjson") | |
71 parser <- newJSONParser() | |
72 parser$addData(args$filesPath) | |
73 factorList <- parser$getObject() | |
74 factors <- sapply(factorList, function(x) x[[1]]) | |
75 filenamesIn <- unname(unlist(factorList[[1]][[2]])) | |
76 sampleTable <- data.frame(sample=basename(filenamesIn), | |
77 filename=filenamesIn, | |
78 row.names=filenamesIn, | |
79 stringsAsFactors=FALSE) | |
80 for (factor in factorList) { | |
81 factorName <- factor[[1]] | |
82 sampleTable[[factorName]] <- character(nrow(sampleTable)) | |
83 lvls <- sapply(factor[[2]], function(x) names(x)) | |
84 for (i in seq_along(factor[[2]])) { | |
85 files <- factor[[2]][[i]][[1]] | |
86 sampleTable[files,factorName] <- lvls[i] | |
87 } | |
88 sampleTable[[factorName]] <- factor(sampleTable[[factorName]], levels=lvls) | |
89 } | |
90 rownames(sampleTable) <- sampleTable$sample | |
91 rem <- c("sample","filename") | |
92 factors <- sampleTable[, !(names(sampleTable) %in% rem), drop=FALSE] | |
93 | |
94 #read in count files and create single table | |
95 countfiles <- lapply(sampleTable$filename, function(x){read.delim(x, row.names=1)}) | |
96 counts <- do.call("cbind", countfiles) | |
97 | |
98 } else { | |
99 # Process the single count matrix | |
100 counts <- read.table(args$matrixPath, header=TRUE, sep="\t", stringsAsFactors=FALSE) | |
101 row.names(counts) <- counts[, 1] | |
102 counts <- counts[ , -1] | |
103 countsRows <- nrow(counts) | |
104 | |
105 # Process factors | |
106 if (is.null(args$factInput)) { | |
107 factorData <- read.table(args$factFile, header=TRUE, sep="\t") | |
108 factors <- factorData[, -1, drop=FALSE] | |
109 } else { | |
110 factors <- unlist(strsplit(args$factInput, "|", fixed=TRUE)) | |
111 factorData <- list() | |
112 for (fact in factors) { | |
113 newFact <- unlist(strsplit(fact, split="::")) | |
114 factorData <- rbind(factorData, newFact) | |
115 } # Factors have the form: FACT_NAME::LEVEL,LEVEL,LEVEL,LEVEL,... The first factor is the Primary Factor. | |
116 | |
117 # Set the row names to be the name of the factor and delete first row | |
118 row.names(factorData) <- factorData[, 1] | |
119 factorData <- factorData[, -1] | |
120 factorData <- sapply(factorData, sanitiseGroups) | |
121 factorData <- sapply(factorData, strsplit, split=",") | |
122 factorData <- sapply(factorData, make.names) | |
123 # Transform factor data into data frame of R factor objects | |
124 factors <- data.frame(factorData) | |
125 } | |
126 } | |
127 | |
128 # Create a DGEList object | |
129 counts <- DGEList(counts) | |
130 | |
131 # Set group to be the Primary Factor input | |
132 group <- factors[, 1, drop=FALSE] | |
133 | |
134 # Split up contrasts separated by comma into a vector then sanitise | |
135 contrastData <- unlist(strsplit(args$contrastData, split=",")) | |
136 contrastData <- sanitiseEquation(contrastData) | |
137 contrastData <- gsub(" ", ".", contrastData, fixed=TRUE) | |
138 | |
139 # Creating design | |
140 row.names(factors) <- colnames(counts) | |
141 factorList <- sapply(names(factors), pasteListName) | |
142 | |
143 formula <- "~0" | |
144 for (i in 1:length(factorList)) { | |
145 formula <- paste(formula, factorList[i], sep="+") | |
146 } | |
147 formula <- formula(formula) | |
148 | |
149 design <- model.matrix(formula) | |
150 | |
151 for (i in 1:length(factorList)) { | |
152 colnames(design) <- gsub(factorList[i], "", colnames(design), fixed=TRUE) | |
153 } | |
154 | |
155 ## Generate Contrasts information | |
156 contrasts <- makeContrasts(contrasts=contrastData, levels=design) | |
157 | |
158 | |
159 ## Add Gene Symbol information | |
160 | |
161 genes <- read.table(args$genes, sep='\t', header=TRUE) | |
162 | |
163 | |
164 ## Set Gene Set Testing Methods | |
165 | |
166 base_methods <- unlist(strsplit(args$base_methods, ",")) | |
167 | |
168 | |
169 ## Set Gene Sets | |
170 | |
171 if (args$msigdb != "None") { | |
172 msigdb <- unlist(strsplit(args$msigdb, ",")) | |
173 } else { | |
174 msigdb <- "none" | |
175 } | |
176 | |
177 if (args$keggdb != "None") { | |
178 keggdb <- unlist(strsplit(args$keggdb, ",")) | |
179 kegg_all <- c("Metabolism"="keggmet", "Signaling"="keggsig", "Disease"="keggdis") | |
180 kegg_exclude <- names(kegg_all[!(kegg_all %in% keggdb)]) | |
181 } else { | |
182 kegg_exclude <- "all" | |
183 } | |
184 | |
185 if (args$gsdb != "None") { | |
186 gsdb <- unlist(strsplit(args$gsdb, ",")) | |
187 } else { | |
188 gsdb <- "none" | |
189 } | |
190 | |
191 | |
192 ## Index gene sets | |
193 | |
194 gs.annots <- buildIdx(entrezIDs=rownames(counts), species=args$species, msigdb.gsets=msigdb, gsdb.gsets=gsdb, kegg.exclude=kegg_exclude) | |
195 | |
196 | |
197 ## Run egsea.cnt | |
198 | |
199 gsa <- egsea.cnt(counts=counts, group=group, design=design, contrasts=contrasts, gs.annots=gs.annots, symbolsMap=genes, baseGSEAs=base_methods, minSize=args$min_size, display.top=args$display_top, combineMethod=args$combine_method, sort.by=args$sort_method, report.dir='./report_dir', fdr.cutoff=args$fdr_cutoff, num.threads=args$threads, report=TRUE) | |
200 | |
201 | |
202 ## Output RData file | |
203 | |
204 if (!is.null(args$rdata)) { | |
205 save.image(file = "EGSEA_analysis.RData") | |
206 } |