comparison egsea.R @ 4:fba1660fb717 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/egsea commit c2313b506b3b8ae860bb844b979397d87de4fb44"
author iuc
date Mon, 28 Jun 2021 09:45:14 +0000
parents ba2111ae6eb4
children
comparison
equal deleted inserted replaced
3:31ea4992b948 4:fba1660fb717
1 # Code based on (and inspired by) the Galaxy limma-voom/edgeR/DESeq2 wrappers 1 # Code based on (and inspired by) the Galaxy limma-voom/edgeR/DESeq2 wrappers
2 2
3 options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) 3 options(show.error.messages = F, error = function() {
4 cat(geterrmessage(), file = stderr()); q("no", 1, F)
5 })
4 6
5 # we need that to not crash galaxy with an UTF8 error on German LC settings. 7 # we need that to not crash galaxy with an UTF8 error on German LC settings.
6 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") 8 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
7 9
8 suppressPackageStartupMessages({ 10 suppressPackageStartupMessages({
13 }) 15 })
14 16
15 17
16 ## Function Declaration 18 ## Function Declaration
17 19
18 sanitiseEquation <- function(equation) { 20 sanitise_equation <- function(equation) {
19 equation <- gsub(" *[+] *", "+", equation) 21 equation <- gsub(" *[+] *", "+", equation)
20 equation <- gsub(" *[-] *", "-", equation) 22 equation <- gsub(" *[-] *", "-", equation)
21 equation <- gsub(" *[/] *", "/", equation) 23 equation <- gsub(" *[/] *", "/", equation)
22 equation <- gsub(" *[*] *", "*", equation) 24 equation <- gsub(" *[*] *", "*", equation)
23 equation <- gsub("^\\s+|\\s+$", "", equation) 25 equation <- gsub("^\\s+|\\s+$", "", equation)
24 return(equation) 26 return(equation)
25 } 27 }
26 28
27 # Function to sanitise group information 29 # Function to sanitise group information
28 sanitiseGroups <- function(string) { 30 sanitise_groups <- function(string) {
29 string <- gsub(" *[,] *", ",", string) 31 string <- gsub(" *[,] *", ",", string)
30 string <- gsub("^\\s+|\\s+$", "", string) 32 string <- gsub("^\\s+|\\s+$", "", string)
31 return(string) 33 return(string)
32 } 34 }
33 35
34 # Generating design information 36 # Generating design information
35 pasteListName <- function(string) { 37 paste_listname <- function(string) {
36 return(paste0("factors$", string)) 38 return(paste0("factors$", string))
37 } 39 }
38 40
39 ## Input Processing 41 ## Input Processing
40 42
41 option_list <- list( 43 option_list <- list(
42 make_option(c("-threads", "--threads"), default=2, type="integer", help="Number of threads for egsea"), 44 make_option("--threads", default = 2, type = "integer", help = "Number of threads for egsea"),
43 make_option(c("-filesPath", "--filesPath"), type="character", help="JSON list object if multiple files input"), 45 make_option("--filesPath", type = "character", help = "JSON list object if multiple files input"),
44 make_option(c("-matrixPath", "--matrixPath"), type="character", help="Path to count matrix"), 46 make_option("--matrixPath", type = "character", help = "Path to count matrix"),
45 make_option(c("-factFile", "--factFile"), type="character", help="Path to factor information file"), 47 make_option("--factFile", type = "character", help = "Path to factor information file"),
46 make_option(c("-factInput", "--factInput"), type="character", help="String containing factors if manually input"), 48 make_option("--factInput", type = "character", help = "String containing factors if manually input"),
47 make_option(c("-contrastData", "--contrastData"), type="character", help="Contrasts of Interest (Groups to compare)"), 49 make_option("--contrastData", type = "character", help = "Contrasts of Interest (Groups to compare)"),
48 make_option(c("-genes", "--genes"), type="character", help="Path to genes file"), 50 make_option("--genes", type = "character", help = "Path to genes file"),
49 make_option(c("-species", "--species"), type="character"), 51 make_option("--species", type = "character"),
50 make_option(c("-base_methods", "--base_methods"), type="character", help="Gene set testing methods"), 52 make_option("--base_methods", type = "character", help = "Gene set testing methods"),
51 make_option(c("-msigdb", "--msigdb"), type="character", help="MSigDB Gene Set Collections"), 53 make_option("--msigdb", type = "character", help = "MSigDB Gene Set Collections"),
52 make_option(c("-keggdb", "--keggdb"), type="character", help="KEGG Pathways"), 54 make_option("--keggdb", type = "character", help = "KEGG Pathways"),
53 make_option(c("-keggupdated", "--keggupdated"), type="logical", help="Use updated KEGG"), 55 make_option("--keggupdated", type = "logical", help = "Use updated KEGG"),
54 make_option(c("-gsdb", "--gsdb"), type="character", help = "GeneSetDB Gene Sets"), 56 make_option("--gsdb", type = "character", help = "GeneSetDB Gene Sets"),
55 make_option(c("-display_top", "--display_top"), type="integer", help = "Number of top Gene Sets to display"), 57 make_option("--display_top", type = "integer", help = "Number of top Gene Sets to display"),
56 make_option(c("-min_size", "--min_size"), type="integer", help = "Minimum Size of Gene Set"), 58 make_option("--min_size", type = "integer", help = "Minimum Size of Gene Set"),
57 make_option(c("-fdr_cutoff", "--fdr_cutoff"), type="double", help = "FDR cutoff"), 59 make_option("--fdr_cutoff", type = "double", help = "FDR cutoff"),
58 make_option(c("-combine_method", "--combine_method"), type="character", help="Method to use to combine the p-values"), 60 make_option("--combine_method", type = "character", help = "Method to use to combine the p-values"),
59 make_option(c("-sort_method", "--sort_method"), type="character", help="Method to sort the results"), 61 make_option("--sort_method", type = "character", help = "Method to sort the results"),
60 make_option(c("-rdaOpt", "--rdaOpt"), type="character", help="Output RData file") 62 make_option("--rdaOpt", type = "character", help = "Output RData file")
61 ) 63 )
62 64
63 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 65 parser <- OptionParser(usage = "%prog [options] file", option_list = option_list)
64 args = parse_args(parser) 66 args <- parse_args(parser)
65 67
66 68
67 ## Read in Files 69 ## Read in Files
68 70
69 if (!is.null(args$filesPath)) { 71 if (!is.null(args$filesPath)) {
70 # Process the separate count files (adapted from DESeq2 wrapper) 72 # Process the separate count files (adapted from DESeq2 wrapper)
71 library("rjson") 73 library("rjson")
72 parser <- newJSONParser() 74 parser <- newJSONParser()
73 parser$addData(args$filesPath) 75 parser$addData(args$filesPath)
74 factorList <- parser$getObject() 76 factor_list <- parser$getObject()
75 factors <- sapply(factorList, function(x) x[[1]]) 77 factors <- sapply(factor_list, function(x) x[[1]])
76 filenamesIn <- unname(unlist(factorList[[1]][[2]])) 78 filenames_in <- unname(unlist(factor_list[[1]][[2]]))
77 sampleTable <- data.frame(sample=basename(filenamesIn), 79 sampletable <- data.frame(sample = basename(filenames_in),
78 filename=filenamesIn, 80 filename = filenames_in,
79 row.names=filenamesIn, 81 row.names = filenames_in,
80 stringsAsFactors=FALSE) 82 stringsAsFactors = FALSE)
81 for (factor in factorList) { 83 for (factor in factor_list) {
82 factorName <- factor[[1]] 84 factorname <- factor[[1]]
83 sampleTable[[factorName]] <- character(nrow(sampleTable)) 85 sampletable[[factorname]] <- character(nrow(sampletable))
84 lvls <- sapply(factor[[2]], function(x) names(x)) 86 lvls <- sapply(factor[[2]], function(x) names(x))
85 for (i in seq_along(factor[[2]])) { 87 for (i in seq_along(factor[[2]])) {
86 files <- factor[[2]][[i]][[1]] 88 files <- factor[[2]][[i]][[1]]
87 sampleTable[files,factorName] <- lvls[i] 89 sampletable[files, factorname] <- lvls[i]
88 } 90 }
89 sampleTable[[factorName]] <- factor(sampleTable[[factorName]], levels=lvls) 91 sampletable[[factorname]] <- factor(sampletable[[factorname]], levels = lvls)
90 } 92 }
91 rownames(sampleTable) <- sampleTable$sample 93 rownames(sampletable) <- sampletable$sample
92 rem <- c("sample","filename") 94 rem <- c("sample", "filename")
93 factors <- sampleTable[, !(names(sampleTable) %in% rem), drop=FALSE] 95 factors <- sampletable[, !(names(sampletable) %in% rem), drop = FALSE]
94 96
95 #read in count files and create single table 97 #read in count files and create single table
96 countfiles <- lapply(sampleTable$filename, function(x){read.delim(x, row.names=1)}) 98 countfiles <- lapply(sampletable$filename, function(x) {
99 read.delim(x, row.names = 1)
100 })
97 counts <- do.call("cbind", countfiles) 101 counts <- do.call("cbind", countfiles)
98 102
99 } else { 103 } else {
100 # Process the single count matrix 104 # Process the single count matrix
101 counts <- read.table(args$matrixPath, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names=FALSE) 105 counts <- read.table(args$matrixPath, header = TRUE, sep = "\t", stringsAsFactors = FALSE, check.names = FALSE)
102 row.names(counts) <- counts[, 1] 106 row.names(counts) <- counts[, 1]
103 counts <- counts[ , -1] 107 counts <- counts[, -1]
104 countsRows <- nrow(counts) 108 countsrows <- nrow(counts)
105 109
106 # Process factors 110 # Process factors
107 if (is.null(args$factInput)) { 111 if (is.null(args$factInput)) {
108 factorData <- read.table(args$factFile, header=TRUE, sep="\t", strip.white=TRUE) 112 factordata <- read.table(args$factFile, header = TRUE, sep = "\t", strip.white = TRUE, stringsAsFactors = TRUE)
109 # check samples names match 113 # check samples names match
110 if(!any(factorData[, 1] %in% colnames(counts))) 114 if (!any(factordata[, 1] %in% colnames(counts)))
111 stop("Sample IDs in factors file and count matrix don't match") 115 stop("Sample IDs in factors file and count matrix don't match")
112 # order samples as in counts matrix 116 # order samples as in counts matrix
113 factorData <- factorData[match(colnames(counts), factorData[, 1]), ] 117 factordata <- factordata[match(colnames(counts), factordata[, 1]), ]
114 factors <- factorData[, -1, drop=FALSE] 118 factors <- factordata[, -1, drop = FALSE]
115 } else { 119 } else {
116 factors <- unlist(strsplit(args$factInput, "|", fixed=TRUE)) 120 factors <- unlist(strsplit(args$factInput, "|", fixed = TRUE))
117 factorData <- list() 121 factordata <- list()
118 for (fact in factors) { 122 for (fact in factors) {
119 newFact <- unlist(strsplit(fact, split="::")) 123 newfact <- unlist(strsplit(fact, split = "::"))
120 factorData <- rbind(factorData, newFact) 124 factordata <- rbind(factordata, newfact)
121 } # Factors have the form: FACT_NAME::LEVEL,LEVEL,LEVEL,LEVEL,... The first factor is the Primary Factor. 125 } # Factors have the form: FACT_NAME::LEVEL,LEVEL,LEVEL,LEVEL,... The first factor is the Primary Factor.
122 126
123 # Set the row names to be the name of the factor and delete first row 127 # Set the row names to be the name of the factor and delete first row
124 row.names(factorData) <- factorData[, 1] 128 row.names(factordata) <- factordata[, 1]
125 factorData <- factorData[, -1] 129 factordata <- factordata[, -1]
126 factorData <- sapply(factorData, sanitiseGroups) 130 factordata <- sapply(factordata, sanitise_groups)
127 factorData <- sapply(factorData, strsplit, split=",") 131 factordata <- sapply(factordata, strsplit, split = ",")
128 factorData <- sapply(factorData, make.names) 132 factordata <- sapply(factordata, make.names)
129 # Transform factor data into data frame of R factor objects 133 # Transform factor data into data frame of R factor objects
130 factors <- data.frame(factorData) 134 factors <- data.frame(factordata, stringsAsFactors = TRUE)
131 } 135 }
132 } 136 }
133 137
134 # Create a DGEList object 138 # Create a DGEList object
135 counts <- DGEList(counts) 139 counts <- DGEList(counts)
136 140
137 # Set group to be the Primary Factor input 141 # Set group to be the Primary Factor input
138 group <- factors[, 1, drop=FALSE] 142 group <- factors[, 1, drop = FALSE]
139 143
140 # Split up contrasts separated by comma into a vector then sanitise 144 # Split up contrasts separated by comma into a vector then sanitise
141 contrastData <- unlist(strsplit(args$contrastData, split=",")) 145 contrast_data <- unlist(strsplit(args$contrastData, split = ","))
142 contrastData <- sanitiseEquation(contrastData) 146 contrast_data <- sanitise_equation(contrast_data)
143 contrastData <- gsub(" ", ".", contrastData, fixed=TRUE) 147 contrast_data <- gsub(" ", ".", contrast_data, fixed = TRUE)
144 148
145 # Creating design 149 # Creating design
146 row.names(factors) <- colnames(counts) 150 row.names(factors) <- colnames(counts)
147 factorList <- sapply(names(factors), pasteListName) 151 factor_list <- sapply(names(factors), paste_listname)
148 152
149 formula <- "~0" 153 formula <- "~0"
150 for (i in 1:length(factorList)) { 154 for (i in seq_along(factor_list)) {
151 formula <- paste(formula, factorList[i], sep="+") 155 formula <- paste(formula, factor_list[i], sep = "+")
152 } 156 }
153 formula <- formula(formula) 157 formula <- formula(formula)
154 158
155 design <- model.matrix(formula) 159 design <- model.matrix(formula)
156 160
157 for (i in 1:length(factorList)) { 161 for (i in seq_along(factor_list)) {
158 colnames(design) <- gsub(factorList[i], "", colnames(design), fixed=TRUE) 162 colnames(design) <- gsub(factor_list[i], "", colnames(design), fixed = TRUE)
159 } 163 }
160 164
161 ## Generate Contrasts information 165 ## Generate Contrasts information
162 contrasts <- makeContrasts(contrasts=contrastData, levels=design) 166 contrasts <- makeContrasts(contrasts = contrast_data, levels = design)
163 167
164 168
165 ## Add Gene Symbol information 169 ## Add Gene Symbol information
166 170
167 genes <- read.table(args$genes, sep='\t', header=TRUE) 171 genes <- read.table(args$genes, sep = "\t", header = TRUE)
168 172
169 173
170 ## Set Gene Set Testing Methods 174 ## Set Gene Set Testing Methods
171 175
172 base_methods <- unlist(strsplit(args$base_methods, ",")) 176 base_methods <- unlist(strsplit(args$base_methods, ","))
180 msigdb <- "none" 184 msigdb <- "none"
181 } 185 }
182 186
183 if (args$keggdb != "None") { 187 if (args$keggdb != "None") {
184 keggdb <- unlist(strsplit(args$keggdb, ",")) 188 keggdb <- unlist(strsplit(args$keggdb, ","))
185 kegg_all <- c("Metabolism"="keggmet", "Signaling"="keggsig", "Disease"="keggdis") 189 kegg_all <- c("Metabolism" = "keggmet", "Signaling" = "keggsig", "Disease" = "keggdis")
186 kegg_exclude <- names(kegg_all[!(kegg_all %in% keggdb)]) 190 kegg_exclude <- names(kegg_all[!(kegg_all %in% keggdb)])
187 } else { 191 } else {
188 kegg_exclude <- "all" 192 kegg_exclude <- "all"
189 } 193 }
190 194
194 gsdb <- "none" 198 gsdb <- "none"
195 } 199 }
196 200
197 ## Index gene sets 201 ## Index gene sets
198 202
199 gs.annots <- buildIdx(entrezIDs=rownames(counts), species=args$species, msigdb.gsets=msigdb, gsdb.gsets=gsdb, kegg.exclude=kegg_exclude, kegg.updated=args$keggupdated) 203 gs_annots <- buildIdx(entrezIDs = rownames(counts), species = args$species, msigdb.gsets = msigdb, gsdb.gsets = gsdb, kegg.exclude = kegg_exclude, kegg.updated = args$keggupdated)
200 204
201 205
202 ## Run egsea.cnt 206 ## Run egsea.cnt
203 207
204 gsa <- egsea.cnt(counts=counts, group=group, design=design, contrasts=contrasts, gs.annots=gs.annots, symbolsMap=genes, baseGSEAs=base_methods, minSize=args$min_size, display.top=args$display_top, combineMethod=args$combine_method, sort.by=args$sort_method, report.dir='./report_dir', fdr.cutoff=args$fdr_cutoff, num.threads=args$threads, report=TRUE) 208 gsa <- egsea.cnt(counts = counts, group = group, design = design, contrasts = contrasts, gs.annots = gs_annots, symbolsMap = genes, baseGSEAs = base_methods, minSize = args$min_size, display.top = args$display_top, combineMethod = args$combine_method, sort.by = args$sort_method, report.dir = "./report_dir", fdr.cutoff = args$fdr_cutoff, num.threads = args$threads, report = TRUE)
205 209
206 210
207 ## Output RData file 211 ## Output RData file
208 212
209 if (!is.null(args$rdaOpt)) { 213 if (!is.null(args$rdaOpt)) {