Mercurial > repos > iuc > ruvseq
comparison ruvseq.R @ 2:fed9d0350d72 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ruvseq commit 4daa375d022673d2437d609b1865b78c64b04415"
author | iuc |
---|---|
date | Fri, 15 Jan 2021 17:53:15 +0000 |
parents | c24765926774 |
children | 3a083c78896e |
comparison
equal
deleted
inserted
replaced
1:c24765926774 | 2:fed9d0350d72 |
---|---|
1 # setup R error handling to go to stderr | 1 # setup R error handling to go to stderr |
2 library("getopt") | 2 library("getopt") |
3 options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) | 3 options(show.error.messages = F, error = function() { |
4 cat(geterrmessage(), file = stderr()); q("no", 1, F) | |
5 }) | |
4 options(stringAsFactors = FALSE, useFancyQuotes = FALSE) | 6 options(stringAsFactors = FALSE, useFancyQuotes = FALSE) |
5 | 7 |
6 setup_cmdline_options <- function() { | 8 setup_cmdline_options <- function() { |
7 args <- commandArgs(trailingOnly = TRUE) | 9 args <- commandArgs(trailingOnly = TRUE) |
8 spec <- matrix(c( | 10 spec <- matrix(c( |
10 "alpha", "a", 1, "double", | 12 "alpha", "a", 1, "double", |
11 "min_mean_count", "min_c", 1, "double", | 13 "min_mean_count", "min_c", 1, "double", |
12 "min_k", "min_k", 1, "double", | 14 "min_k", "min_k", 1, "double", |
13 "max_k", "max_k", 1, "double", | 15 "max_k", "max_k", 1, "double", |
14 "sample_json", "s", 1, "character", | 16 "sample_json", "s", 1, "character", |
15 "plots" , "p", 1, "character", | 17 "plots", "p", 1, "character", |
16 "header", "H", 0, "logical", | 18 "header", "H", 0, "logical", |
17 "txtype", "y", 1, "character", | 19 "txtype", "y", 1, "character", |
18 "tx2gene", "x", 1, "character"), # a space-sep tx-to-gene map or GTF file (auto detect .gtf/.GTF) | 20 "tx2gene", "x", 1, "character"), # a space-sep tx-to-gene map or GTF file (auto detect .gtf/.GTF) |
19 byrow=TRUE, ncol=4) | 21 byrow = TRUE, ncol = 4) |
20 | 22 |
21 opt <- getopt(spec) | 23 opt <- getopt(spec) |
22 # if help was asked for print a friendly message | 24 # if help was asked for print a friendly message |
23 # and exit with a non-zero error code | 25 # and exit with a non-zero error code |
24 if (!is.null(opt$help)) { | 26 if (!is.null(opt$help)) { |
25 cat(getopt(spec, usage=TRUE)) | 27 cat(getopt(spec, usage = TRUE)) |
26 q(status=1) | 28 q(status = 1) |
27 } else { | 29 } else { |
28 load_libraries() | 30 load_libraries() |
29 } | 31 } |
30 return(opt) | 32 return(opt) |
31 } | 33 } |
40 library("tximport") | 42 library("tximport") |
41 library("DESeq2") | 43 library("DESeq2") |
42 library("ggrepel") | 44 library("ggrepel") |
43 } | 45 } |
44 | 46 |
45 source_local <- function(fname){ | 47 source_local <- function(fname) { |
46 argv <- commandArgs(trailingOnly = FALSE) | 48 argv <- commandArgs(trailingOnly = FALSE) |
47 base_dir <- dirname(substring(argv[grep("--file=", argv)], 8)) | 49 base_dir <- dirname(substring(argv[grep("--file=", argv)], 8)) |
48 source(paste(base_dir, fname, sep="/")) | 50 source(paste(base_dir, fname, sep = "/")) |
49 } | 51 } |
50 | 52 |
51 # Source get_deseq_dataset.R for getting deseq dataset from htseq/featurecounts/tximport | 53 # Source get_deseq_dataset.R for getting deseq dataset from htseq/featurecounts/tximport |
52 source_local('get_deseq_dataset.R') | 54 source_local("get_deseq_dataset.R") |
53 | 55 |
54 # RUVseq function definitions | 56 # RUVseq function definitions |
55 | 57 |
56 plot_pca_rle <- function (set, title) { | 58 plot_pca_rle <- function(set, title) { |
57 x <- pData(set)[,1] | 59 x <- pData(set)[, 1] |
58 colors <- brewer.pal(3, "Set2") | 60 colors <- brewer.pal(3, "Set2") |
59 label <- paste0(' for ', title) | 61 label <- paste0(" for ", title) |
60 plotRLE(set, outline=FALSE, ylim=c(-4, 4), col=colors[x]) | 62 plotRLE(set, outline = FALSE, ylim = c(-4, 4), col = colors[x]) |
61 title(main=paste0("RLE", label)) | 63 title(main = paste0("RLE", label)) |
62 plotPCA(set, col=colors[x], cex=1.2) | 64 plotPCA(set, col = colors[x], cex = 1.2) |
63 title(main=paste0("PCA", label)) | 65 title(main = paste0("PCA", label)) |
64 } | 66 } |
65 | 67 |
66 plot_factors_of_unwanted_variation <- function(set, method, k){ | 68 plot_factors_of_unwanted_var <- function(set, method, k) { |
67 pd <- pData(set) | 69 pd <- pData(set) |
68 pd['sample'] <- row.names(pd) | 70 pd["sample"] <- row.names(pd) |
69 colnames(pd)[1] <- 'condition' | 71 colnames(pd)[1] <- "condition" |
70 d = melt(pd, id.vars = c('sample', 'condition')) | 72 d <- melt(pd, id.vars = c("sample", "condition")) |
71 d['x'] <- 1 # There is no information on the X, so we just fake it to be able to do a scatterplot | 73 d["x"] <- 1 # There is no information on the X, so we just fake it to be able to do a scatterplot |
72 print(ggplot(d, aes(x=x, y=value, color=condition, label=sample)) + | 74 print(ggplot(d, aes(x = x, y = value, color = condition, label = sample)) + |
73 geom_point() + | 75 geom_point() + |
74 ggtitle(paste0('Factors of unwanted variation for method: ', method, ", k=", k)) + | 76 ggtitle(paste0("Factors of unwanted variation for method: ", method, ", k=", k)) + |
75 facet_wrap( ~ variable, scales = "free_x") + | 77 facet_wrap(~ variable, scales = "free_x") + |
76 geom_text_repel() + | 78 geom_text_repel() + |
77 theme(axis.title.x=element_blank(), | 79 theme(axis.title.x = element_blank(), |
78 axis.text.x=element_blank(), | 80 axis.text.x = element_blank(), |
79 axis.ticks.x=element_blank(), | 81 axis.ticks.x = element_blank(), |
80 plot.title = element_text(hjust = 0.5)) | 82 plot.title = element_text(hjust = 0.5)) |
81 ) | 83 ) |
82 } | 84 } |
83 | 85 |
84 create_seq_expression_set <- function (dds, min_mean_count) { | 86 create_seq_expression_set <- function(dds, min_mean_count) { |
85 count_values <- counts(dds) | 87 count_values <- counts(dds) |
86 print(paste0("feature count before filtering :",nrow(count_values),"\n")) | 88 print(paste0("feature count before filtering :", nrow(count_values), "\n")) |
87 print(paste0("Filtering features which mean expression is less or eq. than ", min_mean_count, " counts\n")) | 89 print(paste0("Filtering features which mean expression is less or eq. than ", min_mean_count, " counts\n")) |
88 filter <- apply(count_values, 1, function(x) mean(x) > min_mean_count) | 90 filter <- apply(count_values, 1, function(x) mean(x) > min_mean_count) |
89 filtered <- count_values[filter,] | 91 filtered <- count_values[filter, ] |
90 print(paste0("feature count after filtering :",nrow(filtered),"\n")) | 92 print(paste0("feature count after filtering :", nrow(filtered), "\n")) |
91 set = newSeqExpressionSet(as.matrix(filtered), | 93 set <- newSeqExpressionSet(as.matrix(filtered), |
92 phenoData = data.frame(colData(dds)$condition, row.names=colnames(filtered))) | 94 phenoData = data.frame(colData(dds)$condition, row.names = colnames(filtered))) |
93 plot_pca_rle(set = set, title = 'raw data') | 95 plot_pca_rle(set = set, title = "raw data") |
94 set <- betweenLaneNormalization(set, which="upper") | 96 set <- betweenLaneNormalization(set, which = "upper") |
95 plot_pca_rle(set = set, title = 'upper quartile normalized') | 97 plot_pca_rle(set = set, title = "upper quartile normalized") |
96 return(set) | 98 return(set) |
97 } | 99 } |
98 | 100 |
99 get_empirical_control_genes <- function(set, cutoff_p) { | 101 get_empirical_control_genes <- function(set, cutoff_p) { |
100 x <- pData(set)[,1] | 102 x <- pData(set)[, 1] |
101 design <- model.matrix(~x, data=pData(set)) | 103 design <- model.matrix(~x, data = pData(set)) |
102 y <- DGEList(counts=counts(set), group=x) | 104 y <- DGEList(counts = counts(set), group = x) |
103 y <- calcNormFactors(y, method="upperquartile") | 105 y <- calcNormFactors(y, method = "upperquartile") |
104 y <- estimateGLMCommonDisp(y, design) | 106 y <- estimateGLMCommonDisp(y, design) |
105 y <- estimateGLMTagwiseDisp(y, design) | 107 y <- estimateGLMTagwiseDisp(y, design) |
106 fit <- glmFit(y, design) | 108 fit <- glmFit(y, design) |
107 lrt <- glmLRT(fit, coef=2) | 109 lrt <- glmLRT(fit, coef = 2) |
108 top <- topTags(lrt, n=nrow(set))$table | 110 top <- topTags(lrt, n = nrow(set))$table |
109 top_rows <- rownames(top)[which(top$PValue < cutoff_p)] | 111 top_rows <- rownames(top)[which(top$PValue < cutoff_p)] |
110 empirical <- rownames(set)[which(!(rownames(set) %in% top_rows))] | 112 empirical <- rownames(set)[which(!(rownames(set) %in% top_rows))] |
111 return(empirical) | 113 return(empirical) |
112 } | 114 } |
113 | 115 |
114 ruv_control_gene_method <- function(set, k, control_genes='empirical', cutoff_p=0.2) { | 116 ruv_control_gene_method <- function(set, k, control_genes = "empirical", cutoff_p = 0.2) { |
115 if (control_genes == 'empirical') { | 117 if (control_genes == "empirical") { |
116 control_genes = get_empirical_control_genes(set, cutoff_p=cutoff_p) | 118 control_genes <- get_empirical_control_genes(set, cutoff_p = cutoff_p) |
117 } | 119 } |
118 set <- RUVg(set, control_genes, k=k) | 120 set <- RUVg(set, control_genes, k = k) |
119 plot_pca_rle(set, paste0("RUVg with empirical control genes, k=", k)) | 121 plot_pca_rle(set, paste0("RUVg with empirical control genes, k=", k)) |
120 plot_factors_of_unwanted_variation(set, method="RUVg with empirical control genes", k=k) | 122 plot_factors_of_unwanted_var(set, method = "RUVg with empirical control genes", k = k) |
121 return(set) | 123 return(set) |
122 } | 124 } |
123 | 125 |
124 ruv_residual_method <- function(set, k) { | 126 ruv_residual_method <- function(set, k) { |
125 genes <- rownames(counts(set)) | 127 genes <- rownames(counts(set)) |
126 x <- pData(set)[,1] | 128 x <- pData(set)[, 1] |
127 # Initial edger residuals | 129 # Initial edger residuals |
128 design <- model.matrix(~x, data=pData(set)) | 130 design <- model.matrix(~x, data = pData(set)) |
129 y <- DGEList(counts=counts(set), group=x) | 131 y <- DGEList(counts = counts(set), group = x) |
130 y <- calcNormFactors(y, method="upperquartile") | 132 y <- calcNormFactors(y, method = "upperquartile") |
131 y <- estimateGLMCommonDisp(y, design) | 133 y <- estimateGLMCommonDisp(y, design) |
132 y <- estimateGLMTagwiseDisp(y, design) | 134 y <- estimateGLMTagwiseDisp(y, design) |
133 fit <- glmFit(y, design) | 135 fit <- glmFit(y, design) |
134 res <- residuals(fit, type="deviance") | 136 res <- residuals(fit, type = "deviance") |
135 set <- RUVr(set, genes, k=k, res) | 137 set <- RUVr(set, genes, k = k, res) |
136 plot_pca_rle(set = set, title = paste0('RUVr using residuals, k=', k)) | 138 plot_pca_rle(set = set, title = paste0("RUVr using residuals, k=", k)) |
137 plot_factors_of_unwanted_variation(set, method="RUVr using residuals", k=k) | 139 plot_factors_of_unwanted_var(set, method = "RUVr using residuals", k = k) |
138 return(set) | 140 return(set) |
139 } | 141 } |
140 | 142 |
141 ruv_replicate_method <- function (set, k) { | 143 ruv_replicate_method <- function(set, k) { |
142 genes <- rownames(counts(set)) | 144 genes <- rownames(counts(set)) |
143 x <- pData(set)[,1] | 145 x <- pData(set)[, 1] |
144 differences <- makeGroups(x) | 146 differences <- makeGroups(x) |
145 set <- RUVs(set, genes, k=k, differences) | 147 set <- RUVs(set, genes, k = k, differences) |
146 plot_pca_rle(set, paste0('RUVs with replicate samples, k=', k)) | 148 plot_pca_rle(set, paste0("RUVs with replicate samples, k=", k)) |
147 plot_factors_of_unwanted_variation(set, method="RUVs using replicates", k=k) | 149 plot_factors_of_unwanted_var(set, method = "RUVs using replicates", k = k) |
148 return(set) | 150 return(set) |
149 } | |
150 | |
151 get_differentially_expressed_genes <- function(dds, contrast, alpha=0.01) { | |
152 r <- results(dds, contrast=contrast, alpha=alpha) | |
153 return(rownames(r[which(r$padj < alpha),])) | |
154 } | 151 } |
155 | 152 |
156 opt <- setup_cmdline_options() | 153 opt <- setup_cmdline_options() |
157 alpha <- opt$alpha | 154 alpha <- opt$alpha |
158 min_k <- opt$min_k | 155 min_k <- opt$min_k |
160 min_c <- opt$min_mean_count | 157 min_c <- opt$min_mean_count |
161 sample_json <- fromJSON(opt$sample_json) | 158 sample_json <- fromJSON(opt$sample_json) |
162 sample_paths <- sample_json$path | 159 sample_paths <- sample_json$path |
163 sample_names <- sample_json$label | 160 sample_names <- sample_json$label |
164 condition <- as.factor(sample_json$condition) | 161 condition <- as.factor(sample_json$condition) |
165 sampleTable <- data.frame(samplename=sample_names, | 162 sample_table <- data.frame(samplename = sample_names, filename = sample_paths, condition = condition) |
166 filename = sample_paths, | 163 rownames(sample_table) <- sample_names |
167 condition=condition) | |
168 rownames(sampleTable) <- sample_names | |
169 | 164 |
170 dds <- get_deseq_dataset(sampleTable, header=opt$header, designFormula= ~ condition, tximport=opt$txtype, txtype=opt$txtype, tx2gene=opt$tx2gene) | 165 dds <- get_deseq_dataset(sample_table, header = opt$header, design_formula = ~ condition, tximport = opt$txtype, txtype = opt$txtype, tx2gene = opt$tx2gene) |
171 if (!is.null(opt$plots)) { | 166 if (!is.null(opt$plots)) { |
172 pdf(opt$plots) | 167 pdf(opt$plots) |
173 } | 168 } |
174 | 169 |
175 # Run through the ruvseq variants | 170 # Run through the ruvseq variants |
176 set <- create_seq_expression_set(dds, min_mean_count = min_c) | 171 set <- create_seq_expression_set(dds, min_mean_count = min_c) |
177 result <- list(no_correction = set) | 172 result <- list(no_correction = set) |
178 for (k in seq(min_k, max_k)) { | 173 for (k in seq(min_k, max_k)) { |
179 result[[paste0('residual_method_k', k)]] <- ruv_residual_method(set, k=k) | 174 result[[paste0("residual_method_k", k)]] <- ruv_residual_method(set, k = k) |
180 result[[paste0('replicate_method_k', k)]] <- ruv_replicate_method(set, k=k) | 175 result[[paste0("replicate_method_k", k)]] <- ruv_replicate_method(set, k = k) |
181 result[[paste0('control_method_k', k)]] <- ruv_control_gene_method(set, k=k, cutoff_p=0.5) | 176 result[[paste0("control_method_k", k)]] <- ruv_control_gene_method(set, k = k, cutoff_p = 0.5) |
182 } | 177 } |
183 | 178 |
184 for (name in names(result)) { | 179 for (name in names(result)) { |
185 if (!startsWith(name, "no_correction")) { | 180 if (!startsWith(name, "no_correction")) { |
186 set <- result[[name]] | 181 set <- result[[name]] |
187 unwanted_variation <- pData(set) | 182 unwanted_variation <- pData(set) |
188 df <- data.frame(identifier = rownames(unwanted_variation)) | 183 df <- data.frame(identifier = rownames(unwanted_variation)) |
189 df <- cbind(df, unwanted_variation) | 184 df <- cbind(df, unwanted_variation) |
190 colnames(df)[2] <- 'condition' | 185 colnames(df)[2] <- "condition" |
191 write.table(df, file=paste0("batch_effects_", name, ".tabular"), sep="\t", quote=F, row.names=F) | 186 write.table(df, file = paste0("batch_effects_", name, ".tabular"), sep = "\t", quote = F, row.names = F) |
192 } | 187 } |
193 } | 188 } |
194 | 189 |
195 # close the plot device | 190 # close the plot device |
196 if (!is.null(opt$plots)) { | 191 if (!is.null(opt$plots)) { |