Mercurial > repos > rnateam > dewseq
comparison analyseStudy.Rmd @ 0:e1cb2e012307 draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/dewseq commit 71db0e65b3b306904ae2b17ce3de677244aea776"
| author | rnateam |
|---|---|
| date | Thu, 20 Oct 2022 08:18:30 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e1cb2e012307 |
|---|---|
| 1 --- | |
| 2 title: "DEWSeq Analysis" | |
| 3 author: "Thomas Schwarzl and Sudeep Sahadevan" | |
| 4 date: "07/10/2020" | |
| 5 output: | |
| 6 BiocStyle::html_document: | |
| 7 toc: true | |
| 8 toc_float: true | |
| 9 toc_depth: 5 | |
| 10 params: | |
| 11 protein: "" | |
| 12 sampleinfo_file: "" | |
| 13 countmatrix_file: "" | |
| 14 annotation_file: "" | |
| 15 output_windows_file: "" | |
| 16 output_regions_file: "" | |
| 17 output_bed_file: "" | |
| 18 output_rdata: "" | |
| 19 min_count: 2 | |
| 20 min_sample: 2 | |
| 21 LRT: FALSE | |
| 22 p_value_cutoff: 0.05 | |
| 23 lfc_cutoff: 1 | |
| 24 overlap_correction: FALSE | |
| 25 IHW: TRUE | |
| 26 decide_fit: TRUE | |
| 27 --- | |
| 28 ```{r setup, include=FALSE} | |
| 29 knitr::opts_chunk$set(echo = TRUE) | |
| 30 ``` | |
| 31 | |
| 32 | |
| 33 ```{r assignData, echo=FALSE, eval=TRUE} | |
| 34 protein <- params$protein | |
| 35 sampleinfo_file <- params$sampleinfo_file | |
| 36 countmatrix_file <- params$countmatrix_file | |
| 37 annotation_file <- params$annotation_file | |
| 38 output_windows_file <- params$output_windows_file | |
| 39 output_regions_file <- params$output_regions_file | |
| 40 output_bed_file <- params$output_bed_file | |
| 41 output_rdata <- params$output_rdata | |
| 42 min_count <- params$min_count | |
| 43 min_sample <- params$min_sample | |
| 44 p_value_cutoff <- params$p_value_cutoff | |
| 45 lfc_cutoff <- params$lfc_cutoff | |
| 46 ``` | |
| 47 | |
| 48 Sanity check input parameter values | |
| 49 | |
| 50 ```{r sanityCheck, echo=FALSE, eval=TRUE} | |
| 51 # round of min_count and min_sample values first | |
| 52 message("Any decimals given as values for min_count and min_sample parameters will be rounded off to the nearest integer.") | |
| 53 min_count <- round(min_count) | |
| 54 min_sample <- round(min_sample) | |
| 55 # receive and sanity check p_value_cutoff | |
| 56 if (p_value_cutoff <= 0 || p_value_cutoff >= 1) { | |
| 57 warning("p_value_cutoff must satisfy: 0<=p_value_cutoff<=1. Resetting to default value: 0.05") | |
| 58 p_value_cutoff <- 0.05 | |
| 59 } | |
| 60 # sanity check log2Foldchange cutoff | |
| 61 if (lfc_cutoff < 0) { | |
| 62 warning("lfc_cutoff must be a value >=0. Resetting to default value: 1.00") | |
| 63 lfc_cutoff <- 1.0 | |
| 64 } | |
| 65 # sanity check LRT vs Wald | |
| 66 if (is(params$LRT, "logical")) { | |
| 67 lrt <- params$LRT | |
| 68 }else { | |
| 69 warning("LRT must be TRUE or FALSE, setting this parameter to default: FALSE") | |
| 70 lrt <- FALSE | |
| 71 } | |
| 72 # sanity check overlap correction parameter | |
| 73 if (is(params$overlap_correction, "logical")) { | |
| 74 overlap_correction <- params$overlap_correction | |
| 75 }else { | |
| 76 warning("overlap_correction must be TRUE or FALSE, setting this parameter to default: TRUE") | |
| 77 overlap_correction <- TRUE | |
| 78 } | |
| 79 # sanity check IHW vs BH correction parameter | |
| 80 if (is(params$IHW, "logical")) { | |
| 81 ihw_filt <- params$IHW | |
| 82 }else { | |
| 83 warning("IHW must be TRUE or FALSE, setting this parameter to default: TRUE") | |
| 84 ihw_filt <- TRUE | |
| 85 } | |
| 86 # sanity check automated fit vs parametric fit paramter | |
| 87 if (is(params$decide_fit, "logical")) { | |
| 88 decide_fit <- params$decide_fit | |
| 89 }else { | |
| 90 warning("decide_fit must be TRUE or FALSE, setting this parameter to default: TRUE") | |
| 91 decide_fit <- TRUE | |
| 92 } | |
| 93 ``` | |
| 94 | |
| 95 # eCLIP analysis of `r protein` | |
| 96 | |
| 97 | |
| 98 ## Setup | |
| 99 | |
| 100 This is the analysis of __`r protein`__ with | |
| 101 sampleinfo file: ``r sampleinfo_file`` | |
| 102 countmatrix file: ``r countmatrix_file`` and | |
| 103 annotation file: ``r annotation_file`` | |
| 104 | |
| 105 with the following threshold: | |
| 106 | |
| 107 minimum read count per window per sample: ``r min_count`` | |
| 108 number of samples with minimum read count per window: ``r min_sample`` | |
| 109 | |
| 110 using the following parameters: | |
| 111 p-value cut-off: ``r p_value_cutoff`` | |
| 112 Log2FoldChange cut-off: ``r lfc_cutoff`` | |
| 113 use automated method for dispersion estmation: ``r decide_fit`` | |
| 114 use LRT test : ``r lrt`` | |
| 115 use overlap correction: ``r overlap_correction`` | |
| 116 use IHW for FDR correction: ``r ihw_filt`` | |
| 117 | |
| 118 | |
| 119 ``` {r check if files exists, echo=FALSE} | |
| 120 file_exists <- function(x) { | |
| 121 if (!file.exists(x)) | |
| 122 stop(paste0("'", x, "' - file does not exist.")) | |
| 123 } | |
| 124 | |
| 125 file_exists(countmatrix_file) | |
| 126 file_exists(annotation_file) | |
| 127 file_exists(sampleinfo_file) | |
| 128 ``` | |
| 129 | |
| 130 First, we load the libraries. | |
| 131 | |
| 132 ```{r load DEWSeq} | |
| 133 required_packages <- c("DEWSeq", "data.table", "IHW", "R.utils", "tidyverse", "ggrepel") | |
| 134 installed_packages <- installed.packages()[, 1] | |
| 135 missing_packages <- setdiff(required_packages, installed_packages) | |
| 136 if (length(missing_packages) != 0) { | |
| 137 stop("Found missing dependencies! Please install the following package(s): ", paste(missing_packages, collapse = ", ")) | |
| 138 } | |
| 139 suppressPackageStartupMessages({ | |
| 140 require(DEWSeq) | |
| 141 require(tidyverse) | |
| 142 require(data.table) | |
| 143 require(IHW) | |
| 144 require(R.utils) | |
| 145 require(ggrepel) | |
| 146 }) | |
| 147 ``` | |
| 148 | |
| 149 ## Read in data | |
| 150 | |
| 151 Here we read in the window counts | |
| 152 | |
| 153 ```{r read window counts} | |
| 154 window_counts <- fread(countmatrix_file, sep = "\t", stringsAsFactors = FALSE) %>% as.data.frame() | |
| 155 rownames(window_counts) <- window_counts[, 1] | |
| 156 window_counts <- window_counts[, -1] | |
| 157 ``` | |
| 158 | |
| 159 and the sample info file | |
| 160 | |
| 161 ```{r read sample info} | |
| 162 sample_info <- read.table(sampleinfo_file, sep = "\t", stringsAsFactors = FALSE) | |
| 163 if (ncol(sample_info) < 2) { | |
| 164 stop("sampleinfo_file ", sampleinfo_file, " MUST have atleast two columns: first column should be the sample names used in ", countmatrix_file, | |
| 165 " and second column must be the experiment type: IP or SMI") | |
| 166 }else if (ncol(sample_info) > 2) { | |
| 167 message("Found ", ncol(sample_info), " columns in ", sampleinfo_file, " using the first column as sample name and second column as experiment name") | |
| 168 sample_info <- sample_info[, c(1, 2)] | |
| 169 } | |
| 170 colnames(sample_info) <- c("samples", "type") | |
| 171 rownames(sample_info) <- sample_info[, 1] | |
| 172 ``` | |
| 173 | |
| 174 Now we make sure that the sampleinfo file contains the column "type" with values "SMI" and "IP" only. | |
| 175 | |
| 176 ```{r sampleSanity} | |
| 177 # make sure that sample_info rows and window_counts columns are in same order | |
| 178 common_samples <- sort(intersect(colnames(window_counts), rownames(sample_info))) | |
| 179 if (length(common_samples) != ncol(window_counts)) { | |
| 180 stop("The number of samples in ", countmatrix_file, " and ", sampleinfo_file, " do not MATCH!") | |
| 181 } | |
| 182 sample_info <- sample_info[common_samples, ] | |
| 183 window_counts <- window_counts[, common_samples] | |
| 184 # Now make sure that sample_info$type contains only "IP" and "SMI" | |
| 185 type_check <- setdiff(unique(sample_info$type), c("IP", "SMI")) | |
| 186 if (length(type_check) != 0) { | |
| 187 stop("The second column in ", sampleinfo_file, " should contain analysis types:'IP' or 'SMI' only. Found unknown value(s): ", | |
| 188 paste(type_check, collapse = ", ")) | |
| 189 } | |
| 190 ``` | |
| 191 | |
| 192 We make sure that only IP and SMI are in the right factor level order | |
| 193 | |
| 194 ```{r sampleFactor} | |
| 195 sample_info <- sample_info %>% mutate(type = factor(type, levels = c("SMI", "IP"))) | |
| 196 ``` | |
| 197 | |
| 198 | |
| 199 | |
| 200 We create the DEWSeq object | |
| 201 | |
| 202 ```{r dewseqInit} | |
| 203 ddw <- DESeqDataSetFromSlidingWindows(countData = window_counts, | |
| 204 colData = sample_info, | |
| 205 annotObj = annotation_file, | |
| 206 tidy = FALSE, | |
| 207 design = ~type) | |
| 208 ``` | |
| 209 | |
| 210 | |
| 211 ## Prefiltering | |
| 212 | |
| 213 ```{r prefiltering1} | |
| 214 # remove all empty windows | |
| 215 keep <- rowSums(counts(ddw)) >= 1 | |
| 216 ddw <- ddw[keep, ] | |
| 217 ``` | |
| 218 | |
| 219 | |
| 220 ## Estimating size factors | |
| 221 | |
| 222 | |
| 223 ```{r size factors} | |
| 224 ddw <- estimateSizeFactors(ddw) | |
| 225 sizeFactors(ddw) | |
| 226 ``` | |
| 227 | |
| 228 ### estimate size factors for only protein_coding genes | |
| 229 | |
| 230 ```{r protein_coding_size_factors} | |
| 231 ddw_mrnas <- ddw[rowData(ddw)[, "gene_type"] == "protein_coding", ] | |
| 232 ddw_mrnas <- estimateSizeFactors(ddw_mrnas) | |
| 233 ``` | |
| 234 | |
| 235 ### estimate size factors without significant windows | |
| 236 | |
| 237 ```{r size_factors_no_sig_windows} | |
| 238 ddw_tmp <- ddw | |
| 239 ddw_tmp <- estimateDispersions(ddw_tmp, fitType = "local", quiet = TRUE) | |
| 240 if (lrt) { | |
| 241 ddw_tmp <- nbinomLRT(ddw_tmp, full = ~type, reduced = ~1) | |
| 242 }else { | |
| 243 ddw_tmp <- nbinomWaldTest(ddw_tmp) | |
| 244 } | |
| 245 | |
| 246 tmp_significant_windows <- | |
| 247 results(ddw_tmp, | |
| 248 contrast = c("type", "IP", "SMI"), | |
| 249 tidy = TRUE, | |
| 250 filterFun = ihw) %>% | |
| 251 dplyr::filter(padj < p_value_cutoff) %>% | |
| 252 .[["row"]] | |
| 253 rm(ddw_tmp) | |
| 254 ``` | |
| 255 | |
| 256 estimate the size factors without the significant windows. | |
| 257 | |
| 258 ```{r final_size_factors} | |
| 259 ddw_mrnas <- ddw_mrnas[!rownames(ddw_mrnas) %in% tmp_significant_windows, ] | |
| 260 ddw_mrnas <- estimateSizeFactors(ddw_mrnas) | |
| 261 ``` | |
| 262 | |
| 263 before thresholding: | |
| 264 ```{r threshold1} | |
| 265 dim(ddw) | |
| 266 ``` | |
| 267 | |
| 268 Now threshold the windows read count table. | |
| 269 ```{r threshold2} | |
| 270 keep_exp <- which(rowSums(counts(ddw) > min_count) >= min_sample) | |
| 271 ddw <- ddw[keep_exp, ] | |
| 272 ``` | |
| 273 | |
| 274 after thresholding: | |
| 275 ```{r threshold3} | |
| 276 dim(ddw) | |
| 277 ``` | |
| 278 assign size factors | |
| 279 | |
| 280 ```{r final_assign} | |
| 281 sizeFactors(ddw) <- sizeFactors(ddw_mrnas) | |
| 282 rm(list = c("tmp_significant_windows", "ddw_mrnas")) | |
| 283 sizeFactors(ddw) | |
| 284 ``` | |
| 285 | |
| 286 | |
| 287 ## Differential window analysis | |
| 288 | |
| 289 ### Dispersion estimates | |
| 290 | |
| 291 ```{r source, echo = FALSE, eval = FALSE} | |
| 292 # source: https://support.bioconductor.org/p/81094/ | |
| 293 ``` | |
| 294 We fit parametric and local fit, and decide the best fit following this [Bioconductor post](https://support.bioconductor.org/p/81094/) | |
| 295 | |
| 296 ```{r parametric_dispersion} | |
| 297 parametric_ddw <- estimateDispersions(ddw, fitType = "parametric") | |
| 298 if (decide_fit) { | |
| 299 local_ddw <- estimateDispersions(ddw, fitType = "local") | |
| 300 } | |
| 301 | |
| 302 ``` | |
| 303 | |
| 304 This is the dispersion estimate for parametric fit | |
| 305 ```{r plot parametric fit, fig.wide=TRUE} | |
| 306 plotDispEsts(parametric_ddw, main = "Parametric fit") | |
| 307 ``` | |
| 308 | |
| 309 This is the dispersion estimate for local fit, given automated decision fitting is enabled: | |
| 310 ```{r plot local fit, fig.wide = TRUE} | |
| 311 if (decide_fit) { | |
| 312 plotDispEsts(local_ddw, main = "Local fit") | |
| 313 } | |
| 314 | |
| 315 ``` | |
| 316 | |
| 317 This will get the residuals for either fit, only for automated decision fitting | |
| 318 ```{r residual fit} | |
| 319 parametric_resid <- na.omit(with(mcols(parametric_ddw), abs(log(dispGeneEst) - log(dispFit)))) | |
| 320 if (decide_fit) { | |
| 321 local_resid <- na.omit(with(mcols(local_ddw), abs(log(dispGeneEst) - log(dispFit)))) | |
| 322 resid_df <- data.frame(residuals = c(parametric_resid, local_resid), | |
| 323 fitType = c(rep("parametric", length(parametric_resid)), | |
| 324 rep("local", length(local_resid)))) | |
| 325 summary(resid_df) | |
| 326 } | |
| 327 | |
| 328 ``` | |
| 329 | |
| 330 and we plot histograms of the fits | |
| 331 | |
| 332 ```{r plot residual histograms, fig.wide = TRUE} | |
| 333 if (decide_fit) { | |
| 334 ggplot(resid_df, aes(x = residuals, fill = fitType)) + | |
| 335 scale_fill_manual(values = c("darkred", "darkblue")) + geom_histogram(alpha = 0.5, position = "identity", bins = 100) + | |
| 336 theme_bw() | |
| 337 } | |
| 338 | |
| 339 ``` | |
| 340 | |
| 341 Now, we will decide for the better fit based on median | |
| 342 | |
| 343 ```{r choose_fit} | |
| 344 summary(parametric_resid) | |
| 345 if (decide_fit) { | |
| 346 summary(local_resid) | |
| 347 if (median(local_resid) <= median(parametric_resid)) { | |
| 348 cat("chosen fitType: local") | |
| 349 ddw <- local_ddw | |
| 350 }else { | |
| 351 cat("chosen fitType: parametric") | |
| 352 ddw <- parametric_ddw | |
| 353 } | |
| 354 rm(local_ddw, parametric_ddw, resid_df, parametric_resid, local_resid) | |
| 355 }else { | |
| 356 ddw <- parametric_ddw | |
| 357 rm(parametric_ddw) | |
| 358 } | |
| 359 | |
| 360 ``` | |
| 361 | |
| 362 | |
| 363 | |
| 364 ### Wald test or LRT | |
| 365 | |
| 366 ```{r wald_or_LRT} | |
| 367 if (lrt) { | |
| 368 ddw <- nbinomLRT(ddw, full = ~type, reduced = ~1) | |
| 369 }else { | |
| 370 ddw <- nbinomWaldTest(ddw) | |
| 371 } | |
| 372 | |
| 373 ``` | |
| 374 | |
| 375 | |
| 376 ### Significance testing | |
| 377 | |
| 378 ```{r sig_windows} | |
| 379 result_windows <- resultsDEWSeq(ddw, | |
| 380 contrast = c("type", "IP", "SMI"), | |
| 381 tidy = TRUE) %>% as_tibble | |
| 382 | |
| 383 result_windows | |
| 384 ``` | |
| 385 | |
| 386 | |
| 387 ### Multiple hypothesis correction with IHW | |
| 388 | |
| 389 You might be interested to correct for multiple hypothesis testing with IHW. | |
| 390 | |
| 391 Decide on overlap correction based on the parameter `overlap_correction` | |
| 392 | |
| 393 ```{r ihw} | |
| 394 if (overlap_correction && ihw_filt) { | |
| 395 result_windows[, "p_adj_IHW"] <- adj_pvalues(ihw(pSlidingWindows ~ baseMean, | |
| 396 data = result_windows, | |
| 397 alpha = p_value_cutoff, | |
| 398 nfolds = 10)) | |
| 399 padj_col <- "p_adj_IHW" | |
| 400 }else if (!overlap_correction && ihw_filt) { | |
| 401 result_windows[, "p_adj_IHW"] <- adj_pvalues(ihw(pvalue ~ baseMean, | |
| 402 data = result_windows, | |
| 403 alpha = p_value_cutoff, | |
| 404 nfolds = 10)) | |
| 405 padj_col <- "p_adj_IHW" | |
| 406 }else if (overlap_correction && !ihw_filt) { | |
| 407 padj_col <- "pSlidingWindows.adj" | |
| 408 }else { | |
| 409 result_windows[, "p_adj"] <- p.adjust(result_windows$pvalue, method = "BH") | |
| 410 padj_col <- "p_adj" | |
| 411 } | |
| 412 | |
| 413 ``` | |
| 414 | |
| 415 Determine significant windows | |
| 416 | |
| 417 ```{r filter_sig_windows} | |
| 418 result_windows <- result_windows %>% | |
| 419 mutate(significant = result_windows[, padj_col] < p_value_cutoff) | |
| 420 sig_windows <- sum(result_windows$significant) | |
| 421 ``` | |
| 422 | |
| 423 | |
| 424 `r sig_windows` windows are significant | |
| 425 | |
| 426 | |
| 427 ### Combining windows | |
| 428 | |
| 429 ```{r, reg1, message=FALSE, eval=TRUE, include=FALSE} | |
| 430 if (sig_windows > 0) { | |
| 431 result_regions <- extractRegions(windowRes = result_windows, padjCol = padj_col, padjThresh = p_value_cutoff, log2FoldChangeThresh = lfc_cutoff) %>% as_tibble | |
| 432 }else { | |
| 433 message("Cannot find significant windows in this dataset. Try lowering the p-value and log2FoldChange thresholds!") | |
| 434 } | |
| 435 ``` | |
| 436 | |
| 437 ```{r extractRegion, eval=FALSE} | |
| 438 if (sig_windows > 0) { | |
| 439 result_regions <- extractRegions(windowRes = result_windows, padjCol = padj_col, padjThresh = p_value_cutoff, log2FoldChangeThresh = lfc_cutoff) %>% as_tibble | |
| 440 } | |
| 441 ``` | |
| 442 | |
| 443 | |
| 444 ### Writing Bed file | |
| 445 | |
| 446 ```{r writing bed file} | |
| 447 if (sig_windows > 1) { | |
| 448 toBED(windowRes = result_windows, regionRes = result_regions, padjThresh = p_value_cutoff, padjCol = padj_col, fileName = output_bed_file) | |
| 449 }else { | |
| 450 message("This analysis does not have enough <=1 significant windows to create BED file for visualization") | |
| 451 } | |
| 452 ``` | |
| 453 | |
| 454 | |
| 455 ## Save Session | |
| 456 ```{r save_data} | |
| 457 # save enriched windows, gzip results file if the file suffix is .gz | |
| 458 if (grepl("\\.gz$", output_windows_file, ignore.case = TRUE)) { | |
| 459 gz_out <- gzfile(output_windows_file, "w") | |
| 460 write.table(result_windows, file = gz_out, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE) | |
| 461 close(gz_out) | |
| 462 }else { | |
| 463 write.table(result_windows, file = output_windows_file, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE) | |
| 464 } | |
| 465 # save enriched regions | |
| 466 if (sig_windows > 0) { | |
| 467 if (grepl("\\.gz$", output_regions_file, ignore.case = TRUE)) { | |
| 468 gz_out <- gzfile(output_regions_file, "w") | |
| 469 write.table(result_regions, file = gz_out, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE) | |
| 470 close(gz_out) | |
| 471 }else { | |
| 472 write.table(result_regions, file = output_regions_file, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE) | |
| 473 } | |
| 474 } | |
| 475 # save session | |
| 476 # Warning! session images can be heavy! | |
| 477 if (nchar(output_rdata) > 5) { | |
| 478 save.image(file = output_rdata) | |
| 479 } | |
| 480 ``` | |
| 481 | |
| 482 ## Session Info | |
| 483 | |
| 484 ```{r session info} | |
| 485 sessionInfo() | |
| 486 ``` |
