diff 02_read_counts_tss/countsAroundTSS.R @ 0:e4a5943b9258 draft

Uploaded
author dktanwar
date Tue, 12 Sep 2017 14:33:45 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/02_read_counts_tss/countsAroundTSS.R	Tue Sep 12 14:33:45 2017 -0400
@@ -0,0 +1,178 @@
+## How to execute this tool
+# $Rscript my_r_tool.R --input input.csv --output output.csv
+
+# Send R errors to stderr
+options(show.error.messages = F, error = function(){cat(geterrmessage(), file = stderr()); q("no", 1, F)})
+
+# Avoid crashing Galaxy with an UTF8 error on German LC settings
+loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
+
+options(stringAsfactors = FALSE, useFancyQuotes = FALSE)
+
+# Check for packages requirement -----
+suppressMessages(source("https://bioconductor.org/biocLite.R"))
+
+list.of.packages <- c("GenomicRanges",
+                      "Rsamtools",
+                      "rtracklayer",
+                      "parallel",
+                      "foreach",
+                      "doParallel",
+                      "GenomicFeatures",
+                      "Gviz",
+                      "BSgenome.Hsapiens.UCSC.hg19",
+                      "org.Hs.eg.db",
+                      "BSgenome.Mmusculus.UCSC.mm10",
+                      "org.Mm.eg.db",
+                      "optparse",
+                      "getopt")
+
+new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
+if(length(new.packages)) biocLite(new.packages)
+
+# packages required
+tmp <- suppressMessages(lapply(list.of.packages, require, quietly =T,
+                               warn.conflicts = F, character.only = TRUE))
+rm(tmp)
+
+
+# Command line options ----
+# option_list <- list(
+#   make_option(c("-b", "--bamfile"), type="character", default=NULL, 
+#               help="bamfile for which read counts are to be calculated around TSS", metavar="character"),
+#   make_option(c("-e", "--genome"), type="character", default = NULL,
+#               help="genome: hg19/ mm10", metavar="character"),
+#   make_option(c("-n", "--TSSn"), type="character", default = NULL,
+#               help="region upstream TSS for which the read counts are to be calculated", metavar="character"),
+#   make_option(c("-p", "--TSSp"), type="character", default = NULL,
+#               help="region downstream TSS for which the read counts are to be calculated", metavar="character"),
+#   make_option(c("-u", "--TxDbUCSC"), type="character", default = NULL,
+#               help="Transcripts database: If you already have a sqlite Transcript database for hg19/ mm10, provide it, or do not enter anything. Software will create a database to use from UCSC.", metavar="character")
+# )
+# 
+# opt_parser <- OptionParser(option_list=option_list)
+# opt <- parse_args(opt_parser)
+
+# Take in trailing command line arguments
+args <- commandArgs(trailingOnly = TRUE)
+# args <- c("ENCFF027UTM.bam", "hg19", 500, 500, "transcripts_mm10_UCSC.sqlite")
+
+# Get options using the spec as defined by the enclosed list
+# Options are read from the default: commandArgs(TRUE)
+option_specification = matrix(c(
+  'input1', 'i1', 2, 'character',
+  'input2', 'i2', 2, 'character',
+  'input3', 'i3', 2, 'integer',
+  'input4', 'i4', 2, 'integer',
+  'input5', 'i5', 2, 'character',
+  'output', 'o', 2, 'character'
+), byrow=TRUE, ncol=4);
+
+# Parse options
+options = getopt(option_specification)
+
+if (is.null(options$input1) | is.null(options$input2)){
+  stop("Please provide right input parameters. See help!", call.=FALSE)
+}
+
+
+# opt$genome <- "mm10"
+# opt$TxDbUCSC <- "transcripts_mm10_UCSC.sqlite"
+# opt$bamfile <- "ENCFF027UTM.bam"
+# opt$TSSn <- 500
+# opt$TSSp <- 500
+
+# Transcript database ----
+txdb <- NULL
+if(is.null(options$input5)){
+  txdb <- makeTxDbFromUCSC(genome = options$input2, tablename="refGene")
+  saveDb(txdb, file = paste("transcripts_", options$input2, "_UCSC.sqlite", sep = ""))
+} else{
+  txdb <- loadDb(options$input5)
+}
+
+txs <- transcripts(txdb, columns = c("gene_id", "tx_id"))
+txs <- txs[sapply(split(seq_along(txs), sapply(txs$gene_id, "[", 1)),
+                  function(x) x[which.max(width(txs)[x])])]
+
+geneID <- txs$"gene_id"
+genes <- as.character(geneID)
+
+
+# Bamfiles and chromosomes
+# bamfiles <- list.files(path = opt$bamdir, pattern = ".bam$", full.names = T)
+bamfiles <- options$input1
+
+chrs <- scanBamHeader(bamfiles)[[1]]$targets
+
+
+# TSS regions for calculationg counts ----
+tss <- suppressWarnings(GRanges(seqnames(txs), 
+                                IRanges(start = ifelse(as.character(strand(txs)) == "+", 
+                                                       start(txs), end(txs)) - as.numeric(options$input3),
+                                        width = as.numeric(options$input3) + as.numeric(options$input4)), 
+                                strand = strand(txs), seqlengths = chrs))
+
+
+# Functions ----
+mylapply <- function(...) {
+  if(require(parallel) && .Platform$OS.type == "unix")
+    mclapply(..., mc.cores = detectCores(), mc.preschedule = F)
+  else
+    lapply(...)
+}
+
+coverageForChr <- function(seqname, bamfile, chrs, fragmentSize=0, ...) {
+  message(", ", seqname, appendLF=FALSE)
+  shift <- round(fragmentSize/2)
+  param <- ScanBamParam(what=c("pos","qwidth","strand"), 
+                        which=GRanges(seqname, IRanges(1, chrs[seqname])), 
+                        flag=scanBamFlag(isUnmappedQuery=FALSE))
+  x <- scanBam(bamfile, ..., param=param)[[1]]
+  coverage(IRanges(ifelse(x[["strand"]]=="+", x[["pos"]]+shift, 
+                          x[["pos"]]+x[["qwidth"]]-shift), width=1), width=chrs[seqname])
+}
+
+
+countsForRegions <- function(bamfiles, regions, fragmentSize=0) {
+  names(regions) <- seq_along(regions)
+  chrs <- seqlengths(regions)
+  DF <- do.call(cbind, lapply(bamfiles, function(bf) {
+    message("counting alignments in ",basename(bf),appendLF=FALSE)
+    cover <- mylapply(names(chrs), coverageForChr, bamfile=bf, chrs=chrs, fragmentSize=fragmentSize)
+    names(cover) <- names(chrs)
+    cover <- RleList(unlist(cover), compress=FALSE)
+    vws <- Views(cover, as(regions,"RangesList"))
+    vs <- unlist(viewSums(vws), use.names=FALSE)
+    vs <- vs[match(names(regions), unlist(lapply(vws,names),use.names=FALSE))]
+    df <- DataFrame(vs)
+    colnames(df) <- sub(".bam","",basename(bf))
+    message(": done")
+    df
+  }))
+}
+
+
+
+# Counting reads ----
+cnts <- countsForRegions(bamfiles = bamfiles, regions = tss, fragmentSize = 150)
+cnts <- data.frame(Genes = genes, cnts, stringsAsFactors = F, check.names = F)
+
+# Annotating genes ----
+anno <- NULL
+if(options$input2 == "hg19"){
+  anno <- select(org.Hs.eg.db, as.character(cnts$Genes), c("SYMBOL", "GENENAME"))
+} else if (options$input2 == "mm10"){
+  anno <- select(org.Mm.eg.db, as.character(cnts$Genes), c("SYMBOL", "GENENAME"))
+} else {
+  stop("Please provide correct reference genome. See help!", call.=FALSE)
+}
+
+cnts <- merge(anno, cnts, by.x = "ENTREZID", by.y = "Genes")
+
+index <- which(is.na(cnts$SYMBOL))
+cnts$SYMBOL[index] <- cnts$ENTREZID[index]
+
+write.table(cnts, file = options$output, sep = "\t", row.names = F, quote = F)
+
+sessionInfo()
\ No newline at end of file