Mercurial > repos > dktanwar > test2
diff 02_read_counts_tss/countsAroundTSS.R @ 0:e4a5943b9258 draft
Uploaded
author | dktanwar |
---|---|
date | Tue, 12 Sep 2017 14:33:45 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/02_read_counts_tss/countsAroundTSS.R Tue Sep 12 14:33:45 2017 -0400 @@ -0,0 +1,178 @@ +## How to execute this tool +# $Rscript my_r_tool.R --input input.csv --output output.csv + +# Send R errors to stderr +options(show.error.messages = F, error = function(){cat(geterrmessage(), file = stderr()); q("no", 1, F)}) + +# Avoid crashing Galaxy with an UTF8 error on German LC settings +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") + +options(stringAsfactors = FALSE, useFancyQuotes = FALSE) + +# Check for packages requirement ----- +suppressMessages(source("https://bioconductor.org/biocLite.R")) + +list.of.packages <- c("GenomicRanges", + "Rsamtools", + "rtracklayer", + "parallel", + "foreach", + "doParallel", + "GenomicFeatures", + "Gviz", + "BSgenome.Hsapiens.UCSC.hg19", + "org.Hs.eg.db", + "BSgenome.Mmusculus.UCSC.mm10", + "org.Mm.eg.db", + "optparse", + "getopt") + +new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] +if(length(new.packages)) biocLite(new.packages) + +# packages required +tmp <- suppressMessages(lapply(list.of.packages, require, quietly =T, + warn.conflicts = F, character.only = TRUE)) +rm(tmp) + + +# Command line options ---- +# option_list <- list( +# make_option(c("-b", "--bamfile"), type="character", default=NULL, +# help="bamfile for which read counts are to be calculated around TSS", metavar="character"), +# make_option(c("-e", "--genome"), type="character", default = NULL, +# help="genome: hg19/ mm10", metavar="character"), +# make_option(c("-n", "--TSSn"), type="character", default = NULL, +# help="region upstream TSS for which the read counts are to be calculated", metavar="character"), +# make_option(c("-p", "--TSSp"), type="character", default = NULL, +# help="region downstream TSS for which the read counts are to be calculated", metavar="character"), +# make_option(c("-u", "--TxDbUCSC"), type="character", default = NULL, +# help="Transcripts database: If you already have a sqlite Transcript database for hg19/ mm10, provide it, or do not enter anything. Software will create a database to use from UCSC.", metavar="character") +# ) +# +# opt_parser <- OptionParser(option_list=option_list) +# opt <- parse_args(opt_parser) + +# Take in trailing command line arguments +args <- commandArgs(trailingOnly = TRUE) +# args <- c("ENCFF027UTM.bam", "hg19", 500, 500, "transcripts_mm10_UCSC.sqlite") + +# Get options using the spec as defined by the enclosed list +# Options are read from the default: commandArgs(TRUE) +option_specification = matrix(c( + 'input1', 'i1', 2, 'character', + 'input2', 'i2', 2, 'character', + 'input3', 'i3', 2, 'integer', + 'input4', 'i4', 2, 'integer', + 'input5', 'i5', 2, 'character', + 'output', 'o', 2, 'character' +), byrow=TRUE, ncol=4); + +# Parse options +options = getopt(option_specification) + +if (is.null(options$input1) | is.null(options$input2)){ + stop("Please provide right input parameters. See help!", call.=FALSE) +} + + +# opt$genome <- "mm10" +# opt$TxDbUCSC <- "transcripts_mm10_UCSC.sqlite" +# opt$bamfile <- "ENCFF027UTM.bam" +# opt$TSSn <- 500 +# opt$TSSp <- 500 + +# Transcript database ---- +txdb <- NULL +if(is.null(options$input5)){ + txdb <- makeTxDbFromUCSC(genome = options$input2, tablename="refGene") + saveDb(txdb, file = paste("transcripts_", options$input2, "_UCSC.sqlite", sep = "")) +} else{ + txdb <- loadDb(options$input5) +} + +txs <- transcripts(txdb, columns = c("gene_id", "tx_id")) +txs <- txs[sapply(split(seq_along(txs), sapply(txs$gene_id, "[", 1)), + function(x) x[which.max(width(txs)[x])])] + +geneID <- txs$"gene_id" +genes <- as.character(geneID) + + +# Bamfiles and chromosomes +# bamfiles <- list.files(path = opt$bamdir, pattern = ".bam$", full.names = T) +bamfiles <- options$input1 + +chrs <- scanBamHeader(bamfiles)[[1]]$targets + + +# TSS regions for calculationg counts ---- +tss <- suppressWarnings(GRanges(seqnames(txs), + IRanges(start = ifelse(as.character(strand(txs)) == "+", + start(txs), end(txs)) - as.numeric(options$input3), + width = as.numeric(options$input3) + as.numeric(options$input4)), + strand = strand(txs), seqlengths = chrs)) + + +# Functions ---- +mylapply <- function(...) { + if(require(parallel) && .Platform$OS.type == "unix") + mclapply(..., mc.cores = detectCores(), mc.preschedule = F) + else + lapply(...) +} + +coverageForChr <- function(seqname, bamfile, chrs, fragmentSize=0, ...) { + message(", ", seqname, appendLF=FALSE) + shift <- round(fragmentSize/2) + param <- ScanBamParam(what=c("pos","qwidth","strand"), + which=GRanges(seqname, IRanges(1, chrs[seqname])), + flag=scanBamFlag(isUnmappedQuery=FALSE)) + x <- scanBam(bamfile, ..., param=param)[[1]] + coverage(IRanges(ifelse(x[["strand"]]=="+", x[["pos"]]+shift, + x[["pos"]]+x[["qwidth"]]-shift), width=1), width=chrs[seqname]) +} + + +countsForRegions <- function(bamfiles, regions, fragmentSize=0) { + names(regions) <- seq_along(regions) + chrs <- seqlengths(regions) + DF <- do.call(cbind, lapply(bamfiles, function(bf) { + message("counting alignments in ",basename(bf),appendLF=FALSE) + cover <- mylapply(names(chrs), coverageForChr, bamfile=bf, chrs=chrs, fragmentSize=fragmentSize) + names(cover) <- names(chrs) + cover <- RleList(unlist(cover), compress=FALSE) + vws <- Views(cover, as(regions,"RangesList")) + vs <- unlist(viewSums(vws), use.names=FALSE) + vs <- vs[match(names(regions), unlist(lapply(vws,names),use.names=FALSE))] + df <- DataFrame(vs) + colnames(df) <- sub(".bam","",basename(bf)) + message(": done") + df + })) +} + + + +# Counting reads ---- +cnts <- countsForRegions(bamfiles = bamfiles, regions = tss, fragmentSize = 150) +cnts <- data.frame(Genes = genes, cnts, stringsAsFactors = F, check.names = F) + +# Annotating genes ---- +anno <- NULL +if(options$input2 == "hg19"){ + anno <- select(org.Hs.eg.db, as.character(cnts$Genes), c("SYMBOL", "GENENAME")) +} else if (options$input2 == "mm10"){ + anno <- select(org.Mm.eg.db, as.character(cnts$Genes), c("SYMBOL", "GENENAME")) +} else { + stop("Please provide correct reference genome. See help!", call.=FALSE) +} + +cnts <- merge(anno, cnts, by.x = "ENTREZID", by.y = "Genes") + +index <- which(is.na(cnts$SYMBOL)) +cnts$SYMBOL[index] <- cnts$ENTREZID[index] + +write.table(cnts, file = options$output, sep = "\t", row.names = F, quote = F) + +sessionInfo() \ No newline at end of file