Mercurial > repos > dktanwar > test2
view 02_read_counts_tss/countsAroundTSS.R @ 0:e4a5943b9258 draft
Uploaded
author | dktanwar |
---|---|
date | Tue, 12 Sep 2017 14:33:45 -0400 |
parents | |
children |
line wrap: on
line source
## How to execute this tool # $Rscript my_r_tool.R --input input.csv --output output.csv # Send R errors to stderr options(show.error.messages = F, error = function(){cat(geterrmessage(), file = stderr()); q("no", 1, F)}) # Avoid crashing Galaxy with an UTF8 error on German LC settings loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") options(stringAsfactors = FALSE, useFancyQuotes = FALSE) # Check for packages requirement ----- suppressMessages(source("https://bioconductor.org/biocLite.R")) list.of.packages <- c("GenomicRanges", "Rsamtools", "rtracklayer", "parallel", "foreach", "doParallel", "GenomicFeatures", "Gviz", "BSgenome.Hsapiens.UCSC.hg19", "org.Hs.eg.db", "BSgenome.Mmusculus.UCSC.mm10", "org.Mm.eg.db", "optparse", "getopt") new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] if(length(new.packages)) biocLite(new.packages) # packages required tmp <- suppressMessages(lapply(list.of.packages, require, quietly =T, warn.conflicts = F, character.only = TRUE)) rm(tmp) # Command line options ---- # option_list <- list( # make_option(c("-b", "--bamfile"), type="character", default=NULL, # help="bamfile for which read counts are to be calculated around TSS", metavar="character"), # make_option(c("-e", "--genome"), type="character", default = NULL, # help="genome: hg19/ mm10", metavar="character"), # make_option(c("-n", "--TSSn"), type="character", default = NULL, # help="region upstream TSS for which the read counts are to be calculated", metavar="character"), # make_option(c("-p", "--TSSp"), type="character", default = NULL, # help="region downstream TSS for which the read counts are to be calculated", metavar="character"), # make_option(c("-u", "--TxDbUCSC"), type="character", default = NULL, # help="Transcripts database: If you already have a sqlite Transcript database for hg19/ mm10, provide it, or do not enter anything. Software will create a database to use from UCSC.", metavar="character") # ) # # opt_parser <- OptionParser(option_list=option_list) # opt <- parse_args(opt_parser) # Take in trailing command line arguments args <- commandArgs(trailingOnly = TRUE) # args <- c("ENCFF027UTM.bam", "hg19", 500, 500, "transcripts_mm10_UCSC.sqlite") # Get options using the spec as defined by the enclosed list # Options are read from the default: commandArgs(TRUE) option_specification = matrix(c( 'input1', 'i1', 2, 'character', 'input2', 'i2', 2, 'character', 'input3', 'i3', 2, 'integer', 'input4', 'i4', 2, 'integer', 'input5', 'i5', 2, 'character', 'output', 'o', 2, 'character' ), byrow=TRUE, ncol=4); # Parse options options = getopt(option_specification) if (is.null(options$input1) | is.null(options$input2)){ stop("Please provide right input parameters. See help!", call.=FALSE) } # opt$genome <- "mm10" # opt$TxDbUCSC <- "transcripts_mm10_UCSC.sqlite" # opt$bamfile <- "ENCFF027UTM.bam" # opt$TSSn <- 500 # opt$TSSp <- 500 # Transcript database ---- txdb <- NULL if(is.null(options$input5)){ txdb <- makeTxDbFromUCSC(genome = options$input2, tablename="refGene") saveDb(txdb, file = paste("transcripts_", options$input2, "_UCSC.sqlite", sep = "")) } else{ txdb <- loadDb(options$input5) } txs <- transcripts(txdb, columns = c("gene_id", "tx_id")) txs <- txs[sapply(split(seq_along(txs), sapply(txs$gene_id, "[", 1)), function(x) x[which.max(width(txs)[x])])] geneID <- txs$"gene_id" genes <- as.character(geneID) # Bamfiles and chromosomes # bamfiles <- list.files(path = opt$bamdir, pattern = ".bam$", full.names = T) bamfiles <- options$input1 chrs <- scanBamHeader(bamfiles)[[1]]$targets # TSS regions for calculationg counts ---- tss <- suppressWarnings(GRanges(seqnames(txs), IRanges(start = ifelse(as.character(strand(txs)) == "+", start(txs), end(txs)) - as.numeric(options$input3), width = as.numeric(options$input3) + as.numeric(options$input4)), strand = strand(txs), seqlengths = chrs)) # Functions ---- mylapply <- function(...) { if(require(parallel) && .Platform$OS.type == "unix") mclapply(..., mc.cores = detectCores(), mc.preschedule = F) else lapply(...) } coverageForChr <- function(seqname, bamfile, chrs, fragmentSize=0, ...) { message(", ", seqname, appendLF=FALSE) shift <- round(fragmentSize/2) param <- ScanBamParam(what=c("pos","qwidth","strand"), which=GRanges(seqname, IRanges(1, chrs[seqname])), flag=scanBamFlag(isUnmappedQuery=FALSE)) x <- scanBam(bamfile, ..., param=param)[[1]] coverage(IRanges(ifelse(x[["strand"]]=="+", x[["pos"]]+shift, x[["pos"]]+x[["qwidth"]]-shift), width=1), width=chrs[seqname]) } countsForRegions <- function(bamfiles, regions, fragmentSize=0) { names(regions) <- seq_along(regions) chrs <- seqlengths(regions) DF <- do.call(cbind, lapply(bamfiles, function(bf) { message("counting alignments in ",basename(bf),appendLF=FALSE) cover <- mylapply(names(chrs), coverageForChr, bamfile=bf, chrs=chrs, fragmentSize=fragmentSize) names(cover) <- names(chrs) cover <- RleList(unlist(cover), compress=FALSE) vws <- Views(cover, as(regions,"RangesList")) vs <- unlist(viewSums(vws), use.names=FALSE) vs <- vs[match(names(regions), unlist(lapply(vws,names),use.names=FALSE))] df <- DataFrame(vs) colnames(df) <- sub(".bam","",basename(bf)) message(": done") df })) } # Counting reads ---- cnts <- countsForRegions(bamfiles = bamfiles, regions = tss, fragmentSize = 150) cnts <- data.frame(Genes = genes, cnts, stringsAsFactors = F, check.names = F) # Annotating genes ---- anno <- NULL if(options$input2 == "hg19"){ anno <- select(org.Hs.eg.db, as.character(cnts$Genes), c("SYMBOL", "GENENAME")) } else if (options$input2 == "mm10"){ anno <- select(org.Mm.eg.db, as.character(cnts$Genes), c("SYMBOL", "GENENAME")) } else { stop("Please provide correct reference genome. See help!", call.=FALSE) } cnts <- merge(anno, cnts, by.x = "ENTREZID", by.y = "Genes") index <- which(is.na(cnts$SYMBOL)) cnts$SYMBOL[index] <- cnts$ENTREZID[index] write.table(cnts, file = options$output, sep = "\t", row.names = F, quote = F) sessionInfo()