Mercurial > repos > dktanwar > test2
comparison 02_read_counts_tss/countsAroundTSS.R @ 0:e4a5943b9258 draft
Uploaded
author | dktanwar |
---|---|
date | Tue, 12 Sep 2017 14:33:45 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e4a5943b9258 |
---|---|
1 ## How to execute this tool | |
2 # $Rscript my_r_tool.R --input input.csv --output output.csv | |
3 | |
4 # Send R errors to stderr | |
5 options(show.error.messages = F, error = function(){cat(geterrmessage(), file = stderr()); q("no", 1, F)}) | |
6 | |
7 # Avoid crashing Galaxy with an UTF8 error on German LC settings | |
8 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") | |
9 | |
10 options(stringAsfactors = FALSE, useFancyQuotes = FALSE) | |
11 | |
12 # Check for packages requirement ----- | |
13 suppressMessages(source("https://bioconductor.org/biocLite.R")) | |
14 | |
15 list.of.packages <- c("GenomicRanges", | |
16 "Rsamtools", | |
17 "rtracklayer", | |
18 "parallel", | |
19 "foreach", | |
20 "doParallel", | |
21 "GenomicFeatures", | |
22 "Gviz", | |
23 "BSgenome.Hsapiens.UCSC.hg19", | |
24 "org.Hs.eg.db", | |
25 "BSgenome.Mmusculus.UCSC.mm10", | |
26 "org.Mm.eg.db", | |
27 "optparse", | |
28 "getopt") | |
29 | |
30 new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] | |
31 if(length(new.packages)) biocLite(new.packages) | |
32 | |
33 # packages required | |
34 tmp <- suppressMessages(lapply(list.of.packages, require, quietly =T, | |
35 warn.conflicts = F, character.only = TRUE)) | |
36 rm(tmp) | |
37 | |
38 | |
39 # Command line options ---- | |
40 # option_list <- list( | |
41 # make_option(c("-b", "--bamfile"), type="character", default=NULL, | |
42 # help="bamfile for which read counts are to be calculated around TSS", metavar="character"), | |
43 # make_option(c("-e", "--genome"), type="character", default = NULL, | |
44 # help="genome: hg19/ mm10", metavar="character"), | |
45 # make_option(c("-n", "--TSSn"), type="character", default = NULL, | |
46 # help="region upstream TSS for which the read counts are to be calculated", metavar="character"), | |
47 # make_option(c("-p", "--TSSp"), type="character", default = NULL, | |
48 # help="region downstream TSS for which the read counts are to be calculated", metavar="character"), | |
49 # make_option(c("-u", "--TxDbUCSC"), type="character", default = NULL, | |
50 # help="Transcripts database: If you already have a sqlite Transcript database for hg19/ mm10, provide it, or do not enter anything. Software will create a database to use from UCSC.", metavar="character") | |
51 # ) | |
52 # | |
53 # opt_parser <- OptionParser(option_list=option_list) | |
54 # opt <- parse_args(opt_parser) | |
55 | |
56 # Take in trailing command line arguments | |
57 args <- commandArgs(trailingOnly = TRUE) | |
58 # args <- c("ENCFF027UTM.bam", "hg19", 500, 500, "transcripts_mm10_UCSC.sqlite") | |
59 | |
60 # Get options using the spec as defined by the enclosed list | |
61 # Options are read from the default: commandArgs(TRUE) | |
62 option_specification = matrix(c( | |
63 'input1', 'i1', 2, 'character', | |
64 'input2', 'i2', 2, 'character', | |
65 'input3', 'i3', 2, 'integer', | |
66 'input4', 'i4', 2, 'integer', | |
67 'input5', 'i5', 2, 'character', | |
68 'output', 'o', 2, 'character' | |
69 ), byrow=TRUE, ncol=4); | |
70 | |
71 # Parse options | |
72 options = getopt(option_specification) | |
73 | |
74 if (is.null(options$input1) | is.null(options$input2)){ | |
75 stop("Please provide right input parameters. See help!", call.=FALSE) | |
76 } | |
77 | |
78 | |
79 # opt$genome <- "mm10" | |
80 # opt$TxDbUCSC <- "transcripts_mm10_UCSC.sqlite" | |
81 # opt$bamfile <- "ENCFF027UTM.bam" | |
82 # opt$TSSn <- 500 | |
83 # opt$TSSp <- 500 | |
84 | |
85 # Transcript database ---- | |
86 txdb <- NULL | |
87 if(is.null(options$input5)){ | |
88 txdb <- makeTxDbFromUCSC(genome = options$input2, tablename="refGene") | |
89 saveDb(txdb, file = paste("transcripts_", options$input2, "_UCSC.sqlite", sep = "")) | |
90 } else{ | |
91 txdb <- loadDb(options$input5) | |
92 } | |
93 | |
94 txs <- transcripts(txdb, columns = c("gene_id", "tx_id")) | |
95 txs <- txs[sapply(split(seq_along(txs), sapply(txs$gene_id, "[", 1)), | |
96 function(x) x[which.max(width(txs)[x])])] | |
97 | |
98 geneID <- txs$"gene_id" | |
99 genes <- as.character(geneID) | |
100 | |
101 | |
102 # Bamfiles and chromosomes | |
103 # bamfiles <- list.files(path = opt$bamdir, pattern = ".bam$", full.names = T) | |
104 bamfiles <- options$input1 | |
105 | |
106 chrs <- scanBamHeader(bamfiles)[[1]]$targets | |
107 | |
108 | |
109 # TSS regions for calculationg counts ---- | |
110 tss <- suppressWarnings(GRanges(seqnames(txs), | |
111 IRanges(start = ifelse(as.character(strand(txs)) == "+", | |
112 start(txs), end(txs)) - as.numeric(options$input3), | |
113 width = as.numeric(options$input3) + as.numeric(options$input4)), | |
114 strand = strand(txs), seqlengths = chrs)) | |
115 | |
116 | |
117 # Functions ---- | |
118 mylapply <- function(...) { | |
119 if(require(parallel) && .Platform$OS.type == "unix") | |
120 mclapply(..., mc.cores = detectCores(), mc.preschedule = F) | |
121 else | |
122 lapply(...) | |
123 } | |
124 | |
125 coverageForChr <- function(seqname, bamfile, chrs, fragmentSize=0, ...) { | |
126 message(", ", seqname, appendLF=FALSE) | |
127 shift <- round(fragmentSize/2) | |
128 param <- ScanBamParam(what=c("pos","qwidth","strand"), | |
129 which=GRanges(seqname, IRanges(1, chrs[seqname])), | |
130 flag=scanBamFlag(isUnmappedQuery=FALSE)) | |
131 x <- scanBam(bamfile, ..., param=param)[[1]] | |
132 coverage(IRanges(ifelse(x[["strand"]]=="+", x[["pos"]]+shift, | |
133 x[["pos"]]+x[["qwidth"]]-shift), width=1), width=chrs[seqname]) | |
134 } | |
135 | |
136 | |
137 countsForRegions <- function(bamfiles, regions, fragmentSize=0) { | |
138 names(regions) <- seq_along(regions) | |
139 chrs <- seqlengths(regions) | |
140 DF <- do.call(cbind, lapply(bamfiles, function(bf) { | |
141 message("counting alignments in ",basename(bf),appendLF=FALSE) | |
142 cover <- mylapply(names(chrs), coverageForChr, bamfile=bf, chrs=chrs, fragmentSize=fragmentSize) | |
143 names(cover) <- names(chrs) | |
144 cover <- RleList(unlist(cover), compress=FALSE) | |
145 vws <- Views(cover, as(regions,"RangesList")) | |
146 vs <- unlist(viewSums(vws), use.names=FALSE) | |
147 vs <- vs[match(names(regions), unlist(lapply(vws,names),use.names=FALSE))] | |
148 df <- DataFrame(vs) | |
149 colnames(df) <- sub(".bam","",basename(bf)) | |
150 message(": done") | |
151 df | |
152 })) | |
153 } | |
154 | |
155 | |
156 | |
157 # Counting reads ---- | |
158 cnts <- countsForRegions(bamfiles = bamfiles, regions = tss, fragmentSize = 150) | |
159 cnts <- data.frame(Genes = genes, cnts, stringsAsFactors = F, check.names = F) | |
160 | |
161 # Annotating genes ---- | |
162 anno <- NULL | |
163 if(options$input2 == "hg19"){ | |
164 anno <- select(org.Hs.eg.db, as.character(cnts$Genes), c("SYMBOL", "GENENAME")) | |
165 } else if (options$input2 == "mm10"){ | |
166 anno <- select(org.Mm.eg.db, as.character(cnts$Genes), c("SYMBOL", "GENENAME")) | |
167 } else { | |
168 stop("Please provide correct reference genome. See help!", call.=FALSE) | |
169 } | |
170 | |
171 cnts <- merge(anno, cnts, by.x = "ENTREZID", by.y = "Genes") | |
172 | |
173 index <- which(is.na(cnts$SYMBOL)) | |
174 cnts$SYMBOL[index] <- cnts$ENTREZID[index] | |
175 | |
176 write.table(cnts, file = options$output, sep = "\t", row.names = F, quote = F) | |
177 | |
178 sessionInfo() |