0
|
1 ## How to execute this tool
|
|
2 # $Rscript my_r_tool.R --input input.csv --output output.csv
|
|
3
|
|
4 # Send R errors to stderr
|
|
5 options(show.error.messages = F, error = function(){cat(geterrmessage(), file = stderr()); q("no", 1, F)})
|
|
6
|
|
7 # Avoid crashing Galaxy with an UTF8 error on German LC settings
|
|
8 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
|
|
9
|
|
10 options(stringAsfactors = FALSE, useFancyQuotes = FALSE)
|
|
11
|
|
12 # Check for packages requirement -----
|
|
13 suppressMessages(source("https://bioconductor.org/biocLite.R"))
|
|
14
|
|
15 list.of.packages <- c("GenomicRanges",
|
|
16 "Rsamtools",
|
|
17 "rtracklayer",
|
|
18 "parallel",
|
|
19 "foreach",
|
|
20 "doParallel",
|
|
21 "GenomicFeatures",
|
|
22 "Gviz",
|
|
23 "BSgenome.Hsapiens.UCSC.hg19",
|
|
24 "org.Hs.eg.db",
|
|
25 "BSgenome.Mmusculus.UCSC.mm10",
|
|
26 "org.Mm.eg.db",
|
|
27 "optparse",
|
|
28 "getopt")
|
|
29
|
|
30 new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
|
|
31 if(length(new.packages)) biocLite(new.packages)
|
|
32
|
|
33 # packages required
|
|
34 tmp <- suppressMessages(lapply(list.of.packages, require, quietly =T,
|
|
35 warn.conflicts = F, character.only = TRUE))
|
|
36 rm(tmp)
|
|
37
|
|
38
|
|
39 # Command line options ----
|
|
40 # option_list <- list(
|
|
41 # make_option(c("-b", "--bamfile"), type="character", default=NULL,
|
|
42 # help="bamfile for which read counts are to be calculated around TSS", metavar="character"),
|
|
43 # make_option(c("-e", "--genome"), type="character", default = NULL,
|
|
44 # help="genome: hg19/ mm10", metavar="character"),
|
|
45 # make_option(c("-n", "--TSSn"), type="character", default = NULL,
|
|
46 # help="region upstream TSS for which the read counts are to be calculated", metavar="character"),
|
|
47 # make_option(c("-p", "--TSSp"), type="character", default = NULL,
|
|
48 # help="region downstream TSS for which the read counts are to be calculated", metavar="character"),
|
|
49 # make_option(c("-u", "--TxDbUCSC"), type="character", default = NULL,
|
|
50 # help="Transcripts database: If you already have a sqlite Transcript database for hg19/ mm10, provide it, or do not enter anything. Software will create a database to use from UCSC.", metavar="character")
|
|
51 # )
|
|
52 #
|
|
53 # opt_parser <- OptionParser(option_list=option_list)
|
|
54 # opt <- parse_args(opt_parser)
|
|
55
|
|
56 # Take in trailing command line arguments
|
|
57 args <- commandArgs(trailingOnly = TRUE)
|
|
58 # args <- c("ENCFF027UTM.bam", "hg19", 500, 500, "transcripts_mm10_UCSC.sqlite")
|
|
59
|
|
60 # Get options using the spec as defined by the enclosed list
|
|
61 # Options are read from the default: commandArgs(TRUE)
|
|
62 option_specification = matrix(c(
|
|
63 'input1', 'i1', 2, 'character',
|
|
64 'input2', 'i2', 2, 'character',
|
|
65 'input3', 'i3', 2, 'integer',
|
|
66 'input4', 'i4', 2, 'integer',
|
|
67 'input5', 'i5', 2, 'character',
|
|
68 'output', 'o', 2, 'character'
|
|
69 ), byrow=TRUE, ncol=4);
|
|
70
|
|
71 # Parse options
|
|
72 options = getopt(option_specification)
|
|
73
|
|
74 if (is.null(options$input1) | is.null(options$input2)){
|
|
75 stop("Please provide right input parameters. See help!", call.=FALSE)
|
|
76 }
|
|
77
|
|
78
|
|
79 # opt$genome <- "mm10"
|
|
80 # opt$TxDbUCSC <- "transcripts_mm10_UCSC.sqlite"
|
|
81 # opt$bamfile <- "ENCFF027UTM.bam"
|
|
82 # opt$TSSn <- 500
|
|
83 # opt$TSSp <- 500
|
|
84
|
|
85 # Transcript database ----
|
|
86 txdb <- NULL
|
|
87 if(is.null(options$input5)){
|
|
88 txdb <- makeTxDbFromUCSC(genome = options$input2, tablename="refGene")
|
|
89 saveDb(txdb, file = paste("transcripts_", options$input2, "_UCSC.sqlite", sep = ""))
|
|
90 } else{
|
|
91 txdb <- loadDb(options$input5)
|
|
92 }
|
|
93
|
|
94 txs <- transcripts(txdb, columns = c("gene_id", "tx_id"))
|
|
95 txs <- txs[sapply(split(seq_along(txs), sapply(txs$gene_id, "[", 1)),
|
|
96 function(x) x[which.max(width(txs)[x])])]
|
|
97
|
|
98 geneID <- txs$"gene_id"
|
|
99 genes <- as.character(geneID)
|
|
100
|
|
101
|
|
102 # Bamfiles and chromosomes
|
|
103 # bamfiles <- list.files(path = opt$bamdir, pattern = ".bam$", full.names = T)
|
|
104 bamfiles <- options$input1
|
|
105
|
|
106 chrs <- scanBamHeader(bamfiles)[[1]]$targets
|
|
107
|
|
108
|
|
109 # TSS regions for calculationg counts ----
|
|
110 tss <- suppressWarnings(GRanges(seqnames(txs),
|
|
111 IRanges(start = ifelse(as.character(strand(txs)) == "+",
|
|
112 start(txs), end(txs)) - as.numeric(options$input3),
|
|
113 width = as.numeric(options$input3) + as.numeric(options$input4)),
|
|
114 strand = strand(txs), seqlengths = chrs))
|
|
115
|
|
116
|
|
117 # Functions ----
|
|
118 mylapply <- function(...) {
|
|
119 if(require(parallel) && .Platform$OS.type == "unix")
|
|
120 mclapply(..., mc.cores = detectCores(), mc.preschedule = F)
|
|
121 else
|
|
122 lapply(...)
|
|
123 }
|
|
124
|
|
125 coverageForChr <- function(seqname, bamfile, chrs, fragmentSize=0, ...) {
|
|
126 message(", ", seqname, appendLF=FALSE)
|
|
127 shift <- round(fragmentSize/2)
|
|
128 param <- ScanBamParam(what=c("pos","qwidth","strand"),
|
|
129 which=GRanges(seqname, IRanges(1, chrs[seqname])),
|
|
130 flag=scanBamFlag(isUnmappedQuery=FALSE))
|
|
131 x <- scanBam(bamfile, ..., param=param)[[1]]
|
|
132 coverage(IRanges(ifelse(x[["strand"]]=="+", x[["pos"]]+shift,
|
|
133 x[["pos"]]+x[["qwidth"]]-shift), width=1), width=chrs[seqname])
|
|
134 }
|
|
135
|
|
136
|
|
137 countsForRegions <- function(bamfiles, regions, fragmentSize=0) {
|
|
138 names(regions) <- seq_along(regions)
|
|
139 chrs <- seqlengths(regions)
|
|
140 DF <- do.call(cbind, lapply(bamfiles, function(bf) {
|
|
141 message("counting alignments in ",basename(bf),appendLF=FALSE)
|
|
142 cover <- mylapply(names(chrs), coverageForChr, bamfile=bf, chrs=chrs, fragmentSize=fragmentSize)
|
|
143 names(cover) <- names(chrs)
|
|
144 cover <- RleList(unlist(cover), compress=FALSE)
|
|
145 vws <- Views(cover, as(regions,"RangesList"))
|
|
146 vs <- unlist(viewSums(vws), use.names=FALSE)
|
|
147 vs <- vs[match(names(regions), unlist(lapply(vws,names),use.names=FALSE))]
|
|
148 df <- DataFrame(vs)
|
|
149 colnames(df) <- sub(".bam","",basename(bf))
|
|
150 message(": done")
|
|
151 df
|
|
152 }))
|
|
153 }
|
|
154
|
|
155
|
|
156
|
|
157 # Counting reads ----
|
|
158 cnts <- countsForRegions(bamfiles = bamfiles, regions = tss, fragmentSize = 150)
|
|
159 cnts <- data.frame(Genes = genes, cnts, stringsAsFactors = F, check.names = F)
|
|
160
|
|
161 # Annotating genes ----
|
|
162 anno <- NULL
|
|
163 if(options$input2 == "hg19"){
|
|
164 anno <- select(org.Hs.eg.db, as.character(cnts$Genes), c("SYMBOL", "GENENAME"))
|
|
165 } else if (options$input2 == "mm10"){
|
|
166 anno <- select(org.Mm.eg.db, as.character(cnts$Genes), c("SYMBOL", "GENENAME"))
|
|
167 } else {
|
|
168 stop("Please provide correct reference genome. See help!", call.=FALSE)
|
|
169 }
|
|
170
|
|
171 cnts <- merge(anno, cnts, by.x = "ENTREZID", by.y = "Genes")
|
|
172
|
|
173 index <- which(is.na(cnts$SYMBOL))
|
|
174 cnts$SYMBOL[index] <- cnts$ENTREZID[index]
|
|
175
|
|
176 write.table(cnts, file = options$output, sep = "\t", row.names = F, quote = F)
|
|
177
|
|
178 sessionInfo() |