0
|
1 ## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-.
|
|
2 ## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
|
|
3 ##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' '
|
|
4 ## Nov 2011 th17
|
|
5 ## Bonneau lab - "Aviv Madar" <am2654@nyu.edu>,
|
|
6 ## NYU - Center for Genomics and Systems Biology
|
|
7 ## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-.
|
|
8 ## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
|
|
9 ##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' '
|
|
10
|
|
11 rm(list=ls())
|
|
12 debug=F
|
|
13 script.version=0.1
|
|
14 print.error <- function(){
|
|
15 cat("
|
|
16 NAME
|
|
17 map_ChIP_peaks_to_genes.R
|
|
18
|
|
19 DESCRIPTION
|
|
20 This script takes a MACS tab delimited file as input and
|
|
21 produces two files:
|
|
22
|
|
23 1) genes.xls
|
|
24 A gene centered table that stores the peaks that are
|
|
25 proximal (within x[kb] of TSS) and distal (within the
|
|
26 gene's region + y[kb] upstream or downstream of TSS and
|
|
27 TES respectively).
|
|
28
|
|
29 2) peaks.xls
|
|
30 A peak centered table that equals the MACS input table plus
|
|
31 colums for proximal and distal targets of each peak and
|
|
32 their distance from TSS (if exist). For gene.xls script also
|
|
33 gives a poisson model based -log10(pval) score for proximal
|
|
34 and genewide enrichment of peaks for each gene with at least
|
|
35 one prox/dist peak associated with it.
|
|
36
|
|
37 ARGUMENTS
|
|
38 input_file=path
|
|
39 Path to MACS file.
|
|
40
|
|
41 refseq_table=path
|
|
42 Path to refseq table (gives TSS/TES locations for all genes).
|
|
43
|
|
44 path_output=path
|
|
45 Path to save genes.xls and peaks.xls output files.
|
|
46
|
|
47 tss.dist=N
|
|
48 The absolute distance from TSS where we connect a peak to
|
|
49 gene (for proximal peaks).
|
|
50
|
|
51 gene.wide.dist=N
|
|
52 The absolute distance from TSS or TES where we connect a peak
|
|
53 to gene (for peaks hitting anywhere in gene).
|
|
54
|
|
55 effective.genome.size=N
|
|
56 The effective mappable genome size (for mm9 the value is
|
|
57 1.87e9. For hg19 the value is 2.7e9 (from MACS manual)).
|
|
58
|
|
59 macs.skip.lines=N
|
|
60 Number of lines to skip in input_file.
|
|
61
|
|
62 EXAMPLE
|
|
63 Rscript map_ChIP_peaks_to_genes_v.1.R \\
|
|
64 input_file=SL971_SL970_peaks.xls \\
|
|
65 refseq_table=UCSC_mm9_refseq_genes_Sep_15_2011.txt \\
|
|
66 path_output=./ \\
|
|
67 tss.dist=5000 \\
|
|
68 gene.wide.dist=10000 \\
|
|
69 effective.genome.size=1.87e9 \\
|
|
70 macs.skip.lines=23
|
|
71
|
|
72 CITATION
|
|
73 Please cite us if you used this script: The transcription
|
|
74 factor network regulating Th17 lineage specification and
|
|
75 function. Maria Ciofani, Aviv Madar, Carolina Galan, Kieran
|
|
76 Mace, Ashish Agarwal, Kim Newberry, Richard M. Myers, Richard
|
|
77 Bonneau and Dan R. Littman et. al. (in preperation).
|
|
78
|
|
79 ")
|
|
80 }
|
|
81
|
|
82 # retrieve args
|
|
83 if(debug==T){
|
|
84 cmd.args <- c(
|
|
85 # "input_file=input/th17/used_for_paper/rawData/MACS_Sep_15_2011/SL971_SL970_peaks.xls",
|
|
86 "input_file=input/th17/used_for_paper/rawData/MACS_Sep_15_2011/SL3594_SL3592_peaks.xls",
|
|
87 "refseq_table=input/th17/used_for_paper/rawData/UCSC_mm9_refseq_genes_Sep_15_2011.txt",
|
|
88 "path_output=/Users/aviv/Desktop/test_script/",
|
|
89 "tss.dist=5000",
|
|
90 "tes.dist=10000",
|
|
91 "effective.genome.size=1.87e9",
|
|
92 "gene.wide.dist=10000",
|
|
93 "macs.skip.lines=23"
|
|
94 )
|
|
95 } else {
|
|
96 cmd.args <- commandArgs();
|
|
97 }
|
|
98
|
|
99 if(length(grep("--version",cmd.args))){
|
|
100 cat("version",script.version,"\n")
|
|
101 q()
|
|
102 }
|
|
103
|
|
104 arg.names.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[1])
|
|
105 args.val.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[2])
|
|
106
|
|
107 arg.nms <- c("input_file","refseq_table","path_output","tss.dist",
|
|
108 "gene.wide.dist","effective.genome.size","macs.skip.lines")
|
|
109 arg.val <- character(length=length(arg.nms))
|
|
110 for(i in 1:length(arg.nms)){
|
|
111 ix <- which(arg.names.cmd.line==arg.nms[i])
|
|
112 if(length(ix)==1){
|
|
113 arg.val[i] <- args.val.cmd.line[ix]
|
|
114 } else {
|
|
115 stop("######could not find ",arg.nms[i]," arg######\n\n",print.error())
|
|
116
|
|
117 }
|
|
118 }
|
|
119 if(debug==T){
|
|
120 print(paste(arg.nms,"=",arg.val))
|
|
121 }
|
|
122 # the files here adhere to tab delim format
|
|
123 path.input.macs <- arg.val[1]
|
|
124 path.input.refseq <- arg.val[2]
|
|
125 path.output <- arg.val[3]
|
|
126 tss.dist <- as.numeric(arg.val[4])
|
|
127 gene.wide.dist <- as.numeric(arg.val[5])
|
|
128 effective.genome.size <- as.numeric(arg.val[6])
|
|
129 macs.skip.lines=as.numeric(arg.val[7])
|
|
130
|
|
131 if(debug==T){
|
|
132 # if(1){
|
|
133 load("/Users/aviv/Desktop/r.u.RData")
|
|
134 gn.nms <- rownames(r.u)
|
|
135 } else {
|
|
136 ##################
|
|
137 ### step 1 handle refseq table file (read, make unique)
|
|
138 cat("reading refseq table",path.input.refseq,"\n")
|
|
139 refseq <- read.delim(file=path.input.refseq)
|
|
140 # refseq can have many transcripts for each gene
|
|
141 # here i make it have only one transcript for each gene (the longest one)
|
|
142 cat("for transcripts matching more than one gene keeping only the longest transcript\n")
|
|
143 refseq$name2 <- as.character(refseq$name2)
|
|
144 refseq$chrom <- as.character(refseq$chrom)
|
|
145 refseq$strand <- as.character(refseq$strand)
|
|
146 gn.nms <- unique(refseq$name2)
|
|
147 #x <- sort(table(refseq$name2),decreasing=T)
|
|
148 #gn.nms.non.unique <- names(x)[which(x>1)]
|
|
149 #gn.nms.unique <- names(x)[which(x==1)]
|
|
150 # create refseq unique r.u
|
|
151 n <- length(gn.nms)
|
|
152 r.u <- data.frame(cbind(chrom=rep("NA",n),strand=rep("NA",n),txStart=rep(0,n),txEnd=rep(0,n)),stringsAsFactors=FALSE,row.names=gn.nms)
|
|
153 for(i in 1:n){
|
|
154 ix <- which(refseq$name2==gn.nms[i])
|
|
155 if(length(ix)==0) {
|
|
156 error("could not find gene", ng.nms[i], "in refseq table. Bailing out...\n")
|
|
157 } else if (length(ix)>1){
|
|
158 l <- apply(refseq[ix,c("txStart","txEnd")],1,function(i) abs(i[1]-i[2]) )
|
|
159 l.max <- max(l)
|
|
160 ix <- ix[which(l==l.max)[1]]
|
|
161 }
|
|
162 r.u[gn.nms[i],c("chrom","strand","txStart","txEnd")] <- refseq[ix,c("chrom","strand","txStart","txEnd")]
|
|
163 }
|
|
164 r.u[,"txStart"] <- as.numeric(r.u[,"txStart"])
|
|
165 r.u[,"txEnd"] <- as.numeric(r.u[,"txEnd"])
|
|
166
|
|
167 # switch TSS and TES if we have chr "-"
|
|
168 cat("correcting TSS, TES assiments in refseq table based on strand\n")
|
|
169 ix <- which(r.u$strand=="-")
|
|
170 tmp.tes <- r.u$txStart[ix]
|
|
171 tmp.tss <- r.u$txEnd[ix]
|
|
172 r.u[ix,c("txStart","txEnd")] <- cbind(tmp.tss,tmp.tes)
|
|
173 # ############
|
|
174 }
|
|
175
|
|
176
|
|
177 #### step 2 read macs file
|
|
178 cat("reading Macs file",path.input.macs,"\n")
|
|
179
|
|
180 # read macs file
|
|
181 M <- read.delim(file=path.input.macs,skip=macs.skip.lines)
|
|
182 trgt.prox <- NA
|
|
183 trgt.dist <- NA
|
|
184 dtss.prox <- NA
|
|
185 dtss.dist <- NA
|
|
186 M.annot <- cbind(M,trgt.prox,trgt.dist,dtss.prox,dtss.dist)
|
|
187 M.annot[is.na(M.annot)] <- ""
|
|
188 # get the number of genes
|
|
189 m <- length(gn.nms)
|
|
190 if(debug==T){
|
|
191 # m <- 100
|
|
192 }
|
|
193 f.nm.peak <- paste(sep="",path.output,"peaks.xls")
|
|
194 f.nm.gene <- paste(sep="",path.output,"genes.xls")
|
|
195 # for each genes in the expt (j goes over genes)
|
|
196 cat(file=f.nm.gene,sep="\t","Gene_ID","prox_n_peak","genewide_n_peak","gn_length","strand","prox_mean_pval","genewide_mean_pval",
|
|
197 "prox_max_pval","genewide_max_pval",
|
|
198 "prox_pois_model_pval","genewide_pois_model_pval","(peak_id,summit,d_TSS,d_TES,class,pval,fold_enrich,FDR),(..)\n")
|
|
199 peaks.summit <- (M[,"start"]+M[,"summit"])
|
|
200 cat(sep="","mapping MACS peaks to genes\n")
|
|
201 for (j in 1:m){
|
|
202 if(j%%20==0){cat(".")}
|
|
203 tss <- r.u[j,"txStart"]
|
|
204 tes <- r.u[j,"txEnd"]
|
|
205 gn.lngth <- abs(tss-tes)
|
|
206 strand <- r.u[j,"strand"]
|
|
207 chr <- r.u[j,"chrom"]
|
|
208 if(strand=="+"){
|
|
209 d.tss.all <- peaks.summit-tss
|
|
210 d.tes.all <- peaks.summit-tes
|
|
211 } else {
|
|
212 d.tss.all <- tss-peaks.summit
|
|
213 d.tes.all <- tes-peaks.summit
|
|
214 }
|
|
215 ix.distal <- sort(union( c(which( abs(d.tss.all) < gene.wide.dist ),which( abs(d.tes.all) < gene.wide.dist )),
|
|
216 which( sign(d.tss.all) != sign(d.tes.all) )),decreasing=TRUE)
|
|
217 ix.prox <- sort(which( abs(d.tss.all) < tss.dist ),decreasing=TRUE)
|
|
218 ix.prox <- ix.prox[which(M[ix.prox,"chr"]==chr)]
|
|
219 ix.distal <- ix.distal[which(M[ix.distal,"chr"]==chr)]
|
|
220 n.peaks.prox <- length(ix.prox)
|
|
221 n.peaks.distal <- length(ix.distal)
|
|
222 # if there is at least one peak hitting gene j
|
|
223 if(n.peaks.distal > 0){
|
|
224 # for each peak (l goes over peaks)
|
|
225 peaks.line <- paste(sep="","(")
|
|
226 for (k in 1:n.peaks.distal){
|
|
227 if(k>1){
|
|
228 peaks.line <- paste(sep="",peaks.line,";(")
|
|
229 }
|
|
230 d.tss <- d.tss.all[ ix.distal[k] ]
|
|
231 d.tes <- d.tes.all[ ix.distal[k] ]
|
|
232 if(sign(d.tss)!=sign(d.tes)){
|
|
233 class="intra"
|
|
234 } else if(d.tss<=0){
|
|
235 class="upstream"
|
|
236 } else if(d.tss>0){
|
|
237 class="downstream"
|
|
238 }
|
|
239 peaks.line <- paste(sep="",peaks.line,paste(sep=",", ix.distal[k], peaks.summit[ ix.distal[k] ],
|
|
240 d.tss, d.tes, class, M[ ix.distal[k] ,7], M[ ix.distal[k] ,8], M[ ix.distal[k] ,9] ),")")
|
|
241 # handle M.annot
|
|
242 dtss <- d.tss.all[ ix.distal[k] ]
|
|
243 prev.trgt <- M.annot[ix.distal[k],"trgt.dist"]
|
|
244 if(prev.trgt!=""){ # if peak already is with trgt choose one with min dtss
|
|
245 if(abs(dtss)<abs(as.numeric(M.annot[ix.distal[k],"dtss.dist"]))){
|
|
246 M.annot[ix.distal[k],"dtss.dist"] <- dtss
|
|
247 M.annot[ix.distal[k],"trgt.dist"] <- gn.nms[j]
|
|
248 }
|
|
249 } else { # if peak does not have trgt add current trgt
|
|
250 M.annot[ix.distal[k],"dtss.dist"] <- dtss
|
|
251 M.annot[ix.distal[k],"trgt.dist"] <- gn.nms[j]
|
|
252 }
|
|
253 # calc the pval for these many peaks in proximal region and in gene-wide region
|
|
254 }
|
|
255 if(n.peaks.prox > 0){
|
|
256 mean.pval.prox <- mean(M[ ix.prox ,7])
|
|
257 max.pval.prox <- max(M[ ix.prox ,7])
|
|
258 expected.num.peaks.genome.wide.prox <- length(which(M[,7]>=mean.pval.prox))
|
|
259 # lambda = num peaks / genome size * searched region
|
|
260 lambda.prox <- expected.num.peaks.genome.wide.prox/effective.genome.size*(tss.dist*2)
|
|
261 pval.pois.prox <- -log10(ppois(n.peaks.prox,lambda.prox,lower.tail=FALSE))
|
|
262 # handle M.annot
|
|
263 for(k in 1:n.peaks.prox){
|
|
264 dtss <- d.tss.all[ ix.prox[k] ]
|
|
265 prev.trgt <- M.annot[ix.prox[k],"trgt.prox"]
|
|
266 if(prev.trgt!=""){ # if peak already is with trgt choose one with min dtss
|
|
267 if(abs(dtss)<abs(as.numeric(M.annot[ix.prox[k],"dtss.prox"]))){
|
|
268 M.annot[ix.prox[k],"dtss.prox"] <- dtss
|
|
269 M.annot[ix.prox[k],"trgt.prox"] <- gn.nms[j]
|
|
270 }
|
|
271 } else { # if peak does not have trgt add current trgt
|
|
272 M.annot[ix.prox[k],"dtss.prox"] <- dtss
|
|
273 M.annot[ix.prox[k],"trgt.prox"] <- gn.nms[j]
|
|
274 }
|
|
275 }
|
|
276 } else {
|
|
277 mean.pval.prox <- "NA"
|
|
278 max.pval.prox <- "NA"
|
|
279 pval.pois.prox <- "NA"
|
|
280 }
|
|
281 mean.pval.distal <- mean(M[ ix.distal ,7])
|
|
282 max.pval.distal <- max(M[ ix.distal ,7])
|
|
283 expected.num.peaks.genome.wide.distal <- length(which(M[,7]>=mean.pval.distal))
|
|
284 # lambda = num peaks / genome size * searched region
|
|
285 lambda.distal <- expected.num.peaks.genome.wide.distal/effective.genome.size*(gn.lngth+gene.wide.dist*2)
|
|
286 pval.pois.distal <- -log10(ppois(n.peaks.distal,lambda.distal,lower.tail=FALSE))
|
|
287 # pring peaks for gene j
|
|
288 core.line <- paste(sep="\t",gn.nms[ j ],n.peaks.prox,n.peaks.distal,gn.lngth,strand,
|
|
289 mean.pval.prox,mean.pval.distal,max.pval.prox,max.pval.distal,
|
|
290 pval.pois.prox,pval.pois.distal)
|
|
291 cat(file=f.nm.gene,append=TRUE,sep="",core.line,"\t",peaks.line,"\n")
|
|
292 }
|
|
293 }
|
|
294 cat("Done!\n")
|
|
295
|
|
296 cat(file=f.nm.peak,colnames(M.annot),"\n",sep="\t")
|
|
297 write.table(file=f.nm.peak,append=T,col.names=F,row.names=F,M.annot,sep="\t")
|
|
298
|
|
299
|
|
300
|
|
301
|
|
302
|
|
303
|
|
304
|