comparison mtls_analyze/map_ChIP_peaks_to_genes.R @ 4:b465306d00ba draft default tip

Uploaded
author kmace
date Mon, 23 Jul 2012 13:00:15 -0400
parents a903c48f05b4
children
comparison
equal deleted inserted replaced
3:a0306edbf2f8 4:b465306d00ba
1 ## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-.
2 ## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
3 ##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' '
4 ## Nov 2011 th17
5 ## Bonneau lab - "Aviv Madar" <am2654@nyu.edu>,
6 ## NYU - Center for Genomics and Systems Biology
7 ## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-.
8 ## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
9 ##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' '
10
11 rm(list=ls())
12 debug=F
13 script.version=0.1
14 print.error <- function(){
15 cat("
16 NAME
17 map_ChIP_peaks_to_genes.R
18
19 DESCRIPTION
20 This script takes a MACS tab delimited file as input and
21 produces two files:
22
23 1) genes.xls
24 A gene centered table that stores the peaks that are
25 proximal (within x[kb] of TSS) and distal (within the
26 gene's region + y[kb] upstream or downstream of TSS and
27 TES respectively).
28
29 2) peaks.xls
30 A peak centered table that equals the MACS input table plus
31 colums for proximal and distal targets of each peak and
32 their distance from TSS (if exist). For gene.xls script also
33 gives a poisson model based -log10(pval) score for proximal
34 and genewide enrichment of peaks for each gene with at least
35 one prox/dist peak associated with it.
36
37 ARGUMENTS
38 input_file=path
39 Path to MACS file.
40
41 refseq_table=path
42 Path to refseq table (gives TSS/TES locations for all genes).
43
44 path_output=path
45 Path to save genes.xls and peaks.xls output files.
46
47 tss.dist=N
48 The absolute distance from TSS where we connect a peak to
49 gene (for proximal peaks).
50
51 gene.wide.dist=N
52 The absolute distance from TSS or TES where we connect a peak
53 to gene (for peaks hitting anywhere in gene).
54
55 effective.genome.size=N
56 The effective mappable genome size (for mm9 the value is
57 1.87e9. For hg19 the value is 2.7e9 (from MACS manual)).
58
59 macs.skip.lines=N
60 Number of lines to skip in input_file.
61
62 EXAMPLE
63 Rscript map_ChIP_peaks_to_genes_v.1.R \\
64 input_file=SL971_SL970_peaks.xls \\
65 refseq_table=UCSC_mm9_refseq_genes_Sep_15_2011.txt \\
66 path_output=./ \\
67 tss.dist=5000 \\
68 gene.wide.dist=10000 \\
69 effective.genome.size=1.87e9 \\
70 macs.skip.lines=23
71
72 CITATION
73 Please cite us if you used this script: The transcription
74 factor network regulating Th17 lineage specification and
75 function. Maria Ciofani, Aviv Madar, Carolina Galan, Kieran
76 Mace, Ashish Agarwal, Kim Newberry, Richard M. Myers, Richard
77 Bonneau and Dan R. Littman et. al. (in preperation).
78
79 ")
80 }
81
82 # retrieve args
83 if(debug==T){
84 cmd.args <- c(
85 # "input_file=input/th17/used_for_paper/rawData/MACS_Sep_15_2011/SL971_SL970_peaks.xls",
86 "input_file=input/th17/used_for_paper/rawData/MACS_Sep_15_2011/SL3594_SL3592_peaks.xls",
87 "refseq_table=input/th17/used_for_paper/rawData/UCSC_mm9_refseq_genes_Sep_15_2011.txt",
88 "path_output=/Users/aviv/Desktop/test_script/",
89 "tss.dist=5000",
90 "tes.dist=10000",
91 "effective.genome.size=1.87e9",
92 "gene.wide.dist=10000",
93 "macs.skip.lines=23"
94 )
95 } else {
96 cmd.args <- commandArgs();
97 }
98
99 if(length(grep("--version",cmd.args))){
100 cat("version",script.version,"\n")
101 q()
102 }
103
104 arg.names.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[1])
105 args.val.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[2])
106
107 arg.nms <- c("input_file","refseq_table","path_output","tss.dist",
108 "gene.wide.dist","effective.genome.size","macs.skip.lines")
109 arg.val <- character(length=length(arg.nms))
110 for(i in 1:length(arg.nms)){
111 ix <- which(arg.names.cmd.line==arg.nms[i])
112 if(length(ix)==1){
113 arg.val[i] <- args.val.cmd.line[ix]
114 } else {
115 stop("######could not find ",arg.nms[i]," arg######\n\n",print.error())
116
117 }
118 }
119 if(debug==T){
120 print(paste(arg.nms,"=",arg.val))
121 }
122 # the files here adhere to tab delim format
123 path.input.macs <- arg.val[1]
124 path.input.refseq <- arg.val[2]
125 path.output <- arg.val[3]
126 tss.dist <- as.numeric(arg.val[4])
127 gene.wide.dist <- as.numeric(arg.val[5])
128 effective.genome.size <- as.numeric(arg.val[6])
129 macs.skip.lines=as.numeric(arg.val[7])
130
131 if(debug==T){
132 # if(1){
133 load("/Users/aviv/Desktop/r.u.RData")
134 gn.nms <- rownames(r.u)
135 } else {
136 ##################
137 ### step 1 handle refseq table file (read, make unique)
138 cat("reading refseq table",path.input.refseq,"\n")
139 refseq <- read.delim(file=path.input.refseq)
140 # refseq can have many transcripts for each gene
141 # here i make it have only one transcript for each gene (the longest one)
142 cat("for transcripts matching more than one gene keeping only the longest transcript\n")
143 refseq$name2 <- as.character(refseq$name2)
144 refseq$chrom <- as.character(refseq$chrom)
145 refseq$strand <- as.character(refseq$strand)
146 gn.nms <- unique(refseq$name2)
147 #x <- sort(table(refseq$name2),decreasing=T)
148 #gn.nms.non.unique <- names(x)[which(x>1)]
149 #gn.nms.unique <- names(x)[which(x==1)]
150 # create refseq unique r.u
151 n <- length(gn.nms)
152 r.u <- data.frame(cbind(chrom=rep("NA",n),strand=rep("NA",n),txStart=rep(0,n),txEnd=rep(0,n)),stringsAsFactors=FALSE,row.names=gn.nms)
153 for(i in 1:n){
154 ix <- which(refseq$name2==gn.nms[i])
155 if(length(ix)==0) {
156 error("could not find gene", ng.nms[i], "in refseq table. Bailing out...\n")
157 } else if (length(ix)>1){
158 l <- apply(refseq[ix,c("txStart","txEnd")],1,function(i) abs(i[1]-i[2]) )
159 l.max <- max(l)
160 ix <- ix[which(l==l.max)[1]]
161 }
162 r.u[gn.nms[i],c("chrom","strand","txStart","txEnd")] <- refseq[ix,c("chrom","strand","txStart","txEnd")]
163 }
164 r.u[,"txStart"] <- as.numeric(r.u[,"txStart"])
165 r.u[,"txEnd"] <- as.numeric(r.u[,"txEnd"])
166
167 # switch TSS and TES if we have chr "-"
168 cat("correcting TSS, TES assiments in refseq table based on strand\n")
169 ix <- which(r.u$strand=="-")
170 tmp.tes <- r.u$txStart[ix]
171 tmp.tss <- r.u$txEnd[ix]
172 r.u[ix,c("txStart","txEnd")] <- cbind(tmp.tss,tmp.tes)
173 # ############
174 }
175
176
177 #### step 2 read macs file
178 cat("reading Macs file",path.input.macs,"\n")
179
180 # read macs file
181 M <- read.delim(file=path.input.macs,skip=macs.skip.lines)
182 trgt.prox <- NA
183 trgt.dist <- NA
184 dtss.prox <- NA
185 dtss.dist <- NA
186 M.annot <- cbind(M,trgt.prox,trgt.dist,dtss.prox,dtss.dist)
187 M.annot[is.na(M.annot)] <- ""
188 # get the number of genes
189 m <- length(gn.nms)
190 if(debug==T){
191 # m <- 100
192 }
193 f.nm.peak <- paste(sep="",path.output,"peaks.xls")
194 f.nm.gene <- paste(sep="",path.output,"genes.xls")
195 # for each genes in the expt (j goes over genes)
196 cat(file=f.nm.gene,sep="\t","Gene_ID","prox_n_peak","genewide_n_peak","gn_length","strand","prox_mean_pval","genewide_mean_pval",
197 "prox_max_pval","genewide_max_pval",
198 "prox_pois_model_pval","genewide_pois_model_pval","(peak_id,summit,d_TSS,d_TES,class,pval,fold_enrich,FDR),(..)\n")
199 peaks.summit <- (M[,"start"]+M[,"summit"])
200 cat(sep="","mapping MACS peaks to genes\n")
201 for (j in 1:m){
202 if(j%%20==0){cat(".")}
203 tss <- r.u[j,"txStart"]
204 tes <- r.u[j,"txEnd"]
205 gn.lngth <- abs(tss-tes)
206 strand <- r.u[j,"strand"]
207 chr <- r.u[j,"chrom"]
208 if(strand=="+"){
209 d.tss.all <- peaks.summit-tss
210 d.tes.all <- peaks.summit-tes
211 } else {
212 d.tss.all <- tss-peaks.summit
213 d.tes.all <- tes-peaks.summit
214 }
215 ix.distal <- sort(union( c(which( abs(d.tss.all) < gene.wide.dist ),which( abs(d.tes.all) < gene.wide.dist )),
216 which( sign(d.tss.all) != sign(d.tes.all) )),decreasing=TRUE)
217 ix.prox <- sort(which( abs(d.tss.all) < tss.dist ),decreasing=TRUE)
218 ix.prox <- ix.prox[which(M[ix.prox,"chr"]==chr)]
219 ix.distal <- ix.distal[which(M[ix.distal,"chr"]==chr)]
220 n.peaks.prox <- length(ix.prox)
221 n.peaks.distal <- length(ix.distal)
222 # if there is at least one peak hitting gene j
223 if(n.peaks.distal > 0){
224 # for each peak (l goes over peaks)
225 peaks.line <- paste(sep="","(")
226 for (k in 1:n.peaks.distal){
227 if(k>1){
228 peaks.line <- paste(sep="",peaks.line,";(")
229 }
230 d.tss <- d.tss.all[ ix.distal[k] ]
231 d.tes <- d.tes.all[ ix.distal[k] ]
232 if(sign(d.tss)!=sign(d.tes)){
233 class="intra"
234 } else if(d.tss<=0){
235 class="upstream"
236 } else if(d.tss>0){
237 class="downstream"
238 }
239 peaks.line <- paste(sep="",peaks.line,paste(sep=",", ix.distal[k], peaks.summit[ ix.distal[k] ],
240 d.tss, d.tes, class, M[ ix.distal[k] ,7], M[ ix.distal[k] ,8], M[ ix.distal[k] ,9] ),")")
241 # handle M.annot
242 dtss <- d.tss.all[ ix.distal[k] ]
243 prev.trgt <- M.annot[ix.distal[k],"trgt.dist"]
244 if(prev.trgt!=""){ # if peak already is with trgt choose one with min dtss
245 if(abs(dtss)<abs(as.numeric(M.annot[ix.distal[k],"dtss.dist"]))){
246 M.annot[ix.distal[k],"dtss.dist"] <- dtss
247 M.annot[ix.distal[k],"trgt.dist"] <- gn.nms[j]
248 }
249 } else { # if peak does not have trgt add current trgt
250 M.annot[ix.distal[k],"dtss.dist"] <- dtss
251 M.annot[ix.distal[k],"trgt.dist"] <- gn.nms[j]
252 }
253 # calc the pval for these many peaks in proximal region and in gene-wide region
254 }
255 if(n.peaks.prox > 0){
256 mean.pval.prox <- mean(M[ ix.prox ,7])
257 max.pval.prox <- max(M[ ix.prox ,7])
258 expected.num.peaks.genome.wide.prox <- length(which(M[,7]>=mean.pval.prox))
259 # lambda = num peaks / genome size * searched region
260 lambda.prox <- expected.num.peaks.genome.wide.prox/effective.genome.size*(tss.dist*2)
261 pval.pois.prox <- -log10(ppois(n.peaks.prox,lambda.prox,lower.tail=FALSE))
262 # handle M.annot
263 for(k in 1:n.peaks.prox){
264 dtss <- d.tss.all[ ix.prox[k] ]
265 prev.trgt <- M.annot[ix.prox[k],"trgt.prox"]
266 if(prev.trgt!=""){ # if peak already is with trgt choose one with min dtss
267 if(abs(dtss)<abs(as.numeric(M.annot[ix.prox[k],"dtss.prox"]))){
268 M.annot[ix.prox[k],"dtss.prox"] <- dtss
269 M.annot[ix.prox[k],"trgt.prox"] <- gn.nms[j]
270 }
271 } else { # if peak does not have trgt add current trgt
272 M.annot[ix.prox[k],"dtss.prox"] <- dtss
273 M.annot[ix.prox[k],"trgt.prox"] <- gn.nms[j]
274 }
275 }
276 } else {
277 mean.pval.prox <- "NA"
278 max.pval.prox <- "NA"
279 pval.pois.prox <- "NA"
280 }
281 mean.pval.distal <- mean(M[ ix.distal ,7])
282 max.pval.distal <- max(M[ ix.distal ,7])
283 expected.num.peaks.genome.wide.distal <- length(which(M[,7]>=mean.pval.distal))
284 # lambda = num peaks / genome size * searched region
285 lambda.distal <- expected.num.peaks.genome.wide.distal/effective.genome.size*(gn.lngth+gene.wide.dist*2)
286 pval.pois.distal <- -log10(ppois(n.peaks.distal,lambda.distal,lower.tail=FALSE))
287 # pring peaks for gene j
288 core.line <- paste(sep="\t",gn.nms[ j ],n.peaks.prox,n.peaks.distal,gn.lngth,strand,
289 mean.pval.prox,mean.pval.distal,max.pval.prox,max.pval.distal,
290 pval.pois.prox,pval.pois.distal)
291 cat(file=f.nm.gene,append=TRUE,sep="",core.line,"\t",peaks.line,"\n")
292 }
293 }
294 cat("Done!\n")
295
296 cat(file=f.nm.peak,colnames(M.annot),"\n",sep="\t")
297 write.table(file=f.nm.peak,append=T,col.names=F,row.names=F,M.annot,sep="\t")
298
299
300
301
302
303
304