Mercurial > repos > kmace > mtls_analysis
comparison mtls_analyze/map_ChIP_peaks_to_genes.R @ 4:b465306d00ba draft default tip
Uploaded
author | kmace |
---|---|
date | Mon, 23 Jul 2012 13:00:15 -0400 |
parents | a903c48f05b4 |
children |
comparison
equal
deleted
inserted
replaced
3:a0306edbf2f8 | 4:b465306d00ba |
---|---|
1 ## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. | |
2 ## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ | |
3 ##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' | |
4 ## Nov 2011 th17 | |
5 ## Bonneau lab - "Aviv Madar" <am2654@nyu.edu>, | |
6 ## NYU - Center for Genomics and Systems Biology | |
7 ## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. | |
8 ## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ | |
9 ##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' | |
10 | |
11 rm(list=ls()) | |
12 debug=F | |
13 script.version=0.1 | |
14 print.error <- function(){ | |
15 cat(" | |
16 NAME | |
17 map_ChIP_peaks_to_genes.R | |
18 | |
19 DESCRIPTION | |
20 This script takes a MACS tab delimited file as input and | |
21 produces two files: | |
22 | |
23 1) genes.xls | |
24 A gene centered table that stores the peaks that are | |
25 proximal (within x[kb] of TSS) and distal (within the | |
26 gene's region + y[kb] upstream or downstream of TSS and | |
27 TES respectively). | |
28 | |
29 2) peaks.xls | |
30 A peak centered table that equals the MACS input table plus | |
31 colums for proximal and distal targets of each peak and | |
32 their distance from TSS (if exist). For gene.xls script also | |
33 gives a poisson model based -log10(pval) score for proximal | |
34 and genewide enrichment of peaks for each gene with at least | |
35 one prox/dist peak associated with it. | |
36 | |
37 ARGUMENTS | |
38 input_file=path | |
39 Path to MACS file. | |
40 | |
41 refseq_table=path | |
42 Path to refseq table (gives TSS/TES locations for all genes). | |
43 | |
44 path_output=path | |
45 Path to save genes.xls and peaks.xls output files. | |
46 | |
47 tss.dist=N | |
48 The absolute distance from TSS where we connect a peak to | |
49 gene (for proximal peaks). | |
50 | |
51 gene.wide.dist=N | |
52 The absolute distance from TSS or TES where we connect a peak | |
53 to gene (for peaks hitting anywhere in gene). | |
54 | |
55 effective.genome.size=N | |
56 The effective mappable genome size (for mm9 the value is | |
57 1.87e9. For hg19 the value is 2.7e9 (from MACS manual)). | |
58 | |
59 macs.skip.lines=N | |
60 Number of lines to skip in input_file. | |
61 | |
62 EXAMPLE | |
63 Rscript map_ChIP_peaks_to_genes_v.1.R \\ | |
64 input_file=SL971_SL970_peaks.xls \\ | |
65 refseq_table=UCSC_mm9_refseq_genes_Sep_15_2011.txt \\ | |
66 path_output=./ \\ | |
67 tss.dist=5000 \\ | |
68 gene.wide.dist=10000 \\ | |
69 effective.genome.size=1.87e9 \\ | |
70 macs.skip.lines=23 | |
71 | |
72 CITATION | |
73 Please cite us if you used this script: The transcription | |
74 factor network regulating Th17 lineage specification and | |
75 function. Maria Ciofani, Aviv Madar, Carolina Galan, Kieran | |
76 Mace, Ashish Agarwal, Kim Newberry, Richard M. Myers, Richard | |
77 Bonneau and Dan R. Littman et. al. (in preperation). | |
78 | |
79 ") | |
80 } | |
81 | |
82 # retrieve args | |
83 if(debug==T){ | |
84 cmd.args <- c( | |
85 # "input_file=input/th17/used_for_paper/rawData/MACS_Sep_15_2011/SL971_SL970_peaks.xls", | |
86 "input_file=input/th17/used_for_paper/rawData/MACS_Sep_15_2011/SL3594_SL3592_peaks.xls", | |
87 "refseq_table=input/th17/used_for_paper/rawData/UCSC_mm9_refseq_genes_Sep_15_2011.txt", | |
88 "path_output=/Users/aviv/Desktop/test_script/", | |
89 "tss.dist=5000", | |
90 "tes.dist=10000", | |
91 "effective.genome.size=1.87e9", | |
92 "gene.wide.dist=10000", | |
93 "macs.skip.lines=23" | |
94 ) | |
95 } else { | |
96 cmd.args <- commandArgs(); | |
97 } | |
98 | |
99 if(length(grep("--version",cmd.args))){ | |
100 cat("version",script.version,"\n") | |
101 q() | |
102 } | |
103 | |
104 arg.names.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[1]) | |
105 args.val.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[2]) | |
106 | |
107 arg.nms <- c("input_file","refseq_table","path_output","tss.dist", | |
108 "gene.wide.dist","effective.genome.size","macs.skip.lines") | |
109 arg.val <- character(length=length(arg.nms)) | |
110 for(i in 1:length(arg.nms)){ | |
111 ix <- which(arg.names.cmd.line==arg.nms[i]) | |
112 if(length(ix)==1){ | |
113 arg.val[i] <- args.val.cmd.line[ix] | |
114 } else { | |
115 stop("######could not find ",arg.nms[i]," arg######\n\n",print.error()) | |
116 | |
117 } | |
118 } | |
119 if(debug==T){ | |
120 print(paste(arg.nms,"=",arg.val)) | |
121 } | |
122 # the files here adhere to tab delim format | |
123 path.input.macs <- arg.val[1] | |
124 path.input.refseq <- arg.val[2] | |
125 path.output <- arg.val[3] | |
126 tss.dist <- as.numeric(arg.val[4]) | |
127 gene.wide.dist <- as.numeric(arg.val[5]) | |
128 effective.genome.size <- as.numeric(arg.val[6]) | |
129 macs.skip.lines=as.numeric(arg.val[7]) | |
130 | |
131 if(debug==T){ | |
132 # if(1){ | |
133 load("/Users/aviv/Desktop/r.u.RData") | |
134 gn.nms <- rownames(r.u) | |
135 } else { | |
136 ################## | |
137 ### step 1 handle refseq table file (read, make unique) | |
138 cat("reading refseq table",path.input.refseq,"\n") | |
139 refseq <- read.delim(file=path.input.refseq) | |
140 # refseq can have many transcripts for each gene | |
141 # here i make it have only one transcript for each gene (the longest one) | |
142 cat("for transcripts matching more than one gene keeping only the longest transcript\n") | |
143 refseq$name2 <- as.character(refseq$name2) | |
144 refseq$chrom <- as.character(refseq$chrom) | |
145 refseq$strand <- as.character(refseq$strand) | |
146 gn.nms <- unique(refseq$name2) | |
147 #x <- sort(table(refseq$name2),decreasing=T) | |
148 #gn.nms.non.unique <- names(x)[which(x>1)] | |
149 #gn.nms.unique <- names(x)[which(x==1)] | |
150 # create refseq unique r.u | |
151 n <- length(gn.nms) | |
152 r.u <- data.frame(cbind(chrom=rep("NA",n),strand=rep("NA",n),txStart=rep(0,n),txEnd=rep(0,n)),stringsAsFactors=FALSE,row.names=gn.nms) | |
153 for(i in 1:n){ | |
154 ix <- which(refseq$name2==gn.nms[i]) | |
155 if(length(ix)==0) { | |
156 error("could not find gene", ng.nms[i], "in refseq table. Bailing out...\n") | |
157 } else if (length(ix)>1){ | |
158 l <- apply(refseq[ix,c("txStart","txEnd")],1,function(i) abs(i[1]-i[2]) ) | |
159 l.max <- max(l) | |
160 ix <- ix[which(l==l.max)[1]] | |
161 } | |
162 r.u[gn.nms[i],c("chrom","strand","txStart","txEnd")] <- refseq[ix,c("chrom","strand","txStart","txEnd")] | |
163 } | |
164 r.u[,"txStart"] <- as.numeric(r.u[,"txStart"]) | |
165 r.u[,"txEnd"] <- as.numeric(r.u[,"txEnd"]) | |
166 | |
167 # switch TSS and TES if we have chr "-" | |
168 cat("correcting TSS, TES assiments in refseq table based on strand\n") | |
169 ix <- which(r.u$strand=="-") | |
170 tmp.tes <- r.u$txStart[ix] | |
171 tmp.tss <- r.u$txEnd[ix] | |
172 r.u[ix,c("txStart","txEnd")] <- cbind(tmp.tss,tmp.tes) | |
173 # ############ | |
174 } | |
175 | |
176 | |
177 #### step 2 read macs file | |
178 cat("reading Macs file",path.input.macs,"\n") | |
179 | |
180 # read macs file | |
181 M <- read.delim(file=path.input.macs,skip=macs.skip.lines) | |
182 trgt.prox <- NA | |
183 trgt.dist <- NA | |
184 dtss.prox <- NA | |
185 dtss.dist <- NA | |
186 M.annot <- cbind(M,trgt.prox,trgt.dist,dtss.prox,dtss.dist) | |
187 M.annot[is.na(M.annot)] <- "" | |
188 # get the number of genes | |
189 m <- length(gn.nms) | |
190 if(debug==T){ | |
191 # m <- 100 | |
192 } | |
193 f.nm.peak <- paste(sep="",path.output,"peaks.xls") | |
194 f.nm.gene <- paste(sep="",path.output,"genes.xls") | |
195 # for each genes in the expt (j goes over genes) | |
196 cat(file=f.nm.gene,sep="\t","Gene_ID","prox_n_peak","genewide_n_peak","gn_length","strand","prox_mean_pval","genewide_mean_pval", | |
197 "prox_max_pval","genewide_max_pval", | |
198 "prox_pois_model_pval","genewide_pois_model_pval","(peak_id,summit,d_TSS,d_TES,class,pval,fold_enrich,FDR),(..)\n") | |
199 peaks.summit <- (M[,"start"]+M[,"summit"]) | |
200 cat(sep="","mapping MACS peaks to genes\n") | |
201 for (j in 1:m){ | |
202 if(j%%20==0){cat(".")} | |
203 tss <- r.u[j,"txStart"] | |
204 tes <- r.u[j,"txEnd"] | |
205 gn.lngth <- abs(tss-tes) | |
206 strand <- r.u[j,"strand"] | |
207 chr <- r.u[j,"chrom"] | |
208 if(strand=="+"){ | |
209 d.tss.all <- peaks.summit-tss | |
210 d.tes.all <- peaks.summit-tes | |
211 } else { | |
212 d.tss.all <- tss-peaks.summit | |
213 d.tes.all <- tes-peaks.summit | |
214 } | |
215 ix.distal <- sort(union( c(which( abs(d.tss.all) < gene.wide.dist ),which( abs(d.tes.all) < gene.wide.dist )), | |
216 which( sign(d.tss.all) != sign(d.tes.all) )),decreasing=TRUE) | |
217 ix.prox <- sort(which( abs(d.tss.all) < tss.dist ),decreasing=TRUE) | |
218 ix.prox <- ix.prox[which(M[ix.prox,"chr"]==chr)] | |
219 ix.distal <- ix.distal[which(M[ix.distal,"chr"]==chr)] | |
220 n.peaks.prox <- length(ix.prox) | |
221 n.peaks.distal <- length(ix.distal) | |
222 # if there is at least one peak hitting gene j | |
223 if(n.peaks.distal > 0){ | |
224 # for each peak (l goes over peaks) | |
225 peaks.line <- paste(sep="","(") | |
226 for (k in 1:n.peaks.distal){ | |
227 if(k>1){ | |
228 peaks.line <- paste(sep="",peaks.line,";(") | |
229 } | |
230 d.tss <- d.tss.all[ ix.distal[k] ] | |
231 d.tes <- d.tes.all[ ix.distal[k] ] | |
232 if(sign(d.tss)!=sign(d.tes)){ | |
233 class="intra" | |
234 } else if(d.tss<=0){ | |
235 class="upstream" | |
236 } else if(d.tss>0){ | |
237 class="downstream" | |
238 } | |
239 peaks.line <- paste(sep="",peaks.line,paste(sep=",", ix.distal[k], peaks.summit[ ix.distal[k] ], | |
240 d.tss, d.tes, class, M[ ix.distal[k] ,7], M[ ix.distal[k] ,8], M[ ix.distal[k] ,9] ),")") | |
241 # handle M.annot | |
242 dtss <- d.tss.all[ ix.distal[k] ] | |
243 prev.trgt <- M.annot[ix.distal[k],"trgt.dist"] | |
244 if(prev.trgt!=""){ # if peak already is with trgt choose one with min dtss | |
245 if(abs(dtss)<abs(as.numeric(M.annot[ix.distal[k],"dtss.dist"]))){ | |
246 M.annot[ix.distal[k],"dtss.dist"] <- dtss | |
247 M.annot[ix.distal[k],"trgt.dist"] <- gn.nms[j] | |
248 } | |
249 } else { # if peak does not have trgt add current trgt | |
250 M.annot[ix.distal[k],"dtss.dist"] <- dtss | |
251 M.annot[ix.distal[k],"trgt.dist"] <- gn.nms[j] | |
252 } | |
253 # calc the pval for these many peaks in proximal region and in gene-wide region | |
254 } | |
255 if(n.peaks.prox > 0){ | |
256 mean.pval.prox <- mean(M[ ix.prox ,7]) | |
257 max.pval.prox <- max(M[ ix.prox ,7]) | |
258 expected.num.peaks.genome.wide.prox <- length(which(M[,7]>=mean.pval.prox)) | |
259 # lambda = num peaks / genome size * searched region | |
260 lambda.prox <- expected.num.peaks.genome.wide.prox/effective.genome.size*(tss.dist*2) | |
261 pval.pois.prox <- -log10(ppois(n.peaks.prox,lambda.prox,lower.tail=FALSE)) | |
262 # handle M.annot | |
263 for(k in 1:n.peaks.prox){ | |
264 dtss <- d.tss.all[ ix.prox[k] ] | |
265 prev.trgt <- M.annot[ix.prox[k],"trgt.prox"] | |
266 if(prev.trgt!=""){ # if peak already is with trgt choose one with min dtss | |
267 if(abs(dtss)<abs(as.numeric(M.annot[ix.prox[k],"dtss.prox"]))){ | |
268 M.annot[ix.prox[k],"dtss.prox"] <- dtss | |
269 M.annot[ix.prox[k],"trgt.prox"] <- gn.nms[j] | |
270 } | |
271 } else { # if peak does not have trgt add current trgt | |
272 M.annot[ix.prox[k],"dtss.prox"] <- dtss | |
273 M.annot[ix.prox[k],"trgt.prox"] <- gn.nms[j] | |
274 } | |
275 } | |
276 } else { | |
277 mean.pval.prox <- "NA" | |
278 max.pval.prox <- "NA" | |
279 pval.pois.prox <- "NA" | |
280 } | |
281 mean.pval.distal <- mean(M[ ix.distal ,7]) | |
282 max.pval.distal <- max(M[ ix.distal ,7]) | |
283 expected.num.peaks.genome.wide.distal <- length(which(M[,7]>=mean.pval.distal)) | |
284 # lambda = num peaks / genome size * searched region | |
285 lambda.distal <- expected.num.peaks.genome.wide.distal/effective.genome.size*(gn.lngth+gene.wide.dist*2) | |
286 pval.pois.distal <- -log10(ppois(n.peaks.distal,lambda.distal,lower.tail=FALSE)) | |
287 # pring peaks for gene j | |
288 core.line <- paste(sep="\t",gn.nms[ j ],n.peaks.prox,n.peaks.distal,gn.lngth,strand, | |
289 mean.pval.prox,mean.pval.distal,max.pval.prox,max.pval.distal, | |
290 pval.pois.prox,pval.pois.distal) | |
291 cat(file=f.nm.gene,append=TRUE,sep="",core.line,"\t",peaks.line,"\n") | |
292 } | |
293 } | |
294 cat("Done!\n") | |
295 | |
296 cat(file=f.nm.peak,colnames(M.annot),"\n",sep="\t") | |
297 write.table(file=f.nm.peak,append=T,col.names=F,row.names=F,M.annot,sep="\t") | |
298 | |
299 | |
300 | |
301 | |
302 | |
303 | |
304 |