annotate mtls_analyze/cluster_peaks.R @ 4:b465306d00ba draft default tip

Uploaded
author kmace
date Mon, 23 Jul 2012 13:00:15 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
b465306d00ba Uploaded
kmace
parents:
diff changeset
1 ## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-.
b465306d00ba Uploaded
kmace
parents:
diff changeset
2 ## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
b465306d00ba Uploaded
kmace
parents:
diff changeset
3 ##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' '
b465306d00ba Uploaded
kmace
parents:
diff changeset
4 ## Feb 2012 th17
b465306d00ba Uploaded
kmace
parents:
diff changeset
5 ## Bonneau lab - "Aviv Madar" <am2654@nyu.edu>,
b465306d00ba Uploaded
kmace
parents:
diff changeset
6 ## NYU - Center for Genomics and Systems Biology
b465306d00ba Uploaded
kmace
parents:
diff changeset
7 ## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-.
b465306d00ba Uploaded
kmace
parents:
diff changeset
8 ## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \
b465306d00ba Uploaded
kmace
parents:
diff changeset
9 ##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' '
b465306d00ba Uploaded
kmace
parents:
diff changeset
10
b465306d00ba Uploaded
kmace
parents:
diff changeset
11 rm(list=ls())
b465306d00ba Uploaded
kmace
parents:
diff changeset
12 debug=F
b465306d00ba Uploaded
kmace
parents:
diff changeset
13 verbose=F
b465306d00ba Uploaded
kmace
parents:
diff changeset
14 script.version=0.1
b465306d00ba Uploaded
kmace
parents:
diff changeset
15 print.error <- function(){
b465306d00ba Uploaded
kmace
parents:
diff changeset
16 cat("
b465306d00ba Uploaded
kmace
parents:
diff changeset
17 DESCRIPTIION:
b465306d00ba Uploaded
kmace
parents:
diff changeset
18 cluster_peaks.R takes MACS/.bed tab delimited files as input and produces one tab delimeted file (named mtls.xls) where
b465306d00ba Uploaded
kmace
parents:
diff changeset
19 each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS/.bed files)
b465306d00ba Uploaded
kmace
parents:
diff changeset
20 fall within a certain distance from eachother. If you cluster MTLs based on summits, you need to specify dist.summits.
b465306d00ba Uploaded
kmace
parents:
diff changeset
21 If you use .bed files, the files must contain no header, and at least the five columns:
b465306d00ba Uploaded
kmace
parents:
diff changeset
22 1. chrom, 2. chromStart, 3. chromEnd, 4. name, and 5.score. 2 and 3 represent the end points used for MTLs defined based on a shared
b465306d00ba Uploaded
kmace
parents:
diff changeset
23 interval. (2+3)/2 is used as the summit of each row if used for MTLs defined based on proximity of summits.
b465306d00ba Uploaded
kmace
parents:
diff changeset
24
b465306d00ba Uploaded
kmace
parents:
diff changeset
25 INPUT:
b465306d00ba Uploaded
kmace
parents:
diff changeset
26 1.input_files: path to MACS/bed files '::' delim [path_input=f1::f2::f3::...::fk]
b465306d00ba Uploaded
kmace
parents:
diff changeset
27 2.path_output: path to save generated MTL cluster file (where to save mtls.xls)
b465306d00ba Uploaded
kmace
parents:
diff changeset
28 3.expt_names: user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk]
b465306d00ba Uploaded
kmace
parents:
diff changeset
29 4.input_type: the type of input file used (MACS or BED; defaults to MACS)
b465306d00ba Uploaded
kmace
parents:
diff changeset
30 5.mtl_type: interval or summit (defaults to summit)
b465306d00ba Uploaded
kmace
parents:
diff changeset
31 6.dist.summits: maximum distance between summits belonging to the same MTL (defaults to 100; only used if mtl_type is summit)
b465306d00ba Uploaded
kmace
parents:
diff changeset
32
b465306d00ba Uploaded
kmace
parents:
diff changeset
33 EXAMPLE RUN:
b465306d00ba Uploaded
kmace
parents:
diff changeset
34 cluster_peaks.R
b465306d00ba Uploaded
kmace
parents:
diff changeset
35 --input_files input/SL2870_SL2871_peaks.xls::input/SL2872_SL2876_peaks.xls::input/SL3032_SL2871_peaks.xls::input/SL3037_SL3036_peaks.xls::input/SL3315_SL3319_peaks.xls
b465306d00ba Uploaded
kmace
parents:
diff changeset
36 --input_type MACS
b465306d00ba Uploaded
kmace
parents:
diff changeset
37 --path_output results/
b465306d00ba Uploaded
kmace
parents:
diff changeset
38 --expt_names RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17
b465306d00ba Uploaded
kmace
parents:
diff changeset
39 --dist_summits 100
b465306d00ba Uploaded
kmace
parents:
diff changeset
40 --mtl_type summit
b465306d00ba Uploaded
kmace
parents:
diff changeset
41
b465306d00ba Uploaded
kmace
parents:
diff changeset
42 Please cite us if you used this script:
b465306d00ba Uploaded
kmace
parents:
diff changeset
43 The transcription factor network regulating Th17 lineage specification and function.
b465306d00ba Uploaded
kmace
parents:
diff changeset
44 Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers,
b465306d00ba Uploaded
kmace
parents:
diff changeset
45 Richard Bonneau and Dan R. Littman et. al. (in preperation)\n\n")
b465306d00ba Uploaded
kmace
parents:
diff changeset
46 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
47
b465306d00ba Uploaded
kmace
parents:
diff changeset
48 ############# helper functions:
b465306d00ba Uploaded
kmace
parents:
diff changeset
49 ## split.macs.list.to.chrmosomes
b465306d00ba Uploaded
kmace
parents:
diff changeset
50 # input:
b465306d00ba Uploaded
kmace
parents:
diff changeset
51 # - macs.list: a list of macs expts: here are a few lines of one expt
b465306d00ba Uploaded
kmace
parents:
diff changeset
52 ## chr start end length summit tags #NAME? fold_enrichment FDR(%)
b465306d00ba Uploaded
kmace
parents:
diff changeset
53 ## chr1 4322210 4323069 860 494 55 158.95 6.03 0.05
b465306d00ba Uploaded
kmace
parents:
diff changeset
54 ## chr1 4797749 4798368 620 211 29 119.82 3.47 0.09
b465306d00ba Uploaded
kmace
parents:
diff changeset
55 ## chr1 4848182 4849113 932 494 46 105.42 2.9 0.09
b465306d00ba Uploaded
kmace
parents:
diff changeset
56 # - expts: a list of the expts names from macs.list that you want to process
b465306d00ba Uploaded
kmace
parents:
diff changeset
57 # - chr: chrmosomes names
b465306d00ba Uploaded
kmace
parents:
diff changeset
58 # output:
b465306d00ba Uploaded
kmace
parents:
diff changeset
59 # - x: a list with as many elements as chr specified by input.
b465306d00ba Uploaded
kmace
parents:
diff changeset
60 # -x[[i]]:macs list per chr, with peak.id column added (these are the row numbers of MACS)
b465306d00ba Uploaded
kmace
parents:
diff changeset
61 split.macs.list.to.chrmosomes.no.pml <- function(macs.list,expts,chr="chr1"){
b465306d00ba Uploaded
kmace
parents:
diff changeset
62 x <- list()
b465306d00ba Uploaded
kmace
parents:
diff changeset
63 n <- length(expts)
b465306d00ba Uploaded
kmace
parents:
diff changeset
64 for(i in 1:n){
b465306d00ba Uploaded
kmace
parents:
diff changeset
65 e <- expts[i] #experiment name
b465306d00ba Uploaded
kmace
parents:
diff changeset
66 cat("wroking on expt", e,"\n")
b465306d00ba Uploaded
kmace
parents:
diff changeset
67 x[[e]] <- lapply(chr,split.one.macs.expt.by.chromosome.no.pml,m=macs.list[[e]])
b465306d00ba Uploaded
kmace
parents:
diff changeset
68 names(x[[e]]) <- chr
b465306d00ba Uploaded
kmace
parents:
diff changeset
69 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
70 return(x)
b465306d00ba Uploaded
kmace
parents:
diff changeset
71 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
72 # a helper function for spliat.macs.list.to.chrmosomes, gives for one chromosome the macs rows for expt MACS matrix m
b465306d00ba Uploaded
kmace
parents:
diff changeset
73 # input:
b465306d00ba Uploaded
kmace
parents:
diff changeset
74 # - r is chromosome
b465306d00ba Uploaded
kmace
parents:
diff changeset
75 # - m is macs matrix for expt e from above function
b465306d00ba Uploaded
kmace
parents:
diff changeset
76 split.one.macs.expt.by.chromosome.no.pml <- function(r,m){
b465306d00ba Uploaded
kmace
parents:
diff changeset
77 ix.chr.i <- which(m[,"chr"]==r)
b465306d00ba Uploaded
kmace
parents:
diff changeset
78 # cat("working on",r,"\n")
b465306d00ba Uploaded
kmace
parents:
diff changeset
79 o <- list()
b465306d00ba Uploaded
kmace
parents:
diff changeset
80 o[[r]] <- m[ix.chr.i,]
b465306d00ba Uploaded
kmace
parents:
diff changeset
81 o[[r]]$peak.id <- ix.chr.i
b465306d00ba Uploaded
kmace
parents:
diff changeset
82 return(o[[r]])
b465306d00ba Uploaded
kmace
parents:
diff changeset
83 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
84
b465306d00ba Uploaded
kmace
parents:
diff changeset
85 ## make.ruler makes a ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
b465306d00ba Uploaded
kmace
parents:
diff changeset
86 # also match these summit locations with corresponding:
b465306d00ba Uploaded
kmace
parents:
diff changeset
87 # pvals, tfs, peak start and peak end
b465306d00ba Uploaded
kmace
parents:
diff changeset
88 # input:
b465306d00ba Uploaded
kmace
parents:
diff changeset
89 # - chr: a list of chromosome names
b465306d00ba Uploaded
kmace
parents:
diff changeset
90 # - macs.list.per.chrom: a list of macs peaks for each chromosome
b465306d00ba Uploaded
kmace
parents:
diff changeset
91 # output:
b465306d00ba Uploaded
kmace
parents:
diff changeset
92 # - o: a list each chormosome ruler as an element
b465306d00ba Uploaded
kmace
parents:
diff changeset
93 make.ruler.no.pml <- function(chr,macs.list.per.chrom){
b465306d00ba Uploaded
kmace
parents:
diff changeset
94 x <- macs.list.per.chrom
b465306d00ba Uploaded
kmace
parents:
diff changeset
95 o <- list()
b465306d00ba Uploaded
kmace
parents:
diff changeset
96 for(j in 1:length(chr)){
b465306d00ba Uploaded
kmace
parents:
diff changeset
97 r <- chr[j] # chrmosome we go over
b465306d00ba Uploaded
kmace
parents:
diff changeset
98 s <- numeric()
b465306d00ba Uploaded
kmace
parents:
diff changeset
99 pval <- numeric()
b465306d00ba Uploaded
kmace
parents:
diff changeset
100 tf.to.s <- character()
b465306d00ba Uploaded
kmace
parents:
diff changeset
101 start <- numeric()
b465306d00ba Uploaded
kmace
parents:
diff changeset
102 end <- numeric()
b465306d00ba Uploaded
kmace
parents:
diff changeset
103 trtmnts <- names(x) # the treatments name (expt names)
b465306d00ba Uploaded
kmace
parents:
diff changeset
104 ## debug parameters ###
b465306d00ba Uploaded
kmace
parents:
diff changeset
105 ## which experiment peaks come from
b465306d00ba Uploaded
kmace
parents:
diff changeset
106 expt <- character()
b465306d00ba Uploaded
kmace
parents:
diff changeset
107 ## what was the peak id in that experiment
b465306d00ba Uploaded
kmace
parents:
diff changeset
108 peak.ids <- numeric()
b465306d00ba Uploaded
kmace
parents:
diff changeset
109 ## this will allow us to always back track from a cluster to the actual peaks in it
b465306d00ba Uploaded
kmace
parents:
diff changeset
110 ## debug params end ###
b465306d00ba Uploaded
kmace
parents:
diff changeset
111 for(i in 1:length(trtmnts)){
b465306d00ba Uploaded
kmace
parents:
diff changeset
112 o[[r]] <- list()
b465306d00ba Uploaded
kmace
parents:
diff changeset
113 e <- trtmnts[i] #experiment name
b465306d00ba Uploaded
kmace
parents:
diff changeset
114 tf <- strsplit(e,"_")[[1]][1]
b465306d00ba Uploaded
kmace
parents:
diff changeset
115 s <- c(s,x[[e]][[r]][,"start"]+x[[e]][[r]][,"summit"])
b465306d00ba Uploaded
kmace
parents:
diff changeset
116 pval <- c(pval,x[[e]][[r]][,"score"])
b465306d00ba Uploaded
kmace
parents:
diff changeset
117 start <- c(start,x[[e]][[r]][,"start"])
b465306d00ba Uploaded
kmace
parents:
diff changeset
118 end <- c(end,x[[e]][[r]][,"end"])
b465306d00ba Uploaded
kmace
parents:
diff changeset
119 expt <- c(expt,rep(e,length(x[[e]][[r]][,"end"])))
b465306d00ba Uploaded
kmace
parents:
diff changeset
120 peak.ids <- c(peak.ids,x[[e]][[r]][,"peak.id"])
b465306d00ba Uploaded
kmace
parents:
diff changeset
121 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
122 ix <- sort(s,index.return=TRUE)$ix
b465306d00ba Uploaded
kmace
parents:
diff changeset
123 o[[r]] <- list(summit=s[ix],score=pval[ix],expt=expt[ix],start=start[ix],end=end[ix],peak.ids=peak.ids[ix])
b465306d00ba Uploaded
kmace
parents:
diff changeset
124 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
125 return(o)
b465306d00ba Uploaded
kmace
parents:
diff changeset
126 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
127
b465306d00ba Uploaded
kmace
parents:
diff changeset
128 ## add cluster memberships based on ruler
b465306d00ba Uploaded
kmace
parents:
diff changeset
129 ## require no more than d.cut distance between tf summits
b465306d00ba Uploaded
kmace
parents:
diff changeset
130 ## cur.l is the current loci number (or cluster number)
b465306d00ba Uploaded
kmace
parents:
diff changeset
131 assign.clusters.ids.to.peaks.based.on.summits <- function(ruler,d.cut){
b465306d00ba Uploaded
kmace
parents:
diff changeset
132 cur.l <- 0
b465306d00ba Uploaded
kmace
parents:
diff changeset
133 for(j in 1:length(ruler)){
b465306d00ba Uploaded
kmace
parents:
diff changeset
134 s <- ruler[[j]][["summit"]]
b465306d00ba Uploaded
kmace
parents:
diff changeset
135 l <- numeric(length=length(s))
b465306d00ba Uploaded
kmace
parents:
diff changeset
136 if(length(l)>0){
b465306d00ba Uploaded
kmace
parents:
diff changeset
137 cur.l <- cur.l+1
b465306d00ba Uploaded
kmace
parents:
diff changeset
138 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
139 if(length(l)==1){
b465306d00ba Uploaded
kmace
parents:
diff changeset
140 l[1] <- cur.l
b465306d00ba Uploaded
kmace
parents:
diff changeset
141 } else if(length(l)>1) {
b465306d00ba Uploaded
kmace
parents:
diff changeset
142 l[1] <- cur.l
b465306d00ba Uploaded
kmace
parents:
diff changeset
143 for(i in 2:length(l)){
b465306d00ba Uploaded
kmace
parents:
diff changeset
144 d <- s[i]-s[i-1] # assumes s is sorted increasingly
b465306d00ba Uploaded
kmace
parents:
diff changeset
145 if(d>d.cut){
b465306d00ba Uploaded
kmace
parents:
diff changeset
146 cur.l <- cur.l+1
b465306d00ba Uploaded
kmace
parents:
diff changeset
147 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
148 l[i] <- cur.l
b465306d00ba Uploaded
kmace
parents:
diff changeset
149 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
150 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
151 ruler[[j]][["mtl.id"]] <- l
b465306d00ba Uploaded
kmace
parents:
diff changeset
152 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
153 return(ruler)
b465306d00ba Uploaded
kmace
parents:
diff changeset
154 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
155
b465306d00ba Uploaded
kmace
parents:
diff changeset
156 ## add cluster memberships based on ruler
b465306d00ba Uploaded
kmace
parents:
diff changeset
157 ## require that peaks span will have a non-empty intersection
b465306d00ba Uploaded
kmace
parents:
diff changeset
158 ## cur.mtl is the current loci number (or cluster number)
b465306d00ba Uploaded
kmace
parents:
diff changeset
159 assign.clusters.ids.to.peaks.based.on.intervals <- function(ruler){
b465306d00ba Uploaded
kmace
parents:
diff changeset
160 cur.mtl <- 0
b465306d00ba Uploaded
kmace
parents:
diff changeset
161 for(j in 1:length(ruler)){
b465306d00ba Uploaded
kmace
parents:
diff changeset
162 s <- ruler[[j]][["start"]]
b465306d00ba Uploaded
kmace
parents:
diff changeset
163 e <- ruler[[j]][["end"]]
b465306d00ba Uploaded
kmace
parents:
diff changeset
164 l <- numeric(length=length(s))
b465306d00ba Uploaded
kmace
parents:
diff changeset
165 if(length(l)>0){
b465306d00ba Uploaded
kmace
parents:
diff changeset
166 cur.mtl <- cur.mtl+1
b465306d00ba Uploaded
kmace
parents:
diff changeset
167 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
168 if(length(l)==1){
b465306d00ba Uploaded
kmace
parents:
diff changeset
169 l[1] <- cur.mtl
b465306d00ba Uploaded
kmace
parents:
diff changeset
170 } else if(length(l)>1) {
b465306d00ba Uploaded
kmace
parents:
diff changeset
171 l[1] <- cur.mtl
b465306d00ba Uploaded
kmace
parents:
diff changeset
172 cur.e <- e[1] # the right-most bp belonging to the current mtl
b465306d00ba Uploaded
kmace
parents:
diff changeset
173 for(i in 2:length(l)){
b465306d00ba Uploaded
kmace
parents:
diff changeset
174 if(cur.e<s[i]){
b465306d00ba Uploaded
kmace
parents:
diff changeset
175 cur.mtl <- cur.mtl+1
b465306d00ba Uploaded
kmace
parents:
diff changeset
176 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
177 l[i] <- cur.mtl
b465306d00ba Uploaded
kmace
parents:
diff changeset
178 cur.e <- max(cur.e,e[i])
b465306d00ba Uploaded
kmace
parents:
diff changeset
179 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
180 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
181 ruler[[j]][["mtl.id"]] <- l
b465306d00ba Uploaded
kmace
parents:
diff changeset
182 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
183 return(ruler)
b465306d00ba Uploaded
kmace
parents:
diff changeset
184 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
185
b465306d00ba Uploaded
kmace
parents:
diff changeset
186 ## here we create a list of TF enriched loci (clusters)
b465306d00ba Uploaded
kmace
parents:
diff changeset
187 ## input:
b465306d00ba Uploaded
kmace
parents:
diff changeset
188 ## - ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end)
b465306d00ba Uploaded
kmace
parents:
diff changeset
189 ## output:
b465306d00ba Uploaded
kmace
parents:
diff changeset
190 ## -ll: a list of clusters ll where each cluster (elem in ll holds):
b465306d00ba Uploaded
kmace
parents:
diff changeset
191 ## - mtl.id: the current loci (elem in ll)
b465306d00ba Uploaded
kmace
parents:
diff changeset
192 ## - s: summits vector as before
b465306d00ba Uploaded
kmace
parents:
diff changeset
193 ## - pval: pvals vector matching the summits
b465306d00ba Uploaded
kmace
parents:
diff changeset
194 ## - tfs: a vector of tfs matching s, the summits in loci l
b465306d00ba Uploaded
kmace
parents:
diff changeset
195 ## - spans: a vector of spans matching s, the summits in loci l, where spans is the dist between start and end of a peak
b465306d00ba Uploaded
kmace
parents:
diff changeset
196 create.cluster.list.no.pml <- function(ruler){
b465306d00ba Uploaded
kmace
parents:
diff changeset
197 tmp <- list()
b465306d00ba Uploaded
kmace
parents:
diff changeset
198 ll <- list()
b465306d00ba Uploaded
kmace
parents:
diff changeset
199 for(j in 1:length(ruler)){
b465306d00ba Uploaded
kmace
parents:
diff changeset
200 r <- names(ruler)[j]
b465306d00ba Uploaded
kmace
parents:
diff changeset
201 # cat("working on ",r,"\n")
b465306d00ba Uploaded
kmace
parents:
diff changeset
202 x <- ruler[[j]] # for short typing let x stand for ruler[[chr[j]]]
b465306d00ba Uploaded
kmace
parents:
diff changeset
203 l.vec <- unique(x[["mtl.id"]]) # the clusters ids on j chr (only unique)
b465306d00ba Uploaded
kmace
parents:
diff changeset
204 n <- length(l.vec) # iterate over n clusters
b465306d00ba Uploaded
kmace
parents:
diff changeset
205 tmp[[j]] <- lapply(1:n,get.cluster.params.no.pml,x=x,l.vec=l.vec,r=r)
b465306d00ba Uploaded
kmace
parents:
diff changeset
206 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
207 ## concatenate tmp into one list
b465306d00ba Uploaded
kmace
parents:
diff changeset
208 command <- paste(sep="","c(",paste(sep="","tmp[[",1:length(tmp),"]]",collapse=","),")")
b465306d00ba Uploaded
kmace
parents:
diff changeset
209 ll=eval(parse(text=command))
b465306d00ba Uploaded
kmace
parents:
diff changeset
210 return(ll)
b465306d00ba Uploaded
kmace
parents:
diff changeset
211 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
212 get.cluster.params.no.pml <- function(i,x,l.vec,r){
b465306d00ba Uploaded
kmace
parents:
diff changeset
213 # ix <- which(x[["l"]]==l.vec[i])
b465306d00ba Uploaded
kmace
parents:
diff changeset
214 # l <- l.vec[i]
b465306d00ba Uploaded
kmace
parents:
diff changeset
215 ix <- which(x[["mtl.id"]]==l.vec[i])
b465306d00ba Uploaded
kmace
parents:
diff changeset
216 l <- l.vec[i]
b465306d00ba Uploaded
kmace
parents:
diff changeset
217 start <- min(x[["start"]][ix])
b465306d00ba Uploaded
kmace
parents:
diff changeset
218 end <- max(x[["end"]][ix])
b465306d00ba Uploaded
kmace
parents:
diff changeset
219 # s <- x[["s"]][ix]
b465306d00ba Uploaded
kmace
parents:
diff changeset
220 summit <- x[["summit"]][ix]
b465306d00ba Uploaded
kmace
parents:
diff changeset
221 # pval <- x[["pval"]][ix]
b465306d00ba Uploaded
kmace
parents:
diff changeset
222 score <- x[["score"]][ix]
b465306d00ba Uploaded
kmace
parents:
diff changeset
223 # pval.mean <- mean(pval)
b465306d00ba Uploaded
kmace
parents:
diff changeset
224 score.mean <- mean(score)
b465306d00ba Uploaded
kmace
parents:
diff changeset
225 span.tfs <- x[["end"]][ix]-x[["start"]][ix]
b465306d00ba Uploaded
kmace
parents:
diff changeset
226 span.l <- end-start
b465306d00ba Uploaded
kmace
parents:
diff changeset
227 peak.ids <- x[["peak.ids"]][ix]
b465306d00ba Uploaded
kmace
parents:
diff changeset
228 expt <- x[["expt"]][ix]
b465306d00ba Uploaded
kmace
parents:
diff changeset
229 expt.alphanum.sorted <- sort(x[["expt"]][ix])
b465306d00ba Uploaded
kmace
parents:
diff changeset
230
b465306d00ba Uploaded
kmace
parents:
diff changeset
231 chr <- rep(r,length(ix))
b465306d00ba Uploaded
kmace
parents:
diff changeset
232 return(list(mtl.id=l,chr=chr,expt=expt,expt.alphanum.sorted=expt.alphanum.sorted,start=start,
b465306d00ba Uploaded
kmace
parents:
diff changeset
233 end=end,summit=summit,score=score,score.mean=score.mean,span.tfs=span.tfs,
b465306d00ba Uploaded
kmace
parents:
diff changeset
234 span.l=span.l,peak.ids=peak.ids))
b465306d00ba Uploaded
kmace
parents:
diff changeset
235 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
236
b465306d00ba Uploaded
kmace
parents:
diff changeset
237 ## pretty print (tab deim) to file requested elements out of chip cluster list, ll.
b465306d00ba Uploaded
kmace
parents:
diff changeset
238 ## input:
b465306d00ba Uploaded
kmace
parents:
diff changeset
239 ## - ll: a cluster list
b465306d00ba Uploaded
kmace
parents:
diff changeset
240 ## - f.nm: a file name (include path) to where you want files to print
b465306d00ba Uploaded
kmace
parents:
diff changeset
241 ## - tfs: a list of the tfs we want to print the file for (the same as the tfs used for the peak clustering)
b465306d00ba Uploaded
kmace
parents:
diff changeset
242 ## output
b465306d00ba Uploaded
kmace
parents:
diff changeset
243 ## - a tab delim file with clusers as rows and elems tab delim for each cluster
b465306d00ba Uploaded
kmace
parents:
diff changeset
244 print.ll.verbose.all <- function(ll,f.nm="ll.xls"){
b465306d00ba Uploaded
kmace
parents:
diff changeset
245 options(digits=5)
b465306d00ba Uploaded
kmace
parents:
diff changeset
246 cat(file=f.nm,names(ll[[1]]),sep="\t")
b465306d00ba Uploaded
kmace
parents:
diff changeset
247 cat(file=f.nm,"\n",append=TRUE)
b465306d00ba Uploaded
kmace
parents:
diff changeset
248 for(i in 1:length(ll)){
b465306d00ba Uploaded
kmace
parents:
diff changeset
249 line <- ll[[i]][[1]] # put MTL number
b465306d00ba Uploaded
kmace
parents:
diff changeset
250 line <- paste(line,ll[[i]][[2]][1],sep="\t")
b465306d00ba Uploaded
kmace
parents:
diff changeset
251 for(j in 3:length(ll[[i]])){
b465306d00ba Uploaded
kmace
parents:
diff changeset
252 val <- ll[[i]][[j]]
b465306d00ba Uploaded
kmace
parents:
diff changeset
253 if(length(val) == 1){
b465306d00ba Uploaded
kmace
parents:
diff changeset
254 line <- paste(line,val,sep="\t")
b465306d00ba Uploaded
kmace
parents:
diff changeset
255 } else {
b465306d00ba Uploaded
kmace
parents:
diff changeset
256 line <- paste(line,paste(sep="",unlist(ll[[i]][j]),collapse="_"),sep="\t")
b465306d00ba Uploaded
kmace
parents:
diff changeset
257 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
258 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
259 cat(file=f.nm,line,"\n",append=TRUE)
b465306d00ba Uploaded
kmace
parents:
diff changeset
260 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
261 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
262 # read command line paramters that are not optional
b465306d00ba Uploaded
kmace
parents:
diff changeset
263 read.cmd.line.params.must <- function(args.nms, cmd.line.args){
b465306d00ba Uploaded
kmace
parents:
diff changeset
264 if(length(grep("--version",cmd.line.args))){
b465306d00ba Uploaded
kmace
parents:
diff changeset
265 cat("version",script.version,"\n")
b465306d00ba Uploaded
kmace
parents:
diff changeset
266 q()
b465306d00ba Uploaded
kmace
parents:
diff changeset
267 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
268 args <- sapply(strsplit(cmd.line.args," "),function(i) i)
b465306d00ba Uploaded
kmace
parents:
diff changeset
269 vals <- character(length(args.nms))
b465306d00ba Uploaded
kmace
parents:
diff changeset
270 # split cmd.line to key and value pairs
b465306d00ba Uploaded
kmace
parents:
diff changeset
271 for(i in 1:length(args.nms)){
b465306d00ba Uploaded
kmace
parents:
diff changeset
272 ix <- grep(args.nms[i],args)
b465306d00ba Uploaded
kmace
parents:
diff changeset
273 if(length(ix)>1){
b465306d00ba Uploaded
kmace
parents:
diff changeset
274 stop("arg ",args.nms[i]," used more than once. Bailing out...\n",print.error())
b465306d00ba Uploaded
kmace
parents:
diff changeset
275 } else if (length(ix)==0){
b465306d00ba Uploaded
kmace
parents:
diff changeset
276 stop("could not find ",args.nms[i],". Bailing out...\n",print.error())
b465306d00ba Uploaded
kmace
parents:
diff changeset
277 } else {
b465306d00ba Uploaded
kmace
parents:
diff changeset
278 vals[i] <- args[ix+1]
b465306d00ba Uploaded
kmace
parents:
diff changeset
279 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
280 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
281 return(vals)
b465306d00ba Uploaded
kmace
parents:
diff changeset
282 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
283 # read command line paramters that are optional, if an optional param is not give functions return na for this param
b465306d00ba Uploaded
kmace
parents:
diff changeset
284 read.cmd.line.params.optional <- function(args.nms, cmd.line.args){
b465306d00ba Uploaded
kmace
parents:
diff changeset
285 args <- sapply(strsplit(cmd.line.args," "),function(i) i)
b465306d00ba Uploaded
kmace
parents:
diff changeset
286 vals <- character(length(args.nms))
b465306d00ba Uploaded
kmace
parents:
diff changeset
287 # split cmd.line to key and value pairs
b465306d00ba Uploaded
kmace
parents:
diff changeset
288 for(i in 1:length(args.nms)){
b465306d00ba Uploaded
kmace
parents:
diff changeset
289 ix <- grep(args.nms[i],args)
b465306d00ba Uploaded
kmace
parents:
diff changeset
290 if(length(ix)>1){
b465306d00ba Uploaded
kmace
parents:
diff changeset
291 stop("arg ",args.nms[i]," used more than once. Bailing out...\n",print.error())
b465306d00ba Uploaded
kmace
parents:
diff changeset
292 } else if (length(ix)==0){ # if --param was not written in cmd line
b465306d00ba Uploaded
kmace
parents:
diff changeset
293 vals[i] <- NA
b465306d00ba Uploaded
kmace
parents:
diff changeset
294 } else if(( (ix+1) <= length(args) ) & ( length(grep("--",args[ix+1])) == 0) ){ # if --param was written in cmd line AND was followed by a value
b465306d00ba Uploaded
kmace
parents:
diff changeset
295 vals[i] <- args[ix+1]
b465306d00ba Uploaded
kmace
parents:
diff changeset
296 } else { # otherwise
b465306d00ba Uploaded
kmace
parents:
diff changeset
297 vals[i] <- NA
b465306d00ba Uploaded
kmace
parents:
diff changeset
298 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
299 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
300 return(vals)
b465306d00ba Uploaded
kmace
parents:
diff changeset
301 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
302
b465306d00ba Uploaded
kmace
parents:
diff changeset
303 ############# Code:
b465306d00ba Uploaded
kmace
parents:
diff changeset
304 # retrieve args
b465306d00ba Uploaded
kmace
parents:
diff changeset
305
b465306d00ba Uploaded
kmace
parents:
diff changeset
306 if(debug==T){
b465306d00ba Uploaded
kmace
parents:
diff changeset
307 cmd.args <- c(
b465306d00ba Uploaded
kmace
parents:
diff changeset
308 "--input_files data/xls/SL10571_SL10565_peaks.xls::data/xls/SL10570_SL10564_peaks.xls::data/xls/SL10572_SL10566_peaks.xls",
b465306d00ba Uploaded
kmace
parents:
diff changeset
309 #"--input_files data/bed/SL10571_SL10565_peaks.bed::data/bed/SL10570_SL10564_peaks.bed::data/bed/SL10572_SL10566_peaks.bed",
b465306d00ba Uploaded
kmace
parents:
diff changeset
310 "--input_type MACS", #BED
b465306d00ba Uploaded
kmace
parents:
diff changeset
311 "--path_output ./results/",
b465306d00ba Uploaded
kmace
parents:
diff changeset
312 "--expt_names macs1::macs2::macs3",
b465306d00ba Uploaded
kmace
parents:
diff changeset
313 #"--expt_names bed1::bed2::bed3",
b465306d00ba Uploaded
kmace
parents:
diff changeset
314 "--mtl_type interval", #interval summit
b465306d00ba Uploaded
kmace
parents:
diff changeset
315 "--dist_summits 100"
b465306d00ba Uploaded
kmace
parents:
diff changeset
316 )
b465306d00ba Uploaded
kmace
parents:
diff changeset
317 } else {
b465306d00ba Uploaded
kmace
parents:
diff changeset
318 cmd.args <- commandArgs(trailingOnly = T);
b465306d00ba Uploaded
kmace
parents:
diff changeset
319 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
320
b465306d00ba Uploaded
kmace
parents:
diff changeset
321 # if(length(grep("--version",cmd.args))){
b465306d00ba Uploaded
kmace
parents:
diff changeset
322 # cat("version",script.version,"\n")
b465306d00ba Uploaded
kmace
parents:
diff changeset
323 # q()
b465306d00ba Uploaded
kmace
parents:
diff changeset
324 # }
b465306d00ba Uploaded
kmace
parents:
diff changeset
325 args.nms.must <- c(
b465306d00ba Uploaded
kmace
parents:
diff changeset
326 "--input_files", #1
b465306d00ba Uploaded
kmace
parents:
diff changeset
327 "--path_output", #2
b465306d00ba Uploaded
kmace
parents:
diff changeset
328 "--expt_names" #3
b465306d00ba Uploaded
kmace
parents:
diff changeset
329 )
b465306d00ba Uploaded
kmace
parents:
diff changeset
330
b465306d00ba Uploaded
kmace
parents:
diff changeset
331 # all numeric params must come at the end of args.nms.optional
b465306d00ba Uploaded
kmace
parents:
diff changeset
332 n.start.numeric <- 3
b465306d00ba Uploaded
kmace
parents:
diff changeset
333 args.nms.optional <- c(
b465306d00ba Uploaded
kmace
parents:
diff changeset
334 "--input_type", #1
b465306d00ba Uploaded
kmace
parents:
diff changeset
335 "--mtl_type", #2
b465306d00ba Uploaded
kmace
parents:
diff changeset
336 "--dist_summits" #3
b465306d00ba Uploaded
kmace
parents:
diff changeset
337 )
b465306d00ba Uploaded
kmace
parents:
diff changeset
338
b465306d00ba Uploaded
kmace
parents:
diff changeset
339 # get must parameters
b465306d00ba Uploaded
kmace
parents:
diff changeset
340 vals.must <- read.cmd.line.params.must(args.nms = args.nms.must, cmd.line.args = cmd.args)
b465306d00ba Uploaded
kmace
parents:
diff changeset
341 if(verbose){
b465306d00ba Uploaded
kmace
parents:
diff changeset
342 cat("\nfinshed reading vals: \n")
b465306d00ba Uploaded
kmace
parents:
diff changeset
343 cat("\nvals.must", unlist(vals.must), "\n")
b465306d00ba Uploaded
kmace
parents:
diff changeset
344 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
345 input.files <- strsplit(vals.must[1],"::")[[1]]
b465306d00ba Uploaded
kmace
parents:
diff changeset
346 if(length(input.files)==1){
b465306d00ba Uploaded
kmace
parents:
diff changeset
347 cat("only provided one MACS file to cluster.")
b465306d00ba Uploaded
kmace
parents:
diff changeset
348 print.error()
b465306d00ba Uploaded
kmace
parents:
diff changeset
349 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
350 path.output <- vals.must[2]
b465306d00ba Uploaded
kmace
parents:
diff changeset
351 expt.names <- strsplit(vals.must[3],"::")[[1]]
b465306d00ba Uploaded
kmace
parents:
diff changeset
352 # get optional parameters
b465306d00ba Uploaded
kmace
parents:
diff changeset
353 vals.optional <- read.cmd.line.params.optional(args.nms = args.nms.optional, cmd.line.args = cmd.args)
b465306d00ba Uploaded
kmace
parents:
diff changeset
354 if(verbose){
b465306d00ba Uploaded
kmace
parents:
diff changeset
355 cat("\nvals.optional:", unlist(vals.optional),"\n")
b465306d00ba Uploaded
kmace
parents:
diff changeset
356 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
357 if(is.na(vals.optional[1])){
b465306d00ba Uploaded
kmace
parents:
diff changeset
358 input.type <- "MACS"
b465306d00ba Uploaded
kmace
parents:
diff changeset
359 } else {
b465306d00ba Uploaded
kmace
parents:
diff changeset
360 input.type <- vals.optional[1]
b465306d00ba Uploaded
kmace
parents:
diff changeset
361 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
362 if(is.na(vals.optional[2])){
b465306d00ba Uploaded
kmace
parents:
diff changeset
363 mtl.type <- "interval"
b465306d00ba Uploaded
kmace
parents:
diff changeset
364 } else {
b465306d00ba Uploaded
kmace
parents:
diff changeset
365 mtl.type <- vals.optional[2]
b465306d00ba Uploaded
kmace
parents:
diff changeset
366 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
367 if(is.na(vals.optional[2])){
b465306d00ba Uploaded
kmace
parents:
diff changeset
368 dist.summits <- 100
b465306d00ba Uploaded
kmace
parents:
diff changeset
369 } else {
b465306d00ba Uploaded
kmace
parents:
diff changeset
370 dist.summits <- as.numeric(vals.optional[3])
b465306d00ba Uploaded
kmace
parents:
diff changeset
371 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
372
b465306d00ba Uploaded
kmace
parents:
diff changeset
373 # read MACS files
b465306d00ba Uploaded
kmace
parents:
diff changeset
374 unique.chr <- character()
b465306d00ba Uploaded
kmace
parents:
diff changeset
375 infile.list <- list()
b465306d00ba Uploaded
kmace
parents:
diff changeset
376 col.nms.keepers <- c("chr","start","end","summit","score")
b465306d00ba Uploaded
kmace
parents:
diff changeset
377 for(i in 1:length(input.files)){
b465306d00ba Uploaded
kmace
parents:
diff changeset
378 e <- expt.names[i]
b465306d00ba Uploaded
kmace
parents:
diff changeset
379 if(toupper(input.type)=="MACS"){
b465306d00ba Uploaded
kmace
parents:
diff changeset
380 tmp <- read.delim(file=input.files[i],comment.char="#",stringsAsFactors = F)
b465306d00ba Uploaded
kmace
parents:
diff changeset
381 columns <- c("chr","start","end","summit","pvalue")
b465306d00ba Uploaded
kmace
parents:
diff changeset
382 ix.cols <- numeric()
b465306d00ba Uploaded
kmace
parents:
diff changeset
383 for(j in 1:length(columns)){
b465306d00ba Uploaded
kmace
parents:
diff changeset
384 ix.j <- grep(columns[j],names(tmp))
b465306d00ba Uploaded
kmace
parents:
diff changeset
385 if(length(ix.j)==0){
b465306d00ba Uploaded
kmace
parents:
diff changeset
386 stop("can't find (grep) column ",columns[j]," in MACS input file ", e)
b465306d00ba Uploaded
kmace
parents:
diff changeset
387 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
388 ix.cols <- c(ix.cols,ix.j)
b465306d00ba Uploaded
kmace
parents:
diff changeset
389 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
390 infile.list[[e]] <- tmp[,ix.cols]
b465306d00ba Uploaded
kmace
parents:
diff changeset
391 colnames(infile.list[[e]]) <- col.nms.keepers
b465306d00ba Uploaded
kmace
parents:
diff changeset
392 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
393 if(toupper(input.type)=="BED"){
b465306d00ba Uploaded
kmace
parents:
diff changeset
394 tmp <- read.delim(file=input.files[i],stringsAsFactors = F,header = FALSE)
b465306d00ba Uploaded
kmace
parents:
diff changeset
395 tmp[,4] <- tmp[,2]+(tmp[,3]-tmp[,2])/2 # define summit and put istead of name column
b465306d00ba Uploaded
kmace
parents:
diff changeset
396 colnames(tmp) <- col.nms.keepers
b465306d00ba Uploaded
kmace
parents:
diff changeset
397 infile.list[[e]] <- tmp[,1:5]
b465306d00ba Uploaded
kmace
parents:
diff changeset
398 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
399 unique.chr <- unique(c(unique.chr,infile.list[[ e ]][,"chr"]))
b465306d00ba Uploaded
kmace
parents:
diff changeset
400 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
401
b465306d00ba Uploaded
kmace
parents:
diff changeset
402 unique.chr <- sort(unique.chr)
b465306d00ba Uploaded
kmace
parents:
diff changeset
403
b465306d00ba Uploaded
kmace
parents:
diff changeset
404 # take all macs files and put peaks together on each chromosome
b465306d00ba Uploaded
kmace
parents:
diff changeset
405 # (as if each chromosome is a ruler and we specify where in the ruler each peak summit falls)
b465306d00ba Uploaded
kmace
parents:
diff changeset
406 cat("splitting input files per chromosome\n")
b465306d00ba Uploaded
kmace
parents:
diff changeset
407 x <- split.macs.list.to.chrmosomes.no.pml(macs.list=infile.list,expts=expt.names,chr=unique.chr)
b465306d00ba Uploaded
kmace
parents:
diff changeset
408 cat("adding peaks from all input files into chromosome rulers\n")
b465306d00ba Uploaded
kmace
parents:
diff changeset
409 ruler <- make.ruler.no.pml(chr=unique.chr,macs.list.per.chrom=x)
b465306d00ba Uploaded
kmace
parents:
diff changeset
410 cat("add MTL membership to the ruler\n")
b465306d00ba Uploaded
kmace
parents:
diff changeset
411
b465306d00ba Uploaded
kmace
parents:
diff changeset
412 if(mtl.type=="interval"){
b465306d00ba Uploaded
kmace
parents:
diff changeset
413 ruler <- assign.clusters.ids.to.peaks.based.on.intervals(ruler)
b465306d00ba Uploaded
kmace
parents:
diff changeset
414 } else {
b465306d00ba Uploaded
kmace
parents:
diff changeset
415 ruler <- assign.clusters.ids.to.peaks.based.on.summits(ruler,d.cut=dist.summits)
b465306d00ba Uploaded
kmace
parents:
diff changeset
416 }
b465306d00ba Uploaded
kmace
parents:
diff changeset
417
b465306d00ba Uploaded
kmace
parents:
diff changeset
418 cat("creating MTL list\n")
b465306d00ba Uploaded
kmace
parents:
diff changeset
419 ll <- create.cluster.list.no.pml(ruler=ruler)
b465306d00ba Uploaded
kmace
parents:
diff changeset
420 cat("writing MTL table\n")
b465306d00ba Uploaded
kmace
parents:
diff changeset
421 f.nm <- paste(sep="",path.output,"mtls",".xls")
b465306d00ba Uploaded
kmace
parents:
diff changeset
422 print.ll.verbose.all(ll=ll,f.nm=f.nm)
b465306d00ba Uploaded
kmace
parents:
diff changeset
423
b465306d00ba Uploaded
kmace
parents:
diff changeset
424
b465306d00ba Uploaded
kmace
parents:
diff changeset
425
b465306d00ba Uploaded
kmace
parents:
diff changeset
426
b465306d00ba Uploaded
kmace
parents:
diff changeset
427
b465306d00ba Uploaded
kmace
parents:
diff changeset
428
b465306d00ba Uploaded
kmace
parents:
diff changeset
429
b465306d00ba Uploaded
kmace
parents:
diff changeset
430
b465306d00ba Uploaded
kmace
parents:
diff changeset
431