comparison mtls_analyze/heatmap.R @ 0:a903c48f05b4

Added non-integrated scripts to galaxy
author kmace
date Thu, 22 Mar 2012 15:21:00 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a903c48f05b4
1 rm(list = ls())
2
3 GeneratePeakMatrix <- function(experiments, scores) {
4 # Generates a score matrix for all mtls.
5 #
6 # Args:
7 # experiments: A list of underscore deliminated experiments for each mtl.
8 # There should never be a completely empty item in this list.
9 # for eg. experiments[1] = expA_expD_expB.
10 # scores: A list of underscore deliminated scores for each mtl, the length
11 # of these scores should be identicle to the length of the
12 # experiments. eg. scores[1] = 55_33_245.
13 #
14 # Returns:
15 # The peak score matrix for all mtls.
16 experiments <- sapply(experiments, function(x) strsplit(x, split = "_"))
17 scores <- sapply(scores, function(x) strsplit(x, split = "_"))
18 unique.experiments <- unique(unlist(experiments))
19
20 peaks=matrix(0,nr=length(experiments),nc=length(unique.experiments))
21 colnames(peaks) <- unique.experiments
22
23 for(i in 1:length(experiments)){
24 for(j in 1:length(experiments[[i]])){
25 peaks[i,experiments[[i]][j]]=as.numeric(scores[[i]][j])
26 }
27 }
28 return(peaks)
29 ##################################################################
30 }
31 GetChipData <- function(file.name,
32 proximity = "distal",
33 include.targetless = TRUE) {
34 # Reads in, filters, and organizes mtls data
35 #
36 # Args:
37 # file.name: The path to the mtls file (including the file name).
38 # proximity: Either "distal" or "proximal", defines the gene target distance
39 # from the mtl. Default is "distal".
40 # include.targetless: If TRUE, includes mtls with no targets (applied after
41 # the proximity filter); if not, mtls with no target
42 # will be exclided. Default is TRUE.
43 #
44 # Returns:
45 # An organized list of mtls data with the following elements:
46 # $peaks - a matrix of peak p-values
47 # $targets - a list of underscore deliminated gene targets for each mtl
48 if(param$debug) {
49 print("In GetChipData")
50 }
51 # Set Constants for the mtls file type:
52 MTLNUM <- "l"
53 CHROMOSOME <- "chr"
54 EXPERIMENTS <- "expt"
55 EXPERIMENTS.SORTED <- "expt.alphanum.sorted"
56 START <- "start"
57 END <- "end"
58 SUMMIT <- "s"
59 PVALS <- "pval"
60 PVAL.MEAN <- "pval.mean"
61 SPANS <- "span.tfs"
62 SPAN.TOTAL <- "span.l"
63 PEAK.IDS <- "peak.ids" # TODO(kieran): Find out what this is
64 PROXIMAL.TARGETS <- "trgt.prox"
65 DISTAL.TARGETS <- "trgt.dist"
66 PROXIMAL.TARGETS.TSS.DISTANCE <- "dtss.prox"
67 DISTAL.TARGETS.TSS.DISTANCE <- "dtss.dist"
68
69
70 #Get chip data in from files:
71 TARGETS <- switch(proximity,
72 distal = DISTAL.TARGETS,
73 proximal = PROXIMAL.TARGETS,
74 stop(" Bad proximity argument supplied.")) # Default
75
76 keep.columns <- c(EXPERIMENTS, PVALS, TARGETS)
77
78 file <- read.delim(file.name, header=T, as.is=T)[, keep.columns]
79
80 if (!include.targetless) {
81 ix.has.trgt <- which(as.character(file[, TARGETS])!="")
82 file <- file[ix.has.trgt, ]
83 }
84
85 chip = list()
86
87 chip$peaks = GeneratePeakMatrix(file[, EXPERIMENTS], file[, PVALS])
88 chip$targets = file[ , TARGETS]
89 return (chip)
90 }
91
92
93
94 GenerateKMOrder <- function(data, km) {
95 # generates the order of clusters from high to low mean values
96 #
97 # Args:
98 # data: A matrix of scores that have already been clustered.
99 # km: The K-means object generated from running kmeans on data. Another
100 # method could be used so long as it supplies a (km)$cluser list. Must
101 # have the same length as the number of rows in data
102 #
103 # Returns:
104 # cluster.order: The order in which the clusters should be displayed.
105
106 km.cluster = km$cluster
107 clusters = unique(km.cluster)
108 clusters.avg = numeric()
109 for(i in clusters) {
110 clusters.avg = c(clusters.avg, mean(data[which(km.cluster == i), ]))
111 }
112 if(param$debug) {
113 print ("straight clusters")
114 print (clusters)
115 print ("straigth average")
116 print (clusters.avg)
117 print ("ordered clusters")
118 print (clusters[order(clusters.avg)])
119 print("ordered average")
120 print (clusters.avg[order(clusters.avg)])
121 }
122 return(clusters[rev(order(clusters.avg))])
123 }
124
125 OrderMTL <- function(data, km, cluster.order) {
126 # Orders a matrix of data according to a clustering algorithm
127 #
128 # Args:
129 # data: A matrix of scores that have already been clustered.
130 # km: The K-means object generated from running kmeans on data. Another
131 # method could be used so long as it supplies a (km)$cluser list. Must
132 # have the same length as the number of rows in data
133 # cluster.order: The order in which the clusters should be displayed.
134 # for eg. km.order = c(2, 3, 1) would result in cluster 2
135 # being on top, then cluster 3 and lastly cluster 1.
136 #
137 # Returns:
138 # a list that contains 3 objects:
139 # list$data: the ordered version of the data.
140 # list$color.vector: a list of colors that should be assigned to each row.
141 # list$start.row: the starting row of each cluster in data.
142 number.clusters <- length(cluster.order)
143 cluster.colors <- sample(rainbow(number.clusters))
144
145 # Set up return objects
146 sorted.data <- matrix(,nr=nrow(data), nc=ncol(data))
147 colnames(sorted.data) <- colnames(data)
148 cluster.color.vector = vector(length=length(km$cluster))
149 cluster.start.row = numeric(number.clusters)
150 cluster.start.row[1]=1
151
152 for (i in 1:number.clusters)
153 {
154 current.cluster = cluster.order[i]
155 ix = which(km$cluster == current.cluster)
156 current.cluster.range <- cluster.start.row[i]:(cluster.start.row[i]+length(ix)-1)
157 sorted.data[current.cluster.range, ] = data[ix, ]
158 cluster.color.vector[current.cluster.range] = cluster.colors[i]
159 cluster.start.row[i+1] = (cluster.start.row[i]+length(ix))
160 }
161 ret.list = list()
162 ret.list$data = sorted.data
163 ret.list$color.vector = cluster.color.vector
164 ret.list$cluster.index = cluster.start.row
165 return(ret.list)
166 }
167
168
169 CreateHeatMap <- function(data,
170 km,
171 cluster.order,
172 document.name,
173 document.type = "png",
174 number.colors = 30) {
175 # Generates a heatmap image based for a matrix based on a clustering
176 # algorithm.
177 #
178 # Args:
179 # data: A matrix of scores that have already been clustered, the column
180 # names of this matrix will become the column titels of the heatmap.
181 # km: The K-means object generated from running kmeans on data. Another
182 # method could be used so long as it supplies a (km)$cluser list.
183 # cluster.order: The order in which the clusters should be displayed.
184 # for eg. km.order = c(2, 3, 1) would result in cluster 2
185 # being on top, then cluster 3 and lastly cluster 1.
186 # document.name: A name for the produced file. there is no need to
187 # supply the .png/.pdf in your argument.
188 # document.type: The type of file you want to produce. current options are
189 # png and pdf. Default is pdf.
190 #
191 # Returns:
192 # Nothing to the script that calls it, however it creates an image at the
193 # path specified.
194 if(param$debug) {
195 print("In CreateHeatMap")
196 }
197
198 data.ordered <- OrderMTL(data, km, cluster.order)
199
200
201 #Load Lib
202 library(gplots)
203
204 #Set Color gradient
205 color.ramp = colorRampPalette(c("black",
206 "darkblue",
207 "blue",
208 "yellow",
209 "orange",
210 "red"))(number.colors) #7
211
212 if(document.type == "png") {
213 png(paste(document.name,".png", sep = ""),
214 width=1920,
215 height=2560,
216 res=100,
217 antialias="gray")
218 }
219 else {
220 pdf(paste(document.name,".pdf", sep = ""))
221 }
222
223 heatmap.2(
224 data.ordered$data[,sort(colnames(data.ordered$data))],
225 rowsep = data.ordered$cluster.index[-1],
226 sepwidth = c(0.5, 60),
227 dendrogram = "none",
228 Rowv = F,
229 Colv = F,
230 trace = "none",
231 labRow = F,
232 labCol = sort(colnames(data.ordered$data)),
233 RowSideColors = data.ordered$color.vector,
234 col = color.ramp,
235 cexCol = 1.8,
236 cexRow = 0.8,
237 useRaster=F)
238
239 dev.off()
240
241 }
242
243
244
245
246 ReadCommadLineParameters <- function(argument.names, command.line.arguments) {
247 # Reads the parameters from the command line arguments.
248 #
249 # Args:
250 # argument.names: A list of expected argument names.
251 # command.line.arguments: The list of recieved command line arguments
252 #
253 # Returns:
254 # The arguments for argument.names. As strings In that order.
255
256 if(length(grep("--version",command.line.arguments))){
257 cat("version",script.version,"\n")
258 q()
259 }
260 # Split command line arguments
261 args <- sapply(strsplit(command.line.arguments, " "),function(i) i)
262 vals <- character(length(argument.names))
263 # split cmd.line to key and value pairs
264 for(i in 1:length(argument.names)) {
265 ix <- grep(argument.names[i], args)
266 if(length(ix)>1) {
267 stop("arg ",
268 argument.names[i],
269 " used more than once. Bailing out...\n",
270 PrintParamError())
271 }
272 else if (length(ix)==0) {
273 stop("could not find --",
274 argument.names[i],
275 ". Bailing out...\n",
276 PrintParamError())
277 }
278 vals[i] <- args[ix+1]
279 }
280 return(vals)
281 }
282
283 PrintParamError <- function(){
284 # Prints the usage of the function, shows users what arguments they can use
285 #
286 # Args:
287 # param: A list that contains all the paramaters.
288 #
289 # Returns:
290 # A modified version of the param list, with the default values loaded.
291 cat("
292 DESCRIPTIION:
293 heatmap.R takes a ...
294
295 INPUT:
296 1.--mtls_file path to mtls file.\n
297 2.--cluster_file the destination path for the cluster file.\n
298 3.--heatmap_file the destination path for heatmap image (no extension).\n
299 4.--heatmap_type choice of image type, currently support png and pdf.\n
300 5.--n_clusters number of clusters in the heatmap\n
301 6.--filter_percentage percentage of mtls that will be analysed. for eg. if
302 we make filter_percentage 30, we will take the union of the top mtls in
303 mean, non-zero mean and variance.\n
304
305 EXAMPLE RUN:
306 Rscript heatmap.R
307 --mtls_file mtls.xls
308 --cluster_file output/cluster
309 --heatmap_file output/heatmap
310 --heatmap_type png
311 --n_clusters 13
312 --filter_percentage 60
313
314 ")
315 }
316
317 LoadDefaultParams <- function(param) {
318 # Loads default paramaters for the heatmap application
319 #
320 # Args:
321 # param: A list that contains all the previous paramaters.
322 #
323 # Returns:
324 # A modified version of the param list, with the default values loaded.
325
326 script.version=0.1
327 param$debug = F
328
329 # ChIP data:
330 param$annotated.macs.file = "data/mtls/mtls.xls"
331
332 # Filter:
333 param$filter.percentage <- 100
334
335 # Analyze Clustering:
336 param$analyze.number.of.clusters = F
337 param$analyze.number.of.clusters.sample.size = 10000
338 param$analyze.number.of.clusters.min = 4
339 param$analyze.number.of.clusters.max = 9
340 param$analyze.number.of.clusters.document.name <- "data/analysis/cluster_analysis"
341 param$analyze.number.of.clusters.itterations = 25
342
343 # Clustering:
344 param$clustering.number.of.clusters <- 13
345
346 # Heatmap:
347 param$heatmap.document.name <- "data/heatmap/heatmap"
348 param$heatmap.document.type <- "png"
349 #Cluster Groups:
350 param$cluster.groups.document.name <- "data/clusters/clusters"
351
352 return(param)
353
354 }
355
356
357
358 LoadDefinedParams <- function(param) {
359 # Loads user defined paramaters for the heatmap application
360 #
361 # Args:
362 # param: A list that contains all the previous paramaters.
363 #
364 # Returns:
365 # A modified version of the param list, with the user defined values loaded.
366
367 cmd.args <- commandArgs(trailingOnly = T);
368
369
370 # put the numeric params at the end!
371 n.start.numeric <- 5
372 args.nms <- c( "--mtls_file", #1
373 "--cluster_file", #2
374 "--heatmap_file", #3
375 "--heatmap_type", #4
376 "--n_clusters", #5
377 "--filter_percentage") #6
378
379 # get parameters
380 vals <- ReadCommadLineParameters(argument.names = args.nms,
381 command.line.arguments = cmd.args)
382
383 # check if numeric params are indeed numeric
384 for(i in n.start.numeric:length(vals)){
385 if(is.na(as.numeric(vals[i]))){
386 stop("arg ",args.nms[i]," is not numeric. Bailing out...\n",print.error())
387 }
388 }
389
390 # vals has the same order as args.nms
391
392 # ChIP data:
393 param$annotated.macs.file <- vals[1]
394
395 # Filter:
396 param$filter.percentage <- as.numeric(vals[6])
397
398 # Clustering:
399 param$clustering.number.of.clusters <- as.numeric(vals[5])
400
401 # Heatmap:
402 param$heatmap.document.name <- vals[3]
403 param$heatmap.document.type <- vals[4]
404 #Cluster Groups:
405 param$cluster.groups.document.name <- vals[2]
406
407 return(param)
408 }
409
410
411
412
413 # here we output the fasta file of the targets of each kmeans cluster
414 CreateClusterGroups <- function(trgts,k.ix,f.nm="output/clust.to.trgts", km.order){
415 f.nm = paste(f.nm, ".fasta", sep="")
416 clusters = km.order
417 for(i in 1:length(clusters)){
418 v=trgts[which(k.ix==clusters[i])]
419 v.split = unlist(sapply(v,strsplit, "_"))
420 if(i == 1){
421 cat(sep="",file=f.nm,">cluster_",i,"\n")#clusters[i],"\n")
422 } else {
423 cat(sep="",file=f.nm,">cluster_",i,"\n",append=TRUE)#clusters[i],"\n",append=TRUE)
424 }
425 cat(sep="\n",file=f.nm,v.split,append=TRUE)
426 }
427 }
428
429
430 GetTopRowsFromMatrix = function(mtrx, percentage = 10)
431 {
432 if (param$debug) {
433 print("In GetTopRowsFromMatrix")
434 }
435 #Store the stats for the mtrx
436 stats = list()
437 stats$mean=apply(mtrx,1,mean)
438 stats$sd=apply(mtrx,1,sd)
439 stats$nonzero.mean=apply(mtrx, 1, function(x) mean(x[which(x != 0)]))
440
441
442 #Store the indexes for the mtrx
443 index = list()
444 index$mean = sort(stats$mean, decreasing=T, index.return=T)$ix
445 index$sd = sort(stats$sd, decreasing=T, index.return=T)$ix
446 index$nonzero.mean = sort(stats$nonzero.mean, decreasing=T, index.return=T)$ix
447
448 #Calculate how many data points we want to take
449 cutoff = floor(length(mtrx[ ,1])*(percentage/100))
450
451 #Extract Indexes and return to caller
452 index$union = union(index$mean[1:cutoff], union(index$sd[1:cutoff], index$nonzero.mean[1:cutoff]))
453
454 return(index)
455 }
456
457 #GetUniq = function(targets.cluster)
458 #{
459 # #problem with this function is that it agrigates the list handed to it. after this point you cant maintain order
460 #
461 # return(unique(unlist(lapply(targets.cluster, function(i) strsplit(i, "_")))))
462 # #return(unlist(lapply(targets.cluster, function(i) strsplit(i, "_"))))
463 #
464 #}
465
466 bindata = function(d,qunts=seq(.4,.9,.1))
467 {
468 tmp=matrix(0,nr=nrow(d),nc=ncol(d))
469 for(j in 1:ncol(d))
470 {
471 bins=quantile(d[,j],qunts)
472 for(i in 1:length(bins))
473 {
474 tmp[which(d[,j]>bins[i]),j] = i
475 }
476 }
477 return(tmp)
478 }
479
480 bindata.non.zero = function(d,qunts=seq(0,0.9,0.1))
481 {
482 tmp=matrix(0,nr=nrow(d),nc=ncol(d))
483 for(j in 1:ncol(d))
484 {
485 ix.non.zero=which(d[,j]!=0)
486 bins=quantile(d[ix.non.zero,j],qunts)
487 for(i in 1:length(bins))
488 {
489 tmp[which(d[,j]>bins[i]),j] = i
490 }
491 }
492
493 return(tmp)
494 }
495
496 bindata.non.zero.matrix = function(d,qunts=seq(0,0.9,0.1))
497 {
498 tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d))
499 ix.non.zero=which(d!=0)
500 bins=quantile(d[ix.non.zero],qunts)
501 for(i in 1:length(bins))
502 {
503 tmp[which(d>bins[i])] = i
504 }
505 return(tmp)
506 }
507
508 print("Finished loading functions")
509
510 param <- list()
511 param <- LoadDefaultParams(param)
512 param <- LoadDefinedParams(param)
513
514 print (param)
515
516 #Load Chip data, alread in pval format
517 chip <- GetChipData(param$annotated.macs.file)
518
519 #Get the top percentages on different criteria
520 ix.top <- GetTopRowsFromMatrix(chip$peaks, percentage = param$filter.percentage)
521 chip$peaks <- chip$peaks[ix.top$union, ]
522 chip$targets <- chip$targets[ix.top$union]
523
524 #Bin data:
525 chip$peaks = bindata.non.zero.matrix(chip$peaks, qunts = seq(0, 0.9, length=30))
526
527
528 #Analyze-cluster
529 if(param$analyze.number.of.clusters)
530 {
531 AnalyzeCluster(data,
532 sample.size = param$analyze.number.of.clusters.sample.size,
533 clusters.min = param$analyze.number.of.clusters.min,
534 clusters.max = param$analyze.number.of.clusters.max,
535 document.name = param$analyze.number.of.clusters.document.name,
536 itterations = param$analyze.number.of.clusters.itterations)
537
538 }
539
540 #Cluster
541 set.seed(1234)
542 km <- kmeans(chip$peaks, param$clustering.number.of.clusters)
543 km.order <- GenerateKMOrder(chip$peaks, km)
544
545 CreateHeatMap(chip$peaks,
546 km,
547 km.order,
548 document.name = param$heatmap.document.name,
549 document.type = param$heatmap.document.type)
550
551 CreateClusterGroups(trgts=chip$targets,
552 k.ix=km$cluster,
553 f.nm = param$cluster.groups.document.name,
554 km.order = km.order)
555