Mercurial > repos > kmace > mtls_analysis
comparison mtls_analyze/heatmap.R @ 0:a903c48f05b4
Added non-integrated scripts to galaxy
author | kmace |
---|---|
date | Thu, 22 Mar 2012 15:21:00 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a903c48f05b4 |
---|---|
1 rm(list = ls()) | |
2 | |
3 GeneratePeakMatrix <- function(experiments, scores) { | |
4 # Generates a score matrix for all mtls. | |
5 # | |
6 # Args: | |
7 # experiments: A list of underscore deliminated experiments for each mtl. | |
8 # There should never be a completely empty item in this list. | |
9 # for eg. experiments[1] = expA_expD_expB. | |
10 # scores: A list of underscore deliminated scores for each mtl, the length | |
11 # of these scores should be identicle to the length of the | |
12 # experiments. eg. scores[1] = 55_33_245. | |
13 # | |
14 # Returns: | |
15 # The peak score matrix for all mtls. | |
16 experiments <- sapply(experiments, function(x) strsplit(x, split = "_")) | |
17 scores <- sapply(scores, function(x) strsplit(x, split = "_")) | |
18 unique.experiments <- unique(unlist(experiments)) | |
19 | |
20 peaks=matrix(0,nr=length(experiments),nc=length(unique.experiments)) | |
21 colnames(peaks) <- unique.experiments | |
22 | |
23 for(i in 1:length(experiments)){ | |
24 for(j in 1:length(experiments[[i]])){ | |
25 peaks[i,experiments[[i]][j]]=as.numeric(scores[[i]][j]) | |
26 } | |
27 } | |
28 return(peaks) | |
29 ################################################################## | |
30 } | |
31 GetChipData <- function(file.name, | |
32 proximity = "distal", | |
33 include.targetless = TRUE) { | |
34 # Reads in, filters, and organizes mtls data | |
35 # | |
36 # Args: | |
37 # file.name: The path to the mtls file (including the file name). | |
38 # proximity: Either "distal" or "proximal", defines the gene target distance | |
39 # from the mtl. Default is "distal". | |
40 # include.targetless: If TRUE, includes mtls with no targets (applied after | |
41 # the proximity filter); if not, mtls with no target | |
42 # will be exclided. Default is TRUE. | |
43 # | |
44 # Returns: | |
45 # An organized list of mtls data with the following elements: | |
46 # $peaks - a matrix of peak p-values | |
47 # $targets - a list of underscore deliminated gene targets for each mtl | |
48 if(param$debug) { | |
49 print("In GetChipData") | |
50 } | |
51 # Set Constants for the mtls file type: | |
52 MTLNUM <- "l" | |
53 CHROMOSOME <- "chr" | |
54 EXPERIMENTS <- "expt" | |
55 EXPERIMENTS.SORTED <- "expt.alphanum.sorted" | |
56 START <- "start" | |
57 END <- "end" | |
58 SUMMIT <- "s" | |
59 PVALS <- "pval" | |
60 PVAL.MEAN <- "pval.mean" | |
61 SPANS <- "span.tfs" | |
62 SPAN.TOTAL <- "span.l" | |
63 PEAK.IDS <- "peak.ids" # TODO(kieran): Find out what this is | |
64 PROXIMAL.TARGETS <- "trgt.prox" | |
65 DISTAL.TARGETS <- "trgt.dist" | |
66 PROXIMAL.TARGETS.TSS.DISTANCE <- "dtss.prox" | |
67 DISTAL.TARGETS.TSS.DISTANCE <- "dtss.dist" | |
68 | |
69 | |
70 #Get chip data in from files: | |
71 TARGETS <- switch(proximity, | |
72 distal = DISTAL.TARGETS, | |
73 proximal = PROXIMAL.TARGETS, | |
74 stop(" Bad proximity argument supplied.")) # Default | |
75 | |
76 keep.columns <- c(EXPERIMENTS, PVALS, TARGETS) | |
77 | |
78 file <- read.delim(file.name, header=T, as.is=T)[, keep.columns] | |
79 | |
80 if (!include.targetless) { | |
81 ix.has.trgt <- which(as.character(file[, TARGETS])!="") | |
82 file <- file[ix.has.trgt, ] | |
83 } | |
84 | |
85 chip = list() | |
86 | |
87 chip$peaks = GeneratePeakMatrix(file[, EXPERIMENTS], file[, PVALS]) | |
88 chip$targets = file[ , TARGETS] | |
89 return (chip) | |
90 } | |
91 | |
92 | |
93 | |
94 GenerateKMOrder <- function(data, km) { | |
95 # generates the order of clusters from high to low mean values | |
96 # | |
97 # Args: | |
98 # data: A matrix of scores that have already been clustered. | |
99 # km: The K-means object generated from running kmeans on data. Another | |
100 # method could be used so long as it supplies a (km)$cluser list. Must | |
101 # have the same length as the number of rows in data | |
102 # | |
103 # Returns: | |
104 # cluster.order: The order in which the clusters should be displayed. | |
105 | |
106 km.cluster = km$cluster | |
107 clusters = unique(km.cluster) | |
108 clusters.avg = numeric() | |
109 for(i in clusters) { | |
110 clusters.avg = c(clusters.avg, mean(data[which(km.cluster == i), ])) | |
111 } | |
112 if(param$debug) { | |
113 print ("straight clusters") | |
114 print (clusters) | |
115 print ("straigth average") | |
116 print (clusters.avg) | |
117 print ("ordered clusters") | |
118 print (clusters[order(clusters.avg)]) | |
119 print("ordered average") | |
120 print (clusters.avg[order(clusters.avg)]) | |
121 } | |
122 return(clusters[rev(order(clusters.avg))]) | |
123 } | |
124 | |
125 OrderMTL <- function(data, km, cluster.order) { | |
126 # Orders a matrix of data according to a clustering algorithm | |
127 # | |
128 # Args: | |
129 # data: A matrix of scores that have already been clustered. | |
130 # km: The K-means object generated from running kmeans on data. Another | |
131 # method could be used so long as it supplies a (km)$cluser list. Must | |
132 # have the same length as the number of rows in data | |
133 # cluster.order: The order in which the clusters should be displayed. | |
134 # for eg. km.order = c(2, 3, 1) would result in cluster 2 | |
135 # being on top, then cluster 3 and lastly cluster 1. | |
136 # | |
137 # Returns: | |
138 # a list that contains 3 objects: | |
139 # list$data: the ordered version of the data. | |
140 # list$color.vector: a list of colors that should be assigned to each row. | |
141 # list$start.row: the starting row of each cluster in data. | |
142 number.clusters <- length(cluster.order) | |
143 cluster.colors <- sample(rainbow(number.clusters)) | |
144 | |
145 # Set up return objects | |
146 sorted.data <- matrix(,nr=nrow(data), nc=ncol(data)) | |
147 colnames(sorted.data) <- colnames(data) | |
148 cluster.color.vector = vector(length=length(km$cluster)) | |
149 cluster.start.row = numeric(number.clusters) | |
150 cluster.start.row[1]=1 | |
151 | |
152 for (i in 1:number.clusters) | |
153 { | |
154 current.cluster = cluster.order[i] | |
155 ix = which(km$cluster == current.cluster) | |
156 current.cluster.range <- cluster.start.row[i]:(cluster.start.row[i]+length(ix)-1) | |
157 sorted.data[current.cluster.range, ] = data[ix, ] | |
158 cluster.color.vector[current.cluster.range] = cluster.colors[i] | |
159 cluster.start.row[i+1] = (cluster.start.row[i]+length(ix)) | |
160 } | |
161 ret.list = list() | |
162 ret.list$data = sorted.data | |
163 ret.list$color.vector = cluster.color.vector | |
164 ret.list$cluster.index = cluster.start.row | |
165 return(ret.list) | |
166 } | |
167 | |
168 | |
169 CreateHeatMap <- function(data, | |
170 km, | |
171 cluster.order, | |
172 document.name, | |
173 document.type = "png", | |
174 number.colors = 30) { | |
175 # Generates a heatmap image based for a matrix based on a clustering | |
176 # algorithm. | |
177 # | |
178 # Args: | |
179 # data: A matrix of scores that have already been clustered, the column | |
180 # names of this matrix will become the column titels of the heatmap. | |
181 # km: The K-means object generated from running kmeans on data. Another | |
182 # method could be used so long as it supplies a (km)$cluser list. | |
183 # cluster.order: The order in which the clusters should be displayed. | |
184 # for eg. km.order = c(2, 3, 1) would result in cluster 2 | |
185 # being on top, then cluster 3 and lastly cluster 1. | |
186 # document.name: A name for the produced file. there is no need to | |
187 # supply the .png/.pdf in your argument. | |
188 # document.type: The type of file you want to produce. current options are | |
189 # png and pdf. Default is pdf. | |
190 # | |
191 # Returns: | |
192 # Nothing to the script that calls it, however it creates an image at the | |
193 # path specified. | |
194 if(param$debug) { | |
195 print("In CreateHeatMap") | |
196 } | |
197 | |
198 data.ordered <- OrderMTL(data, km, cluster.order) | |
199 | |
200 | |
201 #Load Lib | |
202 library(gplots) | |
203 | |
204 #Set Color gradient | |
205 color.ramp = colorRampPalette(c("black", | |
206 "darkblue", | |
207 "blue", | |
208 "yellow", | |
209 "orange", | |
210 "red"))(number.colors) #7 | |
211 | |
212 if(document.type == "png") { | |
213 png(paste(document.name,".png", sep = ""), | |
214 width=1920, | |
215 height=2560, | |
216 res=100, | |
217 antialias="gray") | |
218 } | |
219 else { | |
220 pdf(paste(document.name,".pdf", sep = "")) | |
221 } | |
222 | |
223 heatmap.2( | |
224 data.ordered$data[,sort(colnames(data.ordered$data))], | |
225 rowsep = data.ordered$cluster.index[-1], | |
226 sepwidth = c(0.5, 60), | |
227 dendrogram = "none", | |
228 Rowv = F, | |
229 Colv = F, | |
230 trace = "none", | |
231 labRow = F, | |
232 labCol = sort(colnames(data.ordered$data)), | |
233 RowSideColors = data.ordered$color.vector, | |
234 col = color.ramp, | |
235 cexCol = 1.8, | |
236 cexRow = 0.8, | |
237 useRaster=F) | |
238 | |
239 dev.off() | |
240 | |
241 } | |
242 | |
243 | |
244 | |
245 | |
246 ReadCommadLineParameters <- function(argument.names, command.line.arguments) { | |
247 # Reads the parameters from the command line arguments. | |
248 # | |
249 # Args: | |
250 # argument.names: A list of expected argument names. | |
251 # command.line.arguments: The list of recieved command line arguments | |
252 # | |
253 # Returns: | |
254 # The arguments for argument.names. As strings In that order. | |
255 | |
256 if(length(grep("--version",command.line.arguments))){ | |
257 cat("version",script.version,"\n") | |
258 q() | |
259 } | |
260 # Split command line arguments | |
261 args <- sapply(strsplit(command.line.arguments, " "),function(i) i) | |
262 vals <- character(length(argument.names)) | |
263 # split cmd.line to key and value pairs | |
264 for(i in 1:length(argument.names)) { | |
265 ix <- grep(argument.names[i], args) | |
266 if(length(ix)>1) { | |
267 stop("arg ", | |
268 argument.names[i], | |
269 " used more than once. Bailing out...\n", | |
270 PrintParamError()) | |
271 } | |
272 else if (length(ix)==0) { | |
273 stop("could not find --", | |
274 argument.names[i], | |
275 ". Bailing out...\n", | |
276 PrintParamError()) | |
277 } | |
278 vals[i] <- args[ix+1] | |
279 } | |
280 return(vals) | |
281 } | |
282 | |
283 PrintParamError <- function(){ | |
284 # Prints the usage of the function, shows users what arguments they can use | |
285 # | |
286 # Args: | |
287 # param: A list that contains all the paramaters. | |
288 # | |
289 # Returns: | |
290 # A modified version of the param list, with the default values loaded. | |
291 cat(" | |
292 DESCRIPTIION: | |
293 heatmap.R takes a ... | |
294 | |
295 INPUT: | |
296 1.--mtls_file path to mtls file.\n | |
297 2.--cluster_file the destination path for the cluster file.\n | |
298 3.--heatmap_file the destination path for heatmap image (no extension).\n | |
299 4.--heatmap_type choice of image type, currently support png and pdf.\n | |
300 5.--n_clusters number of clusters in the heatmap\n | |
301 6.--filter_percentage percentage of mtls that will be analysed. for eg. if | |
302 we make filter_percentage 30, we will take the union of the top mtls in | |
303 mean, non-zero mean and variance.\n | |
304 | |
305 EXAMPLE RUN: | |
306 Rscript heatmap.R | |
307 --mtls_file mtls.xls | |
308 --cluster_file output/cluster | |
309 --heatmap_file output/heatmap | |
310 --heatmap_type png | |
311 --n_clusters 13 | |
312 --filter_percentage 60 | |
313 | |
314 ") | |
315 } | |
316 | |
317 LoadDefaultParams <- function(param) { | |
318 # Loads default paramaters for the heatmap application | |
319 # | |
320 # Args: | |
321 # param: A list that contains all the previous paramaters. | |
322 # | |
323 # Returns: | |
324 # A modified version of the param list, with the default values loaded. | |
325 | |
326 script.version=0.1 | |
327 param$debug = F | |
328 | |
329 # ChIP data: | |
330 param$annotated.macs.file = "data/mtls/mtls.xls" | |
331 | |
332 # Filter: | |
333 param$filter.percentage <- 100 | |
334 | |
335 # Analyze Clustering: | |
336 param$analyze.number.of.clusters = F | |
337 param$analyze.number.of.clusters.sample.size = 10000 | |
338 param$analyze.number.of.clusters.min = 4 | |
339 param$analyze.number.of.clusters.max = 9 | |
340 param$analyze.number.of.clusters.document.name <- "data/analysis/cluster_analysis" | |
341 param$analyze.number.of.clusters.itterations = 25 | |
342 | |
343 # Clustering: | |
344 param$clustering.number.of.clusters <- 13 | |
345 | |
346 # Heatmap: | |
347 param$heatmap.document.name <- "data/heatmap/heatmap" | |
348 param$heatmap.document.type <- "png" | |
349 #Cluster Groups: | |
350 param$cluster.groups.document.name <- "data/clusters/clusters" | |
351 | |
352 return(param) | |
353 | |
354 } | |
355 | |
356 | |
357 | |
358 LoadDefinedParams <- function(param) { | |
359 # Loads user defined paramaters for the heatmap application | |
360 # | |
361 # Args: | |
362 # param: A list that contains all the previous paramaters. | |
363 # | |
364 # Returns: | |
365 # A modified version of the param list, with the user defined values loaded. | |
366 | |
367 cmd.args <- commandArgs(trailingOnly = T); | |
368 | |
369 | |
370 # put the numeric params at the end! | |
371 n.start.numeric <- 5 | |
372 args.nms <- c( "--mtls_file", #1 | |
373 "--cluster_file", #2 | |
374 "--heatmap_file", #3 | |
375 "--heatmap_type", #4 | |
376 "--n_clusters", #5 | |
377 "--filter_percentage") #6 | |
378 | |
379 # get parameters | |
380 vals <- ReadCommadLineParameters(argument.names = args.nms, | |
381 command.line.arguments = cmd.args) | |
382 | |
383 # check if numeric params are indeed numeric | |
384 for(i in n.start.numeric:length(vals)){ | |
385 if(is.na(as.numeric(vals[i]))){ | |
386 stop("arg ",args.nms[i]," is not numeric. Bailing out...\n",print.error()) | |
387 } | |
388 } | |
389 | |
390 # vals has the same order as args.nms | |
391 | |
392 # ChIP data: | |
393 param$annotated.macs.file <- vals[1] | |
394 | |
395 # Filter: | |
396 param$filter.percentage <- as.numeric(vals[6]) | |
397 | |
398 # Clustering: | |
399 param$clustering.number.of.clusters <- as.numeric(vals[5]) | |
400 | |
401 # Heatmap: | |
402 param$heatmap.document.name <- vals[3] | |
403 param$heatmap.document.type <- vals[4] | |
404 #Cluster Groups: | |
405 param$cluster.groups.document.name <- vals[2] | |
406 | |
407 return(param) | |
408 } | |
409 | |
410 | |
411 | |
412 | |
413 # here we output the fasta file of the targets of each kmeans cluster | |
414 CreateClusterGroups <- function(trgts,k.ix,f.nm="output/clust.to.trgts", km.order){ | |
415 f.nm = paste(f.nm, ".fasta", sep="") | |
416 clusters = km.order | |
417 for(i in 1:length(clusters)){ | |
418 v=trgts[which(k.ix==clusters[i])] | |
419 v.split = unlist(sapply(v,strsplit, "_")) | |
420 if(i == 1){ | |
421 cat(sep="",file=f.nm,">cluster_",i,"\n")#clusters[i],"\n") | |
422 } else { | |
423 cat(sep="",file=f.nm,">cluster_",i,"\n",append=TRUE)#clusters[i],"\n",append=TRUE) | |
424 } | |
425 cat(sep="\n",file=f.nm,v.split,append=TRUE) | |
426 } | |
427 } | |
428 | |
429 | |
430 GetTopRowsFromMatrix = function(mtrx, percentage = 10) | |
431 { | |
432 if (param$debug) { | |
433 print("In GetTopRowsFromMatrix") | |
434 } | |
435 #Store the stats for the mtrx | |
436 stats = list() | |
437 stats$mean=apply(mtrx,1,mean) | |
438 stats$sd=apply(mtrx,1,sd) | |
439 stats$nonzero.mean=apply(mtrx, 1, function(x) mean(x[which(x != 0)])) | |
440 | |
441 | |
442 #Store the indexes for the mtrx | |
443 index = list() | |
444 index$mean = sort(stats$mean, decreasing=T, index.return=T)$ix | |
445 index$sd = sort(stats$sd, decreasing=T, index.return=T)$ix | |
446 index$nonzero.mean = sort(stats$nonzero.mean, decreasing=T, index.return=T)$ix | |
447 | |
448 #Calculate how many data points we want to take | |
449 cutoff = floor(length(mtrx[ ,1])*(percentage/100)) | |
450 | |
451 #Extract Indexes and return to caller | |
452 index$union = union(index$mean[1:cutoff], union(index$sd[1:cutoff], index$nonzero.mean[1:cutoff])) | |
453 | |
454 return(index) | |
455 } | |
456 | |
457 #GetUniq = function(targets.cluster) | |
458 #{ | |
459 # #problem with this function is that it agrigates the list handed to it. after this point you cant maintain order | |
460 # | |
461 # return(unique(unlist(lapply(targets.cluster, function(i) strsplit(i, "_"))))) | |
462 # #return(unlist(lapply(targets.cluster, function(i) strsplit(i, "_")))) | |
463 # | |
464 #} | |
465 | |
466 bindata = function(d,qunts=seq(.4,.9,.1)) | |
467 { | |
468 tmp=matrix(0,nr=nrow(d),nc=ncol(d)) | |
469 for(j in 1:ncol(d)) | |
470 { | |
471 bins=quantile(d[,j],qunts) | |
472 for(i in 1:length(bins)) | |
473 { | |
474 tmp[which(d[,j]>bins[i]),j] = i | |
475 } | |
476 } | |
477 return(tmp) | |
478 } | |
479 | |
480 bindata.non.zero = function(d,qunts=seq(0,0.9,0.1)) | |
481 { | |
482 tmp=matrix(0,nr=nrow(d),nc=ncol(d)) | |
483 for(j in 1:ncol(d)) | |
484 { | |
485 ix.non.zero=which(d[,j]!=0) | |
486 bins=quantile(d[ix.non.zero,j],qunts) | |
487 for(i in 1:length(bins)) | |
488 { | |
489 tmp[which(d[,j]>bins[i]),j] = i | |
490 } | |
491 } | |
492 | |
493 return(tmp) | |
494 } | |
495 | |
496 bindata.non.zero.matrix = function(d,qunts=seq(0,0.9,0.1)) | |
497 { | |
498 tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d)) | |
499 ix.non.zero=which(d!=0) | |
500 bins=quantile(d[ix.non.zero],qunts) | |
501 for(i in 1:length(bins)) | |
502 { | |
503 tmp[which(d>bins[i])] = i | |
504 } | |
505 return(tmp) | |
506 } | |
507 | |
508 print("Finished loading functions") | |
509 | |
510 param <- list() | |
511 param <- LoadDefaultParams(param) | |
512 param <- LoadDefinedParams(param) | |
513 | |
514 print (param) | |
515 | |
516 #Load Chip data, alread in pval format | |
517 chip <- GetChipData(param$annotated.macs.file) | |
518 | |
519 #Get the top percentages on different criteria | |
520 ix.top <- GetTopRowsFromMatrix(chip$peaks, percentage = param$filter.percentage) | |
521 chip$peaks <- chip$peaks[ix.top$union, ] | |
522 chip$targets <- chip$targets[ix.top$union] | |
523 | |
524 #Bin data: | |
525 chip$peaks = bindata.non.zero.matrix(chip$peaks, qunts = seq(0, 0.9, length=30)) | |
526 | |
527 | |
528 #Analyze-cluster | |
529 if(param$analyze.number.of.clusters) | |
530 { | |
531 AnalyzeCluster(data, | |
532 sample.size = param$analyze.number.of.clusters.sample.size, | |
533 clusters.min = param$analyze.number.of.clusters.min, | |
534 clusters.max = param$analyze.number.of.clusters.max, | |
535 document.name = param$analyze.number.of.clusters.document.name, | |
536 itterations = param$analyze.number.of.clusters.itterations) | |
537 | |
538 } | |
539 | |
540 #Cluster | |
541 set.seed(1234) | |
542 km <- kmeans(chip$peaks, param$clustering.number.of.clusters) | |
543 km.order <- GenerateKMOrder(chip$peaks, km) | |
544 | |
545 CreateHeatMap(chip$peaks, | |
546 km, | |
547 km.order, | |
548 document.name = param$heatmap.document.name, | |
549 document.type = param$heatmap.document.type) | |
550 | |
551 CreateClusterGroups(trgts=chip$targets, | |
552 k.ix=km$cluster, | |
553 f.nm = param$cluster.groups.document.name, | |
554 km.order = km.order) | |
555 |