# HG changeset patch # User kmace # Date 1343062815 14400 # Node ID b465306d00bad605eefeedee4529cee4ab5db487 # Parent a0306edbf2f8dd0d001605507a786731dad9f90a Uploaded diff -r a0306edbf2f8 -r b465306d00ba mtls_analyze/annotate_mtls.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mtls_analyze/annotate_mtls.R Mon Jul 23 13:00:15 2012 -0400 @@ -0,0 +1,79 @@ +debug = F +# Read in Data +if (debug == T) { + mtls <- read.delim("~/code/scorchR-annotate_mtls/mtls.xls", quote="", stringsAsFactors=F) + annotation <- read.delim("~/code/scorchR-annotate_mtls/gene_annotation.txt", header=T, stringsAsFactors=F) + threshold <- 100000 +} else { + cmd <- commandArgs(trailingOnly = T) + mtls <- read.delim(cmd[1], quote="", stringsAsFactors=F) + annotation <- read.delim(cmd[2], header=T, stringsAsFactors=F) + threshold <- as.numeric(cmd[3]) +} + + + +# Get the median summit +summits <- sapply(strsplit(mtls[ ,"summit"], split="_"), function(x) {median(as.numeric(x))}) +# Extract every tss +tss <- apply(annotation,1,function(x) { + if (x["strand"]=="+"){ + return(as.numeric(x["txStart"])) + }else{ + return(as.numeric(x["txEnd"])) + } + }) + + +# Get each Chromosome +chromosomes <- unique( mtls[,"chr"]) + +# Set up output: +trgt <- character(length = length(summits)) + +# Loop through Chromosomes +for (chromosome in chromosomes){ + cat(chromosome, "\n") + chr.ix.tss <- which(annotation[ ,"chrom"]==chromosome) + chr.ix.summits <- which(mtls[ ,"chr"]==chromosome) + + + cat(length(chr.ix.summits), " -summits\n", + length(chr.ix.tss), " -tss\n") + if (length(chr.ix.summits)>0 && length(chr.ix.tss)>0) { + # Loop through Summits + for(summit.ix in chr.ix.summits) { + #cat("Summit = ", summit.ix, class(summit.ix),"\n") + distances <- abs(tss[chr.ix.tss] - summits[summit.ix]) + closest.ix.tss <- which(distances == min(distances))[1] + if(!is.na(closest.ix.tss) && !is.na(distances[closest.ix.tss])) { + #print(closest.ix.tss) + if (distances[closest.ix.tss] < threshold) { + + trgt[summit.ix] <- annotation[chr.ix.tss, "name2"][closest.ix.tss] + +# cat("A distance of ", +# distances[closest.ix.tss], +# " was found for the mtl summit located at: ", +# chromosome, +# ": ", +# summits[summit.ix], +# " and TSS located at: ", +# chromosome, +# ": ", +# tss[chr.ix.tss][closest.ix.tss], +# "\n") + } + } + } + } else { + cat("chromosome", chromosome, " either has no transcripts or no peaks in this experiment. \n") + } + output <- cbind(mtls, trgt) +} + +write.table(output, file="annotated_mtls.xls", sep="\t",row.names=F, quote=F) + + + + diff -r a0306edbf2f8 -r b465306d00ba mtls_analyze/cluster_peaks.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mtls_analyze/cluster_peaks.R Mon Jul 23 13:00:15 2012 -0400 @@ -0,0 +1,431 @@ +## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. +## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ +##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' +## Feb 2012 th17 +## Bonneau lab - "Aviv Madar" , +## NYU - Center for Genomics and Systems Biology +## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. +## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ +##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' + +rm(list=ls()) +debug=F +verbose=F +script.version=0.1 +print.error <- function(){ + cat(" +DESCRIPTIION: + cluster_peaks.R takes MACS/.bed tab delimited files as input and produces one tab delimeted file (named mtls.xls) where + each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS/.bed files) + fall within a certain distance from eachother. If you cluster MTLs based on summits, you need to specify dist.summits. + If you use .bed files, the files must contain no header, and at least the five columns: + 1. chrom, 2. chromStart, 3. chromEnd, 4. name, and 5.score. 2 and 3 represent the end points used for MTLs defined based on a shared + interval. (2+3)/2 is used as the summit of each row if used for MTLs defined based on proximity of summits. + +INPUT: + 1.input_files: path to MACS/bed files '::' delim [path_input=f1::f2::f3::...::fk] + 2.path_output: path to save generated MTL cluster file (where to save mtls.xls) + 3.expt_names: user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk] + 4.input_type: the type of input file used (MACS or BED; defaults to MACS) + 5.mtl_type: interval or summit (defaults to summit) + 6.dist.summits: maximum distance between summits belonging to the same MTL (defaults to 100; only used if mtl_type is summit) + +EXAMPLE RUN: + cluster_peaks.R + --input_files input/SL2870_SL2871_peaks.xls::input/SL2872_SL2876_peaks.xls::input/SL3032_SL2871_peaks.xls::input/SL3037_SL3036_peaks.xls::input/SL3315_SL3319_peaks.xls + --input_type MACS + --path_output results/ + --expt_names RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17 + --dist_summits 100 + --mtl_type summit + +Please cite us if you used this script: + The transcription factor network regulating Th17 lineage specification and function. + Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers, + Richard Bonneau and Dan R. Littman et. al. (in preperation)\n\n") +} + +############# helper functions: +## split.macs.list.to.chrmosomes +# input: +# - macs.list: a list of macs expts: here are a few lines of one expt +## chr start end length summit tags #NAME? fold_enrichment FDR(%) +## chr1 4322210 4323069 860 494 55 158.95 6.03 0.05 +## chr1 4797749 4798368 620 211 29 119.82 3.47 0.09 +## chr1 4848182 4849113 932 494 46 105.42 2.9 0.09 +# - expts: a list of the expts names from macs.list that you want to process +# - chr: chrmosomes names +# output: +# - x: a list with as many elements as chr specified by input. +# -x[[i]]:macs list per chr, with peak.id column added (these are the row numbers of MACS) +split.macs.list.to.chrmosomes.no.pml <- function(macs.list,expts,chr="chr1"){ + x <- list() + n <- length(expts) + for(i in 1:n){ + e <- expts[i] #experiment name + cat("wroking on expt", e,"\n") + x[[e]] <- lapply(chr,split.one.macs.expt.by.chromosome.no.pml,m=macs.list[[e]]) + names(x[[e]]) <- chr + } + return(x) +} +# a helper function for spliat.macs.list.to.chrmosomes, gives for one chromosome the macs rows for expt MACS matrix m +# input: +# - r is chromosome +# - m is macs matrix for expt e from above function +split.one.macs.expt.by.chromosome.no.pml <- function(r,m){ + ix.chr.i <- which(m[,"chr"]==r) + # cat("working on",r,"\n") + o <- list() + o[[r]] <- m[ix.chr.i,] + o[[r]]$peak.id <- ix.chr.i + return(o[[r]]) +} + +## make.ruler makes a ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end) +# also match these summit locations with corresponding: +# pvals, tfs, peak start and peak end +# input: +# - chr: a list of chromosome names +# - macs.list.per.chrom: a list of macs peaks for each chromosome +# output: +# - o: a list each chormosome ruler as an element +make.ruler.no.pml <- function(chr,macs.list.per.chrom){ + x <- macs.list.per.chrom + o <- list() + for(j in 1:length(chr)){ + r <- chr[j] # chrmosome we go over + s <- numeric() + pval <- numeric() + tf.to.s <- character() + start <- numeric() + end <- numeric() + trtmnts <- names(x) # the treatments name (expt names) + ## debug parameters ### + ## which experiment peaks come from + expt <- character() + ## what was the peak id in that experiment + peak.ids <- numeric() + ## this will allow us to always back track from a cluster to the actual peaks in it + ## debug params end ### + for(i in 1:length(trtmnts)){ + o[[r]] <- list() + e <- trtmnts[i] #experiment name + tf <- strsplit(e,"_")[[1]][1] + s <- c(s,x[[e]][[r]][,"start"]+x[[e]][[r]][,"summit"]) + pval <- c(pval,x[[e]][[r]][,"score"]) + start <- c(start,x[[e]][[r]][,"start"]) + end <- c(end,x[[e]][[r]][,"end"]) + expt <- c(expt,rep(e,length(x[[e]][[r]][,"end"]))) + peak.ids <- c(peak.ids,x[[e]][[r]][,"peak.id"]) + } + ix <- sort(s,index.return=TRUE)$ix + o[[r]] <- list(summit=s[ix],score=pval[ix],expt=expt[ix],start=start[ix],end=end[ix],peak.ids=peak.ids[ix]) + } + return(o) +} + +## add cluster memberships based on ruler +## require no more than d.cut distance between tf summits +## cur.l is the current loci number (or cluster number) +assign.clusters.ids.to.peaks.based.on.summits <- function(ruler,d.cut){ + cur.l <- 0 + for(j in 1:length(ruler)){ + s <- ruler[[j]][["summit"]] + l <- numeric(length=length(s)) + if(length(l)>0){ + cur.l <- cur.l+1 + } + if(length(l)==1){ + l[1] <- cur.l + } else if(length(l)>1) { + l[1] <- cur.l + for(i in 2:length(l)){ + d <- s[i]-s[i-1] # assumes s is sorted increasingly + if(d>d.cut){ + cur.l <- cur.l+1 + } + l[i] <- cur.l + } + } + ruler[[j]][["mtl.id"]] <- l + } + return(ruler) +} + +## add cluster memberships based on ruler +## require that peaks span will have a non-empty intersection +## cur.mtl is the current loci number (or cluster number) +assign.clusters.ids.to.peaks.based.on.intervals <- function(ruler){ +cur.mtl <- 0 +for(j in 1:length(ruler)){ + s <- ruler[[j]][["start"]] + e <- ruler[[j]][["end"]] + l <- numeric(length=length(s)) + if(length(l)>0){ + cur.mtl <- cur.mtl+1 + } + if(length(l)==1){ + l[1] <- cur.mtl + } else if(length(l)>1) { + l[1] <- cur.mtl + cur.e <- e[1] # the right-most bp belonging to the current mtl + for(i in 2:length(l)){ + if(cur.e1){ + stop("arg ",args.nms[i]," used more than once. Bailing out...\n",print.error()) + } else if (length(ix)==0){ + stop("could not find ",args.nms[i],". Bailing out...\n",print.error()) + } else { + vals[i] <- args[ix+1] + } + } + return(vals) +} +# read command line paramters that are optional, if an optional param is not give functions return na for this param +read.cmd.line.params.optional <- function(args.nms, cmd.line.args){ + args <- sapply(strsplit(cmd.line.args," "),function(i) i) + vals <- character(length(args.nms)) + # split cmd.line to key and value pairs + for(i in 1:length(args.nms)){ + ix <- grep(args.nms[i],args) + if(length(ix)>1){ + stop("arg ",args.nms[i]," used more than once. Bailing out...\n",print.error()) + } else if (length(ix)==0){ # if --param was not written in cmd line + vals[i] <- NA + } else if(( (ix+1) <= length(args) ) & ( length(grep("--",args[ix+1])) == 0) ){ # if --param was written in cmd line AND was followed by a value + vals[i] <- args[ix+1] + } else { # otherwise + vals[i] <- NA + } + } + return(vals) +} + +############# Code: +# retrieve args + +if(debug==T){ + cmd.args <- c( + "--input_files data/xls/SL10571_SL10565_peaks.xls::data/xls/SL10570_SL10564_peaks.xls::data/xls/SL10572_SL10566_peaks.xls", + #"--input_files data/bed/SL10571_SL10565_peaks.bed::data/bed/SL10570_SL10564_peaks.bed::data/bed/SL10572_SL10566_peaks.bed", + "--input_type MACS", #BED + "--path_output ./results/", + "--expt_names macs1::macs2::macs3", + #"--expt_names bed1::bed2::bed3", + "--mtl_type interval", #interval summit + "--dist_summits 100" + ) +} else { + cmd.args <- commandArgs(trailingOnly = T); +} + +# if(length(grep("--version",cmd.args))){ +# cat("version",script.version,"\n") +# q() +# } +args.nms.must <- c( + "--input_files", #1 + "--path_output", #2 + "--expt_names" #3 +) + +# all numeric params must come at the end of args.nms.optional +n.start.numeric <- 3 +args.nms.optional <- c( + "--input_type", #1 + "--mtl_type", #2 + "--dist_summits" #3 +) + +# get must parameters +vals.must <- read.cmd.line.params.must(args.nms = args.nms.must, cmd.line.args = cmd.args) +if(verbose){ + cat("\nfinshed reading vals: \n") + cat("\nvals.must", unlist(vals.must), "\n") +} +input.files <- strsplit(vals.must[1],"::")[[1]] +if(length(input.files)==1){ + cat("only provided one MACS file to cluster.") + print.error() +} +path.output <- vals.must[2] +expt.names <- strsplit(vals.must[3],"::")[[1]] +# get optional parameters +vals.optional <- read.cmd.line.params.optional(args.nms = args.nms.optional, cmd.line.args = cmd.args) +if(verbose){ + cat("\nvals.optional:", unlist(vals.optional),"\n") +} +if(is.na(vals.optional[1])){ + input.type <- "MACS" +} else { + input.type <- vals.optional[1] +} +if(is.na(vals.optional[2])){ + mtl.type <- "interval" +} else { + mtl.type <- vals.optional[2] +} +if(is.na(vals.optional[2])){ + dist.summits <- 100 +} else { + dist.summits <- as.numeric(vals.optional[3]) +} + +# read MACS files +unique.chr <- character() +infile.list <- list() +col.nms.keepers <- c("chr","start","end","summit","score") +for(i in 1:length(input.files)){ + e <- expt.names[i] + if(toupper(input.type)=="MACS"){ + tmp <- read.delim(file=input.files[i],comment.char="#",stringsAsFactors = F) + columns <- c("chr","start","end","summit","pvalue") + ix.cols <- numeric() + for(j in 1:length(columns)){ + ix.j <- grep(columns[j],names(tmp)) + if(length(ix.j)==0){ + stop("can't find (grep) column ",columns[j]," in MACS input file ", e) + } + ix.cols <- c(ix.cols,ix.j) + } + infile.list[[e]] <- tmp[,ix.cols] + colnames(infile.list[[e]]) <- col.nms.keepers + } + if(toupper(input.type)=="BED"){ + tmp <- read.delim(file=input.files[i],stringsAsFactors = F,header = FALSE) + tmp[,4] <- tmp[,2]+(tmp[,3]-tmp[,2])/2 # define summit and put istead of name column + colnames(tmp) <- col.nms.keepers + infile.list[[e]] <- tmp[,1:5] + } + unique.chr <- unique(c(unique.chr,infile.list[[ e ]][,"chr"])) +} + +unique.chr <- sort(unique.chr) + +# take all macs files and put peaks together on each chromosome +# (as if each chromosome is a ruler and we specify where in the ruler each peak summit falls) +cat("splitting input files per chromosome\n") +x <- split.macs.list.to.chrmosomes.no.pml(macs.list=infile.list,expts=expt.names,chr=unique.chr) +cat("adding peaks from all input files into chromosome rulers\n") +ruler <- make.ruler.no.pml(chr=unique.chr,macs.list.per.chrom=x) +cat("add MTL membership to the ruler\n") + +if(mtl.type=="interval"){ + ruler <- assign.clusters.ids.to.peaks.based.on.intervals(ruler) +} else { + ruler <- assign.clusters.ids.to.peaks.based.on.summits(ruler,d.cut=dist.summits) +} + +cat("creating MTL list\n") +ll <- create.cluster.list.no.pml(ruler=ruler) +cat("writing MTL table\n") +f.nm <- paste(sep="",path.output,"mtls",".xls") +print.ll.verbose.all(ll=ll,f.nm=f.nm) + + + + + + + + + diff -r a0306edbf2f8 -r b465306d00ba mtls_analyze/gtfToMapFriendlyAnnotation.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mtls_analyze/gtfToMapFriendlyAnnotation.R Mon Jul 23 13:00:15 2012 -0400 @@ -0,0 +1,102 @@ +rm (list = ls()) +args <- commandArgs() +print(args) +gtf <- read.table(args[6],as.is=T) +colnames(gtf) <- c("chrom", + "source", + "type", + "five_prime", + "three_prime", + "score", + "strand", + "frame", + "nothinggene", + "gene", + "nothingSemi1", + "nothingtranscript", + "transcript", + "nothingSemi2") +gtf.exon <- gtf[which(gtf[,"type"]=="exon"),] +gtf.exon.slim <-gtf.exon[,c("chrom", "five_prime", "three_prime", "strand", "transcript")] +gtf.exon.slim.sort <- gtf.exon.slim[sort.list(gtf.exon.slim[,"transcript"]),] + + + +transcripts <- as.vector(unique(gtf.exon.slim[,"transcript"])) +#transcripts <- as.vector(unique(gtf.exon.slim[,"transcript"])) + + + +output <- matrix(0,nr=length(transcripts),nc=ncol(gtf.exon.slim.sort)) + +all.chrom <- gtf.exon.slim.sort[,"chrom"] +all.five_prime <- gtf.exon.slim.sort[,"five_prime"] +all.three_prime <- gtf.exon.slim.sort[,"three_prime"] +all.strand <- gtf.exon.slim.sort[,"strand"] +all.transcript <- gtf.exon.slim.sort[,"transcript"] + +colnames(output) <- colnames(gtf.exon.slim.sort) + +j <- 0 +i <- 1 + +previous.strand <- gtf.exon.slim.sort[i,"strand"] +previous.chrom <- gtf.exon.slim.sort[i,"chrom"] +previous.transcript <- gtf.exon.slim.sort[i,"transcript"] +previous.five_prime.min <- gtf.exon.slim.sort[i,"five_prime"] +previous.three_prime.max <- gtf.exon.slim.sort[i,"three_prime"] + +# Progress bar: +total <- dim(gtf.exon.slim.sort)[1] +# create progress bar +pb <- txtProgressBar(min = 0, max = total, style = 3) + + + + +for ( i in 1:length(all.transcript)) { + current.transcript <- all.transcript[i] + setTxtProgressBar(pb, i) + if (previous.transcript != current.transcript) { + j <- j + 1 + # Write out the current transcript info + output[j,"chrom"] <- previous.chrom + output[j,"five_prime"] <- previous.five_prime.min + output[j,"three_prime"] <- previous.three_prime.max + output[j,"strand"] <- previous.strand + output[j,"transcript"] <- previous.transcript + + # Save the new transcript info (convert the current to previous) + + previous.strand <- all.strand[i] + previous.chrom <- all.chrom[i] + previous.transcript <- all.transcript[i] # current.transcript + previous.five_prime.min <- all.five_prime[i] + previous.three_prime.max <- all.three_prime[i] + } + else { + previous.five_prime.min <- min(previous.five_prime.min, all.five_prime[i]) + previous.three_prime.max <- max(previous.three_prime.max, all.three_prime[i]) + previous.transcript <- all.transcript[i] # current.transcript + } +} +# Write the last item +j <- j + 1 +# Write out the current transcript info +output[j,"chrom"] <- previous.chrom +output[j,"five_prime"] <- previous.five_prime.min +output[j,"three_prime"] <- previous.three_prime.max +output[j,"strand"] <- previous.strand +output[j,"transcript"] <- previous.transcript + +close(pb) + +colnames(output) <- c("chrom", "txStart", "txEnd", "strand", "name2") +write.table(output, file="gene_annotation.txt", sep="\t", row.names=F) + + #ix <- which(gtf.exon.slim[,"transcript"] == transcripts[i]) + #output[i,"transcript"] <- transcripts[i] + #output[i,"five_prime"] <- min(gtf.exon.slim[ix,"five_prime"]) + #output[i,"three_prime"] <- max(gtf.exon.slim[ix,"three_prime"]) + #output[i,"strand"] <- gtf.exon.slim[ix,"strand"][1] + #output[i,"chrom"] <- gtf.exon.slim[ix,"chrom"][1] diff -r a0306edbf2f8 -r b465306d00ba mtls_analyze/heatmap.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mtls_analyze/heatmap.R Mon Jul 23 13:00:15 2012 -0400 @@ -0,0 +1,1524 @@ +GeneratePeakMatrix <- function(experiments, scores) { + # Generates a score matrix for all mtls. + # + # Args: + # experiments: A list of underscore deliminated experiments for each mtl. + # There should never be a completely empty item in this list. + # for eg. experiments[1] = expA_expD_expB. + # scores: A list of underscore deliminated scores for each mtl, the length + # of these scores should be identicle to the length of the + # experiments. eg. scores[1] = 55_33_245. + # + # Returns: + # The peak score matrix for all mtls. + experiments <- sapply(experiments, function(x) strsplit(x, split = "_")) + scores <- sapply(scores, function(x) strsplit(x, split = "_")) + unique.experiments <- unique(unlist(experiments)) + + peaks=matrix(0,nr=length(experiments),nc=length(unique.experiments)) + colnames(peaks) <- unique.experiments + + for(i in 1:length(experiments)){ + for(j in 1:length(experiments[[i]])){ + peaks[i,experiments[[i]][j]]=as.numeric(scores[[i]][j]) + } + } + return(peaks) +################################################################## +} +GetChipData <- function(file.name, + proximity = "distal", + include.targetless = TRUE, + column.order = NA) { + # Reads in, filters, and organizes mtls data + # + # Args: + # file.name: The path to the mtls file (including the file name). + # proximity: Either "distal" or "proximal", defines the gene target distance + # from the mtl. Default is "distal". + # include.targetless: If TRUE, includes mtls with no targets (applied after + # the proximity filter); if not, mtls with no target + # will be exclided. Default is TRUE. + # column.order: An optional vector of column names in the order in which + # they will be presented. If this is left to default (NA) the + # presented order of the chip columns will be the order in + # which they are seen. + # + # Returns: + # An organized list of mtls data with the following elements: + # $peaks - a matrix of peak p-values + # $targets - a list of underscore deliminated gene targets for each mtl + if(param$debug) { + print("In GetChipData") + } + # Set Constants for the mtls file type: + MTLNUM <- "mtl.id" + CHROMOSOME <- "chr" + EXPERIMENTS <- "expt" + EXPERIMENTS.SORTED <- "expt.alphanum.sorted" + START <- "start" + END <- "end" + SUMMIT <- "summit" + SCORES <- "score" + SCORE.MEAN <- "score.mean" + SPANS <- "span.tfs" + SPAN.TOTAL <- "span.l" + PEAK.IDS <- "peak.ids" + TARGETS <- "trgt" +# PROXIMAL.TARGETS <- "trgt.prox" +# DISTAL.TARGETS <- "trgt.dist" +# PROXIMAL.TARGETS.TSS.DISTANCE <- "dtss.prox" +# DISTAL.TARGETS.TSS.DISTANCE <- "dtss.dist" + + + #Get chip data in from files: + #TARGETS <- switch(proximity, + # distal = DISTAL.TARGETS, + # proximal = PROXIMAL.TARGETS, + # stop(" Bad proximity argument supplied.")) # Default +cat("param$rna.files = ", param$rna.files, "\n") +if(!is.na(param$rna.files) && param$rna.files != "none") { + keep.columns <- c(EXPERIMENTS, SCORES, TARGETS) +} else { + keep.columns <- c(EXPERIMENTS, SCORES) +} + file <- read.delim(file.name, header=T, as.is=T)[, keep.columns] +if(!is.na(param$rna.files) && param$rna.files != "none") { + if (!include.targetless) { + ix.has.trgt <- which(as.character(file[, TARGETS])!="") + file <- file[ix.has.trgt, ] + } +} + chip = list() + + chip$peaks <- GeneratePeakMatrix(file[, EXPERIMENTS], file[, SCORES]) + if(!is.na(column.order)) { + #If you specify an order, or a subset of experiments to include, this is where + #that gets done. + order <- unlist(strsplit(column.order, split="::")) + chip$peaks <- chip$peaks[,order] + } + if(!is.na(param$rna.files) && param$rna.files != "none") { + chip$targets <- file[ , TARGETS] +} + + return (chip) +} +GetRNADataFromOneFile <- function(file.name, fpkm = "avg") { + # Reads in rna expression data from cufflinks + # + # Args: + # file.name: The path to the cufflinks file (including the file name). + # fpkm: What fpkm is chosen, options are hi, low and avg + # Returns: + # An organized vector of rnaseq data with the following elements: + # names(data) - the genes from the file. eg names(data[1]) = JUND + # data - the fpkm score for that gene data[1] = 31 + if(param$debug) { + cat("In GetRNADataFromOneFile for ",file.name) + } + + # Set Constants for the cufflinks file type: + GENE <- "gene_id" + BUNDLE <- "bundle_id" + CHROMOSOME <- "chr" + LEFT <- "left" + RIGHT <- "right" + FPKM_AVG <- "FPKM" + FPKM_LOW <- "FPKM_conf_lo" + FPKM_HIGH <- "FPKM_conf_hi" + STATUS <- "status" + + + #Get chip data in from files: + FPKM <- switch(fpkm, + hi = FPKM_HIGH, + avg = FPKM_AVG, + low = FPKM_LOW, + stop(" Bad fpkm quality argument supplied.")) # Default + + keep.columns <- c(GENE, FPKM) + + file <- read.delim(file.name, header=T, as.is=T)[, keep.columns] + + rna = vector(mode = "numeric", length = nrow(file)) + + rna <- file[, FPKM] + names(rna) <- file[ , GENE] + return (rna) +} + +GetRNAData <- function(file.names, file.lables = file.names, fpkm = "avg") { + # Reads in rna expression data from cufflinks + # + # Args: + # file.names: The list of paths to the cufflinks files (including the file name). + # fpkm: What fpkm is chosen, options are hi, low and avg + # Returns: + # A matrix of rnaseq data with the following description: + # each col corresponds to an rnaseq run + # eacg row corresponds to a gene + if(param$debug) { + print("In GetRNAData") + } + files <- list() + for(i in 1:length(file.names)) { + files[[i]] <-GetRNADataFromOneFile(file.names[i]) + } + genes <- unique(names(unlist(files))) + scores <- matrix(0,nrow=length(genes),ncol=length(file.names)) + rownames(scores) <- genes + colnames(scores) <- file.names + + for(j in 1:length(file.names)) { + scores[names(files[[j]]),j] <- files[[j]] + } +print("# of cols for scores is ") +print(ncol(scores)) +print ("file lables are ") +print (file.lables) + colnames(scores) <- file.lables + return(scores) + #scores[genes,file.names] <- files[[]] +} + +NormalizeRNA <- function(scores) { + +#add psudocount +scores <- scores+1 + +numerator <- scores[,1:(ncol(scores)/2)] +denominator <- scores[,(ncol(scores)/2 + 1):ncol(scores)] + +#new.scores <- scores[,-which(colnames(scores) == norm.exp)] +#norm.score <- scores[,which(colnames(scores) == norm.exp)] +#return(log2(new.scores/norm.score)) + +return(log2(numerator/denominator)) + +} + +PrepareRNAforHeatmap <- function(new.scores.foldchange){ +new.scores.split <- matrix(0, nr=nrow(new.scores.foldchange), nc=2*ncol(new.scores.foldchange)) +is.even <- function(x){ x %% 2 == 0 } +corresponding.col <- function(x){ceiling(x/2)} +new.col.names <- vector(length = ncol(new.scores.split)) +rownames(new.scores.split) <- rownames(new.scores.foldchange) + for(i in 1:ncol(new.scores.split)) { + if(is.even(i)) {sign <- "down"} else {sign <- "up"} + new.col.names[i] <- paste(colnames(new.scores.foldchange)[corresponding.col(i)], + sign, sep =".") + new.scores.split[,i] <- new.scores.foldchange[,corresponding.col(i)] + if(is.even(i)) { + new.scores.split[which(new.scores.split[,i]>0),i] <- 0 + new.scores.split[,i] <- -1*new.scores.split[,i] + } else { + new.scores.split[which(new.scores.split[,i]<0),i] <- 0 + } + colnames(new.scores.split) <- new.col.names +} +return(new.scores.split)} + +#ConvertExpressionToPval = function(expression, threshold = 1) +#{ +# print("In ConvertExpressionToPval") +# #Generate Normal Stats +# # expression.mean = apply(expression, 2, mean) +# # expression.sd = apply(expression, 2, sd) +# +# ix.above.threshold = which(expression > threshold) +# expression.mean = apply(expression, 2, function(i) mean(i[which(i > threshold)])) +# expression.sd = apply(expression, 2, function(i) sd(i[which(i > threshold)])) +# + +# # CDF from zero to point +# # expression.pval = matrix(,nr=nrow(expression),nc=ncol(expression),dimnames=dimnames(expression)) +# # ix.low = which(expression < expression.mean) +# # ix.upper = which(expression >= expression.mean) +# # expression.pval[ix.low] = (-10 * pnorm(expression[ix.low], mean = expression.mean, sd = expression.sd, log=T) *log10(exp(1))) +# +# expression.pval = (-10 * log10(exp(1)) * pnorm(expression, +# mean = expression.mean, +# sd = expression.sd, +# log=T, +# lower.tail=F +# ) +# ) +# #squash those under threshold to zero +# expression.pval[-ix.above.threshold] = 0 + +# # correct for upper tail (ie from point to inf) +# #Still need to do +# return (expression.pval) +#} +MapExpressiontoChip2 = function(chip.targets, expression) +{ +mymatrix = matrix(0, nr=length(chip.targets), nc=ncol(expression)) +rownames(mymatrix) <- chip.targets +colnames(mymatrix) <- colnames(expression) + + +j <- 1 #expression counter +i <- 1 #mymatrix counter +expression.genes <- rownames(expression) +previous.expression.gene <- expression.genes[j] + +# Progress bar: +total <- length(chip.targets) +# create progress bar +pb <- txtProgressBar(min = 0, max = total, style = 3) + +for ( i in 1:length(chip.targets)) { + current.target <- chip.targets[i] + setTxtProgressBar(pb, i) + while (current.target > previous.expression.gene && j<=length(expression.genes)) { + j <- j + 1 + previous.expression.gene <- expression.genes[j] + } + if (current.target == previous.expression.gene){ + mymatrix[i,] <- expression[j,] + } +} + +close(pb) + + print("Leaving MapRNAtoChip") + return(mymatrix) +} + + +MapExpressiontoChip = function(chip.targets, expression) +{ + print("In MapRNAtoChip") + # targets = unlist(strsplit(chip$targets[i], "_")) + # exp = matrix(rna$all.pval[targets, ]) + # return (apply(exp, 2, mean)) + + #rna col 1 = th0 overexpressed + #rna col 2 = th17 overexpressed + #rna col 3 = true zscores + + + + + + + #################################################################### + chip.targets.notnull.unique <- unique(chip.targets[which(chip.targets!="")]) + gene.intersect <- intersect(rownames(expression), chip.targets.notnull.unique) + chip.targets.notnull.unique.with.data <- chip.targets.notnull.unique[which(chip.targets.notnull.unique %in% gene.intersect)] + expression.useful.data <- expression[which(rownames(expression) %in% gene.intersect),] + chip.targets.notnull.unique.with.data.sorted <- chip.targets.notnull.unique.with.data[order(chip.targets.notnull.unique.with.data)] + expression.useful.data.sorted <- expression.useful.data[order(rownames(expression.useful.data)),] + #################################################################### + + mymatrix = matrix(0, nr=length(chip.targets), nc=ncol(expression)) + rownames(mymatrix) <- chip.targets + colnames(mymatrix) <- colnames(expression) + head(chip.targets.notnull.unique.with.data.sorted) + head(rownames(expression.useful.data.sorted)) + if(!identical(chip.targets.notnull.unique.with.data.sorted, rownames(expression.useful.data.sorted))){ + stop("We have a serious problem, chip is not alligned to expression") + } + for(i in 1:length(chip.targets.notnull.unique.with.data.sorted)) + { + mtls.ix <- which(chip.targets == chip.targets.notnull.unique.with.data.sorted[i]) + for(j in 1:length(mtls.ix)){ + mymatrix[mtls.ix[j],] <- expression.useful.data.sorted[i,] + } + } + print("Leaving MapRNAtoChip") + return(mymatrix) + + #return( sapply(chip$targets, function(x) apply(rna$all.pval[unlist(strsplit(chip$targets[3], "_")), ], 2, mean)) ) +} + + + + + +GenerateKMOrder <- function(data, km) { + # generates the order of clusters from high to low mean values + # + # Args: + # data: A matrix of scores that have already been clustered. + # km: The K-means object generated from running kmeans on data. Another + # method could be used so long as it supplies a (km)$cluser list. Must + # have the same length as the number of rows in data + # + # Returns: + # cluster.order: The order in which the clusters should be displayed. + + km.cluster = km$cluster + clusters = unique(km.cluster) + clusters.avg = numeric() + for(i in clusters) { + clusters.avg = c(clusters.avg, mean(data[which(km.cluster == i), ])) + } + if(param$debug) { + print ("straight clusters") + print (clusters) + print ("straigth average") + print (clusters.avg) + print ("ordered clusters") + print (clusters[order(clusters.avg)]) + print("ordered average") + print (clusters.avg[order(clusters.avg)]) + } + return(clusters[rev(order(clusters.avg))]) +} + +OrderMTL <- function(data, km, cluster.order) { + # Orders a matrix of data according to a clustering algorithm + # + # Args: + # data: A matrix of scores that have already been clustered. + # km: The K-means object generated from running kmeans on data. Another + # method could be used so long as it supplies a (km)$cluser list. Must + # have the same length as the number of rows in data + # cluster.order: The order in which the clusters should be displayed. + # for eg. km.order = c(2, 3, 1) would result in cluster 2 + # being on top, then cluster 3 and lastly cluster 1. + # + # Returns: + # a list that contains 3 objects: + # list$data: the ordered version of the data. + # list$color.vector: a list of colors that should be assigned to each row. + # list$start.row: the starting row of each cluster in data. + number.clusters <- length(cluster.order) + cluster.colors <- sample(rainbow(number.clusters)) + + # Set up return objects + sorted.data <- matrix(,nr=nrow(data), nc=ncol(data)) + colnames(sorted.data) <- colnames(data) + cluster.color.vector = vector(length=length(km$cluster)) + cluster.start.row = numeric(number.clusters) + cluster.start.row[1]=1 + + for (i in 1:number.clusters) + { + current.cluster = cluster.order[i] + ix = which(km$cluster == current.cluster) + current.cluster.range <- cluster.start.row[i]:(cluster.start.row[i]+length(ix)-1) + sorted.data[current.cluster.range, ] = data[ix, ] + cluster.color.vector[current.cluster.range] = cluster.colors[i] + cluster.start.row[i+1] = (cluster.start.row[i]+length(ix)) + } + ret.list = list() + ret.list$data = sorted.data + ret.list$color.vector = cluster.color.vector + ret.list$cluster.index = cluster.start.row + return(ret.list) +} + + +CreateHeatMap <- function(data, + km, + cluster.order, + document.name, + document.type = "png", + number.colors = 30) { + # Generates a heatmap image based for a matrix based on a clustering + # algorithm. + # + # Args: + # data: A matrix of scores that have already been clustered, the column + # names of this matrix will become the column titels of the heatmap. + # km: The K-means object generated from running kmeans on data. Another + # method could be used so long as it supplies a (km)$cluser list. + # cluster.order: The order in which the clusters should be displayed. + # for eg. km.order = c(2, 3, 1) would result in cluster 2 + # being on top, then cluster 3 and lastly cluster 1. + # document.name: A name for the produced file. there is no need to + # supply the .png/.pdf in your argument. + # document.type: The type of file you want to produce. current options are + # png and pdf. Default is pdf. + # + # Returns: + # Nothing to the script that calls it, however it creates an image at the + # path specified. + if(param$debug) { + print("In CreateHeatMap") + } + + data.ordered <- OrderMTL(data, km, cluster.order) + + + #Load Lib + library(gplots) + + #Set Color gradient + color.ramp = colorRampPalette(c("black", + "darkblue", + "blue", + "yellow", + "orange", + "red"))(number.colors) #7 + + if(document.type == "png") { + png(paste(document.name,".png", sep = ""), + #width=15360,#1920, + #height=204080,#2560, + #res=500, + antialias="gray") + } + else if(document.type == "pdf") { + pdf(paste(document.name,".pdf", sep = "")) + } + else if(document.type == "tiff") { + tiff(paste(document.name,".tiff", sep = ""), + res=800, + pointsize=2, + width=1920, + height=1920) + } + else { + bitmap(paste(document.name,".bmp", sep = ""), + height = 5, + width = 5, + res = 500) + } +#op <- par(mar = rep(0, 4)) + + heatmap.2( + data.ordered$data, #data.ordered$data[,sort(colnames(data.ordered$data))], + rowsep = data.ordered$cluster.index[-1], + sepwidth = c(0.5, ncol(data)/100), + dendrogram = "none", + Rowv = F, + Colv = F, + trace = "none", + labRow = F, #sapply(seq(1:length(data.ordered$cluster.index)), toString), + labCol = colnames(data.ordered$data), #sort(colnames(data.ordered$data)), + RowSideColors = data.ordered$color.vector, + keysize=0.6, + key=F, + col = color.ramp, + cexCol = 0.8, + cexRow = 0.8) + + dev.off() + +} + +CreateIndividualHeatMap <- function(data, + km, + cluster.order, + color.ramp = colorRampPalette(c("black", + "darkblue", + "blue", + "yellow", + "orange", + "red"))(30)) { + # Generates a heatmap image based for a matrix based on a clustering + # algorithm. + # + # Args: + # data: A matrix of scores that have already been clustered, the column + # names of this matrix will become the column titels of the heatmap. + # km: The K-means object generated from running kmeans on data. Another + # method could be used so long as it supplies a (km)$cluser list. + # cluster.order: The order in which the clusters should be displayed. + # for eg. km.order = c(2, 3, 1) would result in cluster 2 + # being on top, then cluster 3 and lastly cluster 1. + # document.name: A name for the produced file. there is no need to + # supply the .png/.pdf in your argument. + # document.type: The type of file you want to produce. current options are + # png and pdf. Default is pdf. + # + # Returns: + # Nothing to the script that calls it, however it creates an image at the + # path specified. + if(param$debug) { + print("In CreateHeatMap") + } + + data.ordered <- OrderMTL(data, km, cluster.order) + + + #Load Lib + library(gplots) + + + + + heatmap.3( + data.ordered$data, #data.ordered$data[,sort(colnames(data.ordered$data))], + rowsep = data.ordered$cluster.index[-1], + sepwidth = c(0.5, nrow(data.ordered$data)/100), + dendrogram = "none", + Rowv = F, + Colv = F, + trace = "none", + labRow = F, #sapply(seq(1:length(data.ordered$cluster.index)), toString), + labCol = colnames(data.ordered$data), #sort(colnames(data.ordered$data)), + #sourcRowSideColors = data.ordered$color.vector, + keysize=0.6, + key=F, + col = color.ramp, + cexCol = 0.8, + cexRow = 0.8) + + +} + +ReadCommadLineParameters <- function(argument.names, command.line.arguments, optional = F) { + # Reads the parameters from the command line arguments. + # + # Args: + # argument.names: A list of expected argument names. + # command.line.arguments: The list of recieved command line arguments + # optional: Are the areguments optional, or are they required, default is required + # + # Returns: + # The arguments for argument.names. As strings In that order. + + if(length(grep("--version",command.line.arguments))) { + cat("version",script.version,"\n") + q() + } + # Split command line arguments + args <- sapply(strsplit(command.line.arguments, " "),function(i) i) + vals <- character(length(argument.names)) + # split cmd.line to key and value pairs + for(i in 1:length(argument.names)) { + ix <- grep(argument.names[i], args) + if(length(ix)>1) { + stop("arg ", + argument.names[i], + " used more than once. Bailing out...\n", + PrintParamError()) + } + else if (length(ix)==0 && !optional) { + stop("could not find ", + argument.names[i], + ". Bailing out...\n", + PrintParamError()) + } + else if (length(ix)==0 && optional) { + vals[i] <- NA + } + else { + vals[i] <- args[ix+1] + } + } + return(vals) +} + +PrintParamError <- function(){ + # Prints the usage of the function, shows users what arguments they can use + # + # Args: + # param: A list that contains all the paramaters. + # + # Returns: + # A modified version of the param list, with the default values loaded. + cat(" +DESCRIPTIION: + heatmap.R takes a ... + +INPUT: + 1.--mtls_file: path to mtls file.\n + 2.--cluster_file: the destination path for the output cluster file.\n + 3.--chip_experiment_order: The order of desired chip experiments (optional).\n + 4.--heatmap_file: path for output heatmap image (no extension).\n + 5.--heatmap_type: choice of image format, currently support png, pdf, tiff and bmp (optional)\n + 6.--expression_file: list of expression files to be included in analysis (optional).\n + 7.--expression_name: lables for the expression files (optional).\n + 8.--normalization_file: a list of files to be used for normalization, + they can be the same file, however the number of expression nominated + normalization files must match the number of expression files (optional)\n + 9.--n_clusters: number of clusters\n + 10.--filter_percentage: percentage of mtls that will be analysed. Eg: if + we make filter_percentage 30, we will take the union of the top mtls in + mean, non-zero mean and variance (optional).\n + +EXAMPLE RUN: + Rscript heatmap.R + --mtls_file path/to/mtls.xls + --cluster_file path/to/output/cluster + --chip_experiment_order tf1::tf2::tf5::tf3 + --heatmap_file path/to/output/heatmap + --heatmap_type png + --expression_file path/to/exp1::path/to/exp2 + --expression_name myexp1::myexp2 + --normalization_file path/to/exp3::path/to/exp3 + --n_clusters 13 + --filter_percentage 100 + --include_targetless yes + --number_bins 30 + + ") +} + +#LoadDefaultParams <- function(param) { +# # Loads default paramaters for the heatmap application +# # +# # Args: +# # param: A list that contains all the previous paramaters. +# # +# # Returns: +# # A modified version of the param list, with the default values loaded. + +#script.version=0.1 +#param$debug = F + +## RNA data: +#param$rna.files = "" +#param$rna.normalization = "none" + +## Filter: +#param$filter.percentage <- 100 + +## Clustering: +#param$clustering.number.of.clusters <- 13 + +## Heatmap: +#param$heatmap.document.name <- "heatmap" +#param$heatmap.document.type <- "png" +##Cluster Groups: +#param$cluster.groups.document.name <- "clusters" + +#return(param) + +#} + + + +LoadParams <- function(cmd.args, args.nms, n.start.numeric, optional = F) { + # Loads user defined paramaters for the heatmap application + # + # Args: + # cmd.args: The command line arguments given. + # arg.nms: A list of possible command arguments. + # n.start.numeric: the first argument that should be numeric (alway put + # these last). + # optional: This specifies if the params in cmd.args are optional or + # required. + # Returns: + # A list of values assigned to each argument. + + + vals <- ReadCommadLineParameters(argument.names = args.nms, + command.line.arguments = cmd.args, + optional = optional) + + #check if numeric params are indeed numeric + if(!optional) { + for(i in n.start.numeric:length(vals)){ + if(is.na(as.numeric(vals[i]))){ + stop("arg ",args.nms[i]," is not numeric. Bailing out...\n",print.error()) + } + } +} + return (vals) + +} + +#ValidateParams <- function(params) { +# return(T) +#} + + +########## + +LoadDebugParams <- function(param) { + + cmd.args <- c( + "--mtls_file data/test/mtls.xls", + "--cluster_file data/test/cluster", + "--heatmap_file data/test/heatmap", + "--heatmap_type bmp", + "--n_clusters 13", + "--filter_percentage 100", + "--expression_file /home/kieran/code/scorchR-heatmap/data/expression/rna1.tabular::/home/kieran/code/scorchR-heatmap/data/expression/rna2.tabular", + "--expression_name batf.ko.0::batf.ko.17", + "--normalization_file mean", + "--chip_experiment_order ac::bc::cc::dc::ec", + "--include_targetless yes", + "--number_bins 20" + ) + +args.nms <- c( "--mtls_file", #1 + "--cluster_file", #2 + "--chip_experiment_order", #3 + "--heatmap_file", #4 + "--heatmap_type", #5 + "--expression_file", #6 + "--expression_name", #7 + "--normalization_file", #8 + "--include_targetless", #9 + "--n_clusters", #10 + "--filter_percentage", #11 + "--number_bins") #12 + + +# vals has the same order as args.nms +vals <- LoadParams(cmd.args, args.nms, n.start.numeric = 10, optional = F) + + +# ChIP data: +param$annotated.macs.file <- vals[1] +param$chip.order <- vals[3] +# RNA data: +param$rna.files = vals[6] +param$rna.names = vals[7] +param$rna.normalization.file = vals[8] +param$include.targetless = vals[9] + +# Filter: +param$filter.percentage <- as.numeric(vals[11]) + +# Clustering: +param$number.bins <- as.numeric(vals[12]) +param$clustering.number.of.clusters <- as.numeric(vals[10]) + +# Heatmap: +param$heatmap.document.name <- vals[4] +param$heatmap.document.type <- vals[5] +#Cluster Groups: +param$cluster.groups.document.name <- vals[2] + +return(param) + +} + + + +LoadOptionalParams <- function(param) { + +cmd.args <- commandArgs(trailingOnly = T) + +args.nms <- c( "--chip_experiment_order", #1 + "--expression_file", #2 + "--expression_name", #3 + "--normalization_file", #4 + "--heatmap_type", #5 + "--include_targetless", #6 + "--filter_percentage", #7 + "--number_bins") #8 + + +# vals has the same order as args.nms +vals <- LoadParams(cmd.args, args.nms, n.start.numeric = 7, optional = T) + +# ChIP data: +param$chip.order <- if(!is.na(vals[1])){vals[1]}else{NA} + +# RNA data: +param$rna.files <- if(!is.na(vals[2])){vals[2]}else{"none"} +param$rna.names <- if(!is.na(vals[3])){vals[3]}else{"none"} +param$rna.normalization.file <- if(!is.na(vals[4])){vals[4]}else{"no"} +param$include.targetless <- if(!is.na(vals[6])){vals[6]}else{"yes"} +# Filter: +param$filter.percentage <- if(!is.na(vals[7])){as.numeric(vals[7])}else{100} +param$number.bins <- if(!is.na(vals[8])){as.numeric(vals[8])}else{30} +# Heatmap file (output) +param$heatmap.document.type <- if(!is.na(vals[5])){vals[5]}else{"none"} + +return(param) +} + + +LoadReqiredParams <- function(param){ + +cmd.args <- commandArgs(trailingOnly = T) + +args.nms <- c( "--mtls_file", #1 + "--cluster_file", #2 + "--heatmap_file", #3 + "--n_clusters") #4 + + +# vals has the same order as args.nms +vals <- LoadParams(cmd.args, args.nms, n.start.numeric = 4, optional = F) + +# ChIP data: +param$annotated.macs.file <- vals[1] + +# Clustering +param$clustering.number.of.clusters <- as.numeric(vals[4]) + +# Cluster file (output): +param$cluster.groups.document.name <- vals[2] + + +# Heatmap file (output): +param$heatmap.document.name <- vals[3] + +return(param) + +} + +########### +# here we output the fasta file of the targets of each kmeans cluster +CreateClusterGroups <- function(trgts,k.ix,f.nm="output/clust.to.trgts", km.order){ + f.nm = paste(f.nm, ".fasta", sep="") + clusters = km.order + for(i in 1:length(clusters)){ + v=trgts[which(k.ix==clusters[i])] + v.split = unlist(sapply(v,strsplit, "_")) + if(i == 1){ + cat(sep="",file=f.nm,">cluster_",i,"\n")#clusters[i],"\n") + } else { + cat(sep="",file=f.nm,">cluster_",i,"\n",append=TRUE)#clusters[i],"\n",append=TRUE) + } + cat(sep="\n",file=f.nm,v.split,append=TRUE) + } +} + + +PrintClusters <- function(trgts,k.ix,f.nm="output/clust.to.trgts", km.order){ + f.nm = paste(f.nm, ".tsv", sep="") + cat(sep="\t",file=f.nm,"row_number/target","cluster","\n") + trgts[which(trgts=="")] <- "no_target" + for(i in 1:length(trgts)){ + cat(sep="",file=f.nm,trgts[i],"\t","cluster_",k.ix[i],"\n",append=TRUE) + } +# if(i == 1){ +# cat(sep="",file=f.nm,"cluster_",i,"\n")#clusters[i],"\n") +# } else { +# cat(sep="",file=f.nm,">cluster_",i,"\n",append=TRUE)#clusters[i],"\n",append=TRUE) +# } + #cat(sep="\n",file=f.nm,v.split,append=TRUE) + } +GetTopRowsFromMatrix = function(mtrx, percentage = 10) +{ + if (param$debug) { + print("In GetTopRowsFromMatrix") + } + #Store the stats for the mtrx + stats = list() + stats$mean=apply(mtrx,1,mean) + stats$sd=apply(mtrx,1,sd) + stats$nonzero.mean=apply(mtrx, 1, function(x) mean(x[which(x != 0)])) + + + #Store the indexes for the mtrx + index = list() + index$mean = sort(stats$mean, decreasing=T, index.return=T)$ix + index$sd = sort(stats$sd, decreasing=T, index.return=T)$ix + index$nonzero.mean = sort(stats$nonzero.mean, decreasing=T, index.return=T)$ix + + #Calculate how many data points we want to take + cutoff = floor(length(mtrx[ ,1])*(percentage/100)) + + #Extract Indexes and return to caller + index$union = union(index$mean[1:cutoff], union(index$sd[1:cutoff], index$nonzero.mean[1:cutoff])) + + return(index) +} + +#GetUniq = function(targets.cluster) +#{ +# #problem with this function is that it agrigates the list handed to it. after this point you cant maintain order +# +# return(unique(unlist(lapply(targets.cluster, function(i) strsplit(i, "_"))))) +# #return(unlist(lapply(targets.cluster, function(i) strsplit(i, "_")))) +# +#} + +bindata = function(d,qunts=seq(.4,.9,.1)) +{ + tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d)) + for(j in 1:ncol(d)) + { + bins=quantile(d[,j],qunts) + for(i in 1:length(bins)) + { + tmp[which(d[,j]>bins[i]),j] = i + } + } + return(tmp) +} + +bindata.non.zero = function(d,qunts=seq(0,0.9,0.1)) +{ + tmp=matrix(0,nr=nrow(d),nc=ncol(d)) + for(j in 1:ncol(d)) + { + ix.non.zero=which(d[,j]!=0) + bins=quantile(d[ix.non.zero,j],qunts) + for(i in 1:length(bins)) + { + tmp[which(d[,j]>bins[i]),j] = i + } + } + + return(tmp) +} +bindata.matrix = function(d,qunts=seq(0,0.9,0.1)) +{ + tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d)) + #ix.non.zero=which(d!=0) + bins=quantile(d[],qunts) + for(i in 1:length(bins)) + { + tmp[which(d>bins[i])] = i + } + return(tmp) +} +bindata.non.zero.matrix = function(d,qunts=seq(0,0.9,0.1)) +{ + tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d)) + ix.non.zero=which(d!=0) + bins=quantile(d[ix.non.zero],qunts) + for(i in 1:length(bins)) + { + tmp[which(d>bins[i])] = i + } + return(tmp) +} +heatmap.3 <- function (x, Rowv = TRUE, Colv = if (symm) "Rowv" else TRUE, + distfun = dist, hclustfun = hclust, dendrogram = c("both", + "row", "column", "none"), symm = FALSE, scale = c("none", + "row", "column"), na.rm = TRUE, revC = identical(Colv, + "Rowv"), add.expr, breaks, symbreaks = min(x < 0, na.rm = TRUE) || + scale != "none", col = "heat.colors", colsep, rowsep, + sepcolor = "white", sepwidth = c(0.05, 0.05), cellnote, notecex = 1, + notecol = "cyan", na.color = par("bg"), trace = c("column", + "row", "both", "none"), tracecol = "cyan", hline = median(breaks), + vline = median(breaks), linecol = tracecol, margins = c(5, + 5), ColSideColors, RowSideColors, cexRow = 0.2 + 1/log10(nr), + cexCol = 0.2 + 1/log10(nc), labRow = NULL, labCol = NULL, + key = TRUE, keysize = 1.5, density.info = c("histogram", + "density", "none"), denscol = tracecol, symkey = min(x < + 0, na.rm = TRUE) || symbreaks, densadj = 0.25, main = NULL, + xlab = NULL, ylab = NULL, lmat = NULL, lhei = NULL, lwid = NULL, + ...) +{ + scale01 <- function(x, low = min(x), high = max(x)) { + x <- (x - low)/(high - low) + x + } + retval <- list() + scale <- if (symm && missing(scale)) + "none" + else match.arg(scale) + dendrogram <- match.arg(dendrogram) + trace <- match.arg(trace) + density.info <- match.arg(density.info) + if (length(col) == 1 && is.character(col)) + col <- get(col, mode = "function") + if (!missing(breaks) && (scale != "none")) + warning("Using scale=\"row\" or scale=\"column\" when breaks are", + "specified can produce unpredictable results.", "Please consider using only one or the other.") + if (is.null(Rowv) || is.na(Rowv)) + Rowv <- FALSE + if (is.null(Colv) || is.na(Colv)) + Colv <- FALSE + else if (Colv == "Rowv" && !isTRUE(Rowv)) + Colv <- FALSE + if (length(di <- dim(x)) != 2 || !is.numeric(x)) + stop("`x' must be a numeric matrix") + nr <- di[1] + nc <- di[2] + if (nr <= 1 || nc <= 1) + stop("`x' must have at least 2 rows and 2 columns") + if (!is.numeric(margins) || length(margins) != 2) + stop("`margins' must be a numeric vector of length 2") + if (missing(cellnote)) + cellnote <- matrix("", ncol = ncol(x), nrow = nrow(x)) + if (!inherits(Rowv, "dendrogram")) { + if (((!isTRUE(Rowv)) || (is.null(Rowv))) && (dendrogram %in% + c("both", "row"))) { + if (is.logical(Colv) && (Colv)) + dendrogram <- "column" + else dedrogram <- "none" + warning("Discrepancy: Rowv is FALSE, while dendrogram is `", + dendrogram, "'. Omitting row dendogram.") + } + } + if (!inherits(Colv, "dendrogram")) { + if (((!isTRUE(Colv)) || (is.null(Colv))) && (dendrogram %in% + c("both", "column"))) { + if (is.logical(Rowv) && (Rowv)) + dendrogram <- "row" + else dendrogram <- "none" + warning("Discrepancy: Colv is FALSE, while dendrogram is `", + dendrogram, "'. Omitting column dendogram.") + } + } + if (inherits(Rowv, "dendrogram")) { + ddr <- Rowv + rowInd <- order.dendrogram(ddr) + } + else if (is.integer(Rowv)) { + hcr <- hclustfun(distfun(x)) + ddr <- as.dendrogram(hcr) + ddr <- reorder(ddr, Rowv) + rowInd <- order.dendrogram(ddr) + if (nr != length(rowInd)) + stop("row dendrogram ordering gave index of wrong length") + } + else if (isTRUE(Rowv)) { + Rowv <- rowMeans(x, na.rm = na.rm) + hcr <- hclustfun(distfun(x)) + ddr <- as.dendrogram(hcr) + ddr <- reorder(ddr, Rowv) + rowInd <- order.dendrogram(ddr) + if (nr != length(rowInd)) + stop("row dendrogram ordering gave index of wrong length") + } + else { + rowInd <- nr:1 + } + if (inherits(Colv, "dendrogram")) { + ddc <- Colv + colInd <- order.dendrogram(ddc) + } + else if (identical(Colv, "Rowv")) { + if (nr != nc) + stop("Colv = \"Rowv\" but nrow(x) != ncol(x)") + if (exists("ddr")) { + ddc <- ddr + colInd <- order.dendrogram(ddc) + } + else colInd <- rowInd + } + else if (is.integer(Colv)) { + hcc <- hclustfun(distfun(if (symm) + x + else t(x))) + ddc <- as.dendrogram(hcc) + ddc <- reorder(ddc, Colv) + colInd <- order.dendrogram(ddc) + if (nc != length(colInd)) + stop("column dendrogram ordering gave index of wrong length") + } + else if (isTRUE(Colv)) { + Colv <- colMeans(x, na.rm = na.rm) + hcc <- hclustfun(distfun(if (symm) + x + else t(x))) + ddc <- as.dendrogram(hcc) + ddc <- reorder(ddc, Colv) + colInd <- order.dendrogram(ddc) + if (nc != length(colInd)) + stop("column dendrogram ordering gave index of wrong length") + } + else { + colInd <- 1:nc + } + retval$rowInd <- rowInd + retval$colInd <- colInd + retval$call <- match.call() + x <- x[rowInd, colInd] + x.unscaled <- x + cellnote <- cellnote[rowInd, colInd] + if (is.null(labRow)) + labRow <- if (is.null(rownames(x))) + (1:nr)[rowInd] + else rownames(x) + else labRow <- labRow[rowInd] + if (is.null(labCol)) + labCol <- if (is.null(colnames(x))) + (1:nc)[colInd] + else colnames(x) + else labCol <- labCol[colInd] + if (scale == "row") { + retval$rowMeans <- rm <- rowMeans(x, na.rm = na.rm) + x <- sweep(x, 1, rm) + retval$rowSDs <- sx <- apply(x, 1, sd, na.rm = na.rm) + x <- sweep(x, 1, sx, "/") + } + else if (scale == "column") { + retval$colMeans <- rm <- colMeans(x, na.rm = na.rm) + x <- sweep(x, 2, rm) + retval$colSDs <- sx <- apply(x, 2, sd, na.rm = na.rm) + x <- sweep(x, 2, sx, "/") + } + if (missing(breaks) || is.null(breaks) || length(breaks) < + 1) { + if (missing(col) || is.function(col)) + breaks <- 16 + else breaks <- length(col) + 1 + } + if (length(breaks) == 1) { + if (!symbreaks) + breaks <- seq(min(x, na.rm = na.rm), max(x, na.rm = na.rm), + length = breaks) + else { + extreme <- max(abs(x), na.rm = TRUE) + breaks <- seq(-extreme, extreme, length = breaks) + } + } + nbr <- length(breaks) + ncol <- length(breaks) - 1 + if (class(col) == "function") + col <- col(ncol) + min.breaks <- min(breaks) + max.breaks <- max(breaks) + x[x < min.breaks] <- min.breaks + x[x > max.breaks] <- max.breaks + # if (missing(lhei) || is.null(lhei)) + # lhei <- c(keysize, 4) + # if (missing(lwid) || is.null(lwid)) + # lwid <- c(keysize, 4) + # if (missing(lmat) || is.null(lmat)) { + # lmat <- rbind(4:3, 2:1) + # if (!missing(ColSideColors)) { + # if (!is.character(ColSideColors) || length(ColSideColors) != + # nc) + # stop("'ColSideColors' must be a character vector of length ncol(x)") + # lmat <- rbind(lmat[1, ] + 1, c(NA, 1), lmat[2, ] + + # 1) + # lhei <- c(lhei[1], 0.2, lhei[2]) + # } + # if (!missing(RowSideColors)) { + # if (!is.character(RowSideColors) || length(RowSideColors) != + # nr) + # stop("'RowSideColors' must be a character vector of length nrow(x)") + # lmat <- cbind(lmat[, 1] + 1, c(rep(NA, nrow(lmat) - + # 1), 1), lmat[, 2] + 1) + # lwid <- c(lwid[1], 0.2, lwid[2]) + # } + # lmat[is.na(lmat)] <- 0 + # } + # if (length(lhei) != nrow(lmat)) + # stop("lhei must have length = nrow(lmat) = ", nrow(lmat)) + # if (length(lwid) != ncol(lmat)) + # stop("lwid must have length = ncol(lmat) =", ncol(lmat)) + # op <- par(no.readonly = TRUE) + # on.exit(par(op)) + # layout(lmat, widths = lwid, heights = lhei, respect = FALSE) + if (!missing(RowSideColors)) { + par(mar = c(margins[1], 0, 0, 0.5)) + image(rbind(1:nr), col = RowSideColors[rowInd], axes = FALSE) + } + if (!missing(ColSideColors)) { + par(mar = c(0.5, 0, 0, margins[2])) + image(cbind(1:nc), col = ColSideColors[colInd], axes = FALSE) + } + par(mar = c(margins[1], 0, 0, margins[2])) + x <- t(x) + cellnote <- t(cellnote) + if (revC) { + iy <- nr:1 + if (exists("ddr")) + ddr <- rev(ddr) + x <- x[, iy] + cellnote <- cellnote[, iy] + } + else iy <- 1:nr + image(1:nc, 1:nr, x, xlim = 0.5 + c(0, nc), ylim = 0.5 + + c(0, nr), axes = FALSE, xlab = "", ylab = "", col = col, + breaks = breaks, ...) + retval$carpet <- x + if (exists("ddr")) + retval$rowDendrogram <- ddr + if (exists("ddc")) + retval$colDendrogram <- ddc + retval$breaks <- breaks + retval$col <- col + if (!invalid(na.color) & any(is.na(x))) { + mmat <- ifelse(is.na(x), 1, NA) + image(1:nc, 1:nr, mmat, axes = FALSE, xlab = "", ylab = "", + col = na.color, add = TRUE) + } + axis(1, 1:nc, labels = labCol, las = 2, line = -0.5, tick = 0, + cex.axis = cexCol) + if (!is.null(xlab)) + mtext(xlab, side = 1, line = margins[1] - 1.25) + axis(4, iy, labels = labRow, las = 2, line = -0.5, tick = 0, + cex.axis = cexRow) + if (!is.null(ylab)) + mtext(ylab, side = 4, line = margins[2] - 1.25) + if (!missing(add.expr)) + eval(substitute(add.expr)) + if (!missing(colsep)) + for (csep in colsep) rect(xleft = csep + 0.5, ybottom = rep(0, + length(csep)), xright = csep + 0.5 + sepwidth[1], + ytop = rep(ncol(x) + 1, csep), lty = 1, lwd = 1, + col = sepcolor, border = sepcolor) + if (!missing(rowsep)) + for (rsep in rowsep) rect(xleft = 0, ybottom = (ncol(x) + + 1 - rsep) - 0.5, xright = nrow(x) + 1, ytop = (ncol(x) + + 1 - rsep) - 0.5 - sepwidth[2], lty = 1, lwd = 1, + col = sepcolor, border = sepcolor) + min.scale <- min(breaks) + max.scale <- max(breaks) + x.scaled <- scale01(t(x), min.scale, max.scale) + if (trace %in% c("both", "column")) { + retval$vline <- vline + vline.vals <- scale01(vline, min.scale, max.scale) + for (i in colInd) { + if (!is.null(vline)) { + abline(v = i - 0.5 + vline.vals, col = linecol, + lty = 2) + } + xv <- rep(i, nrow(x.scaled)) + x.scaled[, i] - 0.5 + xv <- c(xv[1], xv) + yv <- 1:length(xv) - 0.5 + lines(x = xv, y = yv, lwd = 1, col = tracecol, type = "s") + } + } + if (trace %in% c("both", "row")) { + retval$hline <- hline + hline.vals <- scale01(hline, min.scale, max.scale) + for (i in rowInd) { + if (!is.null(hline)) { + abline(h = i + hline, col = linecol, lty = 2) + } + yv <- rep(i, ncol(x.scaled)) + x.scaled[i, ] - 0.5 + yv <- rev(c(yv[1], yv)) + xv <- length(yv):1 - 0.5 + lines(x = xv, y = yv, lwd = 1, col = tracecol, type = "s") + } + } + if (!missing(cellnote)) + text(x = c(row(cellnote)), y = c(col(cellnote)), labels = c(cellnote), + col = notecol, cex = notecex) + par(mar = c(margins[1], 0, 0, 0)) + # if (dendrogram %in% c("both", "row")) { + # plot(ddr, horiz = TRUE, axes = FALSE, yaxs = "i", leaflab = "none") + # } + # else plot.new() + par(mar = c(0, 0, if (!is.null(main)) 5 else 0, margins[2])) + # if (dendrogram %in% c("both", "column")) { + # plot(ddc, axes = FALSE, xaxs = "i", leaflab = "none") + # } + # else plot.new() + if (!is.null(main)) + title(main, cex.main = 1.5 * op[["cex.main"]]) + # if (key) { + # par(mar = c(5, 4, 2, 1), cex = 0.75) + # tmpbreaks <- breaks + # if (symkey) { + # max.raw <- max(abs(c(x, breaks)), na.rm = TRUE) + # min.raw <- -max.raw + # tmpbreaks[1] <- -max(abs(x), na.rm = TRUE) + # tmpbreaks[length(tmpbreaks)] <- max(abs(x), na.rm = TRUE) + # } + # else { + # min.raw <- min(x, na.rm = TRUE) + # max.raw <- max(x, na.rm = TRUE) + # } + # z <- seq(min.raw, max.raw, length = length(col)) + # image(z = matrix(z, ncol = 1), col = col, breaks = tmpbreaks, + # xaxt = "n", yaxt = "n") + # par(usr = c(0, 1, 0, 1)) + # lv <- pretty(breaks) + # xv <- scale01(as.numeric(lv), min.raw, max.raw) + # axis(1, at = xv, labels = lv) + # if (scale == "row") + # mtext(side = 1, "Row Z-Score", line = 2) + # else if (scale == "column") + # mtext(side = 1, "Column Z-Score", line = 2) + # else mtext(side = 1, "Value", line = 2) + # if (density.info == "density") { + # dens <- density(x, adjust = densadj, na.rm = TRUE) + # omit <- dens$x < min(breaks) | dens$x > max(breaks) + # dens$x <- dens$x[-omit] + # dens$y <- dens$y[-omit] + # dens$x <- scale01(dens$x, min.raw, max.raw) + # lines(dens$x, dens$y/max(dens$y) * 0.95, col = denscol, + # lwd = 1) + # axis(2, at = pretty(dens$y)/max(dens$y) * 0.95, pretty(dens$y)) + # title("Color Key\nand Density Plot") + # par(cex = 0.5) + # mtext(side = 2, "Density", line = 2) + # } + # else if (density.info == "histogram") { + # h <- hist(x, plot = FALSE, breaks = breaks) + # hx <- scale01(breaks, min.raw, max.raw) + # hy <- c(h$counts, h$counts[length(h$counts)]) + # lines(hx, hy/max(hy) * 0.95, lwd = 1, type = "s", + # col = denscol) + # axis(2, at = pretty(hy)/max(hy) * 0.95, pretty(hy)) + # title("Color Key\nand Histogram") + # par(cex = 0.5) + # mtext(side = 2, "Count", line = 2) + # } + # else title("Color Key") + # } + # else plot.new() + retval$colorTable <- data.frame(low = retval$breaks[-length(retval$breaks)], + high = retval$breaks[-1], color = retval$col) + invisible(retval) +} + +#rm(list=ls()) +param <- list() +param$debug <- FALSE #T + +print("Finished loading functions") + + +if(param$debug) { + param <- LoadDebugParams(param) +} else { + param <- LoadReqiredParams(param) + param <- LoadOptionalParams(param) +} +print (param) +Rprof() #Remove me +# Load Chip data, alread in pval format +# Column.order need to be a vecor of strings, that correspond to the order (and +# inclustion) of chip experiments. +chip <- GetChipData(param$annotated.macs.file, column.order = param$chip.order) + +#Get the top percentages on different criteria +#ix.top <- GetTopRowsFromMatrix(chip$peaks, percentage = param$filter.percentage) +#chip$peaks <- chip$peaks[ix.top$union, ] +#chip$targets <- chip$targets[ix.top$union] +#Bin data: +chip$peaks <- bindata.non.zero.matrix(chip$peaks, qunts = seq(0, 0.9, length=(param$number.bins+1))) + +file.names <- unlist(strsplit(param$rna.files, split="::")) +print(file.names) +file.lables <- unlist(strsplit(param$rna.names, split="::")) +print(file.lables) + +library("RColorBrewer") +color.2 = rev(colorRampPalette(c("darkblue", "steelblue", "lightgrey"))(param$number.bins)) +if(!is.na(file.names) && file.names != "none") { + #rna.scores <- GetRNAData(file.names, file.lables = file.lables, fpkm = "avg") + #do differential expression if the user specifies such: + if(param$rna.normalization != "no") { + if(param$rna.normalization != "mean"){ + norm.file.names <- unlist(strsplit(param$rna.normalization.file, split="::")) + all.file.names <- c(file.names, norm.file.names) + all.file.lables <- c(file.lables, norm.file.names) + print (all.file.names) + print (all.file.lables) + rna.scores <- GetRNAData(all.file.names, file.lables = all.file.lables, fpkm = "avg") + rna.scores <- NormalizeRNA(scores = rna.scores) + rna.scores.sign <- rna.scores/abs(rna.scores) + rna.scores.sign[which(is.nan(rna.scores.sign))] <- 0 + rna.scores <- bindata.non.zero.matrix(abs(rna.scores), qunts = seq(0, 0.9, length=(param$number.bins+1))) + rna.scores <- rna.scores.sign * rna.scores + color.2 = colorRampPalette(c("darkblue", "steelblue", "lightgrey", "pink", "darkred"))(param$number.bins) + } else { + print("we are normalizing by average") + print("file lables are:") + print(file.lables) + rna.scores <- GetRNAData(file.names, file.lables = file.lables, fpkm = "avg") + rna.scores <- t(apply(rna.scores,1, function(x) { if(mean(x)!=0){ return(x/mean(x)) } else { return(x) } })) + rna.scores <- bindata.non.zero.matrix(rna.scores, qunts = seq(0, 0.9, length=(param$number.bins+1))) + } + } else { + rna.scores <- GetRNAData(file.names, file.lables = file.lables, fpkm = "avg") + rna.scores <- bindata.non.zero.matrix(rna.scores, qunts = seq(0, 0.9, length=(param$number.bins+1))) + } + + # chip$peaks <- chip$peaks[order(chip$targets), ] + # rna.scores <- rna.scores[order(rownames(rna.scores)), ] + # chip$targets <- chip$targets[order(chip$targets)] + + rna.scores.mapped <- MapExpressiontoChip(chip.targets = chip$targets, expression=rna.scores) + all.data <- cbind(chip$peaks, rna.scores.mapped) + if(param$include.targetless!="yes"){ + all.data <- all.data[-which(chip$targets==""),] + chip$peaks <- chip$peaks[-which(chip$targets==""),] + rna.scores.mapped <- rna.scores.mapped[-which(chip$targets==""),] + chip$targets <- chip$targets[-which(chip$targets=="")] + } +} else { + all.data <- chip$peaks +} + +print("Clustering") +set.seed(1234) +km <- kmeans(all.data, param$clustering.number.of.clusters, iter.max = 50) +print("Ordering") +km.order <- GenerateKMOrder(all.data, km) + +##AM edits## +km.new.order <- numeric(length(km$cluster)) +for(i in 1:length(km.order)){ + cur.clst <- km.order[i] + ix <- which(km$cluster==cur.clst) + km.new.order[ix] <- i +} +km.order <- 1:length(km.order) +km$cluster <- km.new.order +## Done AM ## +print("Creating image") +#bmp(param$heatmap.document.name, 640, 480)#1280, 960) +bmp(paste(param$heatmap.document.name,".bmp", sep = ""), 5120, 3840)#2560, 1920) +#pdf(paste(param$heatmap.document.name,".pdf", sep = "")) +color.1 <- c("#000000", colorRampPalette(c("blue", "yellow","orange","red"))(param$number.bins)) +if(!is.na(file.names) && file.names != "none") { + PrintClusters(trgts=chip$targets, + k.ix=km$cluster, + f.nm = param$cluster.groups.document.name, + km.order = NA) + + #if(param$rna.normalization != "no") { + # data.split <- cbind(chip$peaks, PrepareRNAforHeatmap(rna.scores.mapped)) + #} else { + # data.split <- cbind(chip$peaks, rna.scores.mapped) + #} + expression.width.multiplier <- 2 + if(ncol(rna.scores.mapped)==1) { + rna.scores.mapped <- cbind(rna.scores.mapped, rna.scores.mapped) + expression.width.multiplier <- 1 + } + + layout(matrix(c(1,2,2,5, + 1,3,4,6, + 1,3,4,7, + 1,3,4,8, + 1,3,4,9), + 5,4, + byrow=T), + widths=c(1,2*ncol(chip$peaks),expression.width.multiplier*ncol(rna.scores.mapped),1), + heights=c(1,10,2,10,2)) + #1 + plot.new() + #2 + plot.new() + #3 + print("Creating peak image") + CreateIndividualHeatMap(chip$peaks, + km, + km.order, color.ramp=color.1) + #4 + print("Creating rna image") + CreateIndividualHeatMap(rna.scores.mapped, + km, + km.order, color.ramp=color.2) + #5 + plot.new() + #6 + image(t(matrix(1:param$number.bins,nc=1)), col=color.1) + #7 + plot.new() + #8 + image(t(matrix(1:param$number.bins,nc=1)), col=color.2) + #9 + plot.new() +} else { + PrintClusters(trgts=1:nrow(chip$peaks), + k.ix=km$cluster, + f.nm = param$cluster.groups.document.name, + km.order = NA) + + + layout(matrix(c(1,2,2, + 1,3,4, + 1,3,5),3,3,byrow=T), + widths=c(1,3*ncol(chip$peaks),1), + heights=c(1,8,1)) + #1 + plot.new() + #2 + plot.new() + #3 + CreateIndividualHeatMap(chip$peaks, + km, + km.order, color.ramp=color.1) + #4 + image(t(matrix(1:param$number.bins,nc=1)), col=color.1) + #5 + plot.new() + +} + +dev.off() +Rprof(NULL) +profile <- summaryRprof() +print(str(profile)) +print(profile) +#save.image("test/testData.RData") diff -r a0306edbf2f8 -r b465306d00ba mtls_analyze/map_ChIP_peaks_to_genes.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mtls_analyze/map_ChIP_peaks_to_genes.R Mon Jul 23 13:00:15 2012 -0400 @@ -0,0 +1,304 @@ +## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. +## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ +##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' +## Nov 2011 th17 +## Bonneau lab - "Aviv Madar" , +## NYU - Center for Genomics and Systems Biology +## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. +## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ +##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' + +rm(list=ls()) +debug=F +script.version=0.1 +print.error <- function(){ + cat(" +NAME + map_ChIP_peaks_to_genes.R + +DESCRIPTION + This script takes a MACS tab delimited file as input and + produces two files: + + 1) genes.xls + A gene centered table that stores the peaks that are + proximal (within x[kb] of TSS) and distal (within the + gene's region + y[kb] upstream or downstream of TSS and + TES respectively). + + 2) peaks.xls + A peak centered table that equals the MACS input table plus + colums for proximal and distal targets of each peak and + their distance from TSS (if exist). For gene.xls script also + gives a poisson model based -log10(pval) score for proximal + and genewide enrichment of peaks for each gene with at least + one prox/dist peak associated with it. + +ARGUMENTS + input_file=path + Path to MACS file. + + refseq_table=path + Path to refseq table (gives TSS/TES locations for all genes). + + path_output=path + Path to save genes.xls and peaks.xls output files. + + tss.dist=N + The absolute distance from TSS where we connect a peak to + gene (for proximal peaks). + + gene.wide.dist=N + The absolute distance from TSS or TES where we connect a peak + to gene (for peaks hitting anywhere in gene). + + effective.genome.size=N + The effective mappable genome size (for mm9 the value is + 1.87e9. For hg19 the value is 2.7e9 (from MACS manual)). + + macs.skip.lines=N + Number of lines to skip in input_file. + +EXAMPLE + Rscript map_ChIP_peaks_to_genes_v.1.R \\ + input_file=SL971_SL970_peaks.xls \\ + refseq_table=UCSC_mm9_refseq_genes_Sep_15_2011.txt \\ + path_output=./ \\ + tss.dist=5000 \\ + gene.wide.dist=10000 \\ + effective.genome.size=1.87e9 \\ + macs.skip.lines=23 + +CITATION + Please cite us if you used this script: The transcription + factor network regulating Th17 lineage specification and + function. Maria Ciofani, Aviv Madar, Carolina Galan, Kieran + Mace, Ashish Agarwal, Kim Newberry, Richard M. Myers, Richard + Bonneau and Dan R. Littman et. al. (in preperation). + +") +} + +# retrieve args +if(debug==T){ + cmd.args <- c( + # "input_file=input/th17/used_for_paper/rawData/MACS_Sep_15_2011/SL971_SL970_peaks.xls", + "input_file=input/th17/used_for_paper/rawData/MACS_Sep_15_2011/SL3594_SL3592_peaks.xls", + "refseq_table=input/th17/used_for_paper/rawData/UCSC_mm9_refseq_genes_Sep_15_2011.txt", + "path_output=/Users/aviv/Desktop/test_script/", + "tss.dist=5000", + "tes.dist=10000", + "effective.genome.size=1.87e9", + "gene.wide.dist=10000", + "macs.skip.lines=23" + ) +} else { + cmd.args <- commandArgs(); +} + +if(length(grep("--version",cmd.args))){ + cat("version",script.version,"\n") + q() +} + +arg.names.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[1]) +args.val.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[2]) + +arg.nms <- c("input_file","refseq_table","path_output","tss.dist", + "gene.wide.dist","effective.genome.size","macs.skip.lines") +arg.val <- character(length=length(arg.nms)) +for(i in 1:length(arg.nms)){ + ix <- which(arg.names.cmd.line==arg.nms[i]) + if(length(ix)==1){ + arg.val[i] <- args.val.cmd.line[ix] + } else { + stop("######could not find ",arg.nms[i]," arg######\n\n",print.error()) + + } +} +if(debug==T){ + print(paste(arg.nms,"=",arg.val)) +} +# the files here adhere to tab delim format +path.input.macs <- arg.val[1] +path.input.refseq <- arg.val[2] +path.output <- arg.val[3] +tss.dist <- as.numeric(arg.val[4]) +gene.wide.dist <- as.numeric(arg.val[5]) +effective.genome.size <- as.numeric(arg.val[6]) +macs.skip.lines=as.numeric(arg.val[7]) + +if(debug==T){ +# if(1){ + load("/Users/aviv/Desktop/r.u.RData") + gn.nms <- rownames(r.u) +} else { + ################## + ### step 1 handle refseq table file (read, make unique) + cat("reading refseq table",path.input.refseq,"\n") + refseq <- read.delim(file=path.input.refseq) + # refseq can have many transcripts for each gene + # here i make it have only one transcript for each gene (the longest one) + cat("for transcripts matching more than one gene keeping only the longest transcript\n") + refseq$name2 <- as.character(refseq$name2) + refseq$chrom <- as.character(refseq$chrom) + refseq$strand <- as.character(refseq$strand) + gn.nms <- unique(refseq$name2) + #x <- sort(table(refseq$name2),decreasing=T) + #gn.nms.non.unique <- names(x)[which(x>1)] + #gn.nms.unique <- names(x)[which(x==1)] + # create refseq unique r.u + n <- length(gn.nms) + r.u <- data.frame(cbind(chrom=rep("NA",n),strand=rep("NA",n),txStart=rep(0,n),txEnd=rep(0,n)),stringsAsFactors=FALSE,row.names=gn.nms) + for(i in 1:n){ + ix <- which(refseq$name2==gn.nms[i]) + if(length(ix)==0) { + error("could not find gene", ng.nms[i], "in refseq table. Bailing out...\n") + } else if (length(ix)>1){ + l <- apply(refseq[ix,c("txStart","txEnd")],1,function(i) abs(i[1]-i[2]) ) + l.max <- max(l) + ix <- ix[which(l==l.max)[1]] + } + r.u[gn.nms[i],c("chrom","strand","txStart","txEnd")] <- refseq[ix,c("chrom","strand","txStart","txEnd")] + } + r.u[,"txStart"] <- as.numeric(r.u[,"txStart"]) + r.u[,"txEnd"] <- as.numeric(r.u[,"txEnd"]) + + # switch TSS and TES if we have chr "-" + cat("correcting TSS, TES assiments in refseq table based on strand\n") + ix <- which(r.u$strand=="-") + tmp.tes <- r.u$txStart[ix] + tmp.tss <- r.u$txEnd[ix] + r.u[ix,c("txStart","txEnd")] <- cbind(tmp.tss,tmp.tes) + # ############ +} + + +#### step 2 read macs file +cat("reading Macs file",path.input.macs,"\n") + +# read macs file +M <- read.delim(file=path.input.macs,skip=macs.skip.lines) +trgt.prox <- NA +trgt.dist <- NA +dtss.prox <- NA +dtss.dist <- NA +M.annot <- cbind(M,trgt.prox,trgt.dist,dtss.prox,dtss.dist) +M.annot[is.na(M.annot)] <- "" +# get the number of genes +m <- length(gn.nms) +if(debug==T){ + # m <- 100 +} +f.nm.peak <- paste(sep="",path.output,"peaks.xls") +f.nm.gene <- paste(sep="",path.output,"genes.xls") +# for each genes in the expt (j goes over genes) +cat(file=f.nm.gene,sep="\t","Gene_ID","prox_n_peak","genewide_n_peak","gn_length","strand","prox_mean_pval","genewide_mean_pval", + "prox_max_pval","genewide_max_pval", + "prox_pois_model_pval","genewide_pois_model_pval","(peak_id,summit,d_TSS,d_TES,class,pval,fold_enrich,FDR),(..)\n") +peaks.summit <- (M[,"start"]+M[,"summit"]) +cat(sep="","mapping MACS peaks to genes\n") +for (j in 1:m){ + if(j%%20==0){cat(".")} + tss <- r.u[j,"txStart"] + tes <- r.u[j,"txEnd"] + gn.lngth <- abs(tss-tes) + strand <- r.u[j,"strand"] + chr <- r.u[j,"chrom"] + if(strand=="+"){ + d.tss.all <- peaks.summit-tss + d.tes.all <- peaks.summit-tes + } else { + d.tss.all <- tss-peaks.summit + d.tes.all <- tes-peaks.summit + } + ix.distal <- sort(union( c(which( abs(d.tss.all) < gene.wide.dist ),which( abs(d.tes.all) < gene.wide.dist )), + which( sign(d.tss.all) != sign(d.tes.all) )),decreasing=TRUE) + ix.prox <- sort(which( abs(d.tss.all) < tss.dist ),decreasing=TRUE) + ix.prox <- ix.prox[which(M[ix.prox,"chr"]==chr)] + ix.distal <- ix.distal[which(M[ix.distal,"chr"]==chr)] + n.peaks.prox <- length(ix.prox) + n.peaks.distal <- length(ix.distal) + # if there is at least one peak hitting gene j + if(n.peaks.distal > 0){ + # for each peak (l goes over peaks) + peaks.line <- paste(sep="","(") + for (k in 1:n.peaks.distal){ + if(k>1){ + peaks.line <- paste(sep="",peaks.line,";(") + } + d.tss <- d.tss.all[ ix.distal[k] ] + d.tes <- d.tes.all[ ix.distal[k] ] + if(sign(d.tss)!=sign(d.tes)){ + class="intra" + } else if(d.tss<=0){ + class="upstream" + } else if(d.tss>0){ + class="downstream" + } + peaks.line <- paste(sep="",peaks.line,paste(sep=",", ix.distal[k], peaks.summit[ ix.distal[k] ], + d.tss, d.tes, class, M[ ix.distal[k] ,7], M[ ix.distal[k] ,8], M[ ix.distal[k] ,9] ),")") + # handle M.annot + dtss <- d.tss.all[ ix.distal[k] ] + prev.trgt <- M.annot[ix.distal[k],"trgt.dist"] + if(prev.trgt!=""){ # if peak already is with trgt choose one with min dtss + if(abs(dtss) 0){ + mean.pval.prox <- mean(M[ ix.prox ,7]) + max.pval.prox <- max(M[ ix.prox ,7]) + expected.num.peaks.genome.wide.prox <- length(which(M[,7]>=mean.pval.prox)) + # lambda = num peaks / genome size * searched region + lambda.prox <- expected.num.peaks.genome.wide.prox/effective.genome.size*(tss.dist*2) + pval.pois.prox <- -log10(ppois(n.peaks.prox,lambda.prox,lower.tail=FALSE)) + # handle M.annot + for(k in 1:n.peaks.prox){ + dtss <- d.tss.all[ ix.prox[k] ] + prev.trgt <- M.annot[ix.prox[k],"trgt.prox"] + if(prev.trgt!=""){ # if peak already is with trgt choose one with min dtss + if(abs(dtss)=mean.pval.distal)) + # lambda = num peaks / genome size * searched region + lambda.distal <- expected.num.peaks.genome.wide.distal/effective.genome.size*(gn.lngth+gene.wide.dist*2) + pval.pois.distal <- -log10(ppois(n.peaks.distal,lambda.distal,lower.tail=FALSE)) + # pring peaks for gene j + core.line <- paste(sep="\t",gn.nms[ j ],n.peaks.prox,n.peaks.distal,gn.lngth,strand, + mean.pval.prox,mean.pval.distal,max.pval.prox,max.pval.distal, + pval.pois.prox,pval.pois.distal) + cat(file=f.nm.gene,append=TRUE,sep="",core.line,"\t",peaks.line,"\n") + } +} +cat("Done!\n") + +cat(file=f.nm.peak,colnames(M.annot),"\n",sep="\t") +write.table(file=f.nm.peak,append=T,col.names=F,row.names=F,M.annot,sep="\t") + + + + + + + diff -r a0306edbf2f8 -r b465306d00ba mtls_analyze/mtls_analyze.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mtls_analyze/mtls_analyze.xml Mon Jul 23 13:00:15 2012 -0400 @@ -0,0 +1,333 @@ + + + Merge multiple ChIP-seq experiments, alligning their peaks to MTLs (Multi + Transcription Factor Loci(us)) and optionally incorperate expression + + /bin/bash $shscript + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +#!/bin/bash + +#import os +#set $path = $os.path.abspath($__app__.config.tool_path) + + +## Set symbols so that they are not incorrectly interpreted: +#set $dollar = chr(36) +#set $gt = chr(62) +#set $lt = chr(60) +#set $ad = chr(38) +#set $bs = chr(92) + +echo $map_rna ${ad}${gt}${gt} $log +echo "This is the Bash log file: " ${ad}${gt}${gt} $log +############################################################################### +## Convert the gtf file to a file that aviv's script can hadel +#if str($map_rna)=='yes' + echo "Converting gtf file" ${ad}${gt}${gt} $log + Rscript $path/visualization/gtfToMapFriendlyAnnotation.R $reference_file ${ad}${gt}${gt} $log + echo "done converting gtf file" ${ad}${gt}${gt} $log +#end if +############################################################################### +## Get ChIP data in correctly formated strings and annotate if nessisary. +#set $sep = '::' +#for $i, $chip in enumerate( $chip_tracks ) + #if $i==0 + echo "Chip Files:" ${ad}${gt}${gt} $log + echo "The first file label is: ${chip.name}" ${ad}${gt}${gt} $log + echo "The first file path is: ${chip.file}" ${ad}${gt}${gt} $log + chip_labels=${chip.name} + chip_paths=${chip.file} + #else + echo "The next file label is: ${chip.name}" ${ad}${gt}${gt} $log + echo "The next file path is: ${chip.file}" ${ad}${gt}${gt} $log + chip_labels=${dollar}chip_labels${sep}${chip.name} + chip_paths=${dollar}chip_paths${sep}${chip.file} + #end if +#end for + +echo chip paths are - ${dollar}chip_paths ${ad}${gt}${gt} $log +echo chip labels are - ${dollar}chip_labels ${ad}${gt}${gt} $log + +############################################################################### +## Cluster peaks + +Rscript $path/visualization/cluster_peaks.R \ +--input_files ${dollar}chip_paths \ +--input_type $chipInputFormat \ +--path_output ./ \ +--expt_names ${dollar}chip_labels \ +--dist_summits $summitDistance \ +--mtl_type $mtlType ${ad}${gt}${gt} $log + +############################################################################### +## Annotate mtls.xls if nessisary +#if str($map_rna)=="yes" + echo "annotating mtls.xls..." ${ad}${gt}${gt} $log + Rscript $path/visualization/annotate_mtls.R mtls.xls gene_annotation.txt $trgtDistance ${ad}${gt}${gt} $log +#end if +############################################################################### +## If rna is specified, then get RNA data in correctly formated strings: +#if str($map_rna)=='yes' + #set $sep = '::' + #for $i, $rna in enumerate( $rna_tracks ) + #if $i==0 + echo "The first file label is: ${rna.name}" ${ad}${gt}${gt} $log + echo "The first file path is: ${rna.file}" ${ad}${gt}${gt} $log + rna_labels=${rna.name} + rna_paths=${rna.file} + rna_norm_paths=${rna.norm} + #else + echo "The next file label is: ${rna.name}" ${ad}${gt}${gt} $log + echo "The next file path is: ${rna.file}" ${ad}${gt}${gt} $log + rna_labels=${dollar}rna_labels${sep}${rna.name} + rna_paths=${dollar}rna_paths${sep}${rna.file} + rna_norm_paths=${dollar}rna_norm_paths${sep}${rna.norm} + #end if + #end for + echo rna paths are - ${dollar}rna_paths ${ad}${gt}${gt} $log + echo rna labels are - ${dollar}rna_labels ${ad}${gt}${gt} $log + echo rna norm files are - ${dollar}rna_norm_paths ${ad}${gt}${gt} $log +#end if +############################################################################### + +#if str($normalize_rna)=='no' + echo "Normalization by file is set to no" ${ad}${gt}${gt} $log + rna_norm_paths=no +#end if + +#if str($use_mean)=='yes' + echo "Normalization of expression will be done by mean" ${ad}${gt}${gt} $log + rna_norm_paths=mean +#end if + +#if str($map_rna)=='no' + mtls_file=mtls.xls + rna_paths=none + rna_labels=none +#else + mtls_file=annotated_mtls.xls +#end if + +echo " +Rscript $path/visualization/heatmap.R --mtls_file ./${dollar}mtls_file \ +--cluster_file ./cluster \ +--chip_experiment_order ${dollar}chip_labels \ +--heatmap_file ./heatmap \ +--heatmap_type bmp \ +--n_clusters $numClusters \ +--filter_percentage 100 \ +--expression_file ${dollar}rna_paths \ +--expression_name ${dollar}rna_labels \ +--normalization_file ${dollar}rna_norm_paths \ +${ad}${gt}${gt} $log" ${ad}${gt}${gt} $log + +Rscript $path/visualization/heatmap.R --mtls_file ./${dollar}mtls_file \ +--cluster_file ./cluster \ +--chip_experiment_order ${dollar}chip_labels \ +--heatmap_file ./heatmap \ +--heatmap_type bmp \ +--n_clusters $numClusters \ +--filter_percentage 100 \ +--number_bins $numberBins \ +--include_targetless $includeTargetless \ +--expression_file ${dollar}rna_paths \ +--expression_name ${dollar}rna_labels \ +--normalization_file ${dollar}rna_norm_paths \ +${ad}${gt}${gt} $log + +ls ${ad}${gt}${gt} $log + + + + +################################################################## +#if str($map_rna)=='yes' + mv ./annotated_mtls.xls $mtls +#else + mv ./mtls.xls $mtls +#end if +mv ./heatmap.* $heatmap_image +mv ./cluster.tsv $cluster_assignments + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +This tool will merge peaks form multiple chip-seq experiments, creating MTLs for +each overlapping region. It will then cluster each MTL based on the score of +each peak within each MTL (using K-means clustering, with k set by user). A +heatmap is then generated from the resulting cluster along with the MTLs +generated. This module in writin in R and is will be made available on github +and bioconductor. This work was done by Kieran Mace and Aviv Madar. + +**NEED IMPROVEMENT** + +----- + +**Parameters** + +- **Input files** contains either macs or BED files to be merged. This list of files must be two or larger. +- **Experiment names** contains the name given to each track. +- **Summit distance** is the cuttoff distance (in BP) to be included in an MTL. This option is not used with the summit option below +- **Input Format** Either bed of MACS file format, all files must be of one type. Defaults to MACS +- **MTL Type** Either interval or summit (defaults to summit). +- **Number clusters** the value of k for kmeans clustering. +- **Filter top MTLS** The top percentage of MTLs to keep for image and cluster (based on the union of mean, non-zero mean, and variance of the scores). +----- + +**Output** + +- **XLS file** is the tab-delimited file containing the MTL data. +- **PNG file** is the heatmap image generated after clustering the MTL data. + +----- + +**script parameter list of Chip-Cluster** + +Options: +DESCRIPTIION: + cluster_peaks.R takes MACS/.bed tab delimited files as input and produces one tab delimeted file (named mtls.xls) where + each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS/.bed files) + fall within a certain distance between summits from eachother. + +INPUT: + 1.path_input=path to MACS/bed files '::' delim [path_input=f1::f2::f3::...::fk] + 2.path_output=path to save generated MTL cluster file (where to save mtls.xls) + 3.expt_names=user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk] + 4.dist.summits=maximum distance between summits belonging to the same MTL (defaults to 100) + 5.input_type=the type of input file used (MACS or .bed; defaults to MACS) + 6.mtl_type=interval or summit (defaults to summit) + +EXAMPLE RUN: + cluster_peaks.R + --input_macs_files input/SL2870_SL2871_peaks.xls::input/SL2872_SL2876_peaks.xls::input/SL3032_SL2871_peaks.xls::input/SL3037_SL3036_peaks.xls::input/SL3315_SL3319_peaks.xls + --input_type MACS + --path_output results/ + --expt_names RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17 + --dist_summits 100 + --mtl_type summit + + DESCRIPTIION: + heatmap.R takes a ... + +INPUT: + 1.--mtls_file path to mtls file. + + 2.--cluster_file the destination path for the cluster file. + + 3.--heatmap_file the destination path for heatmap image (no extension). + + 4.--heatmap_type choice of image type, currently support png and pdf. + + 5.--n_clusters number of clusters in the heatmap + + 6.--filter_percentage percentage of mtls that will be analysed. for eg. if + we make filter_percentage 30, we will take the union of the top mtls in + mean, non-zero mean and variance. + + +EXAMPLE RUN: + Rscript heatmap.R + --mtls_file mtls.xls + --cluster_file output/cluster + --heatmap_file output/heatmap + --heatmap_type png + --n_clusters 13 + --filter_percentage 60 + +Please cite us if you used this script: + The transcription factor network regulating Th17 lineage specification and function. + Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers, + Richard Bonneau and Dan R. Littman et. al. (in preperation) + + + +