# HG changeset patch # User kmace # Date 1343062772 14400 # Node ID a0306edbf2f8dd0d001605507a786731dad9f90a # Parent a665e0ad63db8d70fc7b89556d461b8ac660f28f Deleted selected files diff -r a665e0ad63db -r a0306edbf2f8 mtls_analyze/mtls_analyze/annotate_mtls.R --- a/mtls_analyze/mtls_analyze/annotate_mtls.R Mon Jul 23 12:58:55 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,79 +0,0 @@ -debug = F -# Read in Data -if (debug == T) { - mtls <- read.delim("~/code/scorchR-annotate_mtls/mtls.xls", quote="", stringsAsFactors=F) - annotation <- read.delim("~/code/scorchR-annotate_mtls/gene_annotation.txt", header=T, stringsAsFactors=F) - threshold <- 100000 -} else { - cmd <- commandArgs(trailingOnly = T) - mtls <- read.delim(cmd[1], quote="", stringsAsFactors=F) - annotation <- read.delim(cmd[2], header=T, stringsAsFactors=F) - threshold <- as.numeric(cmd[3]) -} - - - -# Get the median summit -summits <- sapply(strsplit(mtls[ ,"summit"], split="_"), function(x) {median(as.numeric(x))}) -# Extract every tss -tss <- apply(annotation,1,function(x) { - if (x["strand"]=="+"){ - return(as.numeric(x["txStart"])) - }else{ - return(as.numeric(x["txEnd"])) - } - }) - - -# Get each Chromosome -chromosomes <- unique( mtls[,"chr"]) - -# Set up output: -trgt <- character(length = length(summits)) - -# Loop through Chromosomes -for (chromosome in chromosomes){ - cat(chromosome, "\n") - chr.ix.tss <- which(annotation[ ,"chrom"]==chromosome) - chr.ix.summits <- which(mtls[ ,"chr"]==chromosome) - - - cat(length(chr.ix.summits), " -summits\n", - length(chr.ix.tss), " -tss\n") - if (length(chr.ix.summits)>0 && length(chr.ix.tss)>0) { - # Loop through Summits - for(summit.ix in chr.ix.summits) { - #cat("Summit = ", summit.ix, class(summit.ix),"\n") - distances <- abs(tss[chr.ix.tss] - summits[summit.ix]) - closest.ix.tss <- which(distances == min(distances))[1] - if(!is.na(closest.ix.tss) && !is.na(distances[closest.ix.tss])) { - #print(closest.ix.tss) - if (distances[closest.ix.tss] < threshold) { - - trgt[summit.ix] <- annotation[chr.ix.tss, "name2"][closest.ix.tss] - -# cat("A distance of ", -# distances[closest.ix.tss], -# " was found for the mtl summit located at: ", -# chromosome, -# ": ", -# summits[summit.ix], -# " and TSS located at: ", -# chromosome, -# ": ", -# tss[chr.ix.tss][closest.ix.tss], -# "\n") - } - } - } - } else { - cat("chromosome", chromosome, " either has no transcripts or no peaks in this experiment. \n") - } - output <- cbind(mtls, trgt) -} - -write.table(output, file="annotated_mtls.xls", sep="\t",row.names=F, quote=F) - - - - diff -r a665e0ad63db -r a0306edbf2f8 mtls_analyze/mtls_analyze/cluster_peaks.R --- a/mtls_analyze/mtls_analyze/cluster_peaks.R Mon Jul 23 12:58:55 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,431 +0,0 @@ -## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. -## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ -##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' -## Feb 2012 th17 -## Bonneau lab - "Aviv Madar" , -## NYU - Center for Genomics and Systems Biology -## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. -## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ -##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' - -rm(list=ls()) -debug=F -verbose=F -script.version=0.1 -print.error <- function(){ - cat(" -DESCRIPTIION: - cluster_peaks.R takes MACS/.bed tab delimited files as input and produces one tab delimeted file (named mtls.xls) where - each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS/.bed files) - fall within a certain distance from eachother. If you cluster MTLs based on summits, you need to specify dist.summits. - If you use .bed files, the files must contain no header, and at least the five columns: - 1. chrom, 2. chromStart, 3. chromEnd, 4. name, and 5.score. 2 and 3 represent the end points used for MTLs defined based on a shared - interval. (2+3)/2 is used as the summit of each row if used for MTLs defined based on proximity of summits. - -INPUT: - 1.input_files: path to MACS/bed files '::' delim [path_input=f1::f2::f3::...::fk] - 2.path_output: path to save generated MTL cluster file (where to save mtls.xls) - 3.expt_names: user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk] - 4.input_type: the type of input file used (MACS or BED; defaults to MACS) - 5.mtl_type: interval or summit (defaults to summit) - 6.dist.summits: maximum distance between summits belonging to the same MTL (defaults to 100; only used if mtl_type is summit) - -EXAMPLE RUN: - cluster_peaks.R - --input_files input/SL2870_SL2871_peaks.xls::input/SL2872_SL2876_peaks.xls::input/SL3032_SL2871_peaks.xls::input/SL3037_SL3036_peaks.xls::input/SL3315_SL3319_peaks.xls - --input_type MACS - --path_output results/ - --expt_names RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17 - --dist_summits 100 - --mtl_type summit - -Please cite us if you used this script: - The transcription factor network regulating Th17 lineage specification and function. - Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers, - Richard Bonneau and Dan R. Littman et. al. (in preperation)\n\n") -} - -############# helper functions: -## split.macs.list.to.chrmosomes -# input: -# - macs.list: a list of macs expts: here are a few lines of one expt -## chr start end length summit tags #NAME? fold_enrichment FDR(%) -## chr1 4322210 4323069 860 494 55 158.95 6.03 0.05 -## chr1 4797749 4798368 620 211 29 119.82 3.47 0.09 -## chr1 4848182 4849113 932 494 46 105.42 2.9 0.09 -# - expts: a list of the expts names from macs.list that you want to process -# - chr: chrmosomes names -# output: -# - x: a list with as many elements as chr specified by input. -# -x[[i]]:macs list per chr, with peak.id column added (these are the row numbers of MACS) -split.macs.list.to.chrmosomes.no.pml <- function(macs.list,expts,chr="chr1"){ - x <- list() - n <- length(expts) - for(i in 1:n){ - e <- expts[i] #experiment name - cat("wroking on expt", e,"\n") - x[[e]] <- lapply(chr,split.one.macs.expt.by.chromosome.no.pml,m=macs.list[[e]]) - names(x[[e]]) <- chr - } - return(x) -} -# a helper function for spliat.macs.list.to.chrmosomes, gives for one chromosome the macs rows for expt MACS matrix m -# input: -# - r is chromosome -# - m is macs matrix for expt e from above function -split.one.macs.expt.by.chromosome.no.pml <- function(r,m){ - ix.chr.i <- which(m[,"chr"]==r) - # cat("working on",r,"\n") - o <- list() - o[[r]] <- m[ix.chr.i,] - o[[r]]$peak.id <- ix.chr.i - return(o[[r]]) -} - -## make.ruler makes a ruler: a line per chromosome with the locations of all tf binding sites (sorted from start of chromosome to end) -# also match these summit locations with corresponding: -# pvals, tfs, peak start and peak end -# input: -# - chr: a list of chromosome names -# - macs.list.per.chrom: a list of macs peaks for each chromosome -# output: -# - o: a list each chormosome ruler as an element -make.ruler.no.pml <- function(chr,macs.list.per.chrom){ - x <- macs.list.per.chrom - o <- list() - for(j in 1:length(chr)){ - r <- chr[j] # chrmosome we go over - s <- numeric() - pval <- numeric() - tf.to.s <- character() - start <- numeric() - end <- numeric() - trtmnts <- names(x) # the treatments name (expt names) - ## debug parameters ### - ## which experiment peaks come from - expt <- character() - ## what was the peak id in that experiment - peak.ids <- numeric() - ## this will allow us to always back track from a cluster to the actual peaks in it - ## debug params end ### - for(i in 1:length(trtmnts)){ - o[[r]] <- list() - e <- trtmnts[i] #experiment name - tf <- strsplit(e,"_")[[1]][1] - s <- c(s,x[[e]][[r]][,"start"]+x[[e]][[r]][,"summit"]) - pval <- c(pval,x[[e]][[r]][,"score"]) - start <- c(start,x[[e]][[r]][,"start"]) - end <- c(end,x[[e]][[r]][,"end"]) - expt <- c(expt,rep(e,length(x[[e]][[r]][,"end"]))) - peak.ids <- c(peak.ids,x[[e]][[r]][,"peak.id"]) - } - ix <- sort(s,index.return=TRUE)$ix - o[[r]] <- list(summit=s[ix],score=pval[ix],expt=expt[ix],start=start[ix],end=end[ix],peak.ids=peak.ids[ix]) - } - return(o) -} - -## add cluster memberships based on ruler -## require no more than d.cut distance between tf summits -## cur.l is the current loci number (or cluster number) -assign.clusters.ids.to.peaks.based.on.summits <- function(ruler,d.cut){ - cur.l <- 0 - for(j in 1:length(ruler)){ - s <- ruler[[j]][["summit"]] - l <- numeric(length=length(s)) - if(length(l)>0){ - cur.l <- cur.l+1 - } - if(length(l)==1){ - l[1] <- cur.l - } else if(length(l)>1) { - l[1] <- cur.l - for(i in 2:length(l)){ - d <- s[i]-s[i-1] # assumes s is sorted increasingly - if(d>d.cut){ - cur.l <- cur.l+1 - } - l[i] <- cur.l - } - } - ruler[[j]][["mtl.id"]] <- l - } - return(ruler) -} - -## add cluster memberships based on ruler -## require that peaks span will have a non-empty intersection -## cur.mtl is the current loci number (or cluster number) -assign.clusters.ids.to.peaks.based.on.intervals <- function(ruler){ -cur.mtl <- 0 -for(j in 1:length(ruler)){ - s <- ruler[[j]][["start"]] - e <- ruler[[j]][["end"]] - l <- numeric(length=length(s)) - if(length(l)>0){ - cur.mtl <- cur.mtl+1 - } - if(length(l)==1){ - l[1] <- cur.mtl - } else if(length(l)>1) { - l[1] <- cur.mtl - cur.e <- e[1] # the right-most bp belonging to the current mtl - for(i in 2:length(l)){ - if(cur.e1){ - stop("arg ",args.nms[i]," used more than once. Bailing out...\n",print.error()) - } else if (length(ix)==0){ - stop("could not find ",args.nms[i],". Bailing out...\n",print.error()) - } else { - vals[i] <- args[ix+1] - } - } - return(vals) -} -# read command line paramters that are optional, if an optional param is not give functions return na for this param -read.cmd.line.params.optional <- function(args.nms, cmd.line.args){ - args <- sapply(strsplit(cmd.line.args," "),function(i) i) - vals <- character(length(args.nms)) - # split cmd.line to key and value pairs - for(i in 1:length(args.nms)){ - ix <- grep(args.nms[i],args) - if(length(ix)>1){ - stop("arg ",args.nms[i]," used more than once. Bailing out...\n",print.error()) - } else if (length(ix)==0){ # if --param was not written in cmd line - vals[i] <- NA - } else if(( (ix+1) <= length(args) ) & ( length(grep("--",args[ix+1])) == 0) ){ # if --param was written in cmd line AND was followed by a value - vals[i] <- args[ix+1] - } else { # otherwise - vals[i] <- NA - } - } - return(vals) -} - -############# Code: -# retrieve args - -if(debug==T){ - cmd.args <- c( - "--input_files data/xls/SL10571_SL10565_peaks.xls::data/xls/SL10570_SL10564_peaks.xls::data/xls/SL10572_SL10566_peaks.xls", - #"--input_files data/bed/SL10571_SL10565_peaks.bed::data/bed/SL10570_SL10564_peaks.bed::data/bed/SL10572_SL10566_peaks.bed", - "--input_type MACS", #BED - "--path_output ./results/", - "--expt_names macs1::macs2::macs3", - #"--expt_names bed1::bed2::bed3", - "--mtl_type interval", #interval summit - "--dist_summits 100" - ) -} else { - cmd.args <- commandArgs(trailingOnly = T); -} - -# if(length(grep("--version",cmd.args))){ -# cat("version",script.version,"\n") -# q() -# } -args.nms.must <- c( - "--input_files", #1 - "--path_output", #2 - "--expt_names" #3 -) - -# all numeric params must come at the end of args.nms.optional -n.start.numeric <- 3 -args.nms.optional <- c( - "--input_type", #1 - "--mtl_type", #2 - "--dist_summits" #3 -) - -# get must parameters -vals.must <- read.cmd.line.params.must(args.nms = args.nms.must, cmd.line.args = cmd.args) -if(verbose){ - cat("\nfinshed reading vals: \n") - cat("\nvals.must", unlist(vals.must), "\n") -} -input.files <- strsplit(vals.must[1],"::")[[1]] -if(length(input.files)==1){ - cat("only provided one MACS file to cluster.") - print.error() -} -path.output <- vals.must[2] -expt.names <- strsplit(vals.must[3],"::")[[1]] -# get optional parameters -vals.optional <- read.cmd.line.params.optional(args.nms = args.nms.optional, cmd.line.args = cmd.args) -if(verbose){ - cat("\nvals.optional:", unlist(vals.optional),"\n") -} -if(is.na(vals.optional[1])){ - input.type <- "MACS" -} else { - input.type <- vals.optional[1] -} -if(is.na(vals.optional[2])){ - mtl.type <- "interval" -} else { - mtl.type <- vals.optional[2] -} -if(is.na(vals.optional[2])){ - dist.summits <- 100 -} else { - dist.summits <- as.numeric(vals.optional[3]) -} - -# read MACS files -unique.chr <- character() -infile.list <- list() -col.nms.keepers <- c("chr","start","end","summit","score") -for(i in 1:length(input.files)){ - e <- expt.names[i] - if(toupper(input.type)=="MACS"){ - tmp <- read.delim(file=input.files[i],comment.char="#",stringsAsFactors = F) - columns <- c("chr","start","end","summit","pvalue") - ix.cols <- numeric() - for(j in 1:length(columns)){ - ix.j <- grep(columns[j],names(tmp)) - if(length(ix.j)==0){ - stop("can't find (grep) column ",columns[j]," in MACS input file ", e) - } - ix.cols <- c(ix.cols,ix.j) - } - infile.list[[e]] <- tmp[,ix.cols] - colnames(infile.list[[e]]) <- col.nms.keepers - } - if(toupper(input.type)=="BED"){ - tmp <- read.delim(file=input.files[i],stringsAsFactors = F,header = FALSE) - tmp[,4] <- tmp[,2]+(tmp[,3]-tmp[,2])/2 # define summit and put istead of name column - colnames(tmp) <- col.nms.keepers - infile.list[[e]] <- tmp[,1:5] - } - unique.chr <- unique(c(unique.chr,infile.list[[ e ]][,"chr"])) -} - -unique.chr <- sort(unique.chr) - -# take all macs files and put peaks together on each chromosome -# (as if each chromosome is a ruler and we specify where in the ruler each peak summit falls) -cat("splitting input files per chromosome\n") -x <- split.macs.list.to.chrmosomes.no.pml(macs.list=infile.list,expts=expt.names,chr=unique.chr) -cat("adding peaks from all input files into chromosome rulers\n") -ruler <- make.ruler.no.pml(chr=unique.chr,macs.list.per.chrom=x) -cat("add MTL membership to the ruler\n") - -if(mtl.type=="interval"){ - ruler <- assign.clusters.ids.to.peaks.based.on.intervals(ruler) -} else { - ruler <- assign.clusters.ids.to.peaks.based.on.summits(ruler,d.cut=dist.summits) -} - -cat("creating MTL list\n") -ll <- create.cluster.list.no.pml(ruler=ruler) -cat("writing MTL table\n") -f.nm <- paste(sep="",path.output,"mtls",".xls") -print.ll.verbose.all(ll=ll,f.nm=f.nm) - - - - - - - - - diff -r a665e0ad63db -r a0306edbf2f8 mtls_analyze/mtls_analyze/gtfToMapFriendlyAnnotation.R --- a/mtls_analyze/mtls_analyze/gtfToMapFriendlyAnnotation.R Mon Jul 23 12:58:55 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,102 +0,0 @@ -rm (list = ls()) -args <- commandArgs() -print(args) -gtf <- read.table(args[6],as.is=T) -colnames(gtf) <- c("chrom", - "source", - "type", - "five_prime", - "three_prime", - "score", - "strand", - "frame", - "nothinggene", - "gene", - "nothingSemi1", - "nothingtranscript", - "transcript", - "nothingSemi2") -gtf.exon <- gtf[which(gtf[,"type"]=="exon"),] -gtf.exon.slim <-gtf.exon[,c("chrom", "five_prime", "three_prime", "strand", "transcript")] -gtf.exon.slim.sort <- gtf.exon.slim[sort.list(gtf.exon.slim[,"transcript"]),] - - - -transcripts <- as.vector(unique(gtf.exon.slim[,"transcript"])) -#transcripts <- as.vector(unique(gtf.exon.slim[,"transcript"])) - - - -output <- matrix(0,nr=length(transcripts),nc=ncol(gtf.exon.slim.sort)) - -all.chrom <- gtf.exon.slim.sort[,"chrom"] -all.five_prime <- gtf.exon.slim.sort[,"five_prime"] -all.three_prime <- gtf.exon.slim.sort[,"three_prime"] -all.strand <- gtf.exon.slim.sort[,"strand"] -all.transcript <- gtf.exon.slim.sort[,"transcript"] - -colnames(output) <- colnames(gtf.exon.slim.sort) - -j <- 0 -i <- 1 - -previous.strand <- gtf.exon.slim.sort[i,"strand"] -previous.chrom <- gtf.exon.slim.sort[i,"chrom"] -previous.transcript <- gtf.exon.slim.sort[i,"transcript"] -previous.five_prime.min <- gtf.exon.slim.sort[i,"five_prime"] -previous.three_prime.max <- gtf.exon.slim.sort[i,"three_prime"] - -# Progress bar: -total <- dim(gtf.exon.slim.sort)[1] -# create progress bar -pb <- txtProgressBar(min = 0, max = total, style = 3) - - - - -for ( i in 1:length(all.transcript)) { - current.transcript <- all.transcript[i] - setTxtProgressBar(pb, i) - if (previous.transcript != current.transcript) { - j <- j + 1 - # Write out the current transcript info - output[j,"chrom"] <- previous.chrom - output[j,"five_prime"] <- previous.five_prime.min - output[j,"three_prime"] <- previous.three_prime.max - output[j,"strand"] <- previous.strand - output[j,"transcript"] <- previous.transcript - - # Save the new transcript info (convert the current to previous) - - previous.strand <- all.strand[i] - previous.chrom <- all.chrom[i] - previous.transcript <- all.transcript[i] # current.transcript - previous.five_prime.min <- all.five_prime[i] - previous.three_prime.max <- all.three_prime[i] - } - else { - previous.five_prime.min <- min(previous.five_prime.min, all.five_prime[i]) - previous.three_prime.max <- max(previous.three_prime.max, all.three_prime[i]) - previous.transcript <- all.transcript[i] # current.transcript - } -} -# Write the last item -j <- j + 1 -# Write out the current transcript info -output[j,"chrom"] <- previous.chrom -output[j,"five_prime"] <- previous.five_prime.min -output[j,"three_prime"] <- previous.three_prime.max -output[j,"strand"] <- previous.strand -output[j,"transcript"] <- previous.transcript - -close(pb) - -colnames(output) <- c("chrom", "txStart", "txEnd", "strand", "name2") -write.table(output, file="gene_annotation.txt", sep="\t", row.names=F) - - #ix <- which(gtf.exon.slim[,"transcript"] == transcripts[i]) - #output[i,"transcript"] <- transcripts[i] - #output[i,"five_prime"] <- min(gtf.exon.slim[ix,"five_prime"]) - #output[i,"three_prime"] <- max(gtf.exon.slim[ix,"three_prime"]) - #output[i,"strand"] <- gtf.exon.slim[ix,"strand"][1] - #output[i,"chrom"] <- gtf.exon.slim[ix,"chrom"][1] diff -r a665e0ad63db -r a0306edbf2f8 mtls_analyze/mtls_analyze/heatmap.R --- a/mtls_analyze/mtls_analyze/heatmap.R Mon Jul 23 12:58:55 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1524 +0,0 @@ -GeneratePeakMatrix <- function(experiments, scores) { - # Generates a score matrix for all mtls. - # - # Args: - # experiments: A list of underscore deliminated experiments for each mtl. - # There should never be a completely empty item in this list. - # for eg. experiments[1] = expA_expD_expB. - # scores: A list of underscore deliminated scores for each mtl, the length - # of these scores should be identicle to the length of the - # experiments. eg. scores[1] = 55_33_245. - # - # Returns: - # The peak score matrix for all mtls. - experiments <- sapply(experiments, function(x) strsplit(x, split = "_")) - scores <- sapply(scores, function(x) strsplit(x, split = "_")) - unique.experiments <- unique(unlist(experiments)) - - peaks=matrix(0,nr=length(experiments),nc=length(unique.experiments)) - colnames(peaks) <- unique.experiments - - for(i in 1:length(experiments)){ - for(j in 1:length(experiments[[i]])){ - peaks[i,experiments[[i]][j]]=as.numeric(scores[[i]][j]) - } - } - return(peaks) -################################################################## -} -GetChipData <- function(file.name, - proximity = "distal", - include.targetless = TRUE, - column.order = NA) { - # Reads in, filters, and organizes mtls data - # - # Args: - # file.name: The path to the mtls file (including the file name). - # proximity: Either "distal" or "proximal", defines the gene target distance - # from the mtl. Default is "distal". - # include.targetless: If TRUE, includes mtls with no targets (applied after - # the proximity filter); if not, mtls with no target - # will be exclided. Default is TRUE. - # column.order: An optional vector of column names in the order in which - # they will be presented. If this is left to default (NA) the - # presented order of the chip columns will be the order in - # which they are seen. - # - # Returns: - # An organized list of mtls data with the following elements: - # $peaks - a matrix of peak p-values - # $targets - a list of underscore deliminated gene targets for each mtl - if(param$debug) { - print("In GetChipData") - } - # Set Constants for the mtls file type: - MTLNUM <- "mtl.id" - CHROMOSOME <- "chr" - EXPERIMENTS <- "expt" - EXPERIMENTS.SORTED <- "expt.alphanum.sorted" - START <- "start" - END <- "end" - SUMMIT <- "summit" - SCORES <- "score" - SCORE.MEAN <- "score.mean" - SPANS <- "span.tfs" - SPAN.TOTAL <- "span.l" - PEAK.IDS <- "peak.ids" - TARGETS <- "trgt" -# PROXIMAL.TARGETS <- "trgt.prox" -# DISTAL.TARGETS <- "trgt.dist" -# PROXIMAL.TARGETS.TSS.DISTANCE <- "dtss.prox" -# DISTAL.TARGETS.TSS.DISTANCE <- "dtss.dist" - - - #Get chip data in from files: - #TARGETS <- switch(proximity, - # distal = DISTAL.TARGETS, - # proximal = PROXIMAL.TARGETS, - # stop(" Bad proximity argument supplied.")) # Default -cat("param$rna.files = ", param$rna.files, "\n") -if(!is.na(param$rna.files) && param$rna.files != "none") { - keep.columns <- c(EXPERIMENTS, SCORES, TARGETS) -} else { - keep.columns <- c(EXPERIMENTS, SCORES) -} - file <- read.delim(file.name, header=T, as.is=T)[, keep.columns] -if(!is.na(param$rna.files) && param$rna.files != "none") { - if (!include.targetless) { - ix.has.trgt <- which(as.character(file[, TARGETS])!="") - file <- file[ix.has.trgt, ] - } -} - chip = list() - - chip$peaks <- GeneratePeakMatrix(file[, EXPERIMENTS], file[, SCORES]) - if(!is.na(column.order)) { - #If you specify an order, or a subset of experiments to include, this is where - #that gets done. - order <- unlist(strsplit(column.order, split="::")) - chip$peaks <- chip$peaks[,order] - } - if(!is.na(param$rna.files) && param$rna.files != "none") { - chip$targets <- file[ , TARGETS] -} - - return (chip) -} -GetRNADataFromOneFile <- function(file.name, fpkm = "avg") { - # Reads in rna expression data from cufflinks - # - # Args: - # file.name: The path to the cufflinks file (including the file name). - # fpkm: What fpkm is chosen, options are hi, low and avg - # Returns: - # An organized vector of rnaseq data with the following elements: - # names(data) - the genes from the file. eg names(data[1]) = JUND - # data - the fpkm score for that gene data[1] = 31 - if(param$debug) { - cat("In GetRNADataFromOneFile for ",file.name) - } - - # Set Constants for the cufflinks file type: - GENE <- "gene_id" - BUNDLE <- "bundle_id" - CHROMOSOME <- "chr" - LEFT <- "left" - RIGHT <- "right" - FPKM_AVG <- "FPKM" - FPKM_LOW <- "FPKM_conf_lo" - FPKM_HIGH <- "FPKM_conf_hi" - STATUS <- "status" - - - #Get chip data in from files: - FPKM <- switch(fpkm, - hi = FPKM_HIGH, - avg = FPKM_AVG, - low = FPKM_LOW, - stop(" Bad fpkm quality argument supplied.")) # Default - - keep.columns <- c(GENE, FPKM) - - file <- read.delim(file.name, header=T, as.is=T)[, keep.columns] - - rna = vector(mode = "numeric", length = nrow(file)) - - rna <- file[, FPKM] - names(rna) <- file[ , GENE] - return (rna) -} - -GetRNAData <- function(file.names, file.lables = file.names, fpkm = "avg") { - # Reads in rna expression data from cufflinks - # - # Args: - # file.names: The list of paths to the cufflinks files (including the file name). - # fpkm: What fpkm is chosen, options are hi, low and avg - # Returns: - # A matrix of rnaseq data with the following description: - # each col corresponds to an rnaseq run - # eacg row corresponds to a gene - if(param$debug) { - print("In GetRNAData") - } - files <- list() - for(i in 1:length(file.names)) { - files[[i]] <-GetRNADataFromOneFile(file.names[i]) - } - genes <- unique(names(unlist(files))) - scores <- matrix(0,nrow=length(genes),ncol=length(file.names)) - rownames(scores) <- genes - colnames(scores) <- file.names - - for(j in 1:length(file.names)) { - scores[names(files[[j]]),j] <- files[[j]] - } -print("# of cols for scores is ") -print(ncol(scores)) -print ("file lables are ") -print (file.lables) - colnames(scores) <- file.lables - return(scores) - #scores[genes,file.names] <- files[[]] -} - -NormalizeRNA <- function(scores) { - -#add psudocount -scores <- scores+1 - -numerator <- scores[,1:(ncol(scores)/2)] -denominator <- scores[,(ncol(scores)/2 + 1):ncol(scores)] - -#new.scores <- scores[,-which(colnames(scores) == norm.exp)] -#norm.score <- scores[,which(colnames(scores) == norm.exp)] -#return(log2(new.scores/norm.score)) - -return(log2(numerator/denominator)) - -} - -PrepareRNAforHeatmap <- function(new.scores.foldchange){ -new.scores.split <- matrix(0, nr=nrow(new.scores.foldchange), nc=2*ncol(new.scores.foldchange)) -is.even <- function(x){ x %% 2 == 0 } -corresponding.col <- function(x){ceiling(x/2)} -new.col.names <- vector(length = ncol(new.scores.split)) -rownames(new.scores.split) <- rownames(new.scores.foldchange) - for(i in 1:ncol(new.scores.split)) { - if(is.even(i)) {sign <- "down"} else {sign <- "up"} - new.col.names[i] <- paste(colnames(new.scores.foldchange)[corresponding.col(i)], - sign, sep =".") - new.scores.split[,i] <- new.scores.foldchange[,corresponding.col(i)] - if(is.even(i)) { - new.scores.split[which(new.scores.split[,i]>0),i] <- 0 - new.scores.split[,i] <- -1*new.scores.split[,i] - } else { - new.scores.split[which(new.scores.split[,i]<0),i] <- 0 - } - colnames(new.scores.split) <- new.col.names -} -return(new.scores.split)} - -#ConvertExpressionToPval = function(expression, threshold = 1) -#{ -# print("In ConvertExpressionToPval") -# #Generate Normal Stats -# # expression.mean = apply(expression, 2, mean) -# # expression.sd = apply(expression, 2, sd) -# -# ix.above.threshold = which(expression > threshold) -# expression.mean = apply(expression, 2, function(i) mean(i[which(i > threshold)])) -# expression.sd = apply(expression, 2, function(i) sd(i[which(i > threshold)])) -# - -# # CDF from zero to point -# # expression.pval = matrix(,nr=nrow(expression),nc=ncol(expression),dimnames=dimnames(expression)) -# # ix.low = which(expression < expression.mean) -# # ix.upper = which(expression >= expression.mean) -# # expression.pval[ix.low] = (-10 * pnorm(expression[ix.low], mean = expression.mean, sd = expression.sd, log=T) *log10(exp(1))) -# -# expression.pval = (-10 * log10(exp(1)) * pnorm(expression, -# mean = expression.mean, -# sd = expression.sd, -# log=T, -# lower.tail=F -# ) -# ) -# #squash those under threshold to zero -# expression.pval[-ix.above.threshold] = 0 - -# # correct for upper tail (ie from point to inf) -# #Still need to do -# return (expression.pval) -#} -MapExpressiontoChip2 = function(chip.targets, expression) -{ -mymatrix = matrix(0, nr=length(chip.targets), nc=ncol(expression)) -rownames(mymatrix) <- chip.targets -colnames(mymatrix) <- colnames(expression) - - -j <- 1 #expression counter -i <- 1 #mymatrix counter -expression.genes <- rownames(expression) -previous.expression.gene <- expression.genes[j] - -# Progress bar: -total <- length(chip.targets) -# create progress bar -pb <- txtProgressBar(min = 0, max = total, style = 3) - -for ( i in 1:length(chip.targets)) { - current.target <- chip.targets[i] - setTxtProgressBar(pb, i) - while (current.target > previous.expression.gene && j<=length(expression.genes)) { - j <- j + 1 - previous.expression.gene <- expression.genes[j] - } - if (current.target == previous.expression.gene){ - mymatrix[i,] <- expression[j,] - } -} - -close(pb) - - print("Leaving MapRNAtoChip") - return(mymatrix) -} - - -MapExpressiontoChip = function(chip.targets, expression) -{ - print("In MapRNAtoChip") - # targets = unlist(strsplit(chip$targets[i], "_")) - # exp = matrix(rna$all.pval[targets, ]) - # return (apply(exp, 2, mean)) - - #rna col 1 = th0 overexpressed - #rna col 2 = th17 overexpressed - #rna col 3 = true zscores - - - - - - - #################################################################### - chip.targets.notnull.unique <- unique(chip.targets[which(chip.targets!="")]) - gene.intersect <- intersect(rownames(expression), chip.targets.notnull.unique) - chip.targets.notnull.unique.with.data <- chip.targets.notnull.unique[which(chip.targets.notnull.unique %in% gene.intersect)] - expression.useful.data <- expression[which(rownames(expression) %in% gene.intersect),] - chip.targets.notnull.unique.with.data.sorted <- chip.targets.notnull.unique.with.data[order(chip.targets.notnull.unique.with.data)] - expression.useful.data.sorted <- expression.useful.data[order(rownames(expression.useful.data)),] - #################################################################### - - mymatrix = matrix(0, nr=length(chip.targets), nc=ncol(expression)) - rownames(mymatrix) <- chip.targets - colnames(mymatrix) <- colnames(expression) - head(chip.targets.notnull.unique.with.data.sorted) - head(rownames(expression.useful.data.sorted)) - if(!identical(chip.targets.notnull.unique.with.data.sorted, rownames(expression.useful.data.sorted))){ - stop("We have a serious problem, chip is not alligned to expression") - } - for(i in 1:length(chip.targets.notnull.unique.with.data.sorted)) - { - mtls.ix <- which(chip.targets == chip.targets.notnull.unique.with.data.sorted[i]) - for(j in 1:length(mtls.ix)){ - mymatrix[mtls.ix[j],] <- expression.useful.data.sorted[i,] - } - } - print("Leaving MapRNAtoChip") - return(mymatrix) - - #return( sapply(chip$targets, function(x) apply(rna$all.pval[unlist(strsplit(chip$targets[3], "_")), ], 2, mean)) ) -} - - - - - -GenerateKMOrder <- function(data, km) { - # generates the order of clusters from high to low mean values - # - # Args: - # data: A matrix of scores that have already been clustered. - # km: The K-means object generated from running kmeans on data. Another - # method could be used so long as it supplies a (km)$cluser list. Must - # have the same length as the number of rows in data - # - # Returns: - # cluster.order: The order in which the clusters should be displayed. - - km.cluster = km$cluster - clusters = unique(km.cluster) - clusters.avg = numeric() - for(i in clusters) { - clusters.avg = c(clusters.avg, mean(data[which(km.cluster == i), ])) - } - if(param$debug) { - print ("straight clusters") - print (clusters) - print ("straigth average") - print (clusters.avg) - print ("ordered clusters") - print (clusters[order(clusters.avg)]) - print("ordered average") - print (clusters.avg[order(clusters.avg)]) - } - return(clusters[rev(order(clusters.avg))]) -} - -OrderMTL <- function(data, km, cluster.order) { - # Orders a matrix of data according to a clustering algorithm - # - # Args: - # data: A matrix of scores that have already been clustered. - # km: The K-means object generated from running kmeans on data. Another - # method could be used so long as it supplies a (km)$cluser list. Must - # have the same length as the number of rows in data - # cluster.order: The order in which the clusters should be displayed. - # for eg. km.order = c(2, 3, 1) would result in cluster 2 - # being on top, then cluster 3 and lastly cluster 1. - # - # Returns: - # a list that contains 3 objects: - # list$data: the ordered version of the data. - # list$color.vector: a list of colors that should be assigned to each row. - # list$start.row: the starting row of each cluster in data. - number.clusters <- length(cluster.order) - cluster.colors <- sample(rainbow(number.clusters)) - - # Set up return objects - sorted.data <- matrix(,nr=nrow(data), nc=ncol(data)) - colnames(sorted.data) <- colnames(data) - cluster.color.vector = vector(length=length(km$cluster)) - cluster.start.row = numeric(number.clusters) - cluster.start.row[1]=1 - - for (i in 1:number.clusters) - { - current.cluster = cluster.order[i] - ix = which(km$cluster == current.cluster) - current.cluster.range <- cluster.start.row[i]:(cluster.start.row[i]+length(ix)-1) - sorted.data[current.cluster.range, ] = data[ix, ] - cluster.color.vector[current.cluster.range] = cluster.colors[i] - cluster.start.row[i+1] = (cluster.start.row[i]+length(ix)) - } - ret.list = list() - ret.list$data = sorted.data - ret.list$color.vector = cluster.color.vector - ret.list$cluster.index = cluster.start.row - return(ret.list) -} - - -CreateHeatMap <- function(data, - km, - cluster.order, - document.name, - document.type = "png", - number.colors = 30) { - # Generates a heatmap image based for a matrix based on a clustering - # algorithm. - # - # Args: - # data: A matrix of scores that have already been clustered, the column - # names of this matrix will become the column titels of the heatmap. - # km: The K-means object generated from running kmeans on data. Another - # method could be used so long as it supplies a (km)$cluser list. - # cluster.order: The order in which the clusters should be displayed. - # for eg. km.order = c(2, 3, 1) would result in cluster 2 - # being on top, then cluster 3 and lastly cluster 1. - # document.name: A name for the produced file. there is no need to - # supply the .png/.pdf in your argument. - # document.type: The type of file you want to produce. current options are - # png and pdf. Default is pdf. - # - # Returns: - # Nothing to the script that calls it, however it creates an image at the - # path specified. - if(param$debug) { - print("In CreateHeatMap") - } - - data.ordered <- OrderMTL(data, km, cluster.order) - - - #Load Lib - library(gplots) - - #Set Color gradient - color.ramp = colorRampPalette(c("black", - "darkblue", - "blue", - "yellow", - "orange", - "red"))(number.colors) #7 - - if(document.type == "png") { - png(paste(document.name,".png", sep = ""), - #width=15360,#1920, - #height=204080,#2560, - #res=500, - antialias="gray") - } - else if(document.type == "pdf") { - pdf(paste(document.name,".pdf", sep = "")) - } - else if(document.type == "tiff") { - tiff(paste(document.name,".tiff", sep = ""), - res=800, - pointsize=2, - width=1920, - height=1920) - } - else { - bitmap(paste(document.name,".bmp", sep = ""), - height = 5, - width = 5, - res = 500) - } -#op <- par(mar = rep(0, 4)) - - heatmap.2( - data.ordered$data, #data.ordered$data[,sort(colnames(data.ordered$data))], - rowsep = data.ordered$cluster.index[-1], - sepwidth = c(0.5, ncol(data)/100), - dendrogram = "none", - Rowv = F, - Colv = F, - trace = "none", - labRow = F, #sapply(seq(1:length(data.ordered$cluster.index)), toString), - labCol = colnames(data.ordered$data), #sort(colnames(data.ordered$data)), - RowSideColors = data.ordered$color.vector, - keysize=0.6, - key=F, - col = color.ramp, - cexCol = 0.8, - cexRow = 0.8) - - dev.off() - -} - -CreateIndividualHeatMap <- function(data, - km, - cluster.order, - color.ramp = colorRampPalette(c("black", - "darkblue", - "blue", - "yellow", - "orange", - "red"))(30)) { - # Generates a heatmap image based for a matrix based on a clustering - # algorithm. - # - # Args: - # data: A matrix of scores that have already been clustered, the column - # names of this matrix will become the column titels of the heatmap. - # km: The K-means object generated from running kmeans on data. Another - # method could be used so long as it supplies a (km)$cluser list. - # cluster.order: The order in which the clusters should be displayed. - # for eg. km.order = c(2, 3, 1) would result in cluster 2 - # being on top, then cluster 3 and lastly cluster 1. - # document.name: A name for the produced file. there is no need to - # supply the .png/.pdf in your argument. - # document.type: The type of file you want to produce. current options are - # png and pdf. Default is pdf. - # - # Returns: - # Nothing to the script that calls it, however it creates an image at the - # path specified. - if(param$debug) { - print("In CreateHeatMap") - } - - data.ordered <- OrderMTL(data, km, cluster.order) - - - #Load Lib - library(gplots) - - - - - heatmap.3( - data.ordered$data, #data.ordered$data[,sort(colnames(data.ordered$data))], - rowsep = data.ordered$cluster.index[-1], - sepwidth = c(0.5, nrow(data.ordered$data)/100), - dendrogram = "none", - Rowv = F, - Colv = F, - trace = "none", - labRow = F, #sapply(seq(1:length(data.ordered$cluster.index)), toString), - labCol = colnames(data.ordered$data), #sort(colnames(data.ordered$data)), - #sourcRowSideColors = data.ordered$color.vector, - keysize=0.6, - key=F, - col = color.ramp, - cexCol = 0.8, - cexRow = 0.8) - - -} - -ReadCommadLineParameters <- function(argument.names, command.line.arguments, optional = F) { - # Reads the parameters from the command line arguments. - # - # Args: - # argument.names: A list of expected argument names. - # command.line.arguments: The list of recieved command line arguments - # optional: Are the areguments optional, or are they required, default is required - # - # Returns: - # The arguments for argument.names. As strings In that order. - - if(length(grep("--version",command.line.arguments))) { - cat("version",script.version,"\n") - q() - } - # Split command line arguments - args <- sapply(strsplit(command.line.arguments, " "),function(i) i) - vals <- character(length(argument.names)) - # split cmd.line to key and value pairs - for(i in 1:length(argument.names)) { - ix <- grep(argument.names[i], args) - if(length(ix)>1) { - stop("arg ", - argument.names[i], - " used more than once. Bailing out...\n", - PrintParamError()) - } - else if (length(ix)==0 && !optional) { - stop("could not find ", - argument.names[i], - ". Bailing out...\n", - PrintParamError()) - } - else if (length(ix)==0 && optional) { - vals[i] <- NA - } - else { - vals[i] <- args[ix+1] - } - } - return(vals) -} - -PrintParamError <- function(){ - # Prints the usage of the function, shows users what arguments they can use - # - # Args: - # param: A list that contains all the paramaters. - # - # Returns: - # A modified version of the param list, with the default values loaded. - cat(" -DESCRIPTIION: - heatmap.R takes a ... - -INPUT: - 1.--mtls_file: path to mtls file.\n - 2.--cluster_file: the destination path for the output cluster file.\n - 3.--chip_experiment_order: The order of desired chip experiments (optional).\n - 4.--heatmap_file: path for output heatmap image (no extension).\n - 5.--heatmap_type: choice of image format, currently support png, pdf, tiff and bmp (optional)\n - 6.--expression_file: list of expression files to be included in analysis (optional).\n - 7.--expression_name: lables for the expression files (optional).\n - 8.--normalization_file: a list of files to be used for normalization, - they can be the same file, however the number of expression nominated - normalization files must match the number of expression files (optional)\n - 9.--n_clusters: number of clusters\n - 10.--filter_percentage: percentage of mtls that will be analysed. Eg: if - we make filter_percentage 30, we will take the union of the top mtls in - mean, non-zero mean and variance (optional).\n - -EXAMPLE RUN: - Rscript heatmap.R - --mtls_file path/to/mtls.xls - --cluster_file path/to/output/cluster - --chip_experiment_order tf1::tf2::tf5::tf3 - --heatmap_file path/to/output/heatmap - --heatmap_type png - --expression_file path/to/exp1::path/to/exp2 - --expression_name myexp1::myexp2 - --normalization_file path/to/exp3::path/to/exp3 - --n_clusters 13 - --filter_percentage 100 - --include_targetless yes - --number_bins 30 - - ") -} - -#LoadDefaultParams <- function(param) { -# # Loads default paramaters for the heatmap application -# # -# # Args: -# # param: A list that contains all the previous paramaters. -# # -# # Returns: -# # A modified version of the param list, with the default values loaded. - -#script.version=0.1 -#param$debug = F - -## RNA data: -#param$rna.files = "" -#param$rna.normalization = "none" - -## Filter: -#param$filter.percentage <- 100 - -## Clustering: -#param$clustering.number.of.clusters <- 13 - -## Heatmap: -#param$heatmap.document.name <- "heatmap" -#param$heatmap.document.type <- "png" -##Cluster Groups: -#param$cluster.groups.document.name <- "clusters" - -#return(param) - -#} - - - -LoadParams <- function(cmd.args, args.nms, n.start.numeric, optional = F) { - # Loads user defined paramaters for the heatmap application - # - # Args: - # cmd.args: The command line arguments given. - # arg.nms: A list of possible command arguments. - # n.start.numeric: the first argument that should be numeric (alway put - # these last). - # optional: This specifies if the params in cmd.args are optional or - # required. - # Returns: - # A list of values assigned to each argument. - - - vals <- ReadCommadLineParameters(argument.names = args.nms, - command.line.arguments = cmd.args, - optional = optional) - - #check if numeric params are indeed numeric - if(!optional) { - for(i in n.start.numeric:length(vals)){ - if(is.na(as.numeric(vals[i]))){ - stop("arg ",args.nms[i]," is not numeric. Bailing out...\n",print.error()) - } - } -} - return (vals) - -} - -#ValidateParams <- function(params) { -# return(T) -#} - - -########## - -LoadDebugParams <- function(param) { - - cmd.args <- c( - "--mtls_file data/test/mtls.xls", - "--cluster_file data/test/cluster", - "--heatmap_file data/test/heatmap", - "--heatmap_type bmp", - "--n_clusters 13", - "--filter_percentage 100", - "--expression_file /home/kieran/code/scorchR-heatmap/data/expression/rna1.tabular::/home/kieran/code/scorchR-heatmap/data/expression/rna2.tabular", - "--expression_name batf.ko.0::batf.ko.17", - "--normalization_file mean", - "--chip_experiment_order ac::bc::cc::dc::ec", - "--include_targetless yes", - "--number_bins 20" - ) - -args.nms <- c( "--mtls_file", #1 - "--cluster_file", #2 - "--chip_experiment_order", #3 - "--heatmap_file", #4 - "--heatmap_type", #5 - "--expression_file", #6 - "--expression_name", #7 - "--normalization_file", #8 - "--include_targetless", #9 - "--n_clusters", #10 - "--filter_percentage", #11 - "--number_bins") #12 - - -# vals has the same order as args.nms -vals <- LoadParams(cmd.args, args.nms, n.start.numeric = 10, optional = F) - - -# ChIP data: -param$annotated.macs.file <- vals[1] -param$chip.order <- vals[3] -# RNA data: -param$rna.files = vals[6] -param$rna.names = vals[7] -param$rna.normalization.file = vals[8] -param$include.targetless = vals[9] - -# Filter: -param$filter.percentage <- as.numeric(vals[11]) - -# Clustering: -param$number.bins <- as.numeric(vals[12]) -param$clustering.number.of.clusters <- as.numeric(vals[10]) - -# Heatmap: -param$heatmap.document.name <- vals[4] -param$heatmap.document.type <- vals[5] -#Cluster Groups: -param$cluster.groups.document.name <- vals[2] - -return(param) - -} - - - -LoadOptionalParams <- function(param) { - -cmd.args <- commandArgs(trailingOnly = T) - -args.nms <- c( "--chip_experiment_order", #1 - "--expression_file", #2 - "--expression_name", #3 - "--normalization_file", #4 - "--heatmap_type", #5 - "--include_targetless", #6 - "--filter_percentage", #7 - "--number_bins") #8 - - -# vals has the same order as args.nms -vals <- LoadParams(cmd.args, args.nms, n.start.numeric = 7, optional = T) - -# ChIP data: -param$chip.order <- if(!is.na(vals[1])){vals[1]}else{NA} - -# RNA data: -param$rna.files <- if(!is.na(vals[2])){vals[2]}else{"none"} -param$rna.names <- if(!is.na(vals[3])){vals[3]}else{"none"} -param$rna.normalization.file <- if(!is.na(vals[4])){vals[4]}else{"no"} -param$include.targetless <- if(!is.na(vals[6])){vals[6]}else{"yes"} -# Filter: -param$filter.percentage <- if(!is.na(vals[7])){as.numeric(vals[7])}else{100} -param$number.bins <- if(!is.na(vals[8])){as.numeric(vals[8])}else{30} -# Heatmap file (output) -param$heatmap.document.type <- if(!is.na(vals[5])){vals[5]}else{"none"} - -return(param) -} - - -LoadReqiredParams <- function(param){ - -cmd.args <- commandArgs(trailingOnly = T) - -args.nms <- c( "--mtls_file", #1 - "--cluster_file", #2 - "--heatmap_file", #3 - "--n_clusters") #4 - - -# vals has the same order as args.nms -vals <- LoadParams(cmd.args, args.nms, n.start.numeric = 4, optional = F) - -# ChIP data: -param$annotated.macs.file <- vals[1] - -# Clustering -param$clustering.number.of.clusters <- as.numeric(vals[4]) - -# Cluster file (output): -param$cluster.groups.document.name <- vals[2] - - -# Heatmap file (output): -param$heatmap.document.name <- vals[3] - -return(param) - -} - -########### -# here we output the fasta file of the targets of each kmeans cluster -CreateClusterGroups <- function(trgts,k.ix,f.nm="output/clust.to.trgts", km.order){ - f.nm = paste(f.nm, ".fasta", sep="") - clusters = km.order - for(i in 1:length(clusters)){ - v=trgts[which(k.ix==clusters[i])] - v.split = unlist(sapply(v,strsplit, "_")) - if(i == 1){ - cat(sep="",file=f.nm,">cluster_",i,"\n")#clusters[i],"\n") - } else { - cat(sep="",file=f.nm,">cluster_",i,"\n",append=TRUE)#clusters[i],"\n",append=TRUE) - } - cat(sep="\n",file=f.nm,v.split,append=TRUE) - } -} - - -PrintClusters <- function(trgts,k.ix,f.nm="output/clust.to.trgts", km.order){ - f.nm = paste(f.nm, ".tsv", sep="") - cat(sep="\t",file=f.nm,"row_number/target","cluster","\n") - trgts[which(trgts=="")] <- "no_target" - for(i in 1:length(trgts)){ - cat(sep="",file=f.nm,trgts[i],"\t","cluster_",k.ix[i],"\n",append=TRUE) - } -# if(i == 1){ -# cat(sep="",file=f.nm,"cluster_",i,"\n")#clusters[i],"\n") -# } else { -# cat(sep="",file=f.nm,">cluster_",i,"\n",append=TRUE)#clusters[i],"\n",append=TRUE) -# } - #cat(sep="\n",file=f.nm,v.split,append=TRUE) - } -GetTopRowsFromMatrix = function(mtrx, percentage = 10) -{ - if (param$debug) { - print("In GetTopRowsFromMatrix") - } - #Store the stats for the mtrx - stats = list() - stats$mean=apply(mtrx,1,mean) - stats$sd=apply(mtrx,1,sd) - stats$nonzero.mean=apply(mtrx, 1, function(x) mean(x[which(x != 0)])) - - - #Store the indexes for the mtrx - index = list() - index$mean = sort(stats$mean, decreasing=T, index.return=T)$ix - index$sd = sort(stats$sd, decreasing=T, index.return=T)$ix - index$nonzero.mean = sort(stats$nonzero.mean, decreasing=T, index.return=T)$ix - - #Calculate how many data points we want to take - cutoff = floor(length(mtrx[ ,1])*(percentage/100)) - - #Extract Indexes and return to caller - index$union = union(index$mean[1:cutoff], union(index$sd[1:cutoff], index$nonzero.mean[1:cutoff])) - - return(index) -} - -#GetUniq = function(targets.cluster) -#{ -# #problem with this function is that it agrigates the list handed to it. after this point you cant maintain order -# -# return(unique(unlist(lapply(targets.cluster, function(i) strsplit(i, "_"))))) -# #return(unlist(lapply(targets.cluster, function(i) strsplit(i, "_")))) -# -#} - -bindata = function(d,qunts=seq(.4,.9,.1)) -{ - tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d)) - for(j in 1:ncol(d)) - { - bins=quantile(d[,j],qunts) - for(i in 1:length(bins)) - { - tmp[which(d[,j]>bins[i]),j] = i - } - } - return(tmp) -} - -bindata.non.zero = function(d,qunts=seq(0,0.9,0.1)) -{ - tmp=matrix(0,nr=nrow(d),nc=ncol(d)) - for(j in 1:ncol(d)) - { - ix.non.zero=which(d[,j]!=0) - bins=quantile(d[ix.non.zero,j],qunts) - for(i in 1:length(bins)) - { - tmp[which(d[,j]>bins[i]),j] = i - } - } - - return(tmp) -} -bindata.matrix = function(d,qunts=seq(0,0.9,0.1)) -{ - tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d)) - #ix.non.zero=which(d!=0) - bins=quantile(d[],qunts) - for(i in 1:length(bins)) - { - tmp[which(d>bins[i])] = i - } - return(tmp) -} -bindata.non.zero.matrix = function(d,qunts=seq(0,0.9,0.1)) -{ - tmp=matrix(0,nr=nrow(d),nc=ncol(d),dimnames=dimnames(d)) - ix.non.zero=which(d!=0) - bins=quantile(d[ix.non.zero],qunts) - for(i in 1:length(bins)) - { - tmp[which(d>bins[i])] = i - } - return(tmp) -} -heatmap.3 <- function (x, Rowv = TRUE, Colv = if (symm) "Rowv" else TRUE, - distfun = dist, hclustfun = hclust, dendrogram = c("both", - "row", "column", "none"), symm = FALSE, scale = c("none", - "row", "column"), na.rm = TRUE, revC = identical(Colv, - "Rowv"), add.expr, breaks, symbreaks = min(x < 0, na.rm = TRUE) || - scale != "none", col = "heat.colors", colsep, rowsep, - sepcolor = "white", sepwidth = c(0.05, 0.05), cellnote, notecex = 1, - notecol = "cyan", na.color = par("bg"), trace = c("column", - "row", "both", "none"), tracecol = "cyan", hline = median(breaks), - vline = median(breaks), linecol = tracecol, margins = c(5, - 5), ColSideColors, RowSideColors, cexRow = 0.2 + 1/log10(nr), - cexCol = 0.2 + 1/log10(nc), labRow = NULL, labCol = NULL, - key = TRUE, keysize = 1.5, density.info = c("histogram", - "density", "none"), denscol = tracecol, symkey = min(x < - 0, na.rm = TRUE) || symbreaks, densadj = 0.25, main = NULL, - xlab = NULL, ylab = NULL, lmat = NULL, lhei = NULL, lwid = NULL, - ...) -{ - scale01 <- function(x, low = min(x), high = max(x)) { - x <- (x - low)/(high - low) - x - } - retval <- list() - scale <- if (symm && missing(scale)) - "none" - else match.arg(scale) - dendrogram <- match.arg(dendrogram) - trace <- match.arg(trace) - density.info <- match.arg(density.info) - if (length(col) == 1 && is.character(col)) - col <- get(col, mode = "function") - if (!missing(breaks) && (scale != "none")) - warning("Using scale=\"row\" or scale=\"column\" when breaks are", - "specified can produce unpredictable results.", "Please consider using only one or the other.") - if (is.null(Rowv) || is.na(Rowv)) - Rowv <- FALSE - if (is.null(Colv) || is.na(Colv)) - Colv <- FALSE - else if (Colv == "Rowv" && !isTRUE(Rowv)) - Colv <- FALSE - if (length(di <- dim(x)) != 2 || !is.numeric(x)) - stop("`x' must be a numeric matrix") - nr <- di[1] - nc <- di[2] - if (nr <= 1 || nc <= 1) - stop("`x' must have at least 2 rows and 2 columns") - if (!is.numeric(margins) || length(margins) != 2) - stop("`margins' must be a numeric vector of length 2") - if (missing(cellnote)) - cellnote <- matrix("", ncol = ncol(x), nrow = nrow(x)) - if (!inherits(Rowv, "dendrogram")) { - if (((!isTRUE(Rowv)) || (is.null(Rowv))) && (dendrogram %in% - c("both", "row"))) { - if (is.logical(Colv) && (Colv)) - dendrogram <- "column" - else dedrogram <- "none" - warning("Discrepancy: Rowv is FALSE, while dendrogram is `", - dendrogram, "'. Omitting row dendogram.") - } - } - if (!inherits(Colv, "dendrogram")) { - if (((!isTRUE(Colv)) || (is.null(Colv))) && (dendrogram %in% - c("both", "column"))) { - if (is.logical(Rowv) && (Rowv)) - dendrogram <- "row" - else dendrogram <- "none" - warning("Discrepancy: Colv is FALSE, while dendrogram is `", - dendrogram, "'. Omitting column dendogram.") - } - } - if (inherits(Rowv, "dendrogram")) { - ddr <- Rowv - rowInd <- order.dendrogram(ddr) - } - else if (is.integer(Rowv)) { - hcr <- hclustfun(distfun(x)) - ddr <- as.dendrogram(hcr) - ddr <- reorder(ddr, Rowv) - rowInd <- order.dendrogram(ddr) - if (nr != length(rowInd)) - stop("row dendrogram ordering gave index of wrong length") - } - else if (isTRUE(Rowv)) { - Rowv <- rowMeans(x, na.rm = na.rm) - hcr <- hclustfun(distfun(x)) - ddr <- as.dendrogram(hcr) - ddr <- reorder(ddr, Rowv) - rowInd <- order.dendrogram(ddr) - if (nr != length(rowInd)) - stop("row dendrogram ordering gave index of wrong length") - } - else { - rowInd <- nr:1 - } - if (inherits(Colv, "dendrogram")) { - ddc <- Colv - colInd <- order.dendrogram(ddc) - } - else if (identical(Colv, "Rowv")) { - if (nr != nc) - stop("Colv = \"Rowv\" but nrow(x) != ncol(x)") - if (exists("ddr")) { - ddc <- ddr - colInd <- order.dendrogram(ddc) - } - else colInd <- rowInd - } - else if (is.integer(Colv)) { - hcc <- hclustfun(distfun(if (symm) - x - else t(x))) - ddc <- as.dendrogram(hcc) - ddc <- reorder(ddc, Colv) - colInd <- order.dendrogram(ddc) - if (nc != length(colInd)) - stop("column dendrogram ordering gave index of wrong length") - } - else if (isTRUE(Colv)) { - Colv <- colMeans(x, na.rm = na.rm) - hcc <- hclustfun(distfun(if (symm) - x - else t(x))) - ddc <- as.dendrogram(hcc) - ddc <- reorder(ddc, Colv) - colInd <- order.dendrogram(ddc) - if (nc != length(colInd)) - stop("column dendrogram ordering gave index of wrong length") - } - else { - colInd <- 1:nc - } - retval$rowInd <- rowInd - retval$colInd <- colInd - retval$call <- match.call() - x <- x[rowInd, colInd] - x.unscaled <- x - cellnote <- cellnote[rowInd, colInd] - if (is.null(labRow)) - labRow <- if (is.null(rownames(x))) - (1:nr)[rowInd] - else rownames(x) - else labRow <- labRow[rowInd] - if (is.null(labCol)) - labCol <- if (is.null(colnames(x))) - (1:nc)[colInd] - else colnames(x) - else labCol <- labCol[colInd] - if (scale == "row") { - retval$rowMeans <- rm <- rowMeans(x, na.rm = na.rm) - x <- sweep(x, 1, rm) - retval$rowSDs <- sx <- apply(x, 1, sd, na.rm = na.rm) - x <- sweep(x, 1, sx, "/") - } - else if (scale == "column") { - retval$colMeans <- rm <- colMeans(x, na.rm = na.rm) - x <- sweep(x, 2, rm) - retval$colSDs <- sx <- apply(x, 2, sd, na.rm = na.rm) - x <- sweep(x, 2, sx, "/") - } - if (missing(breaks) || is.null(breaks) || length(breaks) < - 1) { - if (missing(col) || is.function(col)) - breaks <- 16 - else breaks <- length(col) + 1 - } - if (length(breaks) == 1) { - if (!symbreaks) - breaks <- seq(min(x, na.rm = na.rm), max(x, na.rm = na.rm), - length = breaks) - else { - extreme <- max(abs(x), na.rm = TRUE) - breaks <- seq(-extreme, extreme, length = breaks) - } - } - nbr <- length(breaks) - ncol <- length(breaks) - 1 - if (class(col) == "function") - col <- col(ncol) - min.breaks <- min(breaks) - max.breaks <- max(breaks) - x[x < min.breaks] <- min.breaks - x[x > max.breaks] <- max.breaks - # if (missing(lhei) || is.null(lhei)) - # lhei <- c(keysize, 4) - # if (missing(lwid) || is.null(lwid)) - # lwid <- c(keysize, 4) - # if (missing(lmat) || is.null(lmat)) { - # lmat <- rbind(4:3, 2:1) - # if (!missing(ColSideColors)) { - # if (!is.character(ColSideColors) || length(ColSideColors) != - # nc) - # stop("'ColSideColors' must be a character vector of length ncol(x)") - # lmat <- rbind(lmat[1, ] + 1, c(NA, 1), lmat[2, ] + - # 1) - # lhei <- c(lhei[1], 0.2, lhei[2]) - # } - # if (!missing(RowSideColors)) { - # if (!is.character(RowSideColors) || length(RowSideColors) != - # nr) - # stop("'RowSideColors' must be a character vector of length nrow(x)") - # lmat <- cbind(lmat[, 1] + 1, c(rep(NA, nrow(lmat) - - # 1), 1), lmat[, 2] + 1) - # lwid <- c(lwid[1], 0.2, lwid[2]) - # } - # lmat[is.na(lmat)] <- 0 - # } - # if (length(lhei) != nrow(lmat)) - # stop("lhei must have length = nrow(lmat) = ", nrow(lmat)) - # if (length(lwid) != ncol(lmat)) - # stop("lwid must have length = ncol(lmat) =", ncol(lmat)) - # op <- par(no.readonly = TRUE) - # on.exit(par(op)) - # layout(lmat, widths = lwid, heights = lhei, respect = FALSE) - if (!missing(RowSideColors)) { - par(mar = c(margins[1], 0, 0, 0.5)) - image(rbind(1:nr), col = RowSideColors[rowInd], axes = FALSE) - } - if (!missing(ColSideColors)) { - par(mar = c(0.5, 0, 0, margins[2])) - image(cbind(1:nc), col = ColSideColors[colInd], axes = FALSE) - } - par(mar = c(margins[1], 0, 0, margins[2])) - x <- t(x) - cellnote <- t(cellnote) - if (revC) { - iy <- nr:1 - if (exists("ddr")) - ddr <- rev(ddr) - x <- x[, iy] - cellnote <- cellnote[, iy] - } - else iy <- 1:nr - image(1:nc, 1:nr, x, xlim = 0.5 + c(0, nc), ylim = 0.5 + - c(0, nr), axes = FALSE, xlab = "", ylab = "", col = col, - breaks = breaks, ...) - retval$carpet <- x - if (exists("ddr")) - retval$rowDendrogram <- ddr - if (exists("ddc")) - retval$colDendrogram <- ddc - retval$breaks <- breaks - retval$col <- col - if (!invalid(na.color) & any(is.na(x))) { - mmat <- ifelse(is.na(x), 1, NA) - image(1:nc, 1:nr, mmat, axes = FALSE, xlab = "", ylab = "", - col = na.color, add = TRUE) - } - axis(1, 1:nc, labels = labCol, las = 2, line = -0.5, tick = 0, - cex.axis = cexCol) - if (!is.null(xlab)) - mtext(xlab, side = 1, line = margins[1] - 1.25) - axis(4, iy, labels = labRow, las = 2, line = -0.5, tick = 0, - cex.axis = cexRow) - if (!is.null(ylab)) - mtext(ylab, side = 4, line = margins[2] - 1.25) - if (!missing(add.expr)) - eval(substitute(add.expr)) - if (!missing(colsep)) - for (csep in colsep) rect(xleft = csep + 0.5, ybottom = rep(0, - length(csep)), xright = csep + 0.5 + sepwidth[1], - ytop = rep(ncol(x) + 1, csep), lty = 1, lwd = 1, - col = sepcolor, border = sepcolor) - if (!missing(rowsep)) - for (rsep in rowsep) rect(xleft = 0, ybottom = (ncol(x) + - 1 - rsep) - 0.5, xright = nrow(x) + 1, ytop = (ncol(x) + - 1 - rsep) - 0.5 - sepwidth[2], lty = 1, lwd = 1, - col = sepcolor, border = sepcolor) - min.scale <- min(breaks) - max.scale <- max(breaks) - x.scaled <- scale01(t(x), min.scale, max.scale) - if (trace %in% c("both", "column")) { - retval$vline <- vline - vline.vals <- scale01(vline, min.scale, max.scale) - for (i in colInd) { - if (!is.null(vline)) { - abline(v = i - 0.5 + vline.vals, col = linecol, - lty = 2) - } - xv <- rep(i, nrow(x.scaled)) + x.scaled[, i] - 0.5 - xv <- c(xv[1], xv) - yv <- 1:length(xv) - 0.5 - lines(x = xv, y = yv, lwd = 1, col = tracecol, type = "s") - } - } - if (trace %in% c("both", "row")) { - retval$hline <- hline - hline.vals <- scale01(hline, min.scale, max.scale) - for (i in rowInd) { - if (!is.null(hline)) { - abline(h = i + hline, col = linecol, lty = 2) - } - yv <- rep(i, ncol(x.scaled)) + x.scaled[i, ] - 0.5 - yv <- rev(c(yv[1], yv)) - xv <- length(yv):1 - 0.5 - lines(x = xv, y = yv, lwd = 1, col = tracecol, type = "s") - } - } - if (!missing(cellnote)) - text(x = c(row(cellnote)), y = c(col(cellnote)), labels = c(cellnote), - col = notecol, cex = notecex) - par(mar = c(margins[1], 0, 0, 0)) - # if (dendrogram %in% c("both", "row")) { - # plot(ddr, horiz = TRUE, axes = FALSE, yaxs = "i", leaflab = "none") - # } - # else plot.new() - par(mar = c(0, 0, if (!is.null(main)) 5 else 0, margins[2])) - # if (dendrogram %in% c("both", "column")) { - # plot(ddc, axes = FALSE, xaxs = "i", leaflab = "none") - # } - # else plot.new() - if (!is.null(main)) - title(main, cex.main = 1.5 * op[["cex.main"]]) - # if (key) { - # par(mar = c(5, 4, 2, 1), cex = 0.75) - # tmpbreaks <- breaks - # if (symkey) { - # max.raw <- max(abs(c(x, breaks)), na.rm = TRUE) - # min.raw <- -max.raw - # tmpbreaks[1] <- -max(abs(x), na.rm = TRUE) - # tmpbreaks[length(tmpbreaks)] <- max(abs(x), na.rm = TRUE) - # } - # else { - # min.raw <- min(x, na.rm = TRUE) - # max.raw <- max(x, na.rm = TRUE) - # } - # z <- seq(min.raw, max.raw, length = length(col)) - # image(z = matrix(z, ncol = 1), col = col, breaks = tmpbreaks, - # xaxt = "n", yaxt = "n") - # par(usr = c(0, 1, 0, 1)) - # lv <- pretty(breaks) - # xv <- scale01(as.numeric(lv), min.raw, max.raw) - # axis(1, at = xv, labels = lv) - # if (scale == "row") - # mtext(side = 1, "Row Z-Score", line = 2) - # else if (scale == "column") - # mtext(side = 1, "Column Z-Score", line = 2) - # else mtext(side = 1, "Value", line = 2) - # if (density.info == "density") { - # dens <- density(x, adjust = densadj, na.rm = TRUE) - # omit <- dens$x < min(breaks) | dens$x > max(breaks) - # dens$x <- dens$x[-omit] - # dens$y <- dens$y[-omit] - # dens$x <- scale01(dens$x, min.raw, max.raw) - # lines(dens$x, dens$y/max(dens$y) * 0.95, col = denscol, - # lwd = 1) - # axis(2, at = pretty(dens$y)/max(dens$y) * 0.95, pretty(dens$y)) - # title("Color Key\nand Density Plot") - # par(cex = 0.5) - # mtext(side = 2, "Density", line = 2) - # } - # else if (density.info == "histogram") { - # h <- hist(x, plot = FALSE, breaks = breaks) - # hx <- scale01(breaks, min.raw, max.raw) - # hy <- c(h$counts, h$counts[length(h$counts)]) - # lines(hx, hy/max(hy) * 0.95, lwd = 1, type = "s", - # col = denscol) - # axis(2, at = pretty(hy)/max(hy) * 0.95, pretty(hy)) - # title("Color Key\nand Histogram") - # par(cex = 0.5) - # mtext(side = 2, "Count", line = 2) - # } - # else title("Color Key") - # } - # else plot.new() - retval$colorTable <- data.frame(low = retval$breaks[-length(retval$breaks)], - high = retval$breaks[-1], color = retval$col) - invisible(retval) -} - -#rm(list=ls()) -param <- list() -param$debug <- FALSE #T - -print("Finished loading functions") - - -if(param$debug) { - param <- LoadDebugParams(param) -} else { - param <- LoadReqiredParams(param) - param <- LoadOptionalParams(param) -} -print (param) -Rprof() #Remove me -# Load Chip data, alread in pval format -# Column.order need to be a vecor of strings, that correspond to the order (and -# inclustion) of chip experiments. -chip <- GetChipData(param$annotated.macs.file, column.order = param$chip.order) - -#Get the top percentages on different criteria -#ix.top <- GetTopRowsFromMatrix(chip$peaks, percentage = param$filter.percentage) -#chip$peaks <- chip$peaks[ix.top$union, ] -#chip$targets <- chip$targets[ix.top$union] -#Bin data: -chip$peaks <- bindata.non.zero.matrix(chip$peaks, qunts = seq(0, 0.9, length=(param$number.bins+1))) - -file.names <- unlist(strsplit(param$rna.files, split="::")) -print(file.names) -file.lables <- unlist(strsplit(param$rna.names, split="::")) -print(file.lables) - -library("RColorBrewer") -color.2 = rev(colorRampPalette(c("darkblue", "steelblue", "lightgrey"))(param$number.bins)) -if(!is.na(file.names) && file.names != "none") { - #rna.scores <- GetRNAData(file.names, file.lables = file.lables, fpkm = "avg") - #do differential expression if the user specifies such: - if(param$rna.normalization != "no") { - if(param$rna.normalization != "mean"){ - norm.file.names <- unlist(strsplit(param$rna.normalization.file, split="::")) - all.file.names <- c(file.names, norm.file.names) - all.file.lables <- c(file.lables, norm.file.names) - print (all.file.names) - print (all.file.lables) - rna.scores <- GetRNAData(all.file.names, file.lables = all.file.lables, fpkm = "avg") - rna.scores <- NormalizeRNA(scores = rna.scores) - rna.scores.sign <- rna.scores/abs(rna.scores) - rna.scores.sign[which(is.nan(rna.scores.sign))] <- 0 - rna.scores <- bindata.non.zero.matrix(abs(rna.scores), qunts = seq(0, 0.9, length=(param$number.bins+1))) - rna.scores <- rna.scores.sign * rna.scores - color.2 = colorRampPalette(c("darkblue", "steelblue", "lightgrey", "pink", "darkred"))(param$number.bins) - } else { - print("we are normalizing by average") - print("file lables are:") - print(file.lables) - rna.scores <- GetRNAData(file.names, file.lables = file.lables, fpkm = "avg") - rna.scores <- t(apply(rna.scores,1, function(x) { if(mean(x)!=0){ return(x/mean(x)) } else { return(x) } })) - rna.scores <- bindata.non.zero.matrix(rna.scores, qunts = seq(0, 0.9, length=(param$number.bins+1))) - } - } else { - rna.scores <- GetRNAData(file.names, file.lables = file.lables, fpkm = "avg") - rna.scores <- bindata.non.zero.matrix(rna.scores, qunts = seq(0, 0.9, length=(param$number.bins+1))) - } - - # chip$peaks <- chip$peaks[order(chip$targets), ] - # rna.scores <- rna.scores[order(rownames(rna.scores)), ] - # chip$targets <- chip$targets[order(chip$targets)] - - rna.scores.mapped <- MapExpressiontoChip(chip.targets = chip$targets, expression=rna.scores) - all.data <- cbind(chip$peaks, rna.scores.mapped) - if(param$include.targetless!="yes"){ - all.data <- all.data[-which(chip$targets==""),] - chip$peaks <- chip$peaks[-which(chip$targets==""),] - rna.scores.mapped <- rna.scores.mapped[-which(chip$targets==""),] - chip$targets <- chip$targets[-which(chip$targets=="")] - } -} else { - all.data <- chip$peaks -} - -print("Clustering") -set.seed(1234) -km <- kmeans(all.data, param$clustering.number.of.clusters, iter.max = 50) -print("Ordering") -km.order <- GenerateKMOrder(all.data, km) - -##AM edits## -km.new.order <- numeric(length(km$cluster)) -for(i in 1:length(km.order)){ - cur.clst <- km.order[i] - ix <- which(km$cluster==cur.clst) - km.new.order[ix] <- i -} -km.order <- 1:length(km.order) -km$cluster <- km.new.order -## Done AM ## -print("Creating image") -#bmp(param$heatmap.document.name, 640, 480)#1280, 960) -bmp(paste(param$heatmap.document.name,".bmp", sep = ""), 5120, 3840)#2560, 1920) -#pdf(paste(param$heatmap.document.name,".pdf", sep = "")) -color.1 <- c("#000000", colorRampPalette(c("blue", "yellow","orange","red"))(param$number.bins)) -if(!is.na(file.names) && file.names != "none") { - PrintClusters(trgts=chip$targets, - k.ix=km$cluster, - f.nm = param$cluster.groups.document.name, - km.order = NA) - - #if(param$rna.normalization != "no") { - # data.split <- cbind(chip$peaks, PrepareRNAforHeatmap(rna.scores.mapped)) - #} else { - # data.split <- cbind(chip$peaks, rna.scores.mapped) - #} - expression.width.multiplier <- 2 - if(ncol(rna.scores.mapped)==1) { - rna.scores.mapped <- cbind(rna.scores.mapped, rna.scores.mapped) - expression.width.multiplier <- 1 - } - - layout(matrix(c(1,2,2,5, - 1,3,4,6, - 1,3,4,7, - 1,3,4,8, - 1,3,4,9), - 5,4, - byrow=T), - widths=c(1,2*ncol(chip$peaks),expression.width.multiplier*ncol(rna.scores.mapped),1), - heights=c(1,10,2,10,2)) - #1 - plot.new() - #2 - plot.new() - #3 - print("Creating peak image") - CreateIndividualHeatMap(chip$peaks, - km, - km.order, color.ramp=color.1) - #4 - print("Creating rna image") - CreateIndividualHeatMap(rna.scores.mapped, - km, - km.order, color.ramp=color.2) - #5 - plot.new() - #6 - image(t(matrix(1:param$number.bins,nc=1)), col=color.1) - #7 - plot.new() - #8 - image(t(matrix(1:param$number.bins,nc=1)), col=color.2) - #9 - plot.new() -} else { - PrintClusters(trgts=1:nrow(chip$peaks), - k.ix=km$cluster, - f.nm = param$cluster.groups.document.name, - km.order = NA) - - - layout(matrix(c(1,2,2, - 1,3,4, - 1,3,5),3,3,byrow=T), - widths=c(1,3*ncol(chip$peaks),1), - heights=c(1,8,1)) - #1 - plot.new() - #2 - plot.new() - #3 - CreateIndividualHeatMap(chip$peaks, - km, - km.order, color.ramp=color.1) - #4 - image(t(matrix(1:param$number.bins,nc=1)), col=color.1) - #5 - plot.new() - -} - -dev.off() -Rprof(NULL) -profile <- summaryRprof() -print(str(profile)) -print(profile) -#save.image("test/testData.RData") diff -r a665e0ad63db -r a0306edbf2f8 mtls_analyze/mtls_analyze/map_ChIP_peaks_to_genes.R --- a/mtls_analyze/mtls_analyze/map_ChIP_peaks_to_genes.R Mon Jul 23 12:58:55 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,304 +0,0 @@ -## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. -## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ -##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' -## Nov 2011 th17 -## Bonneau lab - "Aviv Madar" , -## NYU - Center for Genomics and Systems Biology -## .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. -## /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ /|/ \|\ / / \ \ / / \ \ -##`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' `-`-' ' ' - -rm(list=ls()) -debug=F -script.version=0.1 -print.error <- function(){ - cat(" -NAME - map_ChIP_peaks_to_genes.R - -DESCRIPTION - This script takes a MACS tab delimited file as input and - produces two files: - - 1) genes.xls - A gene centered table that stores the peaks that are - proximal (within x[kb] of TSS) and distal (within the - gene's region + y[kb] upstream or downstream of TSS and - TES respectively). - - 2) peaks.xls - A peak centered table that equals the MACS input table plus - colums for proximal and distal targets of each peak and - their distance from TSS (if exist). For gene.xls script also - gives a poisson model based -log10(pval) score for proximal - and genewide enrichment of peaks for each gene with at least - one prox/dist peak associated with it. - -ARGUMENTS - input_file=path - Path to MACS file. - - refseq_table=path - Path to refseq table (gives TSS/TES locations for all genes). - - path_output=path - Path to save genes.xls and peaks.xls output files. - - tss.dist=N - The absolute distance from TSS where we connect a peak to - gene (for proximal peaks). - - gene.wide.dist=N - The absolute distance from TSS or TES where we connect a peak - to gene (for peaks hitting anywhere in gene). - - effective.genome.size=N - The effective mappable genome size (for mm9 the value is - 1.87e9. For hg19 the value is 2.7e9 (from MACS manual)). - - macs.skip.lines=N - Number of lines to skip in input_file. - -EXAMPLE - Rscript map_ChIP_peaks_to_genes_v.1.R \\ - input_file=SL971_SL970_peaks.xls \\ - refseq_table=UCSC_mm9_refseq_genes_Sep_15_2011.txt \\ - path_output=./ \\ - tss.dist=5000 \\ - gene.wide.dist=10000 \\ - effective.genome.size=1.87e9 \\ - macs.skip.lines=23 - -CITATION - Please cite us if you used this script: The transcription - factor network regulating Th17 lineage specification and - function. Maria Ciofani, Aviv Madar, Carolina Galan, Kieran - Mace, Ashish Agarwal, Kim Newberry, Richard M. Myers, Richard - Bonneau and Dan R. Littman et. al. (in preperation). - -") -} - -# retrieve args -if(debug==T){ - cmd.args <- c( - # "input_file=input/th17/used_for_paper/rawData/MACS_Sep_15_2011/SL971_SL970_peaks.xls", - "input_file=input/th17/used_for_paper/rawData/MACS_Sep_15_2011/SL3594_SL3592_peaks.xls", - "refseq_table=input/th17/used_for_paper/rawData/UCSC_mm9_refseq_genes_Sep_15_2011.txt", - "path_output=/Users/aviv/Desktop/test_script/", - "tss.dist=5000", - "tes.dist=10000", - "effective.genome.size=1.87e9", - "gene.wide.dist=10000", - "macs.skip.lines=23" - ) -} else { - cmd.args <- commandArgs(); -} - -if(length(grep("--version",cmd.args))){ - cat("version",script.version,"\n") - q() -} - -arg.names.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[1]) -args.val.cmd.line <- sapply(strsplit(cmd.args,"="),function(i) i[2]) - -arg.nms <- c("input_file","refseq_table","path_output","tss.dist", - "gene.wide.dist","effective.genome.size","macs.skip.lines") -arg.val <- character(length=length(arg.nms)) -for(i in 1:length(arg.nms)){ - ix <- which(arg.names.cmd.line==arg.nms[i]) - if(length(ix)==1){ - arg.val[i] <- args.val.cmd.line[ix] - } else { - stop("######could not find ",arg.nms[i]," arg######\n\n",print.error()) - - } -} -if(debug==T){ - print(paste(arg.nms,"=",arg.val)) -} -# the files here adhere to tab delim format -path.input.macs <- arg.val[1] -path.input.refseq <- arg.val[2] -path.output <- arg.val[3] -tss.dist <- as.numeric(arg.val[4]) -gene.wide.dist <- as.numeric(arg.val[5]) -effective.genome.size <- as.numeric(arg.val[6]) -macs.skip.lines=as.numeric(arg.val[7]) - -if(debug==T){ -# if(1){ - load("/Users/aviv/Desktop/r.u.RData") - gn.nms <- rownames(r.u) -} else { - ################## - ### step 1 handle refseq table file (read, make unique) - cat("reading refseq table",path.input.refseq,"\n") - refseq <- read.delim(file=path.input.refseq) - # refseq can have many transcripts for each gene - # here i make it have only one transcript for each gene (the longest one) - cat("for transcripts matching more than one gene keeping only the longest transcript\n") - refseq$name2 <- as.character(refseq$name2) - refseq$chrom <- as.character(refseq$chrom) - refseq$strand <- as.character(refseq$strand) - gn.nms <- unique(refseq$name2) - #x <- sort(table(refseq$name2),decreasing=T) - #gn.nms.non.unique <- names(x)[which(x>1)] - #gn.nms.unique <- names(x)[which(x==1)] - # create refseq unique r.u - n <- length(gn.nms) - r.u <- data.frame(cbind(chrom=rep("NA",n),strand=rep("NA",n),txStart=rep(0,n),txEnd=rep(0,n)),stringsAsFactors=FALSE,row.names=gn.nms) - for(i in 1:n){ - ix <- which(refseq$name2==gn.nms[i]) - if(length(ix)==0) { - error("could not find gene", ng.nms[i], "in refseq table. Bailing out...\n") - } else if (length(ix)>1){ - l <- apply(refseq[ix,c("txStart","txEnd")],1,function(i) abs(i[1]-i[2]) ) - l.max <- max(l) - ix <- ix[which(l==l.max)[1]] - } - r.u[gn.nms[i],c("chrom","strand","txStart","txEnd")] <- refseq[ix,c("chrom","strand","txStart","txEnd")] - } - r.u[,"txStart"] <- as.numeric(r.u[,"txStart"]) - r.u[,"txEnd"] <- as.numeric(r.u[,"txEnd"]) - - # switch TSS and TES if we have chr "-" - cat("correcting TSS, TES assiments in refseq table based on strand\n") - ix <- which(r.u$strand=="-") - tmp.tes <- r.u$txStart[ix] - tmp.tss <- r.u$txEnd[ix] - r.u[ix,c("txStart","txEnd")] <- cbind(tmp.tss,tmp.tes) - # ############ -} - - -#### step 2 read macs file -cat("reading Macs file",path.input.macs,"\n") - -# read macs file -M <- read.delim(file=path.input.macs,skip=macs.skip.lines) -trgt.prox <- NA -trgt.dist <- NA -dtss.prox <- NA -dtss.dist <- NA -M.annot <- cbind(M,trgt.prox,trgt.dist,dtss.prox,dtss.dist) -M.annot[is.na(M.annot)] <- "" -# get the number of genes -m <- length(gn.nms) -if(debug==T){ - # m <- 100 -} -f.nm.peak <- paste(sep="",path.output,"peaks.xls") -f.nm.gene <- paste(sep="",path.output,"genes.xls") -# for each genes in the expt (j goes over genes) -cat(file=f.nm.gene,sep="\t","Gene_ID","prox_n_peak","genewide_n_peak","gn_length","strand","prox_mean_pval","genewide_mean_pval", - "prox_max_pval","genewide_max_pval", - "prox_pois_model_pval","genewide_pois_model_pval","(peak_id,summit,d_TSS,d_TES,class,pval,fold_enrich,FDR),(..)\n") -peaks.summit <- (M[,"start"]+M[,"summit"]) -cat(sep="","mapping MACS peaks to genes\n") -for (j in 1:m){ - if(j%%20==0){cat(".")} - tss <- r.u[j,"txStart"] - tes <- r.u[j,"txEnd"] - gn.lngth <- abs(tss-tes) - strand <- r.u[j,"strand"] - chr <- r.u[j,"chrom"] - if(strand=="+"){ - d.tss.all <- peaks.summit-tss - d.tes.all <- peaks.summit-tes - } else { - d.tss.all <- tss-peaks.summit - d.tes.all <- tes-peaks.summit - } - ix.distal <- sort(union( c(which( abs(d.tss.all) < gene.wide.dist ),which( abs(d.tes.all) < gene.wide.dist )), - which( sign(d.tss.all) != sign(d.tes.all) )),decreasing=TRUE) - ix.prox <- sort(which( abs(d.tss.all) < tss.dist ),decreasing=TRUE) - ix.prox <- ix.prox[which(M[ix.prox,"chr"]==chr)] - ix.distal <- ix.distal[which(M[ix.distal,"chr"]==chr)] - n.peaks.prox <- length(ix.prox) - n.peaks.distal <- length(ix.distal) - # if there is at least one peak hitting gene j - if(n.peaks.distal > 0){ - # for each peak (l goes over peaks) - peaks.line <- paste(sep="","(") - for (k in 1:n.peaks.distal){ - if(k>1){ - peaks.line <- paste(sep="",peaks.line,";(") - } - d.tss <- d.tss.all[ ix.distal[k] ] - d.tes <- d.tes.all[ ix.distal[k] ] - if(sign(d.tss)!=sign(d.tes)){ - class="intra" - } else if(d.tss<=0){ - class="upstream" - } else if(d.tss>0){ - class="downstream" - } - peaks.line <- paste(sep="",peaks.line,paste(sep=",", ix.distal[k], peaks.summit[ ix.distal[k] ], - d.tss, d.tes, class, M[ ix.distal[k] ,7], M[ ix.distal[k] ,8], M[ ix.distal[k] ,9] ),")") - # handle M.annot - dtss <- d.tss.all[ ix.distal[k] ] - prev.trgt <- M.annot[ix.distal[k],"trgt.dist"] - if(prev.trgt!=""){ # if peak already is with trgt choose one with min dtss - if(abs(dtss) 0){ - mean.pval.prox <- mean(M[ ix.prox ,7]) - max.pval.prox <- max(M[ ix.prox ,7]) - expected.num.peaks.genome.wide.prox <- length(which(M[,7]>=mean.pval.prox)) - # lambda = num peaks / genome size * searched region - lambda.prox <- expected.num.peaks.genome.wide.prox/effective.genome.size*(tss.dist*2) - pval.pois.prox <- -log10(ppois(n.peaks.prox,lambda.prox,lower.tail=FALSE)) - # handle M.annot - for(k in 1:n.peaks.prox){ - dtss <- d.tss.all[ ix.prox[k] ] - prev.trgt <- M.annot[ix.prox[k],"trgt.prox"] - if(prev.trgt!=""){ # if peak already is with trgt choose one with min dtss - if(abs(dtss)=mean.pval.distal)) - # lambda = num peaks / genome size * searched region - lambda.distal <- expected.num.peaks.genome.wide.distal/effective.genome.size*(gn.lngth+gene.wide.dist*2) - pval.pois.distal <- -log10(ppois(n.peaks.distal,lambda.distal,lower.tail=FALSE)) - # pring peaks for gene j - core.line <- paste(sep="\t",gn.nms[ j ],n.peaks.prox,n.peaks.distal,gn.lngth,strand, - mean.pval.prox,mean.pval.distal,max.pval.prox,max.pval.distal, - pval.pois.prox,pval.pois.distal) - cat(file=f.nm.gene,append=TRUE,sep="",core.line,"\t",peaks.line,"\n") - } -} -cat("Done!\n") - -cat(file=f.nm.peak,colnames(M.annot),"\n",sep="\t") -write.table(file=f.nm.peak,append=T,col.names=F,row.names=F,M.annot,sep="\t") - - - - - - - diff -r a665e0ad63db -r a0306edbf2f8 mtls_analyze/mtls_analyze/mtls_analyze.xml --- a/mtls_analyze/mtls_analyze/mtls_analyze.xml Mon Jul 23 12:58:55 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,333 +0,0 @@ - - - Merge multiple ChIP-seq experiments, alligning their peaks to MTLs (Multi - Transcription Factor Loci(us)) and optionally incorperate expression - - /bin/bash $shscript - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#!/bin/bash - -#import os -#set $path = $os.path.abspath($__app__.config.tool_path) - - -## Set symbols so that they are not incorrectly interpreted: -#set $dollar = chr(36) -#set $gt = chr(62) -#set $lt = chr(60) -#set $ad = chr(38) -#set $bs = chr(92) - -echo $map_rna ${ad}${gt}${gt} $log -echo "This is the Bash log file: " ${ad}${gt}${gt} $log -############################################################################### -## Convert the gtf file to a file that aviv's script can hadel -#if str($map_rna)=='yes' - echo "Converting gtf file" ${ad}${gt}${gt} $log - Rscript $path/visualization/gtfToMapFriendlyAnnotation.R $reference_file ${ad}${gt}${gt} $log - echo "done converting gtf file" ${ad}${gt}${gt} $log -#end if -############################################################################### -## Get ChIP data in correctly formated strings and annotate if nessisary. -#set $sep = '::' -#for $i, $chip in enumerate( $chip_tracks ) - #if $i==0 - echo "Chip Files:" ${ad}${gt}${gt} $log - echo "The first file label is: ${chip.name}" ${ad}${gt}${gt} $log - echo "The first file path is: ${chip.file}" ${ad}${gt}${gt} $log - chip_labels=${chip.name} - chip_paths=${chip.file} - #else - echo "The next file label is: ${chip.name}" ${ad}${gt}${gt} $log - echo "The next file path is: ${chip.file}" ${ad}${gt}${gt} $log - chip_labels=${dollar}chip_labels${sep}${chip.name} - chip_paths=${dollar}chip_paths${sep}${chip.file} - #end if -#end for - -echo chip paths are - ${dollar}chip_paths ${ad}${gt}${gt} $log -echo chip labels are - ${dollar}chip_labels ${ad}${gt}${gt} $log - -############################################################################### -## Cluster peaks - -Rscript $path/visualization/cluster_peaks.R \ ---input_files ${dollar}chip_paths \ ---input_type $chipInputFormat \ ---path_output ./ \ ---expt_names ${dollar}chip_labels \ ---dist_summits $summitDistance \ ---mtl_type $mtlType ${ad}${gt}${gt} $log - -############################################################################### -## Annotate mtls.xls if nessisary -#if str($map_rna)=="yes" - echo "annotating mtls.xls..." ${ad}${gt}${gt} $log - Rscript $path/visualization/annotate_mtls.R mtls.xls gene_annotation.txt $trgtDistance ${ad}${gt}${gt} $log -#end if -############################################################################### -## If rna is specified, then get RNA data in correctly formated strings: -#if str($map_rna)=='yes' - #set $sep = '::' - #for $i, $rna in enumerate( $rna_tracks ) - #if $i==0 - echo "The first file label is: ${rna.name}" ${ad}${gt}${gt} $log - echo "The first file path is: ${rna.file}" ${ad}${gt}${gt} $log - rna_labels=${rna.name} - rna_paths=${rna.file} - rna_norm_paths=${rna.norm} - #else - echo "The next file label is: ${rna.name}" ${ad}${gt}${gt} $log - echo "The next file path is: ${rna.file}" ${ad}${gt}${gt} $log - rna_labels=${dollar}rna_labels${sep}${rna.name} - rna_paths=${dollar}rna_paths${sep}${rna.file} - rna_norm_paths=${dollar}rna_norm_paths${sep}${rna.norm} - #end if - #end for - echo rna paths are - ${dollar}rna_paths ${ad}${gt}${gt} $log - echo rna labels are - ${dollar}rna_labels ${ad}${gt}${gt} $log - echo rna norm files are - ${dollar}rna_norm_paths ${ad}${gt}${gt} $log -#end if -############################################################################### - -#if str($normalize_rna)=='no' - echo "Normalization by file is set to no" ${ad}${gt}${gt} $log - rna_norm_paths=no -#end if - -#if str($use_mean)=='yes' - echo "Normalization of expression will be done by mean" ${ad}${gt}${gt} $log - rna_norm_paths=mean -#end if - -#if str($map_rna)=='no' - mtls_file=mtls.xls - rna_paths=none - rna_labels=none -#else - mtls_file=annotated_mtls.xls -#end if - -echo " -Rscript $path/visualization/heatmap.R --mtls_file ./${dollar}mtls_file \ ---cluster_file ./cluster \ ---chip_experiment_order ${dollar}chip_labels \ ---heatmap_file ./heatmap \ ---heatmap_type bmp \ ---n_clusters $numClusters \ ---filter_percentage 100 \ ---expression_file ${dollar}rna_paths \ ---expression_name ${dollar}rna_labels \ ---normalization_file ${dollar}rna_norm_paths \ -${ad}${gt}${gt} $log" ${ad}${gt}${gt} $log - -Rscript $path/visualization/heatmap.R --mtls_file ./${dollar}mtls_file \ ---cluster_file ./cluster \ ---chip_experiment_order ${dollar}chip_labels \ ---heatmap_file ./heatmap \ ---heatmap_type bmp \ ---n_clusters $numClusters \ ---filter_percentage 100 \ ---number_bins $numberBins \ ---include_targetless $includeTargetless \ ---expression_file ${dollar}rna_paths \ ---expression_name ${dollar}rna_labels \ ---normalization_file ${dollar}rna_norm_paths \ -${ad}${gt}${gt} $log - -ls ${ad}${gt}${gt} $log - - - - -################################################################## -#if str($map_rna)=='yes' - mv ./annotated_mtls.xls $mtls -#else - mv ./mtls.xls $mtls -#end if -mv ./heatmap.* $heatmap_image -mv ./cluster.tsv $cluster_assignments - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -This tool will merge peaks form multiple chip-seq experiments, creating MTLs for -each overlapping region. It will then cluster each MTL based on the score of -each peak within each MTL (using K-means clustering, with k set by user). A -heatmap is then generated from the resulting cluster along with the MTLs -generated. This module in writin in R and is will be made available on github -and bioconductor. This work was done by Kieran Mace and Aviv Madar. - -**NEED IMPROVEMENT** - ------ - -**Parameters** - -- **Input files** contains either macs or BED files to be merged. This list of files must be two or larger. -- **Experiment names** contains the name given to each track. -- **Summit distance** is the cuttoff distance (in BP) to be included in an MTL. This option is not used with the summit option below -- **Input Format** Either bed of MACS file format, all files must be of one type. Defaults to MACS -- **MTL Type** Either interval or summit (defaults to summit). -- **Number clusters** the value of k for kmeans clustering. -- **Filter top MTLS** The top percentage of MTLs to keep for image and cluster (based on the union of mean, non-zero mean, and variance of the scores). ------ - -**Output** - -- **XLS file** is the tab-delimited file containing the MTL data. -- **PNG file** is the heatmap image generated after clustering the MTL data. - ------ - -**script parameter list of Chip-Cluster** - -Options: -DESCRIPTIION: - cluster_peaks.R takes MACS/.bed tab delimited files as input and produces one tab delimeted file (named mtls.xls) where - each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS/.bed files) - fall within a certain distance between summits from eachother. - -INPUT: - 1.path_input=path to MACS/bed files '::' delim [path_input=f1::f2::f3::...::fk] - 2.path_output=path to save generated MTL cluster file (where to save mtls.xls) - 3.expt_names=user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk] - 4.dist.summits=maximum distance between summits belonging to the same MTL (defaults to 100) - 5.input_type=the type of input file used (MACS or .bed; defaults to MACS) - 6.mtl_type=interval or summit (defaults to summit) - -EXAMPLE RUN: - cluster_peaks.R - --input_macs_files input/SL2870_SL2871_peaks.xls::input/SL2872_SL2876_peaks.xls::input/SL3032_SL2871_peaks.xls::input/SL3037_SL3036_peaks.xls::input/SL3315_SL3319_peaks.xls - --input_type MACS - --path_output results/ - --expt_names RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17 - --dist_summits 100 - --mtl_type summit - - DESCRIPTIION: - heatmap.R takes a ... - -INPUT: - 1.--mtls_file path to mtls file. - - 2.--cluster_file the destination path for the cluster file. - - 3.--heatmap_file the destination path for heatmap image (no extension). - - 4.--heatmap_type choice of image type, currently support png and pdf. - - 5.--n_clusters number of clusters in the heatmap - - 6.--filter_percentage percentage of mtls that will be analysed. for eg. if - we make filter_percentage 30, we will take the union of the top mtls in - mean, non-zero mean and variance. - - -EXAMPLE RUN: - Rscript heatmap.R - --mtls_file mtls.xls - --cluster_file output/cluster - --heatmap_file output/heatmap - --heatmap_type png - --n_clusters 13 - --filter_percentage 60 - -Please cite us if you used this script: - The transcription factor network regulating Th17 lineage specification and function. - Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers, - Richard Bonneau and Dan R. Littman et. al. (in preperation) - - - -