# HG changeset patch # User ynewton # Date 1355415597 18000 # Node ID 8389b0c211aead9240d6a193a14bf5f376f42126 # Parent a9d8d4b531f702f17a63e7f96824f0b414e7320d Uploaded diff -r a9d8d4b531f7 -r 8389b0c211ae normalize.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/normalize.r Thu Dec 13 11:19:57 2012 -0500 @@ -0,0 +1,245 @@ +#!/usr/bin/Rscript + +#usage, options and doc goes here +argspec <- c("normalize.r - takes any flat file and normalizes the rows or the columns using various normalizations (median_shift, mean_shift, t_statistic (z-score), exp_fit, normal_fit, weibull_0.5_fit, weibull_1_fit, weibull_1.5_fit, weibull_5_fit). Requires a single header line and a single cloumn of annotation. +Usage: + normalize.r input.tab norm_type norm_by > output.tab +Example: + Rscript normalize.r test_matrix.tab median_shift column > output.tab + Rscript normalize.r test_matrix.tab mean_shift row normals.tab > output.tab +Options: + input matrix (annotated by row and column names) + normalization type; available options: + median_shift - shifts all values by the median or the row/column if no normals are specified, otherwise shifts by the median of normals + mean_shift - shifts all values by the mean or the row/column if no normals are specified, otherwise shifts by the mean of normals + t_statistic - converts all values to z-scores; if normals are specified then converts to z-scores within normal and non-normal classes separately + exponential_fit - (only by column) ranks data and transforms exponential CDF + normal_fit - (only by column) ranks data and transforms normal CDF + weibull_0.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 0.5 + weibull_1_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1 + weibull_1.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1.5 + weibull_5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 5 + normalization by: + row + column + normals_file is an optional parameter which contains either a list of column headers from the input matrix, which should be considered as normals, or a matrix of normal samples + output file is specified through redirect character >") + +read_matrix <- function(in_file){ + header <- strsplit(readLines(con=in_file, n=1), "\t")[[1]] + cl.cols<- 1:length(header) > 1 + data_matrix.df <- read.delim(in_file, header=TRUE, row.names=NULL, stringsAsFactors=FALSE, na.strings="NA", check.names=FALSE) + data_matrix <- as.matrix(data_matrix.df[,cl.cols]) + rownames(data_matrix) <- data_matrix.df[,1] + return(data_matrix) + + #read_mtrx <- as.matrix(read.table(in_file, header=TRUE, sep="", row.names=NULL, stringsAsFactors=FALSE, na.strings="NA")) #separate on white characters + #read_mtrx[,1] + + #return(as.matrix(read.table(in_file, header=TRUE, sep="", row.names=1))) #separate on white characters + #mtrx <- read.delim(in_file, header=TRUE, sep="", row.names=NULL, stringsAsFactors=FALSE, na.strings="NA") + #print(mtrx[1,]) +} + +write_matrix <- function(data_matrix){ + header <- append(c("Genes"), colnames(data_matrix)) + write.table(t(header), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) + write.table(data_matrix, stdout(), quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE) +} + +read_normals <- function(in_file){ + #return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE))[, 1]) + return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE))) +} + +normalize <- function(data_matrix, norm_type, normals_list, tumors_list){ + if(norm_type == 'MEDIAN_SHIFT'){ + return(shift(data_matrix, 'MEDIAN', normals_list, tumors_list)) + } + else if(norm_type == 'MEAN_SHIFT'){ + return(shift(data_matrix, 'MEAN', normals_list, tumors_list)) + } + else if(norm_type == 'T_STATISTIC'){ + return(compute_z_score(data_matrix, normals_list, tumors_list)) + } + else if(norm_type == 'EXPONENTIAL_FIT'){ + return(fit_distribution(data_matrix, 'EXPONENTIAL')) + } + else if(norm_type == 'NORMAL_FIT'){ + return(fit_distribution(data_matrix, 'NORMAL')) + } + else if(norm_type == 'WEIBULL_0.5_FIT'){ + return(fit_distribution(data_matrix, 'WEIBULL_0.5')) + } + else if(norm_type == 'WEIBULL_1_FIT'){ + return(fit_distribution(data_matrix, 'WEIBULL_1')) + } + else if(norm_type == 'WEIBULL_1.5_FIT'){ + return(fit_distribution(data_matrix, 'WEIBULL_1.5')) + } + else if(norm_type == 'WEIBULL_5_FIT'){ + return(fit_distribution(data_matrix, 'WEIBULL_5')) + }else{ + write("ERROR: unknown normalization type", stderr()); + q(); + } +} + +shift <- function(data_matrix, shift_type, normals_list, tumors_list){ + return(t(apply(data_matrix, 1, shift_normalize_row, norm_type=shift_type, normals_list=normals_list, tumors_list=tumors_list))) +} + +shift_normalize_row <- function(data_row, norm_type, normals_list, tumors_list){ + if(length(normals_list) == 0){ #no normals are specified + if(norm_type == 'MEDIAN'){ + row_stat <- median(data_row) + } + else if(norm_type == 'MEAN'){ + row_stat <- mean(data_row) + } + return(unlist(lapply(data_row, function(x){return(x - row_stat);}))) + } + else{ #normals are specified + normal_values <- data_row[normals_list] + tumor_columns <- data_row[tumors_list] + + if(norm_type == 'MEDIAN'){ + row_stat <- median(normal_values) + } + else if(norm_type == 'MEAN'){ + row_stat <- mean(normal_values) + } + return(unlist(lapply(tumor_columns, function(x){return(x - row_stat);}))) + } +} + +compute_z_score <- function(data_matrix, normals_list, tumors_list){ + return(t(apply(data_matrix, 1, t_stat_normalize_row, normals_list=normals_list, tumors_list=tumors_list))) +} + +t_stat_normalize_row <- function(data_row, normals_list, tumors_list){ + if(length(normals_list) == 0){ #no normals are specified + row_mean <- mean(data_row) + row_sd <- sd(data_row) + return(unlist(lapply(data_row, function(x){return((x - row_mean)/row_sd);}))) + } + else{ #normals are specified + normal_values <- data_row[normals_list] + normal_mean <- mean(normal_values) + normal_sd <- sd(normal_values) + normalized_normals <- unlist(lapply(normal_values, function(x){return((x - normal_mean)/normal_sd);})) + + tumor_values <- data_row[tumors_list] + normalized_tumors <- unlist(lapply(tumor_values, function(x){return((x - normal_mean)/normal_sd);})) + + return(append(normalized_normals, normalized_tumors)) + } +} + +rankNA <- function(col){ #originally written by Dan Carlin + col[!is.na(col)]<-(rank(col[!is.na(col)])/sum(!is.na(col)))-(1/sum(!is.na(col))) + return(col) +} + +fit_distribution <- function(data_matrix, dist){ + if(dist == 'EXPONENTIAL'){ + ranked_data_matrix <- apply(data_matrix,1,rankNA) #idea by Dan Carlin + #write.table(c("ranked data:"), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) + #write.table(ranked_data_matrix, stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) + return(apply(ranked_data_matrix, 1, qexp)) + } + else if(dist == 'NORMAL'){ + ranked_data_matrix <- apply(data_matrix,2,rankNA) + return(apply(ranked_data_matrix, c(1,2), qnorm, mean=0, sd=1)) + } + else if(dist == 'WEIBULL_0.5'){ + ranked_data_matrix <- apply(data_matrix,2,rankNA) + return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=0.5)) + } + else if(dist == 'WEIBULL_1'){ + ranked_data_matrix <- apply(data_matrix,2,rankNA) + return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=1)) + } + else if(dist == 'WEIBULL_1.5'){ + ranked_data_matrix <- apply(data_matrix,2,rankNA) + return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=1.5)) + } + else if(dist == 'WEIBULL_5'){ + ranked_data_matrix <- apply(data_matrix,2,rankNA) + return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=5)) + } +} + +main <- function(argv) { + #determine if correct number of arguments are specified and if normals are specified + with_normals = FALSE + + if(length(argv) == 1){ + if(argv==c('--help')){ + write(argspec, stderr()); + q(); + } + } + + if(!(length(argv) == 3 || length(argv) == 4)){ + write("ERROR: invalid number of arguments is specified", stderr()); + q(); + } + + if(length(argv) == 4){ + with_normals = TRUE + normals_file <- argv[4] + } + + #store command line arguments in variables: + input_file <- argv[1] + norm_type <- toupper(argv[2]) + norm_by <- toupper(argv[3]) + + #input_file <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/test_matrix.tab" + #norm_type <- "MEAN_SHIFT" + #norm_by <- "ROW" + #normals_file <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/test_matrix2.tab" + #normals_file2 <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/normals.tab" + + #read the input file(s): + data_matrix <- read_matrix(input_file) + + if(with_normals){ + normals <- read_normals(normals_file) + if(length(colnames(normals)) == 1){ + normals_indices <- which(colnames(data_matrix) %in% normals) + tumor_indices <- which(!(colnames(data_matrix) %in% normals)) + }else{ + normals_numeric <- normals[2:length(normals[,1]),2:length(normals[1,])] + normals_numeric <- apply(normals_numeric, 2, as.numeric) + rownames(normals_numeric) <- normals[,1][2:length(normals[,1])] + colnames(normals_numeric) <- normals[1,][2:length(normals[1,])] + + combined_matrix <- cbind(data_matrix, normals_numeric) + tumor_indices <- c(1:length(data_matrix[1,])) + normals_indices <- c(length(tumor_indices)+1:length(normals_numeric[1,])) + data_matrix <- combined_matrix + } + }else{ + normals_indices <- c() + tumor_indices <- c() + } + + #if normalize by columns then transpose the matrix: + if(norm_by == 'COLUMN'){ + data_matrix <- t(data_matrix) + } + + #normalize: + data_matrix <- normalize(data_matrix, norm_type, normals_indices, tumor_indices) + + #if normalize by columns then transpose the matrix again since we normalized the transposed matrix by row: + if(norm_by == 'COLUMN'){ + data_matrix <- t(data_matrix) + } + + write_matrix(data_matrix) +} + +main(commandArgs(TRUE))