Mercurial > repos > ynewton > matrix_normalization

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/normalize.r	Thu Dec 13 11:19:57 2012 -0500
@@ -0,0 +1,245 @@
+#!/usr/bin/Rscript
+
+#usage, options and doc goes here
+argspec <- c("normalize.r - takes any flat file and normalizes the rows or the columns using various normalizations (median_shift, mean_shift, t_statistic (z-score), exp_fit, normal_fit, weibull_0.5_fit, weibull_1_fit, weibull_1.5_fit, weibull_5_fit). Requires a single header line and a single cloumn of annotation.
+Usage:
+    normalize.r input.tab norm_type norm_by > output.tab
+Example:
+	Rscript normalize.r test_matrix.tab median_shift column > output.tab
+	Rscript normalize.r test_matrix.tab mean_shift row normals.tab > output.tab
+Options:
+	input matrix (annotated by row and column names)
+	normalization type; available options:
+		median_shift - shifts all values by the median or the row/column if no normals are specified, otherwise shifts by the median of normals
+		mean_shift - shifts all values by the mean or the row/column if no normals are specified, otherwise shifts by the mean of normals
+		t_statistic - converts all values to z-scores; if normals are specified then converts to z-scores within normal and non-normal classes separately
+		exponential_fit - (only by column) ranks data and transforms exponential CDF
+		normal_fit - (only by column) ranks data and transforms normal CDF
+		weibull_0.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 0.5
+		weibull_1_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1
+		weibull_1.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1.5
+		weibull_5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 5
+	normalization by:
+		row
+		column
+	normals_file is an optional parameter which contains either a list of column headers from the input matrix, which should be considered as normals, or a matrix of normal samples
+	output file is specified through redirect character >")
+
+read_matrix <- function(in_file){
+	header <- strsplit(readLines(con=in_file, n=1), "\t")[[1]]
+	cl.cols<- 1:length(header) > 1
+	data_matrix.df <- read.delim(in_file, header=TRUE, row.names=NULL, stringsAsFactors=FALSE, na.strings="NA", check.names=FALSE)
+	data_matrix <- as.matrix(data_matrix.df[,cl.cols])
+	rownames(data_matrix) <- data_matrix.df[,1]
+	return(data_matrix)
+
+	#read_mtrx <- as.matrix(read.table(in_file, header=TRUE, sep="", row.names=NULL, stringsAsFactors=FALSE, na.strings="NA"))	#separate on white characters
+	#read_mtrx[,1]
+
+	#return(as.matrix(read.table(in_file, header=TRUE, sep="", row.names=1)))	#separate on white characters
+	#mtrx <- read.delim(in_file, header=TRUE, sep="", row.names=NULL, stringsAsFactors=FALSE, na.strings="NA")
+	#print(mtrx[1,])
+}
+
+write_matrix <- function(data_matrix){
+	header <- append(c("Genes"), colnames(data_matrix))
+	write.table(t(header), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
+	write.table(data_matrix, stdout(), quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE)
+}
+
+read_normals <- function(in_file){
+	#return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE))[, 1])
+	return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE)))
+}
+
+normalize <- function(data_matrix, norm_type, normals_list, tumors_list){
+	if(norm_type == 'MEDIAN_SHIFT'){
+		return(shift(data_matrix, 'MEDIAN', normals_list, tumors_list))
+	}
+	else if(norm_type == 'MEAN_SHIFT'){
+		return(shift(data_matrix, 'MEAN', normals_list, tumors_list))
+	}
+	else if(norm_type == 'T_STATISTIC'){
+		return(compute_z_score(data_matrix, normals_list, tumors_list))
+	}
+	else if(norm_type == 'EXPONENTIAL_FIT'){
+		return(fit_distribution(data_matrix, 'EXPONENTIAL'))
+	}
+	else if(norm_type == 'NORMAL_FIT'){
+		return(fit_distribution(data_matrix, 'NORMAL'))
+	}
+	else if(norm_type == 'WEIBULL_0.5_FIT'){
+		return(fit_distribution(data_matrix, 'WEIBULL_0.5'))
+	}
+	else if(norm_type == 'WEIBULL_1_FIT'){
+		return(fit_distribution(data_matrix, 'WEIBULL_1'))
+	}
+	else if(norm_type == 'WEIBULL_1.5_FIT'){
+		return(fit_distribution(data_matrix, 'WEIBULL_1.5'))
+	}
+	else if(norm_type == 'WEIBULL_5_FIT'){
+		return(fit_distribution(data_matrix, 'WEIBULL_5'))
+	}else{
+		write("ERROR: unknown normalization type", stderr());
+		q();
+	}
+}
+
+shift <- function(data_matrix, shift_type, normals_list, tumors_list){
+	return(t(apply(data_matrix, 1, shift_normalize_row, norm_type=shift_type, normals_list=normals_list, tumors_list=tumors_list)))
+}
+
+shift_normalize_row <- function(data_row, norm_type, normals_list, tumors_list){
+	if(length(normals_list) == 0){	#no normals are specified
+		if(norm_type == 'MEDIAN'){
+			row_stat <- median(data_row)
+		}
+		else if(norm_type == 'MEAN'){
+			row_stat <- mean(data_row)
+		}
+		return(unlist(lapply(data_row, function(x){return(x - row_stat);})))
+	}
+	else{	#normals are specified
+		normal_values <- data_row[normals_list]
+		tumor_columns <- data_row[tumors_list]
+
+		if(norm_type == 'MEDIAN'){
+			row_stat <- median(normal_values)
+		}
+		else if(norm_type == 'MEAN'){
+			row_stat <- mean(normal_values)
+		}
+		return(unlist(lapply(tumor_columns, function(x){return(x - row_stat);})))
+	}
+}
+
+compute_z_score <- function(data_matrix, normals_list, tumors_list){
+	return(t(apply(data_matrix, 1, t_stat_normalize_row, normals_list=normals_list, tumors_list=tumors_list)))
+}
+
+t_stat_normalize_row <- function(data_row, normals_list, tumors_list){
+	if(length(normals_list) == 0){	#no normals are specified
+		row_mean <- mean(data_row)
+		row_sd <- sd(data_row)
+		return(unlist(lapply(data_row, function(x){return((x - row_mean)/row_sd);})))
+	}
+	else{	#normals are specified
+		normal_values <- data_row[normals_list]
+		normal_mean <- mean(normal_values)
+		normal_sd <- sd(normal_values)
+		normalized_normals <- unlist(lapply(normal_values, function(x){return((x - normal_mean)/normal_sd);}))
+
+		tumor_values <- data_row[tumors_list]
+		normalized_tumors <- unlist(lapply(tumor_values, function(x){return((x - normal_mean)/normal_sd);}))
+
+		return(append(normalized_normals, normalized_tumors))
+	}
+}
+
+rankNA <- function(col){		#originally written by Dan Carlin
+	col[!is.na(col)]<-(rank(col[!is.na(col)])/sum(!is.na(col)))-(1/sum(!is.na(col)))
+	return(col)
+}
+
+fit_distribution <- function(data_matrix, dist){
+	if(dist == 'EXPONENTIAL'){
+		ranked_data_matrix <- apply(data_matrix,1,rankNA)	#idea by Dan Carlin
+		#write.table(c("ranked data:"), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
+		#write.table(ranked_data_matrix, stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
+		return(apply(ranked_data_matrix, 1, qexp))
+	}
+	else if(dist == 'NORMAL'){
+		ranked_data_matrix <- apply(data_matrix,2,rankNA)
+		return(apply(ranked_data_matrix, c(1,2), qnorm, mean=0, sd=1))
+	}
+	else if(dist == 'WEIBULL_0.5'){
+		ranked_data_matrix <- apply(data_matrix,2,rankNA)
+		return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=0.5))
+	}
+	else if(dist == 'WEIBULL_1'){
+		ranked_data_matrix <- apply(data_matrix,2,rankNA)
+		return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=1))
+	}
+	else if(dist == 'WEIBULL_1.5'){
+		ranked_data_matrix <- apply(data_matrix,2,rankNA)
+		return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=1.5))
+	}
+	else if(dist == 'WEIBULL_5'){
+		ranked_data_matrix <- apply(data_matrix,2,rankNA)
+		return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=5))
+	}
+}
+
+main <- function(argv) {
+	#determine if correct number of arguments are specified and if normals are specified
+	with_normals = FALSE
+
+	if(length(argv) == 1){
+		if(argv==c('--help')){
+			write(argspec, stderr());
+			q();
+		}
+	}
+
+	if(!(length(argv) == 3 || length(argv) == 4)){
+		write("ERROR: invalid number of arguments is specified", stderr());
+		q();
+	}
+
+	if(length(argv) == 4){
+		with_normals = TRUE
+		normals_file <- argv[4]
+	}
+
+	#store command line arguments in variables:
+	input_file <- argv[1]
+	norm_type <- toupper(argv[2])
+	norm_by <- toupper(argv[3])
+
+	#input_file <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/test_matrix.tab"
+	#norm_type <- "MEAN_SHIFT"
+	#norm_by <- "ROW"
+	#normals_file <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/test_matrix2.tab"
+	#normals_file2 <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/normals.tab"
+
+	#read the input file(s):
+	data_matrix <- read_matrix(input_file)
+
+	if(with_normals){
+		normals <- read_normals(normals_file)
+		if(length(colnames(normals)) == 1){
+			normals_indices <- which(colnames(data_matrix) %in% normals)
+			tumor_indices <- which(!(colnames(data_matrix) %in% normals))
+		}else{
+			normals_numeric <- normals[2:length(normals[,1]),2:length(normals[1,])]
+			normals_numeric <- apply(normals_numeric, 2, as.numeric)
+			rownames(normals_numeric) <- normals[,1][2:length(normals[,1])]
+			colnames(normals_numeric) <- normals[1,][2:length(normals[1,])]
+
+			combined_matrix <- cbind(data_matrix, normals_numeric)
+			tumor_indices <- c(1:length(data_matrix[1,]))
+			normals_indices <- c(length(tumor_indices)+1:length(normals_numeric[1,]))
+			data_matrix <- combined_matrix
+		}
+	}else{
+		normals_indices <- c()
+		tumor_indices <- c()
+	}
+
+	#if normalize by columns then transpose the matrix:
+	if(norm_by == 'COLUMN'){
+		data_matrix <- t(data_matrix)
+	}
+
+	#normalize:
+	data_matrix <- normalize(data_matrix, norm_type, normals_indices, tumor_indices)
+
+	#if normalize by columns then transpose the matrix again since we normalized the transposed matrix by row:
+	if(norm_by == 'COLUMN'){
+		data_matrix <- t(data_matrix)
+	}
+
+	write_matrix(data_matrix)
+}
+
+main(commandArgs(TRUE))