Mercurial > repos > ynewton > matrix_normalization
comparison normalize.r @ 1:8389b0c211ae draft
Uploaded
author | ynewton |
---|---|
date | Thu, 13 Dec 2012 11:19:57 -0500 |
parents | |
children | f3fe0f64fe91 |
comparison
equal
deleted
inserted
replaced
0:a9d8d4b531f7 | 1:8389b0c211ae |
---|---|
1 #!/usr/bin/Rscript | |
2 | |
3 #usage, options and doc goes here | |
4 argspec <- c("normalize.r - takes any flat file and normalizes the rows or the columns using various normalizations (median_shift, mean_shift, t_statistic (z-score), exp_fit, normal_fit, weibull_0.5_fit, weibull_1_fit, weibull_1.5_fit, weibull_5_fit). Requires a single header line and a single cloumn of annotation. | |
5 Usage: | |
6 normalize.r input.tab norm_type norm_by > output.tab | |
7 Example: | |
8 Rscript normalize.r test_matrix.tab median_shift column > output.tab | |
9 Rscript normalize.r test_matrix.tab mean_shift row normals.tab > output.tab | |
10 Options: | |
11 input matrix (annotated by row and column names) | |
12 normalization type; available options: | |
13 median_shift - shifts all values by the median or the row/column if no normals are specified, otherwise shifts by the median of normals | |
14 mean_shift - shifts all values by the mean or the row/column if no normals are specified, otherwise shifts by the mean of normals | |
15 t_statistic - converts all values to z-scores; if normals are specified then converts to z-scores within normal and non-normal classes separately | |
16 exponential_fit - (only by column) ranks data and transforms exponential CDF | |
17 normal_fit - (only by column) ranks data and transforms normal CDF | |
18 weibull_0.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 0.5 | |
19 weibull_1_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1 | |
20 weibull_1.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1.5 | |
21 weibull_5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 5 | |
22 normalization by: | |
23 row | |
24 column | |
25 normals_file is an optional parameter which contains either a list of column headers from the input matrix, which should be considered as normals, or a matrix of normal samples | |
26 output file is specified through redirect character >") | |
27 | |
28 read_matrix <- function(in_file){ | |
29 header <- strsplit(readLines(con=in_file, n=1), "\t")[[1]] | |
30 cl.cols<- 1:length(header) > 1 | |
31 data_matrix.df <- read.delim(in_file, header=TRUE, row.names=NULL, stringsAsFactors=FALSE, na.strings="NA", check.names=FALSE) | |
32 data_matrix <- as.matrix(data_matrix.df[,cl.cols]) | |
33 rownames(data_matrix) <- data_matrix.df[,1] | |
34 return(data_matrix) | |
35 | |
36 #read_mtrx <- as.matrix(read.table(in_file, header=TRUE, sep="", row.names=NULL, stringsAsFactors=FALSE, na.strings="NA")) #separate on white characters | |
37 #read_mtrx[,1] | |
38 | |
39 #return(as.matrix(read.table(in_file, header=TRUE, sep="", row.names=1))) #separate on white characters | |
40 #mtrx <- read.delim(in_file, header=TRUE, sep="", row.names=NULL, stringsAsFactors=FALSE, na.strings="NA") | |
41 #print(mtrx[1,]) | |
42 } | |
43 | |
44 write_matrix <- function(data_matrix){ | |
45 header <- append(c("Genes"), colnames(data_matrix)) | |
46 write.table(t(header), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) | |
47 write.table(data_matrix, stdout(), quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE) | |
48 } | |
49 | |
50 read_normals <- function(in_file){ | |
51 #return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE))[, 1]) | |
52 return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE))) | |
53 } | |
54 | |
55 normalize <- function(data_matrix, norm_type, normals_list, tumors_list){ | |
56 if(norm_type == 'MEDIAN_SHIFT'){ | |
57 return(shift(data_matrix, 'MEDIAN', normals_list, tumors_list)) | |
58 } | |
59 else if(norm_type == 'MEAN_SHIFT'){ | |
60 return(shift(data_matrix, 'MEAN', normals_list, tumors_list)) | |
61 } | |
62 else if(norm_type == 'T_STATISTIC'){ | |
63 return(compute_z_score(data_matrix, normals_list, tumors_list)) | |
64 } | |
65 else if(norm_type == 'EXPONENTIAL_FIT'){ | |
66 return(fit_distribution(data_matrix, 'EXPONENTIAL')) | |
67 } | |
68 else if(norm_type == 'NORMAL_FIT'){ | |
69 return(fit_distribution(data_matrix, 'NORMAL')) | |
70 } | |
71 else if(norm_type == 'WEIBULL_0.5_FIT'){ | |
72 return(fit_distribution(data_matrix, 'WEIBULL_0.5')) | |
73 } | |
74 else if(norm_type == 'WEIBULL_1_FIT'){ | |
75 return(fit_distribution(data_matrix, 'WEIBULL_1')) | |
76 } | |
77 else if(norm_type == 'WEIBULL_1.5_FIT'){ | |
78 return(fit_distribution(data_matrix, 'WEIBULL_1.5')) | |
79 } | |
80 else if(norm_type == 'WEIBULL_5_FIT'){ | |
81 return(fit_distribution(data_matrix, 'WEIBULL_5')) | |
82 }else{ | |
83 write("ERROR: unknown normalization type", stderr()); | |
84 q(); | |
85 } | |
86 } | |
87 | |
88 shift <- function(data_matrix, shift_type, normals_list, tumors_list){ | |
89 return(t(apply(data_matrix, 1, shift_normalize_row, norm_type=shift_type, normals_list=normals_list, tumors_list=tumors_list))) | |
90 } | |
91 | |
92 shift_normalize_row <- function(data_row, norm_type, normals_list, tumors_list){ | |
93 if(length(normals_list) == 0){ #no normals are specified | |
94 if(norm_type == 'MEDIAN'){ | |
95 row_stat <- median(data_row) | |
96 } | |
97 else if(norm_type == 'MEAN'){ | |
98 row_stat <- mean(data_row) | |
99 } | |
100 return(unlist(lapply(data_row, function(x){return(x - row_stat);}))) | |
101 } | |
102 else{ #normals are specified | |
103 normal_values <- data_row[normals_list] | |
104 tumor_columns <- data_row[tumors_list] | |
105 | |
106 if(norm_type == 'MEDIAN'){ | |
107 row_stat <- median(normal_values) | |
108 } | |
109 else if(norm_type == 'MEAN'){ | |
110 row_stat <- mean(normal_values) | |
111 } | |
112 return(unlist(lapply(tumor_columns, function(x){return(x - row_stat);}))) | |
113 } | |
114 } | |
115 | |
116 compute_z_score <- function(data_matrix, normals_list, tumors_list){ | |
117 return(t(apply(data_matrix, 1, t_stat_normalize_row, normals_list=normals_list, tumors_list=tumors_list))) | |
118 } | |
119 | |
120 t_stat_normalize_row <- function(data_row, normals_list, tumors_list){ | |
121 if(length(normals_list) == 0){ #no normals are specified | |
122 row_mean <- mean(data_row) | |
123 row_sd <- sd(data_row) | |
124 return(unlist(lapply(data_row, function(x){return((x - row_mean)/row_sd);}))) | |
125 } | |
126 else{ #normals are specified | |
127 normal_values <- data_row[normals_list] | |
128 normal_mean <- mean(normal_values) | |
129 normal_sd <- sd(normal_values) | |
130 normalized_normals <- unlist(lapply(normal_values, function(x){return((x - normal_mean)/normal_sd);})) | |
131 | |
132 tumor_values <- data_row[tumors_list] | |
133 normalized_tumors <- unlist(lapply(tumor_values, function(x){return((x - normal_mean)/normal_sd);})) | |
134 | |
135 return(append(normalized_normals, normalized_tumors)) | |
136 } | |
137 } | |
138 | |
139 rankNA <- function(col){ #originally written by Dan Carlin | |
140 col[!is.na(col)]<-(rank(col[!is.na(col)])/sum(!is.na(col)))-(1/sum(!is.na(col))) | |
141 return(col) | |
142 } | |
143 | |
144 fit_distribution <- function(data_matrix, dist){ | |
145 if(dist == 'EXPONENTIAL'){ | |
146 ranked_data_matrix <- apply(data_matrix,1,rankNA) #idea by Dan Carlin | |
147 #write.table(c("ranked data:"), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) | |
148 #write.table(ranked_data_matrix, stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) | |
149 return(apply(ranked_data_matrix, 1, qexp)) | |
150 } | |
151 else if(dist == 'NORMAL'){ | |
152 ranked_data_matrix <- apply(data_matrix,2,rankNA) | |
153 return(apply(ranked_data_matrix, c(1,2), qnorm, mean=0, sd=1)) | |
154 } | |
155 else if(dist == 'WEIBULL_0.5'){ | |
156 ranked_data_matrix <- apply(data_matrix,2,rankNA) | |
157 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=0.5)) | |
158 } | |
159 else if(dist == 'WEIBULL_1'){ | |
160 ranked_data_matrix <- apply(data_matrix,2,rankNA) | |
161 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=1)) | |
162 } | |
163 else if(dist == 'WEIBULL_1.5'){ | |
164 ranked_data_matrix <- apply(data_matrix,2,rankNA) | |
165 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=1.5)) | |
166 } | |
167 else if(dist == 'WEIBULL_5'){ | |
168 ranked_data_matrix <- apply(data_matrix,2,rankNA) | |
169 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=5)) | |
170 } | |
171 } | |
172 | |
173 main <- function(argv) { | |
174 #determine if correct number of arguments are specified and if normals are specified | |
175 with_normals = FALSE | |
176 | |
177 if(length(argv) == 1){ | |
178 if(argv==c('--help')){ | |
179 write(argspec, stderr()); | |
180 q(); | |
181 } | |
182 } | |
183 | |
184 if(!(length(argv) == 3 || length(argv) == 4)){ | |
185 write("ERROR: invalid number of arguments is specified", stderr()); | |
186 q(); | |
187 } | |
188 | |
189 if(length(argv) == 4){ | |
190 with_normals = TRUE | |
191 normals_file <- argv[4] | |
192 } | |
193 | |
194 #store command line arguments in variables: | |
195 input_file <- argv[1] | |
196 norm_type <- toupper(argv[2]) | |
197 norm_by <- toupper(argv[3]) | |
198 | |
199 #input_file <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/test_matrix.tab" | |
200 #norm_type <- "MEAN_SHIFT" | |
201 #norm_by <- "ROW" | |
202 #normals_file <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/test_matrix2.tab" | |
203 #normals_file2 <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/normals.tab" | |
204 | |
205 #read the input file(s): | |
206 data_matrix <- read_matrix(input_file) | |
207 | |
208 if(with_normals){ | |
209 normals <- read_normals(normals_file) | |
210 if(length(colnames(normals)) == 1){ | |
211 normals_indices <- which(colnames(data_matrix) %in% normals) | |
212 tumor_indices <- which(!(colnames(data_matrix) %in% normals)) | |
213 }else{ | |
214 normals_numeric <- normals[2:length(normals[,1]),2:length(normals[1,])] | |
215 normals_numeric <- apply(normals_numeric, 2, as.numeric) | |
216 rownames(normals_numeric) <- normals[,1][2:length(normals[,1])] | |
217 colnames(normals_numeric) <- normals[1,][2:length(normals[1,])] | |
218 | |
219 combined_matrix <- cbind(data_matrix, normals_numeric) | |
220 tumor_indices <- c(1:length(data_matrix[1,])) | |
221 normals_indices <- c(length(tumor_indices)+1:length(normals_numeric[1,])) | |
222 data_matrix <- combined_matrix | |
223 } | |
224 }else{ | |
225 normals_indices <- c() | |
226 tumor_indices <- c() | |
227 } | |
228 | |
229 #if normalize by columns then transpose the matrix: | |
230 if(norm_by == 'COLUMN'){ | |
231 data_matrix <- t(data_matrix) | |
232 } | |
233 | |
234 #normalize: | |
235 data_matrix <- normalize(data_matrix, norm_type, normals_indices, tumor_indices) | |
236 | |
237 #if normalize by columns then transpose the matrix again since we normalized the transposed matrix by row: | |
238 if(norm_by == 'COLUMN'){ | |
239 data_matrix <- t(data_matrix) | |
240 } | |
241 | |
242 write_matrix(data_matrix) | |
243 } | |
244 | |
245 main(commandArgs(TRUE)) |