1
|
1 #!/usr/bin/Rscript
|
|
2
|
2
|
3 #Yulia Newton, last updated 20121217 v.3
|
|
4
|
1
|
5 #usage, options and doc goes here
|
|
6 argspec <- c("normalize.r - takes any flat file and normalizes the rows or the columns using various normalizations (median_shift, mean_shift, t_statistic (z-score), exp_fit, normal_fit, weibull_0.5_fit, weibull_1_fit, weibull_1.5_fit, weibull_5_fit). Requires a single header line and a single cloumn of annotation.
|
|
7 Usage:
|
|
8 normalize.r input.tab norm_type norm_by > output.tab
|
|
9 Example:
|
|
10 Rscript normalize.r test_matrix.tab median_shift column > output.tab
|
|
11 Rscript normalize.r test_matrix.tab mean_shift row normals.tab > output.tab
|
|
12 Options:
|
|
13 input matrix (annotated by row and column names)
|
|
14 normalization type; available options:
|
|
15 median_shift - shifts all values by the median or the row/column if no normals are specified, otherwise shifts by the median of normals
|
|
16 mean_shift - shifts all values by the mean or the row/column if no normals are specified, otherwise shifts by the mean of normals
|
|
17 t_statistic - converts all values to z-scores; if normals are specified then converts to z-scores within normal and non-normal classes separately
|
|
18 exponential_fit - (only by column) ranks data and transforms exponential CDF
|
|
19 normal_fit - (only by column) ranks data and transforms normal CDF
|
|
20 weibull_0.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 0.5
|
|
21 weibull_1_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1
|
|
22 weibull_1.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1.5
|
|
23 weibull_5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 5
|
|
24 normalization by:
|
|
25 row
|
|
26 column
|
|
27 normals_file is an optional parameter which contains either a list of column headers from the input matrix, which should be considered as normals, or a matrix of normal samples
|
|
28 output file is specified through redirect character >")
|
|
29
|
|
30 read_matrix <- function(in_file){
|
|
31 header <- strsplit(readLines(con=in_file, n=1), "\t")[[1]]
|
|
32 cl.cols<- 1:length(header) > 1
|
|
33 data_matrix.df <- read.delim(in_file, header=TRUE, row.names=NULL, stringsAsFactors=FALSE, na.strings="NA", check.names=FALSE)
|
|
34 data_matrix <- as.matrix(data_matrix.df[,cl.cols])
|
|
35 rownames(data_matrix) <- data_matrix.df[,1]
|
|
36 return(data_matrix)
|
|
37
|
|
38 #read_mtrx <- as.matrix(read.table(in_file, header=TRUE, sep="", row.names=NULL, stringsAsFactors=FALSE, na.strings="NA")) #separate on white characters
|
|
39 #read_mtrx[,1]
|
|
40
|
|
41 #return(as.matrix(read.table(in_file, header=TRUE, sep="", row.names=1))) #separate on white characters
|
|
42 #mtrx <- read.delim(in_file, header=TRUE, sep="", row.names=NULL, stringsAsFactors=FALSE, na.strings="NA")
|
|
43 #print(mtrx[1,])
|
|
44 }
|
|
45
|
|
46 write_matrix <- function(data_matrix){
|
|
47 header <- append(c("Genes"), colnames(data_matrix))
|
|
48 write.table(t(header), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
|
|
49 write.table(data_matrix, stdout(), quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE)
|
|
50 }
|
|
51
|
|
52 read_normals <- function(in_file){
|
|
53 #return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE))[, 1])
|
|
54 return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE)))
|
|
55 }
|
|
56
|
|
57 normalize <- function(data_matrix, norm_type, normals_list, tumors_list){
|
|
58 if(norm_type == 'MEDIAN_SHIFT'){
|
|
59 return(shift(data_matrix, 'MEDIAN', normals_list, tumors_list))
|
|
60 }
|
|
61 else if(norm_type == 'MEAN_SHIFT'){
|
|
62 return(shift(data_matrix, 'MEAN', normals_list, tumors_list))
|
|
63 }
|
|
64 else if(norm_type == 'T_STATISTIC'){
|
|
65 return(compute_z_score(data_matrix, normals_list, tumors_list))
|
|
66 }
|
|
67 else if(norm_type == 'EXPONENTIAL_FIT'){
|
|
68 return(fit_distribution(data_matrix, 'EXPONENTIAL'))
|
|
69 }
|
|
70 else if(norm_type == 'NORMAL_FIT'){
|
|
71 return(fit_distribution(data_matrix, 'NORMAL'))
|
|
72 }
|
|
73 else if(norm_type == 'WEIBULL_0.5_FIT'){
|
|
74 return(fit_distribution(data_matrix, 'WEIBULL_0.5'))
|
|
75 }
|
|
76 else if(norm_type == 'WEIBULL_1_FIT'){
|
|
77 return(fit_distribution(data_matrix, 'WEIBULL_1'))
|
|
78 }
|
|
79 else if(norm_type == 'WEIBULL_1.5_FIT'){
|
|
80 return(fit_distribution(data_matrix, 'WEIBULL_1.5'))
|
|
81 }
|
|
82 else if(norm_type == 'WEIBULL_5_FIT'){
|
|
83 return(fit_distribution(data_matrix, 'WEIBULL_5'))
|
|
84 }else{
|
|
85 write("ERROR: unknown normalization type", stderr());
|
|
86 q();
|
|
87 }
|
|
88 }
|
|
89
|
|
90 shift <- function(data_matrix, shift_type, normals_list, tumors_list){
|
|
91 return(t(apply(data_matrix, 1, shift_normalize_row, norm_type=shift_type, normals_list=normals_list, tumors_list=tumors_list)))
|
|
92 }
|
|
93
|
|
94 shift_normalize_row <- function(data_row, norm_type, normals_list, tumors_list){
|
|
95 if(length(normals_list) == 0){ #no normals are specified
|
|
96 if(norm_type == 'MEDIAN'){
|
|
97 row_stat <- median(data_row)
|
|
98 }
|
|
99 else if(norm_type == 'MEAN'){
|
|
100 row_stat <- mean(data_row)
|
|
101 }
|
|
102 return(unlist(lapply(data_row, function(x){return(x - row_stat);})))
|
|
103 }
|
|
104 else{ #normals are specified
|
|
105 normal_values <- data_row[normals_list]
|
|
106 tumor_columns <- data_row[tumors_list]
|
|
107
|
|
108 if(norm_type == 'MEDIAN'){
|
|
109 row_stat <- median(normal_values)
|
|
110 }
|
|
111 else if(norm_type == 'MEAN'){
|
|
112 row_stat <- mean(normal_values)
|
|
113 }
|
|
114 return(unlist(lapply(tumor_columns, function(x){return(x - row_stat);})))
|
|
115 }
|
|
116 }
|
|
117
|
|
118 compute_z_score <- function(data_matrix, normals_list, tumors_list){
|
|
119 return(t(apply(data_matrix, 1, t_stat_normalize_row, normals_list=normals_list, tumors_list=tumors_list)))
|
|
120 }
|
|
121
|
|
122 t_stat_normalize_row <- function(data_row, normals_list, tumors_list){
|
|
123 if(length(normals_list) == 0){ #no normals are specified
|
|
124 row_mean <- mean(data_row)
|
|
125 row_sd <- sd(data_row)
|
|
126 return(unlist(lapply(data_row, function(x){return((x - row_mean)/row_sd);})))
|
|
127 }
|
|
128 else{ #normals are specified
|
|
129 normal_values <- data_row[normals_list]
|
|
130 normal_mean <- mean(normal_values)
|
|
131 normal_sd <- sd(normal_values)
|
|
132 normalized_normals <- unlist(lapply(normal_values, function(x){return((x - normal_mean)/normal_sd);}))
|
|
133
|
|
134 tumor_values <- data_row[tumors_list]
|
|
135 normalized_tumors <- unlist(lapply(tumor_values, function(x){return((x - normal_mean)/normal_sd);}))
|
|
136
|
|
137 return(append(normalized_normals, normalized_tumors))
|
|
138 }
|
|
139 }
|
|
140
|
|
141 rankNA <- function(col){ #originally written by Dan Carlin
|
|
142 col[!is.na(col)]<-(rank(col[!is.na(col)])/sum(!is.na(col)))-(1/sum(!is.na(col)))
|
|
143 return(col)
|
|
144 }
|
|
145
|
|
146 fit_distribution <- function(data_matrix, dist){
|
|
147 if(dist == 'EXPONENTIAL'){
|
|
148 ranked_data_matrix <- apply(data_matrix,1,rankNA) #idea by Dan Carlin
|
|
149 #write.table(c("ranked data:"), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
|
|
150 #write.table(ranked_data_matrix, stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
|
|
151 return(apply(ranked_data_matrix, 1, qexp))
|
|
152 }
|
|
153 else if(dist == 'NORMAL'){
|
|
154 ranked_data_matrix <- apply(data_matrix,2,rankNA)
|
|
155 return(apply(ranked_data_matrix, c(1,2), qnorm, mean=0, sd=1))
|
|
156 }
|
|
157 else if(dist == 'WEIBULL_0.5'){
|
|
158 ranked_data_matrix <- apply(data_matrix,2,rankNA)
|
|
159 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=0.5))
|
|
160 }
|
|
161 else if(dist == 'WEIBULL_1'){
|
|
162 ranked_data_matrix <- apply(data_matrix,2,rankNA)
|
|
163 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=1))
|
|
164 }
|
|
165 else if(dist == 'WEIBULL_1.5'){
|
|
166 ranked_data_matrix <- apply(data_matrix,2,rankNA)
|
|
167 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=1.5))
|
|
168 }
|
|
169 else if(dist == 'WEIBULL_5'){
|
|
170 ranked_data_matrix <- apply(data_matrix,2,rankNA)
|
|
171 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=5))
|
|
172 }
|
|
173 }
|
|
174
|
|
175 main <- function(argv) {
|
|
176 #determine if correct number of arguments are specified and if normals are specified
|
|
177 with_normals = FALSE
|
|
178
|
|
179 if(length(argv) == 1){
|
|
180 if(argv==c('--help')){
|
|
181 write(argspec, stderr());
|
|
182 q();
|
|
183 }
|
|
184 }
|
|
185
|
|
186 if(!(length(argv) == 3 || length(argv) == 4)){
|
|
187 write("ERROR: invalid number of arguments is specified", stderr());
|
|
188 q();
|
|
189 }
|
|
190
|
|
191 if(length(argv) == 4){
|
|
192 with_normals = TRUE
|
|
193 normals_file <- argv[4]
|
|
194 }
|
|
195
|
|
196 #store command line arguments in variables:
|
|
197 input_file <- argv[1]
|
|
198 norm_type <- toupper(argv[2])
|
|
199 norm_by <- toupper(argv[3])
|
|
200
|
|
201 #input_file <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/test_matrix.tab"
|
|
202 #norm_type <- "MEAN_SHIFT"
|
|
203 #norm_by <- "ROW"
|
|
204 #normals_file <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/test_matrix2.tab"
|
|
205 #normals_file2 <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/normals.tab"
|
|
206
|
|
207 #read the input file(s):
|
|
208 data_matrix <- read_matrix(input_file)
|
|
209
|
|
210 if(with_normals){
|
|
211 normals <- read_normals(normals_file)
|
|
212 if(length(colnames(normals)) == 1){
|
|
213 normals_indices <- which(colnames(data_matrix) %in% normals)
|
|
214 tumor_indices <- which(!(colnames(data_matrix) %in% normals))
|
|
215 }else{
|
|
216 normals_numeric <- normals[2:length(normals[,1]),2:length(normals[1,])]
|
|
217 normals_numeric <- apply(normals_numeric, 2, as.numeric)
|
|
218 rownames(normals_numeric) <- normals[,1][2:length(normals[,1])]
|
|
219 colnames(normals_numeric) <- normals[1,][2:length(normals[1,])]
|
|
220
|
2
|
221 #select only the intersection of the rows between the two matrices:
|
|
222 normals_numeric <- normals_numeric[rownames(normals_numeric) %in% rownames(data_matrix),]
|
|
223 data_matrix <- data_matrix[rownames(data_matrix) %in% rownames(normals_numeric),]
|
|
224 normals_numeric <- normals_numeric[order(rownames(normals_numeric)),]
|
|
225 data_matrix <- data_matrix[order(rownames(data_matrix)),]
|
|
226
|
1
|
227 combined_matrix <- cbind(data_matrix, normals_numeric)
|
|
228 tumor_indices <- c(1:length(data_matrix[1,]))
|
|
229 normals_indices <- c(length(tumor_indices)+1:length(normals_numeric[1,]))
|
|
230 data_matrix <- combined_matrix
|
|
231 }
|
|
232 }else{
|
|
233 normals_indices <- c()
|
|
234 tumor_indices <- c()
|
|
235 }
|
|
236
|
|
237 #if normalize by columns then transpose the matrix:
|
|
238 if(norm_by == 'COLUMN'){
|
|
239 data_matrix <- t(data_matrix)
|
|
240 }
|
|
241
|
|
242 #normalize:
|
|
243 data_matrix <- normalize(data_matrix, norm_type, normals_indices, tumor_indices)
|
|
244
|
|
245 #if normalize by columns then transpose the matrix again since we normalized the transposed matrix by row:
|
|
246 if(norm_by == 'COLUMN'){
|
|
247 data_matrix <- t(data_matrix)
|
|
248 }
|
|
249
|
|
250 write_matrix(data_matrix)
|
|
251 }
|
|
252
|
|
253 main(commandArgs(TRUE))
|