annotate normalize.r @ 2:f3fe0f64fe91 draft

Uploaded
author ynewton
date Mon, 17 Dec 2012 15:25:38 -0500
parents 8389b0c211ae
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
1 #!/usr/bin/Rscript
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
2
2
f3fe0f64fe91 Uploaded
ynewton
parents: 1
diff changeset
3 #Yulia Newton, last updated 20121217 v.3
f3fe0f64fe91 Uploaded
ynewton
parents: 1
diff changeset
4
1
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
5 #usage, options and doc goes here
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
6 argspec <- c("normalize.r - takes any flat file and normalizes the rows or the columns using various normalizations (median_shift, mean_shift, t_statistic (z-score), exp_fit, normal_fit, weibull_0.5_fit, weibull_1_fit, weibull_1.5_fit, weibull_5_fit). Requires a single header line and a single cloumn of annotation.
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
7 Usage:
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
8 normalize.r input.tab norm_type norm_by > output.tab
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
9 Example:
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
10 Rscript normalize.r test_matrix.tab median_shift column > output.tab
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
11 Rscript normalize.r test_matrix.tab mean_shift row normals.tab > output.tab
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
12 Options:
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
13 input matrix (annotated by row and column names)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
14 normalization type; available options:
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
15 median_shift - shifts all values by the median or the row/column if no normals are specified, otherwise shifts by the median of normals
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
16 mean_shift - shifts all values by the mean or the row/column if no normals are specified, otherwise shifts by the mean of normals
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
17 t_statistic - converts all values to z-scores; if normals are specified then converts to z-scores within normal and non-normal classes separately
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
18 exponential_fit - (only by column) ranks data and transforms exponential CDF
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
19 normal_fit - (only by column) ranks data and transforms normal CDF
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
20 weibull_0.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 0.5
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
21 weibull_1_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
22 weibull_1.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1.5
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
23 weibull_5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 5
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
24 normalization by:
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
25 row
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
26 column
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
27 normals_file is an optional parameter which contains either a list of column headers from the input matrix, which should be considered as normals, or a matrix of normal samples
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
28 output file is specified through redirect character >")
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
29
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
30 read_matrix <- function(in_file){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
31 header <- strsplit(readLines(con=in_file, n=1), "\t")[[1]]
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
32 cl.cols<- 1:length(header) > 1
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
33 data_matrix.df <- read.delim(in_file, header=TRUE, row.names=NULL, stringsAsFactors=FALSE, na.strings="NA", check.names=FALSE)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
34 data_matrix <- as.matrix(data_matrix.df[,cl.cols])
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
35 rownames(data_matrix) <- data_matrix.df[,1]
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
36 return(data_matrix)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
37
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
38 #read_mtrx <- as.matrix(read.table(in_file, header=TRUE, sep="", row.names=NULL, stringsAsFactors=FALSE, na.strings="NA")) #separate on white characters
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
39 #read_mtrx[,1]
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
40
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
41 #return(as.matrix(read.table(in_file, header=TRUE, sep="", row.names=1))) #separate on white characters
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
42 #mtrx <- read.delim(in_file, header=TRUE, sep="", row.names=NULL, stringsAsFactors=FALSE, na.strings="NA")
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
43 #print(mtrx[1,])
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
44 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
45
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
46 write_matrix <- function(data_matrix){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
47 header <- append(c("Genes"), colnames(data_matrix))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
48 write.table(t(header), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
49 write.table(data_matrix, stdout(), quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
50 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
51
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
52 read_normals <- function(in_file){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
53 #return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE))[, 1])
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
54 return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE)))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
55 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
56
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
57 normalize <- function(data_matrix, norm_type, normals_list, tumors_list){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
58 if(norm_type == 'MEDIAN_SHIFT'){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
59 return(shift(data_matrix, 'MEDIAN', normals_list, tumors_list))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
60 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
61 else if(norm_type == 'MEAN_SHIFT'){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
62 return(shift(data_matrix, 'MEAN', normals_list, tumors_list))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
63 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
64 else if(norm_type == 'T_STATISTIC'){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
65 return(compute_z_score(data_matrix, normals_list, tumors_list))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
66 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
67 else if(norm_type == 'EXPONENTIAL_FIT'){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
68 return(fit_distribution(data_matrix, 'EXPONENTIAL'))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
69 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
70 else if(norm_type == 'NORMAL_FIT'){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
71 return(fit_distribution(data_matrix, 'NORMAL'))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
72 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
73 else if(norm_type == 'WEIBULL_0.5_FIT'){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
74 return(fit_distribution(data_matrix, 'WEIBULL_0.5'))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
75 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
76 else if(norm_type == 'WEIBULL_1_FIT'){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
77 return(fit_distribution(data_matrix, 'WEIBULL_1'))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
78 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
79 else if(norm_type == 'WEIBULL_1.5_FIT'){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
80 return(fit_distribution(data_matrix, 'WEIBULL_1.5'))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
81 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
82 else if(norm_type == 'WEIBULL_5_FIT'){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
83 return(fit_distribution(data_matrix, 'WEIBULL_5'))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
84 }else{
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
85 write("ERROR: unknown normalization type", stderr());
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
86 q();
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
87 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
88 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
89
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
90 shift <- function(data_matrix, shift_type, normals_list, tumors_list){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
91 return(t(apply(data_matrix, 1, shift_normalize_row, norm_type=shift_type, normals_list=normals_list, tumors_list=tumors_list)))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
92 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
93
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
94 shift_normalize_row <- function(data_row, norm_type, normals_list, tumors_list){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
95 if(length(normals_list) == 0){ #no normals are specified
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
96 if(norm_type == 'MEDIAN'){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
97 row_stat <- median(data_row)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
98 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
99 else if(norm_type == 'MEAN'){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
100 row_stat <- mean(data_row)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
101 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
102 return(unlist(lapply(data_row, function(x){return(x - row_stat);})))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
103 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
104 else{ #normals are specified
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
105 normal_values <- data_row[normals_list]
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
106 tumor_columns <- data_row[tumors_list]
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
107
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
108 if(norm_type == 'MEDIAN'){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
109 row_stat <- median(normal_values)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
110 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
111 else if(norm_type == 'MEAN'){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
112 row_stat <- mean(normal_values)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
113 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
114 return(unlist(lapply(tumor_columns, function(x){return(x - row_stat);})))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
115 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
116 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
117
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
118 compute_z_score <- function(data_matrix, normals_list, tumors_list){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
119 return(t(apply(data_matrix, 1, t_stat_normalize_row, normals_list=normals_list, tumors_list=tumors_list)))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
120 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
121
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
122 t_stat_normalize_row <- function(data_row, normals_list, tumors_list){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
123 if(length(normals_list) == 0){ #no normals are specified
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
124 row_mean <- mean(data_row)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
125 row_sd <- sd(data_row)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
126 return(unlist(lapply(data_row, function(x){return((x - row_mean)/row_sd);})))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
127 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
128 else{ #normals are specified
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
129 normal_values <- data_row[normals_list]
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
130 normal_mean <- mean(normal_values)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
131 normal_sd <- sd(normal_values)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
132 normalized_normals <- unlist(lapply(normal_values, function(x){return((x - normal_mean)/normal_sd);}))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
133
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
134 tumor_values <- data_row[tumors_list]
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
135 normalized_tumors <- unlist(lapply(tumor_values, function(x){return((x - normal_mean)/normal_sd);}))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
136
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
137 return(append(normalized_normals, normalized_tumors))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
138 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
139 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
140
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
141 rankNA <- function(col){ #originally written by Dan Carlin
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
142 col[!is.na(col)]<-(rank(col[!is.na(col)])/sum(!is.na(col)))-(1/sum(!is.na(col)))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
143 return(col)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
144 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
145
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
146 fit_distribution <- function(data_matrix, dist){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
147 if(dist == 'EXPONENTIAL'){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
148 ranked_data_matrix <- apply(data_matrix,1,rankNA) #idea by Dan Carlin
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
149 #write.table(c("ranked data:"), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
150 #write.table(ranked_data_matrix, stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
151 return(apply(ranked_data_matrix, 1, qexp))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
152 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
153 else if(dist == 'NORMAL'){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
154 ranked_data_matrix <- apply(data_matrix,2,rankNA)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
155 return(apply(ranked_data_matrix, c(1,2), qnorm, mean=0, sd=1))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
156 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
157 else if(dist == 'WEIBULL_0.5'){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
158 ranked_data_matrix <- apply(data_matrix,2,rankNA)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
159 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=0.5))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
160 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
161 else if(dist == 'WEIBULL_1'){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
162 ranked_data_matrix <- apply(data_matrix,2,rankNA)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
163 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=1))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
164 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
165 else if(dist == 'WEIBULL_1.5'){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
166 ranked_data_matrix <- apply(data_matrix,2,rankNA)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
167 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=1.5))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
168 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
169 else if(dist == 'WEIBULL_5'){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
170 ranked_data_matrix <- apply(data_matrix,2,rankNA)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
171 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=5))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
172 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
173 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
174
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
175 main <- function(argv) {
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
176 #determine if correct number of arguments are specified and if normals are specified
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
177 with_normals = FALSE
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
178
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
179 if(length(argv) == 1){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
180 if(argv==c('--help')){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
181 write(argspec, stderr());
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
182 q();
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
183 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
184 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
185
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
186 if(!(length(argv) == 3 || length(argv) == 4)){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
187 write("ERROR: invalid number of arguments is specified", stderr());
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
188 q();
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
189 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
190
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
191 if(length(argv) == 4){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
192 with_normals = TRUE
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
193 normals_file <- argv[4]
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
194 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
195
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
196 #store command line arguments in variables:
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
197 input_file <- argv[1]
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
198 norm_type <- toupper(argv[2])
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
199 norm_by <- toupper(argv[3])
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
200
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
201 #input_file <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/test_matrix.tab"
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
202 #norm_type <- "MEAN_SHIFT"
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
203 #norm_by <- "ROW"
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
204 #normals_file <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/test_matrix2.tab"
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
205 #normals_file2 <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/normals.tab"
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
206
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
207 #read the input file(s):
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
208 data_matrix <- read_matrix(input_file)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
209
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
210 if(with_normals){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
211 normals <- read_normals(normals_file)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
212 if(length(colnames(normals)) == 1){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
213 normals_indices <- which(colnames(data_matrix) %in% normals)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
214 tumor_indices <- which(!(colnames(data_matrix) %in% normals))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
215 }else{
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
216 normals_numeric <- normals[2:length(normals[,1]),2:length(normals[1,])]
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
217 normals_numeric <- apply(normals_numeric, 2, as.numeric)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
218 rownames(normals_numeric) <- normals[,1][2:length(normals[,1])]
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
219 colnames(normals_numeric) <- normals[1,][2:length(normals[1,])]
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
220
2
f3fe0f64fe91 Uploaded
ynewton
parents: 1
diff changeset
221 #select only the intersection of the rows between the two matrices:
f3fe0f64fe91 Uploaded
ynewton
parents: 1
diff changeset
222 normals_numeric <- normals_numeric[rownames(normals_numeric) %in% rownames(data_matrix),]
f3fe0f64fe91 Uploaded
ynewton
parents: 1
diff changeset
223 data_matrix <- data_matrix[rownames(data_matrix) %in% rownames(normals_numeric),]
f3fe0f64fe91 Uploaded
ynewton
parents: 1
diff changeset
224 normals_numeric <- normals_numeric[order(rownames(normals_numeric)),]
f3fe0f64fe91 Uploaded
ynewton
parents: 1
diff changeset
225 data_matrix <- data_matrix[order(rownames(data_matrix)),]
f3fe0f64fe91 Uploaded
ynewton
parents: 1
diff changeset
226
1
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
227 combined_matrix <- cbind(data_matrix, normals_numeric)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
228 tumor_indices <- c(1:length(data_matrix[1,]))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
229 normals_indices <- c(length(tumor_indices)+1:length(normals_numeric[1,]))
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
230 data_matrix <- combined_matrix
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
231 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
232 }else{
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
233 normals_indices <- c()
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
234 tumor_indices <- c()
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
235 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
236
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
237 #if normalize by columns then transpose the matrix:
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
238 if(norm_by == 'COLUMN'){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
239 data_matrix <- t(data_matrix)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
240 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
241
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
242 #normalize:
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
243 data_matrix <- normalize(data_matrix, norm_type, normals_indices, tumor_indices)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
244
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
245 #if normalize by columns then transpose the matrix again since we normalized the transposed matrix by row:
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
246 if(norm_by == 'COLUMN'){
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
247 data_matrix <- t(data_matrix)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
248 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
249
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
250 write_matrix(data_matrix)
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
251 }
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
252
8389b0c211ae Uploaded
ynewton
parents:
diff changeset
253 main(commandArgs(TRUE))