Mercurial > repos > md-anderson-bioinformatics > matrix_manipulation
comparison Matrix_Transformations.py @ 1:f1bcd79cd923 draft default tip
Uploaded
author | insilico-bob |
---|---|
date | Tue, 27 Nov 2018 14:20:40 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:7f12c81e2083 | 1:f1bcd79cd923 |
---|---|
1 ''' | |
2 Created on Jun 6, 2017 updated Feb 2018 | |
3 | |
4 @author: cjacoby and Bob Brown | |
5 ''' | |
6 import os | |
7 import sys, traceback, argparse | |
8 import numpy as np | |
9 from numpy import size, array | |
10 import warnings | |
11 from Matrix_Validate_import import reader | |
12 #import scipy.stats as ss | |
13 warnings.filterwarnings('error') | |
14 | |
15 #Define argparse Function | |
16 def get_args(): | |
17 parser = argparse.ArgumentParser() | |
18 parser.add_argument('input_file_txt', help='text file input matrix(include .txt in name)') | |
19 parser.add_argument('choice', type=str, help='Choose normalization Method: 1 = Z-score, 2 = Mean Centered, 3 = log2, 4= rank') | |
20 parser.add_argument('axes', type=str, help='Choose Axis to normalize On (Row or Column)') | |
21 parser.add_argument('scalevalue', help='optional scaling factor for matrix)') | |
22 parser.add_argument('offsetvalue', help='optional offset for matrix') | |
23 parser.add_argument('output_file_txt', help='text file output matrix(include .txt in name)') | |
24 args = parser.parse_args() | |
25 return args | |
26 | |
27 | |
28 def Zscore_row(matrix): | |
29 | |
30 #Loop To Perform Z-Score normalization | |
31 for i in range(0,len(matrix)): | |
32 temp_mean = np.nanmean(matrix[i]) | |
33 temp_stdev = np.nanstd(matrix[i],ddof=1) | |
34 for j in range(0,len(matrix[0])): | |
35 matrix[i][j] = (matrix[i][j]-temp_mean)/temp_stdev | |
36 return(matrix) | |
37 | |
38 #Define Z-Score normalization Function | |
39 def Zscore_col(matrix): | |
40 | |
41 #Loop To Perform Z-Score normalization | |
42 for i in range(len(matrix[0])): | |
43 # matrix[:][i] = [scaleValue*x+offset for x in matrix[i]] | |
44 temp_mean = np.nanmean([row[i] for row in matrix]) | |
45 temp_stdev = np.nanstd([row[i] for row in matrix],ddof=1) | |
46 #Probably Should Have if statement checking if stdev equals zero, although this implies the data is already Z-score normalized | |
47 for j in range(len(matrix)): | |
48 matrix[j][i] = (matrix[j][i]-temp_mean)/temp_stdev | |
49 return(matrix) | |
50 | |
51 | |
52 #Define Mean Centered or Median centered normalization Function | |
53 def MeanMedianCenter_row(matrix,type): | |
54 | |
55 | |
56 #Loop To Perform mean or median center | |
57 for i in range(0,len(matrix)): | |
58 if type == "mean": | |
59 temp_type = np.nanmean(matrix[i][1::]) | |
60 else: | |
61 temp_type = np.nanmedian(matrix[i][1::]) | |
62 | |
63 for j in range(0,len(matrix[0])): | |
64 matrix[i][j] = (matrix[i][j]-temp_type) | |
65 return(matrix) | |
66 | |
67 | |
68 #Define mean or median | |
69 def MeanMedianCenter_col(matrix,type): | |
70 | |
71 #Loop To Perform mean or median center | |
72 for i in range(0,len(matrix[0])): | |
73 if type == "mean": | |
74 temp_type = np.nanmean([row[i] for row in matrix]) | |
75 else: | |
76 temp_type = np.nanmedian([row[i] for row in matrix]) | |
77 #Probably Should Have if statement checking if stdev equals zero, although this implies the data is already Z-score normalized | |
78 for j in range(0,len(matrix)): | |
79 matrix[j][i] = (matrix[j][i]-temp_type) | |
80 return(matrix) | |
81 | |
82 #Divide by sum of the Row Function | |
83 def Divide_By_Sum_row(matrix): | |
84 | |
85 #Loop To Perform mean or median center | |
86 numRow,numCol= np.shape(matrix) | |
87 | |
88 for i in range(numRow): | |
89 sumValue = sum(matrix[i][:]) | |
90 | |
91 #if equals zero | |
92 if abs(sumValue) > .0001: | |
93 for j in range(numCol): | |
94 matrix[i][j] = matrix[i][j]/sumValue | |
95 else: | |
96 print("ERROR Cannot divide by Sum almost zero", str(sumValue), " for Row ",str(i+1)) | |
97 return(matrix) | |
98 | |
99 | |
100 #Divide by sum of the Column Function | |
101 def Divide_By_Sum_col(matrix): | |
102 | |
103 #Loop To Perform mean or median center | |
104 numRow,numCol= np.shape(matrix) | |
105 | |
106 for i in range(numCol): | |
107 sumValue= 0 | |
108 | |
109 #if equals zero | |
110 if abs(sumValue) > .0001: | |
111 for j in range(numRow): | |
112 matrix[j][i] = (matrix[j][i]/sumValue) | |
113 else: | |
114 print("ERROR Cannot divide by Sum almost zero", str(sumValue), " for Column ",str(i+1)) | |
115 return(matrix) | |
116 | |
117 #scale or add offset to matrix by row | |
118 def ScaleOffset_row(matrix,scaleValue,offset): | |
119 | |
120 #Loop To Perform scale and offset do one or the other per request | |
121 if abs(scaleValue) > 0.0001: | |
122 for i in range(0,len(matrix)): | |
123 matrix[i][:] = [scaleValue*x+offset for x in matrix[i]] | |
124 else: | |
125 print (" Scale facter "+str(scaleValue)+" too small") | |
126 return(matrix) | |
127 | |
128 #scale or add offset to matrix by column | |
129 def ScaleOffset_col(matrix,scaleValue,offset): | |
130 | |
131 #Loop To Perform scale and offset do one or the other per request | |
132 if abs(scaleValue) > 0.0001: | |
133 for i in range(0,len(matrix[0])): | |
134 matrix[:][i] = [scaleValue*x+offset for x in matrix[i]] | |
135 else: | |
136 print (" Scale facter "+str(scaleValue)+" too small") | |
137 return(matrix) | |
138 | |
139 #Define Log2 normalization Method | |
140 def Convert2Logs(matrix,logValue, offset): | |
141 import warnings | |
142 warnings.filterwarnings('error') | |
143 | |
144 #Loop To Perform Z-Score normalization | |
145 for i in range(0,len(matrix)): | |
146 for j in range(0,len(matrix[0])): | |
147 try: | |
148 if logValue == "log2": | |
149 matrix[i][j] = np.log2(matrix[i][j]+offset) | |
150 else: | |
151 matrix[i][j] = np.log10(matrix[i][j]+offset) | |
152 | |
153 except RuntimeWarning: | |
154 print(logValue+" normalization Failed: Encountered elements <= 0, which are invalid inputs for a Log normalization") | |
155 break | |
156 else: | |
157 continue | |
158 break | |
159 return(matrix) | |
160 | |
161 #transpose matrix | |
162 def Transpose(in_mat): | |
163 out_mat = [] | |
164 numRow,numCol= np.shape(in_mat) | |
165 | |
166 for i in range(numCol): | |
167 temp= [] | |
168 for j in range(numRow): | |
169 temp.append(in_mat[j][i]) | |
170 out_mat.append(temp) | |
171 #print( str(out_mat)) | |
172 return out_mat | |
173 | |
174 # restores row and column labels in ouput | |
175 def labeler(matrix,og_cols,og_rows,output_file_txt): | |
176 #Define Null Sets For Col and Row Headers | |
177 with open(output_file_txt,'w') as f: | |
178 f.write("") | |
179 for k in range(0,len(og_cols)): | |
180 f.write('\t' + str(og_cols[k]) ) | |
181 f.write('\n') | |
182 for i in range(0,len(og_rows)): | |
183 f.write(str(og_rows[i]) ) | |
184 for j in range(0,len(matrix[0])): | |
185 f.write('\t' + format(matrix[i][j])) | |
186 f.write('\n') | |
187 | |
188 #Define Main Function | |
189 def main(): | |
190 | |
191 try: | |
192 args = get_args() | |
193 scaleValue = float(args.scalevalue) | |
194 offsetValue= float(args.offsetvalue) | |
195 #print(args) | |
196 #sys.stdout.write(str(args)+"\n") | |
197 | |
198 matrix,og_cols,og_rows = reader(args.input_file_txt) | |
199 if args.choice == "z_score_normalization": | |
200 if args.axes == "Row": | |
201 matrix = Zscore_row(matrix) | |
202 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
203 print("zcore, row") | |
204 elif args.axes == "Column": | |
205 matrix = Zscore_col(matrix) | |
206 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
207 print("zscore, column") | |
208 else: | |
209 print("zscore, invalid axis") | |
210 elif args.choice == "mean_center_normalization": | |
211 if args.axes == "Row": | |
212 matrix = MeanMedianCenter_row(matrix,"mean") | |
213 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
214 print("mean-center by row") | |
215 elif args.axes == "Column": | |
216 matrix = MeanMedianCenter_col(matrix,"mean") | |
217 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
218 print("mean-center by column") | |
219 else: | |
220 print("meancenter, invalid axis") | |
221 elif args.choice == "median_center_normalization": | |
222 if args.axes == "Row": | |
223 matrix = MeanMedianCenter_row(matrix,"median") | |
224 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
225 print("median-center by row") | |
226 elif args.axes == "Column": | |
227 matrix = MeanMedianCenter_col(matrix,"median") | |
228 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
229 print("median-center by column") | |
230 else: | |
231 print("meancenter, invalid axis") | |
232 elif args.choice == "add_offset": | |
233 if args.axes == "Row": | |
234 #offset = -100 #!!!! TODO REMOVE AND ADD WHEN clause to xml to get value | |
235 matrix = ScaleOffset_row(matrix,1.0,offsetValue) | |
236 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
237 print("offset of "+str(offsetValue)+" by row") | |
238 elif args.axes == "Column": | |
239 matrix = ScaleOffset_col(matrix,1.0,offsetValue) | |
240 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
241 print("offset of "+str(offsetValue)+" by column") | |
242 else: | |
243 print("offset"+str(offsetValue)+" invalid axis -not row or column") | |
244 elif args.choice == "scale": | |
245 if args.axes == "Row": | |
246 #scaleValue = 1000 #!!!! TODO REMOVE AND ADD WHEN clause to xml to get value | |
247 matrix = ScaleOffset_row(matrix,scaleValue,0.0) | |
248 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
249 print("scaling "+str(scaleValue)+" by row") | |
250 elif args.axes == "Column": | |
251 matrix = ScaleOffset_col(matrix,scaleValue,0.0) | |
252 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
253 print("scaling "+str(scaleValue)+" by column") | |
254 else: | |
255 print("scaling "+str(scaleValue)+" invalid axis") | |
256 elif args.choice == "transpose": | |
257 matrix = Transpose(matrix) #issue using same matrix? | |
258 labeler(matrix,og_rows,og_cols,args.output_file_txt) #swapped row&col labels | |
259 print("transpose mxn matrix to nxm size") | |
260 elif args.choice == "ln_normalization": | |
261 matrix = Convert2Logs(matrix,"log2",offsetValue) | |
262 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
263 print("log2 plus "+str(offsetValue)+" normalization for all values") | |
264 elif args.choice == "log_normalization": | |
265 matrix = Convert2Logs(matrix,"log10",offsetValue) | |
266 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
267 print("log10 normalization for all values") | |
268 elif args.choice == "rank": | |
269 if args.axes == "Row": | |
270 matrix = Rankdata_ByRow(matrix) | |
271 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
272 print("performed rank normalization by row") | |
273 elif args.axes == "Column": | |
274 matrix = Rankdata_ByColumn(matrix) | |
275 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
276 print("performed rank normalization by column") | |
277 else: | |
278 print("rank, invalid axis") | |
279 elif args.choice == "divide_by_sum": | |
280 if args.axes == "Row": | |
281 matrix = Divide_By_Sum_row(matrix) | |
282 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
283 print("performed divide row N values by row N's sum") | |
284 elif args.axes == "Column": | |
285 matrix = Divide_By_Sum_col(matrix) | |
286 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
287 print("performed divide column N values by column N's sum") | |
288 else: | |
289 print("divide_by_sum, invalid axis") | |
290 | |
291 else: | |
292 print("Invalid normalization Choice") | |
293 | |
294 except Exception as err: | |
295 traceback.print_exc() | |
296 sys.exit(1) | |
297 | |
298 | |
299 if __name__ == '__main__': | |
300 main() | |
301 print("Done") |