comparison Matrix_Transformations.py @ 1:f1bcd79cd923 draft default tip

Uploaded
author insilico-bob
date Tue, 27 Nov 2018 14:20:40 -0500
parents
children
comparison
equal deleted inserted replaced
0:7f12c81e2083 1:f1bcd79cd923
1 '''
2 Created on Jun 6, 2017 updated Feb 2018
3
4 @author: cjacoby and Bob Brown
5 '''
6 import os
7 import sys, traceback, argparse
8 import numpy as np
9 from numpy import size, array
10 import warnings
11 from Matrix_Validate_import import reader
12 #import scipy.stats as ss
13 warnings.filterwarnings('error')
14
15 #Define argparse Function
16 def get_args():
17 parser = argparse.ArgumentParser()
18 parser.add_argument('input_file_txt', help='text file input matrix(include .txt in name)')
19 parser.add_argument('choice', type=str, help='Choose normalization Method: 1 = Z-score, 2 = Mean Centered, 3 = log2, 4= rank')
20 parser.add_argument('axes', type=str, help='Choose Axis to normalize On (Row or Column)')
21 parser.add_argument('scalevalue', help='optional scaling factor for matrix)')
22 parser.add_argument('offsetvalue', help='optional offset for matrix')
23 parser.add_argument('output_file_txt', help='text file output matrix(include .txt in name)')
24 args = parser.parse_args()
25 return args
26
27
28 def Zscore_row(matrix):
29
30 #Loop To Perform Z-Score normalization
31 for i in range(0,len(matrix)):
32 temp_mean = np.nanmean(matrix[i])
33 temp_stdev = np.nanstd(matrix[i],ddof=1)
34 for j in range(0,len(matrix[0])):
35 matrix[i][j] = (matrix[i][j]-temp_mean)/temp_stdev
36 return(matrix)
37
38 #Define Z-Score normalization Function
39 def Zscore_col(matrix):
40
41 #Loop To Perform Z-Score normalization
42 for i in range(len(matrix[0])):
43 # matrix[:][i] = [scaleValue*x+offset for x in matrix[i]]
44 temp_mean = np.nanmean([row[i] for row in matrix])
45 temp_stdev = np.nanstd([row[i] for row in matrix],ddof=1)
46 #Probably Should Have if statement checking if stdev equals zero, although this implies the data is already Z-score normalized
47 for j in range(len(matrix)):
48 matrix[j][i] = (matrix[j][i]-temp_mean)/temp_stdev
49 return(matrix)
50
51
52 #Define Mean Centered or Median centered normalization Function
53 def MeanMedianCenter_row(matrix,type):
54
55
56 #Loop To Perform mean or median center
57 for i in range(0,len(matrix)):
58 if type == "mean":
59 temp_type = np.nanmean(matrix[i][1::])
60 else:
61 temp_type = np.nanmedian(matrix[i][1::])
62
63 for j in range(0,len(matrix[0])):
64 matrix[i][j] = (matrix[i][j]-temp_type)
65 return(matrix)
66
67
68 #Define mean or median
69 def MeanMedianCenter_col(matrix,type):
70
71 #Loop To Perform mean or median center
72 for i in range(0,len(matrix[0])):
73 if type == "mean":
74 temp_type = np.nanmean([row[i] for row in matrix])
75 else:
76 temp_type = np.nanmedian([row[i] for row in matrix])
77 #Probably Should Have if statement checking if stdev equals zero, although this implies the data is already Z-score normalized
78 for j in range(0,len(matrix)):
79 matrix[j][i] = (matrix[j][i]-temp_type)
80 return(matrix)
81
82 #Divide by sum of the Row Function
83 def Divide_By_Sum_row(matrix):
84
85 #Loop To Perform mean or median center
86 numRow,numCol= np.shape(matrix)
87
88 for i in range(numRow):
89 sumValue = sum(matrix[i][:])
90
91 #if equals zero
92 if abs(sumValue) > .0001:
93 for j in range(numCol):
94 matrix[i][j] = matrix[i][j]/sumValue
95 else:
96 print("ERROR Cannot divide by Sum almost zero", str(sumValue), " for Row ",str(i+1))
97 return(matrix)
98
99
100 #Divide by sum of the Column Function
101 def Divide_By_Sum_col(matrix):
102
103 #Loop To Perform mean or median center
104 numRow,numCol= np.shape(matrix)
105
106 for i in range(numCol):
107 sumValue= 0
108
109 #if equals zero
110 if abs(sumValue) > .0001:
111 for j in range(numRow):
112 matrix[j][i] = (matrix[j][i]/sumValue)
113 else:
114 print("ERROR Cannot divide by Sum almost zero", str(sumValue), " for Column ",str(i+1))
115 return(matrix)
116
117 #scale or add offset to matrix by row
118 def ScaleOffset_row(matrix,scaleValue,offset):
119
120 #Loop To Perform scale and offset do one or the other per request
121 if abs(scaleValue) > 0.0001:
122 for i in range(0,len(matrix)):
123 matrix[i][:] = [scaleValue*x+offset for x in matrix[i]]
124 else:
125 print (" Scale facter "+str(scaleValue)+" too small")
126 return(matrix)
127
128 #scale or add offset to matrix by column
129 def ScaleOffset_col(matrix,scaleValue,offset):
130
131 #Loop To Perform scale and offset do one or the other per request
132 if abs(scaleValue) > 0.0001:
133 for i in range(0,len(matrix[0])):
134 matrix[:][i] = [scaleValue*x+offset for x in matrix[i]]
135 else:
136 print (" Scale facter "+str(scaleValue)+" too small")
137 return(matrix)
138
139 #Define Log2 normalization Method
140 def Convert2Logs(matrix,logValue, offset):
141 import warnings
142 warnings.filterwarnings('error')
143
144 #Loop To Perform Z-Score normalization
145 for i in range(0,len(matrix)):
146 for j in range(0,len(matrix[0])):
147 try:
148 if logValue == "log2":
149 matrix[i][j] = np.log2(matrix[i][j]+offset)
150 else:
151 matrix[i][j] = np.log10(matrix[i][j]+offset)
152
153 except RuntimeWarning:
154 print(logValue+" normalization Failed: Encountered elements <= 0, which are invalid inputs for a Log normalization")
155 break
156 else:
157 continue
158 break
159 return(matrix)
160
161 #transpose matrix
162 def Transpose(in_mat):
163 out_mat = []
164 numRow,numCol= np.shape(in_mat)
165
166 for i in range(numCol):
167 temp= []
168 for j in range(numRow):
169 temp.append(in_mat[j][i])
170 out_mat.append(temp)
171 #print( str(out_mat))
172 return out_mat
173
174 # restores row and column labels in ouput
175 def labeler(matrix,og_cols,og_rows,output_file_txt):
176 #Define Null Sets For Col and Row Headers
177 with open(output_file_txt,'w') as f:
178 f.write("")
179 for k in range(0,len(og_cols)):
180 f.write('\t' + str(og_cols[k]) )
181 f.write('\n')
182 for i in range(0,len(og_rows)):
183 f.write(str(og_rows[i]) )
184 for j in range(0,len(matrix[0])):
185 f.write('\t' + format(matrix[i][j]))
186 f.write('\n')
187
188 #Define Main Function
189 def main():
190
191 try:
192 args = get_args()
193 scaleValue = float(args.scalevalue)
194 offsetValue= float(args.offsetvalue)
195 #print(args)
196 #sys.stdout.write(str(args)+"\n")
197
198 matrix,og_cols,og_rows = reader(args.input_file_txt)
199 if args.choice == "z_score_normalization":
200 if args.axes == "Row":
201 matrix = Zscore_row(matrix)
202 labeler(matrix,og_cols,og_rows,args.output_file_txt)
203 print("zcore, row")
204 elif args.axes == "Column":
205 matrix = Zscore_col(matrix)
206 labeler(matrix,og_cols,og_rows,args.output_file_txt)
207 print("zscore, column")
208 else:
209 print("zscore, invalid axis")
210 elif args.choice == "mean_center_normalization":
211 if args.axes == "Row":
212 matrix = MeanMedianCenter_row(matrix,"mean")
213 labeler(matrix,og_cols,og_rows,args.output_file_txt)
214 print("mean-center by row")
215 elif args.axes == "Column":
216 matrix = MeanMedianCenter_col(matrix,"mean")
217 labeler(matrix,og_cols,og_rows,args.output_file_txt)
218 print("mean-center by column")
219 else:
220 print("meancenter, invalid axis")
221 elif args.choice == "median_center_normalization":
222 if args.axes == "Row":
223 matrix = MeanMedianCenter_row(matrix,"median")
224 labeler(matrix,og_cols,og_rows,args.output_file_txt)
225 print("median-center by row")
226 elif args.axes == "Column":
227 matrix = MeanMedianCenter_col(matrix,"median")
228 labeler(matrix,og_cols,og_rows,args.output_file_txt)
229 print("median-center by column")
230 else:
231 print("meancenter, invalid axis")
232 elif args.choice == "add_offset":
233 if args.axes == "Row":
234 #offset = -100 #!!!! TODO REMOVE AND ADD WHEN clause to xml to get value
235 matrix = ScaleOffset_row(matrix,1.0,offsetValue)
236 labeler(matrix,og_cols,og_rows,args.output_file_txt)
237 print("offset of "+str(offsetValue)+" by row")
238 elif args.axes == "Column":
239 matrix = ScaleOffset_col(matrix,1.0,offsetValue)
240 labeler(matrix,og_cols,og_rows,args.output_file_txt)
241 print("offset of "+str(offsetValue)+" by column")
242 else:
243 print("offset"+str(offsetValue)+" invalid axis -not row or column")
244 elif args.choice == "scale":
245 if args.axes == "Row":
246 #scaleValue = 1000 #!!!! TODO REMOVE AND ADD WHEN clause to xml to get value
247 matrix = ScaleOffset_row(matrix,scaleValue,0.0)
248 labeler(matrix,og_cols,og_rows,args.output_file_txt)
249 print("scaling "+str(scaleValue)+" by row")
250 elif args.axes == "Column":
251 matrix = ScaleOffset_col(matrix,scaleValue,0.0)
252 labeler(matrix,og_cols,og_rows,args.output_file_txt)
253 print("scaling "+str(scaleValue)+" by column")
254 else:
255 print("scaling "+str(scaleValue)+" invalid axis")
256 elif args.choice == "transpose":
257 matrix = Transpose(matrix) #issue using same matrix?
258 labeler(matrix,og_rows,og_cols,args.output_file_txt) #swapped row&col labels
259 print("transpose mxn matrix to nxm size")
260 elif args.choice == "ln_normalization":
261 matrix = Convert2Logs(matrix,"log2",offsetValue)
262 labeler(matrix,og_cols,og_rows,args.output_file_txt)
263 print("log2 plus "+str(offsetValue)+" normalization for all values")
264 elif args.choice == "log_normalization":
265 matrix = Convert2Logs(matrix,"log10",offsetValue)
266 labeler(matrix,og_cols,og_rows,args.output_file_txt)
267 print("log10 normalization for all values")
268 elif args.choice == "rank":
269 if args.axes == "Row":
270 matrix = Rankdata_ByRow(matrix)
271 labeler(matrix,og_cols,og_rows,args.output_file_txt)
272 print("performed rank normalization by row")
273 elif args.axes == "Column":
274 matrix = Rankdata_ByColumn(matrix)
275 labeler(matrix,og_cols,og_rows,args.output_file_txt)
276 print("performed rank normalization by column")
277 else:
278 print("rank, invalid axis")
279 elif args.choice == "divide_by_sum":
280 if args.axes == "Row":
281 matrix = Divide_By_Sum_row(matrix)
282 labeler(matrix,og_cols,og_rows,args.output_file_txt)
283 print("performed divide row N values by row N's sum")
284 elif args.axes == "Column":
285 matrix = Divide_By_Sum_col(matrix)
286 labeler(matrix,og_cols,og_rows,args.output_file_txt)
287 print("performed divide column N values by column N's sum")
288 else:
289 print("divide_by_sum, invalid axis")
290
291 else:
292 print("Invalid normalization Choice")
293
294 except Exception as err:
295 traceback.print_exc()
296 sys.exit(1)
297
298
299 if __name__ == '__main__':
300 main()
301 print("Done")