1
|
1 '''
|
|
2 Created on Jun 6, 2017 updated Feb 2018
|
|
3
|
|
4 @author: cjacoby and Bob Brown
|
|
5 '''
|
|
6 import os
|
|
7 import sys, traceback, argparse
|
|
8 import numpy as np
|
|
9 from numpy import size, array
|
|
10 import warnings
|
|
11 from Matrix_Validate_import import reader
|
|
12 #import scipy.stats as ss
|
|
13 warnings.filterwarnings('error')
|
|
14
|
|
15 #Define argparse Function
|
|
16 def get_args():
|
|
17 parser = argparse.ArgumentParser()
|
|
18 parser.add_argument('input_file_txt', help='text file input matrix(include .txt in name)')
|
|
19 parser.add_argument('choice', type=str, help='Choose normalization Method: 1 = Z-score, 2 = Mean Centered, 3 = log2, 4= rank')
|
|
20 parser.add_argument('axes', type=str, help='Choose Axis to normalize On (Row or Column)')
|
|
21 parser.add_argument('scalevalue', help='optional scaling factor for matrix)')
|
|
22 parser.add_argument('offsetvalue', help='optional offset for matrix')
|
|
23 parser.add_argument('output_file_txt', help='text file output matrix(include .txt in name)')
|
|
24 args = parser.parse_args()
|
|
25 return args
|
|
26
|
|
27
|
|
28 def Zscore_row(matrix):
|
|
29
|
|
30 #Loop To Perform Z-Score normalization
|
|
31 for i in range(0,len(matrix)):
|
|
32 temp_mean = np.nanmean(matrix[i])
|
|
33 temp_stdev = np.nanstd(matrix[i],ddof=1)
|
|
34 for j in range(0,len(matrix[0])):
|
|
35 matrix[i][j] = (matrix[i][j]-temp_mean)/temp_stdev
|
|
36 return(matrix)
|
|
37
|
|
38 #Define Z-Score normalization Function
|
|
39 def Zscore_col(matrix):
|
|
40
|
|
41 #Loop To Perform Z-Score normalization
|
|
42 for i in range(len(matrix[0])):
|
|
43 # matrix[:][i] = [scaleValue*x+offset for x in matrix[i]]
|
|
44 temp_mean = np.nanmean([row[i] for row in matrix])
|
|
45 temp_stdev = np.nanstd([row[i] for row in matrix],ddof=1)
|
|
46 #Probably Should Have if statement checking if stdev equals zero, although this implies the data is already Z-score normalized
|
|
47 for j in range(len(matrix)):
|
|
48 matrix[j][i] = (matrix[j][i]-temp_mean)/temp_stdev
|
|
49 return(matrix)
|
|
50
|
|
51
|
|
52 #Define Mean Centered or Median centered normalization Function
|
|
53 def MeanMedianCenter_row(matrix,type):
|
|
54
|
|
55
|
|
56 #Loop To Perform mean or median center
|
|
57 for i in range(0,len(matrix)):
|
|
58 if type == "mean":
|
|
59 temp_type = np.nanmean(matrix[i][1::])
|
|
60 else:
|
|
61 temp_type = np.nanmedian(matrix[i][1::])
|
|
62
|
|
63 for j in range(0,len(matrix[0])):
|
|
64 matrix[i][j] = (matrix[i][j]-temp_type)
|
|
65 return(matrix)
|
|
66
|
|
67
|
|
68 #Define mean or median
|
|
69 def MeanMedianCenter_col(matrix,type):
|
|
70
|
|
71 #Loop To Perform mean or median center
|
|
72 for i in range(0,len(matrix[0])):
|
|
73 if type == "mean":
|
|
74 temp_type = np.nanmean([row[i] for row in matrix])
|
|
75 else:
|
|
76 temp_type = np.nanmedian([row[i] for row in matrix])
|
|
77 #Probably Should Have if statement checking if stdev equals zero, although this implies the data is already Z-score normalized
|
|
78 for j in range(0,len(matrix)):
|
|
79 matrix[j][i] = (matrix[j][i]-temp_type)
|
|
80 return(matrix)
|
|
81
|
|
82 #Divide by sum of the Row Function
|
|
83 def Divide_By_Sum_row(matrix):
|
|
84
|
|
85 #Loop To Perform mean or median center
|
|
86 numRow,numCol= np.shape(matrix)
|
|
87
|
|
88 for i in range(numRow):
|
|
89 sumValue = sum(matrix[i][:])
|
|
90
|
|
91 #if equals zero
|
|
92 if abs(sumValue) > .0001:
|
|
93 for j in range(numCol):
|
|
94 matrix[i][j] = matrix[i][j]/sumValue
|
|
95 else:
|
|
96 print("ERROR Cannot divide by Sum almost zero", str(sumValue), " for Row ",str(i+1))
|
|
97 return(matrix)
|
|
98
|
|
99
|
|
100 #Divide by sum of the Column Function
|
|
101 def Divide_By_Sum_col(matrix):
|
|
102
|
|
103 #Loop To Perform mean or median center
|
|
104 numRow,numCol= np.shape(matrix)
|
|
105
|
|
106 for i in range(numCol):
|
|
107 sumValue= 0
|
|
108
|
|
109 #if equals zero
|
|
110 if abs(sumValue) > .0001:
|
|
111 for j in range(numRow):
|
|
112 matrix[j][i] = (matrix[j][i]/sumValue)
|
|
113 else:
|
|
114 print("ERROR Cannot divide by Sum almost zero", str(sumValue), " for Column ",str(i+1))
|
|
115 return(matrix)
|
|
116
|
|
117 #scale or add offset to matrix by row
|
|
118 def ScaleOffset_row(matrix,scaleValue,offset):
|
|
119
|
|
120 #Loop To Perform scale and offset do one or the other per request
|
|
121 if abs(scaleValue) > 0.0001:
|
|
122 for i in range(0,len(matrix)):
|
|
123 matrix[i][:] = [scaleValue*x+offset for x in matrix[i]]
|
|
124 else:
|
|
125 print (" Scale facter "+str(scaleValue)+" too small")
|
|
126 return(matrix)
|
|
127
|
|
128 #scale or add offset to matrix by column
|
|
129 def ScaleOffset_col(matrix,scaleValue,offset):
|
|
130
|
|
131 #Loop To Perform scale and offset do one or the other per request
|
|
132 if abs(scaleValue) > 0.0001:
|
|
133 for i in range(0,len(matrix[0])):
|
|
134 matrix[:][i] = [scaleValue*x+offset for x in matrix[i]]
|
|
135 else:
|
|
136 print (" Scale facter "+str(scaleValue)+" too small")
|
|
137 return(matrix)
|
|
138
|
|
139 #Define Log2 normalization Method
|
|
140 def Convert2Logs(matrix,logValue, offset):
|
|
141 import warnings
|
|
142 warnings.filterwarnings('error')
|
|
143
|
|
144 #Loop To Perform Z-Score normalization
|
|
145 for i in range(0,len(matrix)):
|
|
146 for j in range(0,len(matrix[0])):
|
|
147 try:
|
|
148 if logValue == "log2":
|
|
149 matrix[i][j] = np.log2(matrix[i][j]+offset)
|
|
150 else:
|
|
151 matrix[i][j] = np.log10(matrix[i][j]+offset)
|
|
152
|
|
153 except RuntimeWarning:
|
|
154 print(logValue+" normalization Failed: Encountered elements <= 0, which are invalid inputs for a Log normalization")
|
|
155 break
|
|
156 else:
|
|
157 continue
|
|
158 break
|
|
159 return(matrix)
|
|
160
|
|
161 #transpose matrix
|
|
162 def Transpose(in_mat):
|
|
163 out_mat = []
|
|
164 numRow,numCol= np.shape(in_mat)
|
|
165
|
|
166 for i in range(numCol):
|
|
167 temp= []
|
|
168 for j in range(numRow):
|
|
169 temp.append(in_mat[j][i])
|
|
170 out_mat.append(temp)
|
|
171 #print( str(out_mat))
|
|
172 return out_mat
|
|
173
|
|
174 # restores row and column labels in ouput
|
|
175 def labeler(matrix,og_cols,og_rows,output_file_txt):
|
|
176 #Define Null Sets For Col and Row Headers
|
|
177 with open(output_file_txt,'w') as f:
|
|
178 f.write("")
|
|
179 for k in range(0,len(og_cols)):
|
|
180 f.write('\t' + str(og_cols[k]) )
|
|
181 f.write('\n')
|
|
182 for i in range(0,len(og_rows)):
|
|
183 f.write(str(og_rows[i]) )
|
|
184 for j in range(0,len(matrix[0])):
|
|
185 f.write('\t' + format(matrix[i][j]))
|
|
186 f.write('\n')
|
|
187
|
|
188 #Define Main Function
|
|
189 def main():
|
|
190
|
|
191 try:
|
|
192 args = get_args()
|
|
193 scaleValue = float(args.scalevalue)
|
|
194 offsetValue= float(args.offsetvalue)
|
|
195 #print(args)
|
|
196 #sys.stdout.write(str(args)+"\n")
|
|
197
|
|
198 matrix,og_cols,og_rows = reader(args.input_file_txt)
|
|
199 if args.choice == "z_score_normalization":
|
|
200 if args.axes == "Row":
|
|
201 matrix = Zscore_row(matrix)
|
|
202 labeler(matrix,og_cols,og_rows,args.output_file_txt)
|
|
203 print("zcore, row")
|
|
204 elif args.axes == "Column":
|
|
205 matrix = Zscore_col(matrix)
|
|
206 labeler(matrix,og_cols,og_rows,args.output_file_txt)
|
|
207 print("zscore, column")
|
|
208 else:
|
|
209 print("zscore, invalid axis")
|
|
210 elif args.choice == "mean_center_normalization":
|
|
211 if args.axes == "Row":
|
|
212 matrix = MeanMedianCenter_row(matrix,"mean")
|
|
213 labeler(matrix,og_cols,og_rows,args.output_file_txt)
|
|
214 print("mean-center by row")
|
|
215 elif args.axes == "Column":
|
|
216 matrix = MeanMedianCenter_col(matrix,"mean")
|
|
217 labeler(matrix,og_cols,og_rows,args.output_file_txt)
|
|
218 print("mean-center by column")
|
|
219 else:
|
|
220 print("meancenter, invalid axis")
|
|
221 elif args.choice == "median_center_normalization":
|
|
222 if args.axes == "Row":
|
|
223 matrix = MeanMedianCenter_row(matrix,"median")
|
|
224 labeler(matrix,og_cols,og_rows,args.output_file_txt)
|
|
225 print("median-center by row")
|
|
226 elif args.axes == "Column":
|
|
227 matrix = MeanMedianCenter_col(matrix,"median")
|
|
228 labeler(matrix,og_cols,og_rows,args.output_file_txt)
|
|
229 print("median-center by column")
|
|
230 else:
|
|
231 print("meancenter, invalid axis")
|
|
232 elif args.choice == "add_offset":
|
|
233 if args.axes == "Row":
|
|
234 #offset = -100 #!!!! TODO REMOVE AND ADD WHEN clause to xml to get value
|
|
235 matrix = ScaleOffset_row(matrix,1.0,offsetValue)
|
|
236 labeler(matrix,og_cols,og_rows,args.output_file_txt)
|
|
237 print("offset of "+str(offsetValue)+" by row")
|
|
238 elif args.axes == "Column":
|
|
239 matrix = ScaleOffset_col(matrix,1.0,offsetValue)
|
|
240 labeler(matrix,og_cols,og_rows,args.output_file_txt)
|
|
241 print("offset of "+str(offsetValue)+" by column")
|
|
242 else:
|
|
243 print("offset"+str(offsetValue)+" invalid axis -not row or column")
|
|
244 elif args.choice == "scale":
|
|
245 if args.axes == "Row":
|
|
246 #scaleValue = 1000 #!!!! TODO REMOVE AND ADD WHEN clause to xml to get value
|
|
247 matrix = ScaleOffset_row(matrix,scaleValue,0.0)
|
|
248 labeler(matrix,og_cols,og_rows,args.output_file_txt)
|
|
249 print("scaling "+str(scaleValue)+" by row")
|
|
250 elif args.axes == "Column":
|
|
251 matrix = ScaleOffset_col(matrix,scaleValue,0.0)
|
|
252 labeler(matrix,og_cols,og_rows,args.output_file_txt)
|
|
253 print("scaling "+str(scaleValue)+" by column")
|
|
254 else:
|
|
255 print("scaling "+str(scaleValue)+" invalid axis")
|
|
256 elif args.choice == "transpose":
|
|
257 matrix = Transpose(matrix) #issue using same matrix?
|
|
258 labeler(matrix,og_rows,og_cols,args.output_file_txt) #swapped row&col labels
|
|
259 print("transpose mxn matrix to nxm size")
|
|
260 elif args.choice == "ln_normalization":
|
|
261 matrix = Convert2Logs(matrix,"log2",offsetValue)
|
|
262 labeler(matrix,og_cols,og_rows,args.output_file_txt)
|
|
263 print("log2 plus "+str(offsetValue)+" normalization for all values")
|
|
264 elif args.choice == "log_normalization":
|
|
265 matrix = Convert2Logs(matrix,"log10",offsetValue)
|
|
266 labeler(matrix,og_cols,og_rows,args.output_file_txt)
|
|
267 print("log10 normalization for all values")
|
|
268 elif args.choice == "rank":
|
|
269 if args.axes == "Row":
|
|
270 matrix = Rankdata_ByRow(matrix)
|
|
271 labeler(matrix,og_cols,og_rows,args.output_file_txt)
|
|
272 print("performed rank normalization by row")
|
|
273 elif args.axes == "Column":
|
|
274 matrix = Rankdata_ByColumn(matrix)
|
|
275 labeler(matrix,og_cols,og_rows,args.output_file_txt)
|
|
276 print("performed rank normalization by column")
|
|
277 else:
|
|
278 print("rank, invalid axis")
|
|
279 elif args.choice == "divide_by_sum":
|
|
280 if args.axes == "Row":
|
|
281 matrix = Divide_By_Sum_row(matrix)
|
|
282 labeler(matrix,og_cols,og_rows,args.output_file_txt)
|
|
283 print("performed divide row N values by row N's sum")
|
|
284 elif args.axes == "Column":
|
|
285 matrix = Divide_By_Sum_col(matrix)
|
|
286 labeler(matrix,og_cols,og_rows,args.output_file_txt)
|
|
287 print("performed divide column N values by column N's sum")
|
|
288 else:
|
|
289 print("divide_by_sum, invalid axis")
|
|
290
|
|
291 else:
|
|
292 print("Invalid normalization Choice")
|
|
293
|
|
294 except Exception as err:
|
|
295 traceback.print_exc()
|
|
296 sys.exit(1)
|
|
297
|
|
298
|
|
299 if __name__ == '__main__':
|
|
300 main()
|
|
301 print("Done")
|