Mercurial > repos > md-anderson-bioinformatics > matrix_manipulation
comparison Matrix_Transformations.py @ 1:f1bcd79cd923 draft default tip
Uploaded
| author | insilico-bob |
|---|---|
| date | Tue, 27 Nov 2018 14:20:40 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:7f12c81e2083 | 1:f1bcd79cd923 |
|---|---|
| 1 ''' | |
| 2 Created on Jun 6, 2017 updated Feb 2018 | |
| 3 | |
| 4 @author: cjacoby and Bob Brown | |
| 5 ''' | |
| 6 import os | |
| 7 import sys, traceback, argparse | |
| 8 import numpy as np | |
| 9 from numpy import size, array | |
| 10 import warnings | |
| 11 from Matrix_Validate_import import reader | |
| 12 #import scipy.stats as ss | |
| 13 warnings.filterwarnings('error') | |
| 14 | |
| 15 #Define argparse Function | |
| 16 def get_args(): | |
| 17 parser = argparse.ArgumentParser() | |
| 18 parser.add_argument('input_file_txt', help='text file input matrix(include .txt in name)') | |
| 19 parser.add_argument('choice', type=str, help='Choose normalization Method: 1 = Z-score, 2 = Mean Centered, 3 = log2, 4= rank') | |
| 20 parser.add_argument('axes', type=str, help='Choose Axis to normalize On (Row or Column)') | |
| 21 parser.add_argument('scalevalue', help='optional scaling factor for matrix)') | |
| 22 parser.add_argument('offsetvalue', help='optional offset for matrix') | |
| 23 parser.add_argument('output_file_txt', help='text file output matrix(include .txt in name)') | |
| 24 args = parser.parse_args() | |
| 25 return args | |
| 26 | |
| 27 | |
| 28 def Zscore_row(matrix): | |
| 29 | |
| 30 #Loop To Perform Z-Score normalization | |
| 31 for i in range(0,len(matrix)): | |
| 32 temp_mean = np.nanmean(matrix[i]) | |
| 33 temp_stdev = np.nanstd(matrix[i],ddof=1) | |
| 34 for j in range(0,len(matrix[0])): | |
| 35 matrix[i][j] = (matrix[i][j]-temp_mean)/temp_stdev | |
| 36 return(matrix) | |
| 37 | |
| 38 #Define Z-Score normalization Function | |
| 39 def Zscore_col(matrix): | |
| 40 | |
| 41 #Loop To Perform Z-Score normalization | |
| 42 for i in range(len(matrix[0])): | |
| 43 # matrix[:][i] = [scaleValue*x+offset for x in matrix[i]] | |
| 44 temp_mean = np.nanmean([row[i] for row in matrix]) | |
| 45 temp_stdev = np.nanstd([row[i] for row in matrix],ddof=1) | |
| 46 #Probably Should Have if statement checking if stdev equals zero, although this implies the data is already Z-score normalized | |
| 47 for j in range(len(matrix)): | |
| 48 matrix[j][i] = (matrix[j][i]-temp_mean)/temp_stdev | |
| 49 return(matrix) | |
| 50 | |
| 51 | |
| 52 #Define Mean Centered or Median centered normalization Function | |
| 53 def MeanMedianCenter_row(matrix,type): | |
| 54 | |
| 55 | |
| 56 #Loop To Perform mean or median center | |
| 57 for i in range(0,len(matrix)): | |
| 58 if type == "mean": | |
| 59 temp_type = np.nanmean(matrix[i][1::]) | |
| 60 else: | |
| 61 temp_type = np.nanmedian(matrix[i][1::]) | |
| 62 | |
| 63 for j in range(0,len(matrix[0])): | |
| 64 matrix[i][j] = (matrix[i][j]-temp_type) | |
| 65 return(matrix) | |
| 66 | |
| 67 | |
| 68 #Define mean or median | |
| 69 def MeanMedianCenter_col(matrix,type): | |
| 70 | |
| 71 #Loop To Perform mean or median center | |
| 72 for i in range(0,len(matrix[0])): | |
| 73 if type == "mean": | |
| 74 temp_type = np.nanmean([row[i] for row in matrix]) | |
| 75 else: | |
| 76 temp_type = np.nanmedian([row[i] for row in matrix]) | |
| 77 #Probably Should Have if statement checking if stdev equals zero, although this implies the data is already Z-score normalized | |
| 78 for j in range(0,len(matrix)): | |
| 79 matrix[j][i] = (matrix[j][i]-temp_type) | |
| 80 return(matrix) | |
| 81 | |
| 82 #Divide by sum of the Row Function | |
| 83 def Divide_By_Sum_row(matrix): | |
| 84 | |
| 85 #Loop To Perform mean or median center | |
| 86 numRow,numCol= np.shape(matrix) | |
| 87 | |
| 88 for i in range(numRow): | |
| 89 sumValue = sum(matrix[i][:]) | |
| 90 | |
| 91 #if equals zero | |
| 92 if abs(sumValue) > .0001: | |
| 93 for j in range(numCol): | |
| 94 matrix[i][j] = matrix[i][j]/sumValue | |
| 95 else: | |
| 96 print("ERROR Cannot divide by Sum almost zero", str(sumValue), " for Row ",str(i+1)) | |
| 97 return(matrix) | |
| 98 | |
| 99 | |
| 100 #Divide by sum of the Column Function | |
| 101 def Divide_By_Sum_col(matrix): | |
| 102 | |
| 103 #Loop To Perform mean or median center | |
| 104 numRow,numCol= np.shape(matrix) | |
| 105 | |
| 106 for i in range(numCol): | |
| 107 sumValue= 0 | |
| 108 | |
| 109 #if equals zero | |
| 110 if abs(sumValue) > .0001: | |
| 111 for j in range(numRow): | |
| 112 matrix[j][i] = (matrix[j][i]/sumValue) | |
| 113 else: | |
| 114 print("ERROR Cannot divide by Sum almost zero", str(sumValue), " for Column ",str(i+1)) | |
| 115 return(matrix) | |
| 116 | |
| 117 #scale or add offset to matrix by row | |
| 118 def ScaleOffset_row(matrix,scaleValue,offset): | |
| 119 | |
| 120 #Loop To Perform scale and offset do one or the other per request | |
| 121 if abs(scaleValue) > 0.0001: | |
| 122 for i in range(0,len(matrix)): | |
| 123 matrix[i][:] = [scaleValue*x+offset for x in matrix[i]] | |
| 124 else: | |
| 125 print (" Scale facter "+str(scaleValue)+" too small") | |
| 126 return(matrix) | |
| 127 | |
| 128 #scale or add offset to matrix by column | |
| 129 def ScaleOffset_col(matrix,scaleValue,offset): | |
| 130 | |
| 131 #Loop To Perform scale and offset do one or the other per request | |
| 132 if abs(scaleValue) > 0.0001: | |
| 133 for i in range(0,len(matrix[0])): | |
| 134 matrix[:][i] = [scaleValue*x+offset for x in matrix[i]] | |
| 135 else: | |
| 136 print (" Scale facter "+str(scaleValue)+" too small") | |
| 137 return(matrix) | |
| 138 | |
| 139 #Define Log2 normalization Method | |
| 140 def Convert2Logs(matrix,logValue, offset): | |
| 141 import warnings | |
| 142 warnings.filterwarnings('error') | |
| 143 | |
| 144 #Loop To Perform Z-Score normalization | |
| 145 for i in range(0,len(matrix)): | |
| 146 for j in range(0,len(matrix[0])): | |
| 147 try: | |
| 148 if logValue == "log2": | |
| 149 matrix[i][j] = np.log2(matrix[i][j]+offset) | |
| 150 else: | |
| 151 matrix[i][j] = np.log10(matrix[i][j]+offset) | |
| 152 | |
| 153 except RuntimeWarning: | |
| 154 print(logValue+" normalization Failed: Encountered elements <= 0, which are invalid inputs for a Log normalization") | |
| 155 break | |
| 156 else: | |
| 157 continue | |
| 158 break | |
| 159 return(matrix) | |
| 160 | |
| 161 #transpose matrix | |
| 162 def Transpose(in_mat): | |
| 163 out_mat = [] | |
| 164 numRow,numCol= np.shape(in_mat) | |
| 165 | |
| 166 for i in range(numCol): | |
| 167 temp= [] | |
| 168 for j in range(numRow): | |
| 169 temp.append(in_mat[j][i]) | |
| 170 out_mat.append(temp) | |
| 171 #print( str(out_mat)) | |
| 172 return out_mat | |
| 173 | |
| 174 # restores row and column labels in ouput | |
| 175 def labeler(matrix,og_cols,og_rows,output_file_txt): | |
| 176 #Define Null Sets For Col and Row Headers | |
| 177 with open(output_file_txt,'w') as f: | |
| 178 f.write("") | |
| 179 for k in range(0,len(og_cols)): | |
| 180 f.write('\t' + str(og_cols[k]) ) | |
| 181 f.write('\n') | |
| 182 for i in range(0,len(og_rows)): | |
| 183 f.write(str(og_rows[i]) ) | |
| 184 for j in range(0,len(matrix[0])): | |
| 185 f.write('\t' + format(matrix[i][j])) | |
| 186 f.write('\n') | |
| 187 | |
| 188 #Define Main Function | |
| 189 def main(): | |
| 190 | |
| 191 try: | |
| 192 args = get_args() | |
| 193 scaleValue = float(args.scalevalue) | |
| 194 offsetValue= float(args.offsetvalue) | |
| 195 #print(args) | |
| 196 #sys.stdout.write(str(args)+"\n") | |
| 197 | |
| 198 matrix,og_cols,og_rows = reader(args.input_file_txt) | |
| 199 if args.choice == "z_score_normalization": | |
| 200 if args.axes == "Row": | |
| 201 matrix = Zscore_row(matrix) | |
| 202 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
| 203 print("zcore, row") | |
| 204 elif args.axes == "Column": | |
| 205 matrix = Zscore_col(matrix) | |
| 206 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
| 207 print("zscore, column") | |
| 208 else: | |
| 209 print("zscore, invalid axis") | |
| 210 elif args.choice == "mean_center_normalization": | |
| 211 if args.axes == "Row": | |
| 212 matrix = MeanMedianCenter_row(matrix,"mean") | |
| 213 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
| 214 print("mean-center by row") | |
| 215 elif args.axes == "Column": | |
| 216 matrix = MeanMedianCenter_col(matrix,"mean") | |
| 217 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
| 218 print("mean-center by column") | |
| 219 else: | |
| 220 print("meancenter, invalid axis") | |
| 221 elif args.choice == "median_center_normalization": | |
| 222 if args.axes == "Row": | |
| 223 matrix = MeanMedianCenter_row(matrix,"median") | |
| 224 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
| 225 print("median-center by row") | |
| 226 elif args.axes == "Column": | |
| 227 matrix = MeanMedianCenter_col(matrix,"median") | |
| 228 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
| 229 print("median-center by column") | |
| 230 else: | |
| 231 print("meancenter, invalid axis") | |
| 232 elif args.choice == "add_offset": | |
| 233 if args.axes == "Row": | |
| 234 #offset = -100 #!!!! TODO REMOVE AND ADD WHEN clause to xml to get value | |
| 235 matrix = ScaleOffset_row(matrix,1.0,offsetValue) | |
| 236 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
| 237 print("offset of "+str(offsetValue)+" by row") | |
| 238 elif args.axes == "Column": | |
| 239 matrix = ScaleOffset_col(matrix,1.0,offsetValue) | |
| 240 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
| 241 print("offset of "+str(offsetValue)+" by column") | |
| 242 else: | |
| 243 print("offset"+str(offsetValue)+" invalid axis -not row or column") | |
| 244 elif args.choice == "scale": | |
| 245 if args.axes == "Row": | |
| 246 #scaleValue = 1000 #!!!! TODO REMOVE AND ADD WHEN clause to xml to get value | |
| 247 matrix = ScaleOffset_row(matrix,scaleValue,0.0) | |
| 248 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
| 249 print("scaling "+str(scaleValue)+" by row") | |
| 250 elif args.axes == "Column": | |
| 251 matrix = ScaleOffset_col(matrix,scaleValue,0.0) | |
| 252 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
| 253 print("scaling "+str(scaleValue)+" by column") | |
| 254 else: | |
| 255 print("scaling "+str(scaleValue)+" invalid axis") | |
| 256 elif args.choice == "transpose": | |
| 257 matrix = Transpose(matrix) #issue using same matrix? | |
| 258 labeler(matrix,og_rows,og_cols,args.output_file_txt) #swapped row&col labels | |
| 259 print("transpose mxn matrix to nxm size") | |
| 260 elif args.choice == "ln_normalization": | |
| 261 matrix = Convert2Logs(matrix,"log2",offsetValue) | |
| 262 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
| 263 print("log2 plus "+str(offsetValue)+" normalization for all values") | |
| 264 elif args.choice == "log_normalization": | |
| 265 matrix = Convert2Logs(matrix,"log10",offsetValue) | |
| 266 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
| 267 print("log10 normalization for all values") | |
| 268 elif args.choice == "rank": | |
| 269 if args.axes == "Row": | |
| 270 matrix = Rankdata_ByRow(matrix) | |
| 271 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
| 272 print("performed rank normalization by row") | |
| 273 elif args.axes == "Column": | |
| 274 matrix = Rankdata_ByColumn(matrix) | |
| 275 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
| 276 print("performed rank normalization by column") | |
| 277 else: | |
| 278 print("rank, invalid axis") | |
| 279 elif args.choice == "divide_by_sum": | |
| 280 if args.axes == "Row": | |
| 281 matrix = Divide_By_Sum_row(matrix) | |
| 282 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
| 283 print("performed divide row N values by row N's sum") | |
| 284 elif args.axes == "Column": | |
| 285 matrix = Divide_By_Sum_col(matrix) | |
| 286 labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
| 287 print("performed divide column N values by column N's sum") | |
| 288 else: | |
| 289 print("divide_by_sum, invalid axis") | |
| 290 | |
| 291 else: | |
| 292 print("Invalid normalization Choice") | |
| 293 | |
| 294 except Exception as err: | |
| 295 traceback.print_exc() | |
| 296 sys.exit(1) | |
| 297 | |
| 298 | |
| 299 if __name__ == '__main__': | |
| 300 main() | |
| 301 print("Done") |
