1
+ − 1 '''
+ − 2 Created on Jun 7, 2017 modified Feb2018
+ − 3
+ − 4 @author: Bob Brown and cjacoby
+ − 5 '''
+ − 6
+ − 7 import sys, traceback, argparse
+ − 8 import numpy as np
+ − 9 import os
+ − 10 from Matrix_Validate_import import reader, Labeler
+ − 11
+ − 12 #Define The Four Arguments Used in the Program
+ − 13 def get_args():
+ − 14 parser = argparse.ArgumentParser()
+ − 15 parser.add_argument('input_file_txt', help='tab delimited text file input matrix(include .txt in name)')
+ − 16 parser.add_argument('replacement', type=str, help='Choose Replacement for Missing Value. Valid Choices are strings: "Mean" or "Zero"')
+ − 17 parser.add_argument('axes', type=str, help='Choose Axes to Normalize On (Either "Row" or "Column"')
+ − 18 parser.add_argument('output_file_txt' ,help='tab delimited text file output name (include .txt in name)')
+ − 19 args = parser.parse_args()
+ − 20 return args
+ − 21
+ − 22
+ − 23 #Define Function to Replace Null Values with Row Mean
+ − 24 def nan_replacer_mean_rows(matrix):
+ − 25
+ − 26 nonNumCnt= 0
+ − 27 nanCnt = 0 #valid NANs are "NA","N/A","-","?"
+ − 28
+ − 29 #Loop Replacing all Null Values with Row Mean
+ − 30 for i in range(0,len(matrix)):
+ − 31 temp_mean = np.nanmean(matrix[i])
+ − 32 for j in range(0,len(matrix[0])):
+ − 33 #if matrix[i][j] == "NA": #np.isnan(matrix[i][j]) == True:
+ − 34 if np.isnan(matrix[i][j]) == True:
+ − 35 matrix[i][j] = temp_mean
+ − 36 nanCnt = nanCnt + 1
+ − 37 return matrix, nonNumCnt, nanCnt
+ − 38
+ − 39 #Define Function to Replace Null Values with Column Mean
+ − 40 def nan_replacer_mean_columns(matrix):
+ − 41
+ − 42 nonNumCnt= 0
+ − 43 nanCnt = 0 #valid NANs are "NA","N/A","-","?"
+ − 44
+ − 45 #Loop Replacing all Null Values with Column Mean
+ − 46 for i in range(0,len(matrix[0])):
+ − 47 col = [row[i] for row in matrix]
+ − 48 temp_mean = np.nanmean(col)
+ − 49 for j in range(0,len(matrix)):
+ − 50 #if matrix[i][j] == "NA": #elif np.isnan(matrix[j][i]) == True:
+ − 51 if np.isnan(matrix[j][i]) == True:
+ − 52 matrix[j][i] = temp_mean
+ − 53 nanCnt = nanCnt + 1
+ − 54
+ − 55 return matrix, nonNumCnt, nanCnt
+ − 56
+ − 57 #Define Function to Replace Null Values with Zero (axis orientation is irrelevant)
+ − 58 def nan_replacer_zero(matrix):
+ − 59
+ − 60 nonNumCnt= 0
+ − 61 nanCnt = 0 #valid NANs are "NA","N/A","-","?"
+ − 62
+ − 63 #Loop Replacing all Null Values with Row Range
+ − 64 for i in range(0,len(matrix)):
+ − 65 for j in range(0,len(matrix[0])):
+ − 66 #if matrix[i][j] =="NA":
+ − 67 if np.isnan(matrix[i][j]) == True:
+ − 68 matrix[i][j] = 0
+ − 69
+ − 70 return matrix, nonNumCnt, nanCnt
+ − 71
+ − 72 #Define Function to Re-Label Output Matrix
+ − 73 #!!!! not needed no output matrix from Validate tool
+ − 74 def OLD_labeler(matrix, og_cols, og_rows, output_file_txt):
+ − 75 #Write Data to Specified Text File Output
+ − 76 with open(output_file_txt,'w') as f:
+ − 77 f.write("Use original input file for further processing\n")
+ − 78 f.close()
+ − 79 # f.write("")
+ − 80 # for k in range(0,len(og_cols)):
+ − 81 # f.write('\t' + str(og_cols[k]))
+ − 82 # f.write('\n')
+ − 83 # for i in range(0,len(og_rows)):
+ − 84 # f.write(og_rows[i])
+ − 85 # for j in range(0,len(matrix[0])):
+ − 86 # f.write('\t' + format(matrix[i][j]))
+ − 87 # f.write('\n')
+ − 88
+ − 89 #Main Function
+ − 90 def main():
+ − 91 args = get_args()
+ − 92 #print(args)
+ − 93 #sys.stdout.write(str(args))
+ − 94 #sys.stdout.write( '\nValid NAN identifiers are "NA","N/A","-", and "?"')
+ − 95
+ − 96 matrix,og_cols,og_rows = reader(args.input_file_txt)
+ − 97
+ − 98 # if nonNumCnt > 0:
+ − 99 # print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
+ − 100 # #sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
+ − 101 # if nanCnt > 0:
+ − 102 # print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
+ − 103 # sys.exit(-1)
+ − 104 # else:
+ − 105 # if nanCnt > 0:
+ − 106 # print('\nWARNING Matrix has NO unknown non-numbers in matrix, but contains '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
+ − 107 # else:
+ − 108 # print('Matrix is Good-to-Go -- all numbers in data area. ')
+ − 109
+ − 110 #with open(args.output_file_txt,'w') as f:
+ − 111 # f.write("Use original input file for further processing\n")
+ − 112 #f.close()
+ − 113 #sys.exit(0)
+ − 114
+ − 115 # TODO !!!!! Below if MDA decides to use it TURNED OFF FOR NOW
+ − 116 # TODO !!!!! Below if MDA decides to use it TURNED OFF FOR NOW
+ − 117
+ − 118 if args.replacement == "Mean":
+ − 119 if args.axes == "Row":
+ − 120 matrix, nonNumCnt, nanCnt = nan_replacer_mean_rows(matrix)
+ − 121 Labeler(matrix,og_cols,og_rows,args.output_file_txt)
+ − 122 #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt)
+ − 123 #print('Mean,Row')
+ − 124 if nonNumCnt > 0:
+ − 125 print('ERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
+ − 126 sys.stderr.write('ERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
+ − 127 if nanCnt > 0:
+ − 128 print('WARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
+ − 129 sys.exit(-1)
+ − 130 else:
+ − 131 if nanCnt > 0:
+ − 132 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
+ − 133 else:
+ − 134 print('\nMatrix is Good-to-Go -- all numbers in matrix. ')
+ − 135 sys.exit(0)
+ − 136 elif args.axes == "Column":
+ − 137 matrix, nonNumCnt, nanCnt = nan_replacer_mean_columns(matrix)
+ − 138 Labeler(matrix,og_cols,og_rows,args.output_file_txt)
+ − 139 #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt)
+ − 140 #print('Mean,Column')
+ − 141 if nonNumCnt > 0:
+ − 142 print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
+ − 143 sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
+ − 144 if nanCnt > 0:
+ − 145 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
+ − 146 sys.exit(-1)
+ − 147 else:
+ − 148 if nanCnt > 0:
+ − 149 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
+ − 150 else:
+ − 151 print('\nMatrix is Good-to-Go -- all numbers in matrix. ')
+ − 152 sys.exit(0)
+ − 153 else:
+ − 154 print('Mean, but given Invalid Axis= '+str(args.axes))
+ − 155 sys.stderr.write('Mean, but given Invalid Axis= '+str(args.axes))
+ − 156 elif args.replacement == "Zero":
+ − 157 matrix, nonNumCnt, nanCnt = nan_replacer_zero(matrix)
+ − 158 Labeler(matrix,og_cols,og_rows,args.output_file_txt)
+ − 159 #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt)
+ − 160 if nonNumCnt > 0:
+ − 161 print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
+ − 162 sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
+ − 163 if nanCnt > 0:
+ − 164 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
+ − 165 sys.exit(-1)
+ − 166 else:
+ − 167 if nanCnt > 0:
+ − 168 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
+ − 169 else:
+ − 170 print('\nMatrix is Good-to-Go -- all numbers in matrix. ')
+ − 171 sys.exit(0)
+ − 172 else:
+ − 173 print('zero, but given Invalid Axis= '+str(args.axes))
+ − 174 sys.stderr.write('zero, but given Invalid Axis= '+str(args.axes))
+ − 175 sys.exit(-2)
+ − 176
+ − 177
+ − 178 if __name__ == '__main__':
+ − 179 main()
+ − 180 print("done")