1
+ − 1 '''
+ − 2 Created on Jun 7, 2017 modified Feb2018
+ − 3
+ − 4 @author: cjacoby and Bob Brown
+ − 5 '''
+ − 6
+ − 7 import sys, traceback, argparse
+ − 8 import numpy as np
+ − 9 import os
+ − 10 #import matplotlib.pyplot as plt
+ − 11 #import matplotlib.pyplot as plt; plt.rcdefaults()
+ − 12
+ − 13 # Define the Reading Function Which Pulls the Data from a .txt file
+ − 14 def reader(input_file_txt, create_plot= False):
+ − 15 #Read Matrix, Preserving String Values for Headers first row and first column (both minus first cell)
+ − 16 #Read Matrix, Converting all values to Float for Data Processing
+ − 17
+ − 18 f = open(input_file_txt, "rU")
+ − 19
+ − 20 #print( 'Valid NAN identifiers are: empty cells, cells with blanks,"NA","N/A","-", and "?"')
+ − 21
+ − 22 column_labels = []
+ − 23 row_labels = []
+ − 24 matrix = []
+ − 25 firstLine= True
+ − 26
+ − 27 line = f.readline()
+ − 28
+ − 29 # "NA","N/A","-","?","NAN","NaN","Na","na","n/a","null",EMPTY/Null, SPACE (blank char)
+ − 30
+ − 31 nanList = ["", " ","NAN", "NA", "N/A", "-","?"]
+ − 32 binCatDict = {"":0, " ":0, "Text":0, "NA":0, "-":0,"NAN":0, "N/A":0,"?":0}
+ − 33 row = 0
+ − 34 nanCnt = 0
+ − 35 nonNumCnt = 0
+ − 36
+ − 37 while line:
+ − 38 line = line.strip("\n")
+ − 39 line = line.split('\t')
+ − 40
+ − 41 row += 1
+ − 42
+ − 43 if firstLine:
+ − 44 lengthRow = len(line)
+ − 45 column_labels = line[1:]
+ − 46 else:
+ − 47 if lengthRow != len(line):
+ − 48 # print("\nERROR matrix row lengths unequal for row 0 and row "+str(row)+"\n" )
+ − 49 sys.exit(-1)
+ − 50
+ − 51 temp = []
+ − 52 # column= 0
+ − 53 row_labels.append(str(line[0]))
+ − 54
+ − 55 #for item in line[1:]: use enumerate
+ − 56 for column, item in enumerate(line[1:],1):
+ − 57 # column += 1
+ − 58 try:
+ − 59 temp.append(float(item))
+ − 60 except ValueError:
+ − 61 temp.append(np.nan)
+ − 62 itemUC= item.upper()
+ − 63
+ − 64 if itemUC in nanList:
+ − 65 nanCnt += 1
+ − 66 binCatDict[itemUC]= binCatDict[itemUC]+1
+ − 67 # print( 'Legit nans= ',str(item))
+ − 68 else:
+ − 69 if nonNumCnt == 0: sys.stderr.write("Start List of up to first 50 Invalid cell values \n")
+ − 70 nonNumCnt +=1
+ − 71 if nonNumCnt < 50: sys.stderr.write("At row_column= "+str(row)+"_"+str(column)+' invalid data cell value '+ item+"\n")
+ − 72
+ − 73 matrix.append(temp)
+ − 74
+ − 75 line = f.readline()
+ − 76 firstLine= False
+ − 77
+ − 78 #sys.stdout.write("\n\n")
+ − 79 f.close()
+ − 80 binCatDict["Text"]= nonNumCnt
+ − 81
+ − 82 # plot results of NAN counts above
+ − 83
+ − 84 binCat = ["null", "blank", 'hyphen', '?','NA','N/A' ,'NAN', 'text']
+ − 85 orderDict= {0:"", 1:"", 2:'-', 3:'?',4:'NA', 5:'N/A' ,6:'NAN', 7:'Text'}
+ − 86 #TODO verify dict orde for data
+ − 87 #print("> key value =",key, str(value))
+ − 88
+ − 89 if create_plot:
+ − 90 numBins = len(binCat)
+ − 91 binWidth = 1
+ − 92 bins = []
+ − 93 binData = []
+ − 94
+ − 95 for key in sorted(orderDict):
+ − 96 value= binCatDict[orderDict[key]] # place items on chart in order and with data value for item
+ − 97 if value < 1:
+ − 98 binData.append(value+0.01)
+ − 99 else:
+ − 100 binData.append(value)
+ − 101
+ − 102 #"""
+ − 103 for j in range(numBins):
+ − 104 bins.append(j*binWidth)
+ − 105 #ttps://pythonspot.com/matplotlib-bar-chart/
+ − 106 y_pos = np.arange(numBins)
+ − 107 plt.yticks(y_pos, binCat)
+ − 108 plt.title("Distribution of NAN types (UPPER & lower & MiXeD case combined)")
+ − 109 plt.ylabel('NAN Types')
+ − 110 plt.xlabel('Occurrences')
+ − 111 #plt.legend()
+ − 112 plt.barh(y_pos, binData, align='center', alpha=0.5)
+ − 113
+ − 114 fig, ax = plt.subplots(num=1, figsize=(8,3))
+ − 115 ax.set_title("Data Cell Counts of Not A Number (NAN) Types")
+ − 116 #ax.bar(center,bins, align='center', width=width)
+ − 117 #ax.bar(center, hist, align='center', width=width)
+ − 118 #ax.set_xticks(bins)
+ − 119 # fig.savefig("/Users/bobbrown/Desktop/Matrix-tools-Test-output/NAN-plot.png")
+ − 120
+ − 121 # fig, ax = plt.subplots(num=1, figsize=(8,3))
+ − 122 # fig.savefig("/Users/bobbrown/Desktop/Matrix-tools-Test-output/hist-out.png")
+ − 123
+ − 124 plt.show()
+ − 125 #"""
+ − 126
+ − 127 #after plot error?
+ − 128 x,y=np.shape(matrix)
+ − 129 if nanCnt > 0: print("WARNING -- Found "+str(nanCnt)+" Valid Non-numbers. Their percent of total matrix data cell values = "+str((100*nanCnt)/(x*y))+"% ")
+ − 130 if nonNumCnt > 0: sys.exit(-1)
+ − 131 #print ("reader output:")
+ − 132 #print (matrix)
+ − 133 #print (column_labels)
+ − 134 #print(row_labels)
+ − 135 return matrix,column_labels,row_labels
+ − 136
+ − 137 #----------------------------------------------------------------------
+ − 138 # Verify Matrix A column_labels match Matrix B row_labels in name and order for A*B
+ − 139 def MatchLabels(column_labels,row_labels):
+ − 140
+ − 141 if len(column_labels) != len(row_labels):
+ − 142 sys.err("ERROR 1st matrix column count "+str(len(column_labels))+" not equal 2nd Matrix number row count "+str(len(row_labels))+"\n" )
+ − 143 else:
+ − 144 cnt= 0
+ − 145 for k in range(0,len(column_labels)):
+ − 146 if column_labels[k] != row_labels[k] and cnt < 20:
+ − 147 cnt += 1
+ − 148 #sys.err("ERROR At column & row position "+str(k)+" Matrix 1 column value "+str(column_labels)+" not equal 2nd Matrix row value "+str(row_labels)+"\n" )
+ − 149
+ − 150 if cnt > 0:
+ − 151 sys.exit(-11)
+ − 152 #----------------------------------------------------------------------
+ − 153 # restores row and column labels in ouput
+ − 154 def Labeler(matrix,column_labels,row_labels,output_file_txt):
+ − 155 #print("matrix length: " + str(len(matrix)))
+ − 156 #print("row labels length: " + str(len(row_labels)))
+ − 157 #print("col labels length: " +str(len(column_labels)))
+ − 158 #Define Null Sets For Col and Row Headers
+ − 159 with open(output_file_txt,'w') as f:
+ − 160 f.write("")
+ − 161 for k in range(0,len(column_labels)):
+ − 162 f.write('\t' + str(column_labels[k]) )
+ − 163 f.write('\n')
+ − 164 #for i in range(0,len(row_labels)):
+ − 165 for i in range(0,len(matrix)):
+ − 166 f.write(str(row_labels[i]) )
+ − 167 #print("matrix["+str(i)+"] length:" + str(len(matrix[i])))
+ − 168 for j in range(0,len(matrix[0])):
+ − 169 f.write('\t' + format(matrix[i][j]))
+ − 170 f.write('\n')
+ − 171
+ − 172
+ − 173 #----------------------------------------------------------------------
+ − 174 if __name__ == '__main__':
+ − 175 input_file_txt = str(sys.argv[1])
+ − 176
+ − 177 matrix,column_labels,row_labels = reader(input_file_txt)
+ − 178 print("Done")
+ − 179