Mercurial > repos > md-anderson-bioinformatics > matrix_manipulation
diff Matrix_Filters.py @ 1:f1bcd79cd923 draft default tip
Uploaded
author | insilico-bob |
---|---|
date | Tue, 27 Nov 2018 14:20:40 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Matrix_Filters.py Tue Nov 27 14:20:40 2018 -0500 @@ -0,0 +1,694 @@ +''' +Created on Jun 7, 2017 updated Feb2018 + +@author: rbrown and cjacoby +''' + +import sys, traceback, argparse +import numpy as np +from Matrix_Validate_import import reader, Labeler +import math +#import matplotlib.pyplot as plt + +#Define argparse Function +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('input_file_txt', help='tab delimited text file input matrix(include .txt in name)') + parser.add_argument('choice',type=str, help='Variance Filter Method (Variance or Range)') + parser.add_argument('thresh', help='Thershold for Variance Filtering') + parser.add_argument('axes', help='Axes to Filter on (Either Row or Column') + parser.add_argument('output_file_txt', help='tab delimited text file output name (include .txt in name)') + args = parser.parse_args() + return args + +def Range_Filter_Row(matrix,thresh,row_header_list,column_header_list): + #Create Null Set of Filtered Row(Populated Later) + deletes = [] + minVal = +9999999 + maxVal = -99999 + #Loop to Determine Which Rows have sub-Threshold Range + for i in range(0,len(matrix)): + temp_range = np.max(matrix[i][0::]) - np.min(matrix[i][0::]) + + if temp_range < minVal: minVal = temp_range + elif temp_range > maxVal: maxVal = temp_range + + if temp_range <= float(thresh): + deletes = np.append(deletes,[i],0) + + #Delete Rows sub-Threshold Rows + matrix = np.delete(matrix,deletes,0) + filter_rows = np.delete(row_header_list,deletes,0) + filter_cols = column_header_list + return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal + +def Range_Filter_Col(matrix,thresh,row_header_list,column_header_list): + #Create Null Set of Filtered Row(Populated Later) + deletes = [] + minVal = +9999999 + maxVal = -99999 + #Loop to Determine Which Rows have sub-Threshold Variance + for i in range(0,len(matrix[0])): + + temp_range = np.max([row[i] for row in matrix]) - np.min([row[i] for row in matrix]) + + if temp_range < minVal: minVal = temp_range + elif temp_range > maxVal: maxVal = temp_range + + #print(temp_stdev) + if temp_range <= float(thresh): + deletes = np.append(deletes,[i],0) + print(deletes) + + #Delete Rows sub-Threshold Rows + matrix = np.delete(matrix,deletes,1) + filter_rows = row_header_list + filter_cols = np.delete(column_header_list,deletes,0) + #np.savetxt('testtest.txt',matrix,delimiter='\t') + + return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal + +#Define Function Which Deletes Sub-Threshold Rows +def Variance_Percent_Filter_row(matrix,cutoff,row_header_list,column_header_list, create_plot= False): +# if create a plot then DO NOT remove DATA only print diagram of variance ranges !!! + +# temp_stdev = np.var(matrix[i][1::]) + #cutoff is the percentile rank of the variance values + cutoff= int(cutoff)/100.0 + if cutoff > 0.99 or cutoff < .01: + sys.stderr.write( "ERROR illegal cutoff value= "+str(cutoff*100)+" allowed values 1 to 99") + sys.exit(-8) + + deletes = [] + varianceDict = {} + minVal = +9999999 + maxVal = -99999 + + #Loop to Determine Which Rows have sub-Threshold Variance + for i in range(len(matrix)): + vector = [] + for p in range(len(matrix[0])): + if not math.isnan(matrix[i][p]): + vector.append(matrix[i][p]) + + #temp_stdev = np.var(matrix[:,i]) + if len(vector) > 1: + temp_stdev = np.var(vector) + else: + temp_stdev = 0.0 + + if temp_stdev < minVal: + minVal = temp_stdev + elif temp_stdev > maxVal: + maxVal = temp_stdev + + if temp_stdev not in varianceDict: + varianceDict[temp_stdev] = [i] + else: + tmp= varianceDict[temp_stdev] + tmp.append(i) + varianceDict[temp_stdev] = tmp + + + #calc how many rows to remove + lowerLimit = int(cutoff*len(matrix) +1) + limit = False + cnt = 0 + + for key in sorted(varianceDict.items()): + #rows = varianceDict[key] + rows= key[1] + cnt += len(rows) + if cnt < lowerLimit: #remove rows below percentile cutoff + for j in rows: + deletes = np.append(deletes,[j],0) + #print(deletes) + else: + limit = True + + print( "Dataset Lowest Variance= %.2f" % minVal+" Highest Variance= %.2f" % maxVal+" and Percentile cutoff row = "+str(lowerLimit)+" of "+str(len(matrix))+" rows") + + + #Delete Rows sub-Threshold Rows + matrix = np.delete(matrix,deletes,0) + filter_rows = np.delete(row_header_list,deletes,0) + filter_cols = column_header_list + #np.savetxt('testtest.txt',matrix,delimiter='\t') + + """ + if create_plot: + numBins = 10 + binWidth = 1 + binCat = [] + binData = [] + counted = False + incrmnt= (maxVal-minVal)/(numBins-1) + current_bin_max = minVal + incrmnt/2 + cnt = 0 + for key, val in sorted(varianceDict.items()): + if key < current_bin_max: + cnt += len(val) # add all rows having that variance value + counted = False + else: + binData.append(cnt) + cnt= len(val) + binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0))) + current_bin_max += incrmnt + counted = True + + if not counted: + binData.append(cnt) + binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0))) + + tot = sum(binData) + bins = [] + for j in range(numBins): + bins.append(j*binWidth) + + #ttps://pythonspot.com/matplotlib-bar-chart/ + y_pos = np.arange(numBins) + plt.xticks(y_pos, binCat) + plt.title("Distribution of Variance Values by Row") + plt.ylabel('Variance Bin Totals') + plt.xlabel('Variance Value Bins') + #plt.legend() + plt.bar(y_pos, binData, align='center', alpha=0.5) + + fig, ax = plt.subplots(num=1, figsize=(8,3)) + + plt.show() + """ + + + + return matrix,filter_rows,filter_cols ,len(deletes), minVal,maxVal + +def Variance_Percent_Filter_col(matrix,cutoff,row_header_list,column_header_list, create_plot=False): + #cutoff is the percentile rank of the variance values + cutoff= int(cutoff)/100.0 + if cutoff > 0.99 or cutoff < .01: + sys.stderr.write( "ERROR illegal cutoff value= "+str(cutoff*100)+" allowed values 1 to 99") + sys.exit(-8) + + deletes = [] + varianceDict = {} + minVal = +9999999 + maxVal = -99999 + lenCol = len(matrix[0]) + + #Loop to Determine Which Rows have sub-Threshold Variance + for i in range(lenCol): + vector = [] + for p in range(len(matrix)): + if not math.isnan(matrix[p][i]): + vector.append(matrix[p][i]) + + #temp_stdev = np.var(matrix[:,i]) + if len(vector) > 1: + temp_stdev = np.var(vector) + else: + temp_stdev = 0.0 + + if temp_stdev < minVal: + minVal = temp_stdev + elif temp_stdev > maxVal: + maxVal = temp_stdev + + if temp_stdev not in varianceDict: + varianceDict[temp_stdev] = [i] + else: + tmp= varianceDict[temp_stdev] + tmp.append(i) + varianceDict[temp_stdev] = tmp + + #print(temp_stdev) + #if temp_stdev <= float(cutoff): + + #calc how many rows to remove + lowerLimit = int(cutoff*lenCol +1) + limit = False + cnt = 0 + + for key in sorted(varianceDict.items()): + #rows = varianceDict[key] + cols= key[1] + cnt += len(cols) + if cnt < lowerLimit: #remove rows below percentile cutoff + for j in cols: + deletes = np.append(deletes,[j],0) + #print(deletes) + else: + limit = True + + print( "Dataset Lowest Variance= %.2f" % minVal+" Highest Variance= %.2f" % maxVal+" and Percentile cutoff column= "+str(lowerLimit)+" of "+str(lenCol)+" columns") + + matrix = np.delete(matrix,deletes,1) + filter_rows = row_header_list + filter_cols = np.delete(column_header_list,deletes,0) + #np.savetxt('testtest.txt',matrix,delimiter='\t') + + """ + if create_plot: + numBins = 10 + binWidth = 1 + binCat = [] + binData = [] + counted = False + incrmnt= (maxVal-minVal)/(numBins-1) + current_bin_max = minVal + incrmnt/2 + cnt = 0 + for key, val in sorted(varianceDict.items()): + if key < current_bin_max: + cnt += len(val) # add all rows having that variance value + counted = False + else: + binData.append(cnt) + cnt= len(val) + binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0))) + current_bin_max += incrmnt + counted = True + + if not counted: + binData.append(cnt) + binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0))) + + tot = sum(binData) + bins = [] + + for j in range(numBins): + bins.append(j*binWidth) + #https://pythonspot.com/matplotlib-bar-chart/ + y_pos = np.arange(numBins) + + plt.xticks(y_pos, binCat) + plt.title("Distribution of Variance Values by Column") + plt.ylabel('Variance Bin Totals') + plt.xlabel('Variance Value Bins') + #plt.legend() + plt.bar(y_pos, binData, align='center', alpha=0.5) + + fig, ax = plt.subplots(num=1, figsize=(8,3)) + plt.show() + """ + + return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal + +def UpperLowerLimit_Filter_Row(upperLower, matrix,cutoff,row_header_list,column_header_list): + #Create Null Set of Filtered Row(Populated Later) + deletes = [] + minVal = +9999999 + maxVal = -99999 + #Loop to Determine Which Rows have sub-Threshold Range + for i in range(0,len(matrix)): + removeRow = False + + for j in range(len(matrix[0])): + val= matrix[i][j] + if not math.isnan(val): + if val <= cutoff and upperLower == 'lower': + removeRow = True + elif val >= cutoff and upperLower == 'upper': + removeRow = True + else: + if val < minVal: minVal = val + if val > maxVal: maxVal = val + + #print(temp_stdev) + if removeRow: + deletes = np.append(deletes,[i],0) + + #Delete Rows sub-Threshold Rows + matrix = np.delete(matrix,deletes,0) + filter_rows = np.delete(row_header_list,deletes,0) + filter_cols = column_header_list + + return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal + +def UpperLowerLimit_Filter_Col(upperLower,matrix,cutoff,row_header_list,column_header_list): + #Create Null Set of Filtered Row(Populated Later) + deletes = [] + minVal = +9999999 + maxVal = -99999 + #Loop to Determine Which Rows have sub-Threshold Variance + + for i in range(0,len(matrix[0])): + removeRow = False + + for j in range(len(matrix)): + val= matrix[j][i] + if not math.isnan(val): + if val <= cutoff and upperLower == 'lower': + removeRow = True + elif val >= cutoff and upperLower == 'upper': + removeRow = True + else: + if val < minVal: minVal = val + if val > maxVal: maxVal = val + + #print(temp_stdev) + if removeRow: deletes = np.append(deletes,[i],0) + + #Delete Rows sub-Threshold Rows + matrix = np.delete(matrix,deletes,1) + filter_rows = row_header_list + filter_cols = np.delete(column_header_list,deletes,0) + #np.savetxt('testtest.txt',matrix,delimiter='\t') + + return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal + +#========= remove rows with too many NANs in cells +def NAN_Filter_Row(matrix,nanList,maxAllowedNANs,row_header_list,column_header_list): + + try: + #Create Null Set of Filtered Row(Populated Later) + maxFoundNANs = 0 + deletes = [] + #Loop to Determine Which Rows have sub-Threshold Range + for i in range(0,len(matrix)): + #matches= [s for s in matrix[i][0::] if any(nan == s.upper() for nan in nanList)] + #matches= [s for s in matrix[i][:] if s in nanList] + matches= [] + for s in matrix[i]: + if str(s) in nanList: matches.append(s) + + + lenMatches = len(matches) + if lenMatches > maxFoundNANs: maxFoundNANs = lenMatches + + if lenMatches >= maxAllowedNANs: + deletes = np.append(deletes,[i],0) + + #Delete Rows sub-Threshold Rows + matrix = np.delete(matrix,deletes,0) + filter_rows = np.delete(row_header_list,deletes,0) + filter_cols = column_header_list + + except Exception as err: + traceback.print_exc() + sys.exit(-4) + + return matrix, filter_rows, filter_cols,len(deletes),maxFoundNANs + +#========= remove Cols with too many NANs + +def NAN_Filter_Column(matrix,nanList,maxAllowedNANs,row_header_list,column_header_list): + + #Create Null Set of Filtered Row(Populated Later) + minNumNANs = 0 + maxFoundNANs = 0 + deletes = [] + #Loop to Determine Which Rows have sub-Threshold Variance + for i in range(0,len(matrix[0])): + matches= [] + for j in range(len(matrix)): + if str(matrix[j][i]) in nanList: matches.append(matrix[j][i]) + + lenMatches = len(matches) + if lenMatches > maxFoundNANs: + maxFoundNANs = lenMatches + + if lenMatches >= maxAllowedNANs: + deletes = np.append(deletes,[i],0) + + #Delete cols with too many NANs + matrix = np.delete(matrix,deletes,1) + filter_rows = row_header_list + filter_cols = np.delete(column_header_list,deletes,0) + #np.savetxt('testtest.txt',matrix,delimiter='\t') + return matrix, filter_rows, filter_cols,len(deletes),maxFoundNANs + + +#MAD Median Absolute Deviation median (|Xi - Xmedian|) > X +def Row_Value_MAD(matrix,cutoff,row_header_list,column_header_list): +#MAD Median Absolute Deviation median (|Xi - Xmedian|) > X +# cutoff is MAX value used to meant to minimize the impact of one outlier + + deletes = [] + minVal = +9999999 + maxVal = -99999 + #Loop to Determine Which Rows have sub-Threshold Range + for i in range(0,len(matrix)): + medianRow = np.median(matrix[i]) + temp = np.median(abs(matrix[i]- medianRow)) +# median (|Xi - Xmedian|) > X => meant to minimize the impact of one outlier + if temp < cutoff: + deletes = np.append(deletes,[i],0) + + if temp < minVal: minVal = temp + if temp > maxVal: maxVal = temp + + #Delete Rows sub-Threshold Rows + matrix = np.delete(matrix,deletes,0) + filter_rows = np.delete(row_header_list,deletes,0) + filter_cols = column_header_list + print( "INFO Row MAD - Matrix min MAD value= "+str(minVal)+" and the max MAD value= "+str(maxVal) ) + + return matrix, filter_rows, filter_cols,len(deletes),maxVal + +#MAD Median Absolute Deviation median (|Xi - Xmedian|) > X +def Col_Value_MAD(matrix,cutoff,row_header_list,column_header_list): +#MAD Median Absolute Deviation median (|Xi - Xmedian|) > X +# cutoff is MAX value used to meant to minimize the impact of one outlier + deletes = [] + minVal = +9999999 + maxVal = -99999 + #Loop to Determine Which Rows have sub-Threshold Range + for i in range(0,len(matrix[0])): + matrixCol= [] + for j in range(len(matrix)): + matrixCol.append(matrix[j][i]) + + medianCol = np.median(matrixCol) + temp = np.median(abs(matrixCol- medianCol)) +# median (|Xi - Xmedian|) > X meant to minimize the impact of one outlier + if temp < cutoff: + deletes = np.append(deletes,[i],0) + + if temp < minVal: minVal = temp + if temp > maxVal: maxVal = temp + + #Delete Rows sub-Threshold Rows + matrix = np.delete(matrix,deletes,1) + filter_rows = row_header_list + filter_cols = np.delete(column_header_list,deletes,0) + print( "INFO Column MAD - Matrix min MAD value= "+str(minVal)+" and the max MAD value= "+str(maxVal) ) + + return matrix, filter_rows, filter_cols,len(deletes),maxVal + + +# if covariance of the data in two columns exceeds a thresehold remove one row list the rows in a separate output +# def CoVariance_Percent_Filter_row_col(matrix,thresh,row_header_list,column_header_list): +# xv= array([8., 9.5, 7.8, 4.2, -7.7, -5.4, 3.2]) +# yv= array([8.9, 2.0, 4.8, -4.2, 2.7, -3.4, -5.9]) +# +# def cov(x,y): +# if (len(x) != len(y) +# [Stop] +# x.bar = mean(x) +# y.bar = mean(y) +# N = len(x) +# Cov = (sum((x-x.bar)*(y-y.bar))) / (N-1.0) +# return(Cov) + +# #Create Null Set of Filtered Row(Populated Later) +# deletes = [] +# +# temp_mean = np.nanmean(matrix[i]) +# temp_stdev = np.nanstd(matrix[i]) +# +# get stddev of each row the calc xi -xj sq +# +# for i in range(0,len(matrix)): +# temp_range = np.max(matrix[i][0::]) - np.min(matrix[i][0::]) +# if temp_range <= float(thresh): +# deletes = np.append(deletes,[i],0) +# +# #Delete Rows sub-Threshold Rows +# matrix = np.delete(matrix,deletes,0) +# filter_rows = np.delete(row_header_list,deletes,0) +# filter_cols = column_header_list +# return(matrix,filter_rows,filter_cols) +# +# #np.savetxt('testtest.txt',matrix,delimiter='\t') +# return(matrix,filter_rows,filter_cols) +# + +#Define Function Which Labels Rows/Columns on Output +#below replace +# def labeler(matrix,filter_rows,filter_cols,output_file_txt): +# +# #Write Data to Specified Text File Output +# with open(output_file_txt,'w') as f: +# f.write("") +# for k in range(0,len(filter_cols)): +# f.write('\t' + filter_cols[k]) +# f.write('\n') +# for i in range(0,len(filter_rows)): +# f.write(filter_rows[i]) +# for j in range(0,len(matrix[0])): +# f.write('\t' + format(matrix[i][j])) +# f.write('\n') + + +#Define Main Function +def main(): + try: + args = get_args() + #sys.stdout.write(str(args)+"\n") +# <option value="LowerLimit">Minimum Absolute(Cell) Values to remove row/column</option> +# <option value="UpperLimit">Maximum Absolute(Cell) Values to remove row/column</option> +# <option value="NANnumber">NAN Number Cells Limit to remove row/column</option> +# <option value="NANpercent">NAN Percent Cells Limit to remove row/column</option> + nanList= ["NAN", "NA", "N/A", "-","?","nan", "na", "n/a"] + + matrix, column_header_list,row_header_list = reader(args.input_file_txt) + #old_reader matrix, row_header_list, column_header_list = reader(args.input_file_txt) + threshold = float(args.thresh) + if threshold < 0.000001: + print('Invalid negative or near-zero threshold chosen = '+str(args.thresh)+" choose positive value") + sys.exit(-4) + +#VariancePercent + if args.choice == "VariancePercent" or args.choice == "VarianceCount": # > percent variance + + if args.axes == "Row": + if args.choice == "VarianceCount": threshold= (1-threshold/len(row_header_list))*100.0 + + matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = Variance_Percent_Filter_row(matrix,threshold,row_header_list,column_header_list) + Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) + if delCnt < 1: + print('\nNO Filtering occurred for rows using variance percentile < '+str(args.thresh)+ ' by row. Matrix row minimum variance= %.2f' % minVal+' and maximum variance= %.2f' % maxVal) + sys.stderr.write('\nFiltering out rows using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows') + sys.exit(-1) + else: + print('\nFiltering out rows using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows') + elif args.axes == "Column": + if args.choice == "VarianceCount": threshold= (1-threshold/len(column_header_list))*100.0 + matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = Variance_Percent_Filter_col(matrix,threshold,row_header_list,column_header_list) + Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) + if delCnt < 1: + print('\nNO Filtering occurred for columns using variance percentile < '+str(args.thresh)+ ' by columns. Matrix columns minimum variance= %.2f' % minVal+' and maximum variance= %.2f' % maxVal) + sys.stderr.write('\nNO Filtering out rows using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows') + sys.exit(-1) + else: + print('\nFiltering out columns using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' columns') + else: + print('Invalid Axes ='+str(args.thresh)) + sys.exit(-1) +#LowerLimit + elif args.choice == "LowerLimit": #!! todo is NOT lower or upper limit but range of values + if args.axes == "Row": + matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Row('lower',matrix,threshold,row_header_list,column_header_list) + Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) + if delCnt < 1: + print('\nNO Filtering occurred for rows using LowerLimit < '+str(args.thresh)+ ' by row. Matrix row minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal) + sys.stderr.write('\nNO Filtering out rows using LowerLimit < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows') + sys.exit(-1) + else: + print('\nFiltered out '+str(delCnt)+' rows with Lower Limit < '+str(args.thresh)) + elif args.axes == "Column": + matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Col('lower', matrix,threshold,row_header_list,column_header_list) + Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) + if delCnt < 1: + print('\nNO Filtering occurred for columns using Lower Limit < '+str(args.thresh)+ ' by columns. Matrix columns minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal) + sys.stderr.write('\nNO Filtering out rows using Lower Limit < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows') + sys.exit(-1) + else: + print('\nFiltered out '+str(delCnt)+' columns with Lower Limit < '+str(args.thresh)) +#UpperLimit + elif args.choice == "UpperLimit": #!! todo is NOT lower or upper limit but range of values + if args.axes == "Row": + matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Row('upper',matrix,threshold,row_header_list,column_header_list) + Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) + if delCnt < 1: + print('\nNO Filtering occurred for rows using Upper Limit < '+str(args.thresh)+ ' by row. Matrix row minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal) + sys.stderr.write('\nNO Filtering out rows using Upper Limit < '+str(args.thresh)+ ' by row. Matrix row minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal) + sys.exit(-1) + else: + print('\nFiltered out '+str(delCnt)+' rows with UpperLimit < '+str(args.thresh)) + elif args.axes == "Column": + matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Col('upper', matrix,threshold,row_header_list,column_header_list) + Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) + if delCnt < 1: + print('\nNO Filtering occurred for columns using UpperLimit < '+str(args.thresh)+ ' by columns. Matrix columns minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal) + sys.stderr.write('\nFiltering out rows using UpperLimit < '+str(args.thresh)+ ' by columns. Matrix columns minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal) + sys.exit(-1) + else: + print('\nFiltered out '+str(delCnt)+' columns with UpperLimit < '+str(args.thresh)) +#MADlimit + elif args.choice == "MADcount" or args.choice == "MADpercent": #!! is lowerlimit of median absolute deviation medians + threshold= threshold + if args.axes == "Row": + if args.choice == "MADpercent": threshold= len(row_header_list)*threshold/100.0 + + matrix, filter_rows, filter_cols,delCnt,maxVal = Row_Value_MAD(matrix,threshold,row_header_list,column_header_list) + Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) + if delCnt < 1: + print('\nNO Filtering occurred for rows using MAD < '+str(threshold)+ ' by row. Matrix row MAD maximum value= %.2f' % maxVal) + sys.stderr.write('\nFiltering out rows using MAD < '+str(threshold)+ ' by row. Matrix row MAD maximum value= %.2f' % maxVal) + sys.exit(-1) + else: + print('\nFiltered out '+str(delCnt)+' rows using MAD maximum value > '+str(threshold)) + elif args.axes == "Column": + if args.choice == "MADpercent": threshold= len(column_header_list)*threshold/100.0 + + matrix, filter_rows, filter_cols,delCnt,maxVal = Col_Value_MAD(matrix,threshold,row_header_list,column_header_list) + Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) + if delCnt < 1: + print('\nNO Filtering occurred for columns using MAD < '+str(threshold)+ ' by columns. Matrix columns MAD maximum value= %.2f' % maxVal) + sys.stderr.write('\nFiltering out columns using MAD < '+str(threshold)+ ' by columns. Matrix columns MAD maximum value= %.2f' % maxVal) + sys.exit(-1) + else: + print('\nFiltered out '+str(delCnt)+' columns using MAD maximum value > '+str(threshold)) +#NANlimit + elif args.choice == "NANlimit" or args.choice == "NANpercent": + maxNANs= int(args.thresh) + val= ' ' + if args.choice == "NANpercent": + n,m = np.shape(matrix) + maxNANs= int(int(args.thresh)*n/100) + val= '%' + if args.axes == "Row": + matrix, filter_rows, filter_cols,delCnt, maxFoundNANs = NAN_Filter_Row(matrix,nanList,maxNANs,row_header_list,column_header_list) + Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) + if delCnt < 1: + print('\nNO Filtering occurred for rows using NAN limit = or > '+str(args.thresh)+val+ ' by row. Matrix row max NAN count is =' + str(maxFoundNANs )) + sys.stderr.write('\nNO Filtering out rows using NAN limit = or > '+str(args.thresh)+val+ ' by row. Matrix row max NAN count is =' + str(maxFoundNANs )) + sys.exit(-1) + else: + print('\nFiltered out '+str(delCnt)+' rows using NAN limit = or > '+str(args.thresh)+val) + elif args.axes == "Column": + matrix, filter_rows, filter_cols,delCnt, maxFoundNANs = NAN_Filter_Column(matrix, nanList, maxNANs, row_header_list, column_header_list) + Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) + if delCnt < 1: + print('\nNO Filtering occurred for columns using NAN limit = or > '+str(args.thresh)+val+ ' by columns. Matrix columns max NAN count is = '+ str(maxFoundNANs)) + sys.stderr.write('\nNO Filtering out columns using NAN limit = or > '+str(args.thresh)+val+ ' by columns. Matrix columns max NAN count is = '+ str(maxFoundNANs)) + sys.exit(-1) + else: + print('\nFiltered out '+str(delCnt)+' columns using NAN limit = or > '+str(args.thresh)+val ) + +# elif args.choice == "covariance": +# if args.axes == "Row": +# matrix, filter_rows, filter_cols = CoVariance_Percent_Filter_row(matrix,args.thresh,row_header_list,column_header_list) +# Labeler(matrix,filter_rows,filter_cols,args.output_file_txt) +# print('Covariance_Filter on row') +# elif args.axes == "Column": +# matrix, filter_rows, filter_cols = CoVariance_Percent_Filter_col(matrix,args.thresh,row_header_list,column_header_list) +# Labeler(matrix,filter_rows,filter_cols,args.output_file_txt) +# print('Covariance_Filter on column') + else: + print('Invalid Axes = '+str(args.axes)) + sys.exit(-1) + else: + print("Invalid Filter Choice = "+str(args.choice)) + sys.exit(-2) + + + except Exception as err: + traceback.print_exc() + sys.exit(-3) + +if __name__ == '__main__': + main() + print("\ndone") + sys.exit(0)