# HG changeset patch # User insilico-bob # Date 1543346440 18000 # Node ID f1bcd79cd923a76ce97dcc843ad28f1c8835283e # Parent 7f12c81e20833740569fbd555d2a83e35f38496a Uploaded diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Filters.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Matrix_Filters.py Tue Nov 27 14:20:40 2018 -0500 @@ -0,0 +1,694 @@ +''' +Created on Jun 7, 2017 updated Feb2018 + +@author: rbrown and cjacoby +''' + +import sys, traceback, argparse +import numpy as np +from Matrix_Validate_import import reader, Labeler +import math +#import matplotlib.pyplot as plt + +#Define argparse Function +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('input_file_txt', help='tab delimited text file input matrix(include .txt in name)') + parser.add_argument('choice',type=str, help='Variance Filter Method (Variance or Range)') + parser.add_argument('thresh', help='Thershold for Variance Filtering') + parser.add_argument('axes', help='Axes to Filter on (Either Row or Column') + parser.add_argument('output_file_txt', help='tab delimited text file output name (include .txt in name)') + args = parser.parse_args() + return args + +def Range_Filter_Row(matrix,thresh,row_header_list,column_header_list): + #Create Null Set of Filtered Row(Populated Later) + deletes = [] + minVal = +9999999 + maxVal = -99999 + #Loop to Determine Which Rows have sub-Threshold Range + for i in range(0,len(matrix)): + temp_range = np.max(matrix[i][0::]) - np.min(matrix[i][0::]) + + if temp_range < minVal: minVal = temp_range + elif temp_range > maxVal: maxVal = temp_range + + if temp_range <= float(thresh): + deletes = np.append(deletes,[i],0) + + #Delete Rows sub-Threshold Rows + matrix = np.delete(matrix,deletes,0) + filter_rows = np.delete(row_header_list,deletes,0) + filter_cols = column_header_list + return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal + +def Range_Filter_Col(matrix,thresh,row_header_list,column_header_list): + #Create Null Set of Filtered Row(Populated Later) + deletes = [] + minVal = +9999999 + maxVal = -99999 + #Loop to Determine Which Rows have sub-Threshold Variance + for i in range(0,len(matrix[0])): + + temp_range = np.max([row[i] for row in matrix]) - np.min([row[i] for row in matrix]) + + if temp_range < minVal: minVal = temp_range + elif temp_range > maxVal: maxVal = temp_range + + #print(temp_stdev) + if temp_range <= float(thresh): + deletes = np.append(deletes,[i],0) + print(deletes) + + #Delete Rows sub-Threshold Rows + matrix = np.delete(matrix,deletes,1) + filter_rows = row_header_list + filter_cols = np.delete(column_header_list,deletes,0) + #np.savetxt('testtest.txt',matrix,delimiter='\t') + + return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal + +#Define Function Which Deletes Sub-Threshold Rows +def Variance_Percent_Filter_row(matrix,cutoff,row_header_list,column_header_list, create_plot= False): +# if create a plot then DO NOT remove DATA only print diagram of variance ranges !!! + +# temp_stdev = np.var(matrix[i][1::]) + #cutoff is the percentile rank of the variance values + cutoff= int(cutoff)/100.0 + if cutoff > 0.99 or cutoff < .01: + sys.stderr.write( "ERROR illegal cutoff value= "+str(cutoff*100)+" allowed values 1 to 99") + sys.exit(-8) + + deletes = [] + varianceDict = {} + minVal = +9999999 + maxVal = -99999 + + #Loop to Determine Which Rows have sub-Threshold Variance + for i in range(len(matrix)): + vector = [] + for p in range(len(matrix[0])): + if not math.isnan(matrix[i][p]): + vector.append(matrix[i][p]) + + #temp_stdev = np.var(matrix[:,i]) + if len(vector) > 1: + temp_stdev = np.var(vector) + else: + temp_stdev = 0.0 + + if temp_stdev < minVal: + minVal = temp_stdev + elif temp_stdev > maxVal: + maxVal = temp_stdev + + if temp_stdev not in varianceDict: + varianceDict[temp_stdev] = [i] + else: + tmp= varianceDict[temp_stdev] + tmp.append(i) + varianceDict[temp_stdev] = tmp + + + #calc how many rows to remove + lowerLimit = int(cutoff*len(matrix) +1) + limit = False + cnt = 0 + + for key in sorted(varianceDict.items()): + #rows = varianceDict[key] + rows= key[1] + cnt += len(rows) + if cnt < lowerLimit: #remove rows below percentile cutoff + for j in rows: + deletes = np.append(deletes,[j],0) + #print(deletes) + else: + limit = True + + print( "Dataset Lowest Variance= %.2f" % minVal+" Highest Variance= %.2f" % maxVal+" and Percentile cutoff row = "+str(lowerLimit)+" of "+str(len(matrix))+" rows") + + + #Delete Rows sub-Threshold Rows + matrix = np.delete(matrix,deletes,0) + filter_rows = np.delete(row_header_list,deletes,0) + filter_cols = column_header_list + #np.savetxt('testtest.txt',matrix,delimiter='\t') + + """ + if create_plot: + numBins = 10 + binWidth = 1 + binCat = [] + binData = [] + counted = False + incrmnt= (maxVal-minVal)/(numBins-1) + current_bin_max = minVal + incrmnt/2 + cnt = 0 + for key, val in sorted(varianceDict.items()): + if key < current_bin_max: + cnt += len(val) # add all rows having that variance value + counted = False + else: + binData.append(cnt) + cnt= len(val) + binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0))) + current_bin_max += incrmnt + counted = True + + if not counted: + binData.append(cnt) + binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0))) + + tot = sum(binData) + bins = [] + for j in range(numBins): + bins.append(j*binWidth) + + #ttps://pythonspot.com/matplotlib-bar-chart/ + y_pos = np.arange(numBins) + plt.xticks(y_pos, binCat) + plt.title("Distribution of Variance Values by Row") + plt.ylabel('Variance Bin Totals') + plt.xlabel('Variance Value Bins') + #plt.legend() + plt.bar(y_pos, binData, align='center', alpha=0.5) + + fig, ax = plt.subplots(num=1, figsize=(8,3)) + + plt.show() + """ + + + + return matrix,filter_rows,filter_cols ,len(deletes), minVal,maxVal + +def Variance_Percent_Filter_col(matrix,cutoff,row_header_list,column_header_list, create_plot=False): + #cutoff is the percentile rank of the variance values + cutoff= int(cutoff)/100.0 + if cutoff > 0.99 or cutoff < .01: + sys.stderr.write( "ERROR illegal cutoff value= "+str(cutoff*100)+" allowed values 1 to 99") + sys.exit(-8) + + deletes = [] + varianceDict = {} + minVal = +9999999 + maxVal = -99999 + lenCol = len(matrix[0]) + + #Loop to Determine Which Rows have sub-Threshold Variance + for i in range(lenCol): + vector = [] + for p in range(len(matrix)): + if not math.isnan(matrix[p][i]): + vector.append(matrix[p][i]) + + #temp_stdev = np.var(matrix[:,i]) + if len(vector) > 1: + temp_stdev = np.var(vector) + else: + temp_stdev = 0.0 + + if temp_stdev < minVal: + minVal = temp_stdev + elif temp_stdev > maxVal: + maxVal = temp_stdev + + if temp_stdev not in varianceDict: + varianceDict[temp_stdev] = [i] + else: + tmp= varianceDict[temp_stdev] + tmp.append(i) + varianceDict[temp_stdev] = tmp + + #print(temp_stdev) + #if temp_stdev <= float(cutoff): + + #calc how many rows to remove + lowerLimit = int(cutoff*lenCol +1) + limit = False + cnt = 0 + + for key in sorted(varianceDict.items()): + #rows = varianceDict[key] + cols= key[1] + cnt += len(cols) + if cnt < lowerLimit: #remove rows below percentile cutoff + for j in cols: + deletes = np.append(deletes,[j],0) + #print(deletes) + else: + limit = True + + print( "Dataset Lowest Variance= %.2f" % minVal+" Highest Variance= %.2f" % maxVal+" and Percentile cutoff column= "+str(lowerLimit)+" of "+str(lenCol)+" columns") + + matrix = np.delete(matrix,deletes,1) + filter_rows = row_header_list + filter_cols = np.delete(column_header_list,deletes,0) + #np.savetxt('testtest.txt',matrix,delimiter='\t') + + """ + if create_plot: + numBins = 10 + binWidth = 1 + binCat = [] + binData = [] + counted = False + incrmnt= (maxVal-minVal)/(numBins-1) + current_bin_max = minVal + incrmnt/2 + cnt = 0 + for key, val in sorted(varianceDict.items()): + if key < current_bin_max: + cnt += len(val) # add all rows having that variance value + counted = False + else: + binData.append(cnt) + cnt= len(val) + binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0))) + current_bin_max += incrmnt + counted = True + + if not counted: + binData.append(cnt) + binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0))) + + tot = sum(binData) + bins = [] + + for j in range(numBins): + bins.append(j*binWidth) + #https://pythonspot.com/matplotlib-bar-chart/ + y_pos = np.arange(numBins) + + plt.xticks(y_pos, binCat) + plt.title("Distribution of Variance Values by Column") + plt.ylabel('Variance Bin Totals') + plt.xlabel('Variance Value Bins') + #plt.legend() + plt.bar(y_pos, binData, align='center', alpha=0.5) + + fig, ax = plt.subplots(num=1, figsize=(8,3)) + plt.show() + """ + + return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal + +def UpperLowerLimit_Filter_Row(upperLower, matrix,cutoff,row_header_list,column_header_list): + #Create Null Set of Filtered Row(Populated Later) + deletes = [] + minVal = +9999999 + maxVal = -99999 + #Loop to Determine Which Rows have sub-Threshold Range + for i in range(0,len(matrix)): + removeRow = False + + for j in range(len(matrix[0])): + val= matrix[i][j] + if not math.isnan(val): + if val <= cutoff and upperLower == 'lower': + removeRow = True + elif val >= cutoff and upperLower == 'upper': + removeRow = True + else: + if val < minVal: minVal = val + if val > maxVal: maxVal = val + + #print(temp_stdev) + if removeRow: + deletes = np.append(deletes,[i],0) + + #Delete Rows sub-Threshold Rows + matrix = np.delete(matrix,deletes,0) + filter_rows = np.delete(row_header_list,deletes,0) + filter_cols = column_header_list + + return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal + +def UpperLowerLimit_Filter_Col(upperLower,matrix,cutoff,row_header_list,column_header_list): + #Create Null Set of Filtered Row(Populated Later) + deletes = [] + minVal = +9999999 + maxVal = -99999 + #Loop to Determine Which Rows have sub-Threshold Variance + + for i in range(0,len(matrix[0])): + removeRow = False + + for j in range(len(matrix)): + val= matrix[j][i] + if not math.isnan(val): + if val <= cutoff and upperLower == 'lower': + removeRow = True + elif val >= cutoff and upperLower == 'upper': + removeRow = True + else: + if val < minVal: minVal = val + if val > maxVal: maxVal = val + + #print(temp_stdev) + if removeRow: deletes = np.append(deletes,[i],0) + + #Delete Rows sub-Threshold Rows + matrix = np.delete(matrix,deletes,1) + filter_rows = row_header_list + filter_cols = np.delete(column_header_list,deletes,0) + #np.savetxt('testtest.txt',matrix,delimiter='\t') + + return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal + +#========= remove rows with too many NANs in cells +def NAN_Filter_Row(matrix,nanList,maxAllowedNANs,row_header_list,column_header_list): + + try: + #Create Null Set of Filtered Row(Populated Later) + maxFoundNANs = 0 + deletes = [] + #Loop to Determine Which Rows have sub-Threshold Range + for i in range(0,len(matrix)): + #matches= [s for s in matrix[i][0::] if any(nan == s.upper() for nan in nanList)] + #matches= [s for s in matrix[i][:] if s in nanList] + matches= [] + for s in matrix[i]: + if str(s) in nanList: matches.append(s) + + + lenMatches = len(matches) + if lenMatches > maxFoundNANs: maxFoundNANs = lenMatches + + if lenMatches >= maxAllowedNANs: + deletes = np.append(deletes,[i],0) + + #Delete Rows sub-Threshold Rows + matrix = np.delete(matrix,deletes,0) + filter_rows = np.delete(row_header_list,deletes,0) + filter_cols = column_header_list + + except Exception as err: + traceback.print_exc() + sys.exit(-4) + + return matrix, filter_rows, filter_cols,len(deletes),maxFoundNANs + +#========= remove Cols with too many NANs + +def NAN_Filter_Column(matrix,nanList,maxAllowedNANs,row_header_list,column_header_list): + + #Create Null Set of Filtered Row(Populated Later) + minNumNANs = 0 + maxFoundNANs = 0 + deletes = [] + #Loop to Determine Which Rows have sub-Threshold Variance + for i in range(0,len(matrix[0])): + matches= [] + for j in range(len(matrix)): + if str(matrix[j][i]) in nanList: matches.append(matrix[j][i]) + + lenMatches = len(matches) + if lenMatches > maxFoundNANs: + maxFoundNANs = lenMatches + + if lenMatches >= maxAllowedNANs: + deletes = np.append(deletes,[i],0) + + #Delete cols with too many NANs + matrix = np.delete(matrix,deletes,1) + filter_rows = row_header_list + filter_cols = np.delete(column_header_list,deletes,0) + #np.savetxt('testtest.txt',matrix,delimiter='\t') + return matrix, filter_rows, filter_cols,len(deletes),maxFoundNANs + + +#MAD Median Absolute Deviation median (|Xi - Xmedian|) > X +def Row_Value_MAD(matrix,cutoff,row_header_list,column_header_list): +#MAD Median Absolute Deviation median (|Xi - Xmedian|) > X +# cutoff is MAX value used to meant to minimize the impact of one outlier + + deletes = [] + minVal = +9999999 + maxVal = -99999 + #Loop to Determine Which Rows have sub-Threshold Range + for i in range(0,len(matrix)): + medianRow = np.median(matrix[i]) + temp = np.median(abs(matrix[i]- medianRow)) +# median (|Xi - Xmedian|) > X => meant to minimize the impact of one outlier + if temp < cutoff: + deletes = np.append(deletes,[i],0) + + if temp < minVal: minVal = temp + if temp > maxVal: maxVal = temp + + #Delete Rows sub-Threshold Rows + matrix = np.delete(matrix,deletes,0) + filter_rows = np.delete(row_header_list,deletes,0) + filter_cols = column_header_list + print( "INFO Row MAD - Matrix min MAD value= "+str(minVal)+" and the max MAD value= "+str(maxVal) ) + + return matrix, filter_rows, filter_cols,len(deletes),maxVal + +#MAD Median Absolute Deviation median (|Xi - Xmedian|) > X +def Col_Value_MAD(matrix,cutoff,row_header_list,column_header_list): +#MAD Median Absolute Deviation median (|Xi - Xmedian|) > X +# cutoff is MAX value used to meant to minimize the impact of one outlier + deletes = [] + minVal = +9999999 + maxVal = -99999 + #Loop to Determine Which Rows have sub-Threshold Range + for i in range(0,len(matrix[0])): + matrixCol= [] + for j in range(len(matrix)): + matrixCol.append(matrix[j][i]) + + medianCol = np.median(matrixCol) + temp = np.median(abs(matrixCol- medianCol)) +# median (|Xi - Xmedian|) > X meant to minimize the impact of one outlier + if temp < cutoff: + deletes = np.append(deletes,[i],0) + + if temp < minVal: minVal = temp + if temp > maxVal: maxVal = temp + + #Delete Rows sub-Threshold Rows + matrix = np.delete(matrix,deletes,1) + filter_rows = row_header_list + filter_cols = np.delete(column_header_list,deletes,0) + print( "INFO Column MAD - Matrix min MAD value= "+str(minVal)+" and the max MAD value= "+str(maxVal) ) + + return matrix, filter_rows, filter_cols,len(deletes),maxVal + + +# if covariance of the data in two columns exceeds a thresehold remove one row list the rows in a separate output +# def CoVariance_Percent_Filter_row_col(matrix,thresh,row_header_list,column_header_list): +# xv= array([8., 9.5, 7.8, 4.2, -7.7, -5.4, 3.2]) +# yv= array([8.9, 2.0, 4.8, -4.2, 2.7, -3.4, -5.9]) +# +# def cov(x,y): +# if (len(x) != len(y) +# [Stop] +# x.bar = mean(x) +# y.bar = mean(y) +# N = len(x) +# Cov = (sum((x-x.bar)*(y-y.bar))) / (N-1.0) +# return(Cov) + +# #Create Null Set of Filtered Row(Populated Later) +# deletes = [] +# +# temp_mean = np.nanmean(matrix[i]) +# temp_stdev = np.nanstd(matrix[i]) +# +# get stddev of each row the calc xi -xj sq +# +# for i in range(0,len(matrix)): +# temp_range = np.max(matrix[i][0::]) - np.min(matrix[i][0::]) +# if temp_range <= float(thresh): +# deletes = np.append(deletes,[i],0) +# +# #Delete Rows sub-Threshold Rows +# matrix = np.delete(matrix,deletes,0) +# filter_rows = np.delete(row_header_list,deletes,0) +# filter_cols = column_header_list +# return(matrix,filter_rows,filter_cols) +# +# #np.savetxt('testtest.txt',matrix,delimiter='\t') +# return(matrix,filter_rows,filter_cols) +# + +#Define Function Which Labels Rows/Columns on Output +#below replace +# def labeler(matrix,filter_rows,filter_cols,output_file_txt): +# +# #Write Data to Specified Text File Output +# with open(output_file_txt,'w') as f: +# f.write("") +# for k in range(0,len(filter_cols)): +# f.write('\t' + filter_cols[k]) +# f.write('\n') +# for i in range(0,len(filter_rows)): +# f.write(filter_rows[i]) +# for j in range(0,len(matrix[0])): +# f.write('\t' + format(matrix[i][j])) +# f.write('\n') + + +#Define Main Function +def main(): + try: + args = get_args() + #sys.stdout.write(str(args)+"\n") +# +# +# +# + nanList= ["NAN", "NA", "N/A", "-","?","nan", "na", "n/a"] + + matrix, column_header_list,row_header_list = reader(args.input_file_txt) + #old_reader matrix, row_header_list, column_header_list = reader(args.input_file_txt) + threshold = float(args.thresh) + if threshold < 0.000001: + print('Invalid negative or near-zero threshold chosen = '+str(args.thresh)+" choose positive value") + sys.exit(-4) + +#VariancePercent + if args.choice == "VariancePercent" or args.choice == "VarianceCount": # > percent variance + + if args.axes == "Row": + if args.choice == "VarianceCount": threshold= (1-threshold/len(row_header_list))*100.0 + + matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = Variance_Percent_Filter_row(matrix,threshold,row_header_list,column_header_list) + Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) + if delCnt < 1: + print('\nNO Filtering occurred for rows using variance percentile < '+str(args.thresh)+ ' by row. Matrix row minimum variance= %.2f' % minVal+' and maximum variance= %.2f' % maxVal) + sys.stderr.write('\nFiltering out rows using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows') + sys.exit(-1) + else: + print('\nFiltering out rows using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows') + elif args.axes == "Column": + if args.choice == "VarianceCount": threshold= (1-threshold/len(column_header_list))*100.0 + matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = Variance_Percent_Filter_col(matrix,threshold,row_header_list,column_header_list) + Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) + if delCnt < 1: + print('\nNO Filtering occurred for columns using variance percentile < '+str(args.thresh)+ ' by columns. Matrix columns minimum variance= %.2f' % minVal+' and maximum variance= %.2f' % maxVal) + sys.stderr.write('\nNO Filtering out rows using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows') + sys.exit(-1) + else: + print('\nFiltering out columns using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' columns') + else: + print('Invalid Axes ='+str(args.thresh)) + sys.exit(-1) +#LowerLimit + elif args.choice == "LowerLimit": #!! todo is NOT lower or upper limit but range of values + if args.axes == "Row": + matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Row('lower',matrix,threshold,row_header_list,column_header_list) + Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) + if delCnt < 1: + print('\nNO Filtering occurred for rows using LowerLimit < '+str(args.thresh)+ ' by row. Matrix row minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal) + sys.stderr.write('\nNO Filtering out rows using LowerLimit < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows') + sys.exit(-1) + else: + print('\nFiltered out '+str(delCnt)+' rows with Lower Limit < '+str(args.thresh)) + elif args.axes == "Column": + matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Col('lower', matrix,threshold,row_header_list,column_header_list) + Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) + if delCnt < 1: + print('\nNO Filtering occurred for columns using Lower Limit < '+str(args.thresh)+ ' by columns. Matrix columns minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal) + sys.stderr.write('\nNO Filtering out rows using Lower Limit < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows') + sys.exit(-1) + else: + print('\nFiltered out '+str(delCnt)+' columns with Lower Limit < '+str(args.thresh)) +#UpperLimit + elif args.choice == "UpperLimit": #!! todo is NOT lower or upper limit but range of values + if args.axes == "Row": + matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Row('upper',matrix,threshold,row_header_list,column_header_list) + Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) + if delCnt < 1: + print('\nNO Filtering occurred for rows using Upper Limit < '+str(args.thresh)+ ' by row. Matrix row minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal) + sys.stderr.write('\nNO Filtering out rows using Upper Limit < '+str(args.thresh)+ ' by row. Matrix row minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal) + sys.exit(-1) + else: + print('\nFiltered out '+str(delCnt)+' rows with UpperLimit < '+str(args.thresh)) + elif args.axes == "Column": + matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Col('upper', matrix,threshold,row_header_list,column_header_list) + Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) + if delCnt < 1: + print('\nNO Filtering occurred for columns using UpperLimit < '+str(args.thresh)+ ' by columns. Matrix columns minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal) + sys.stderr.write('\nFiltering out rows using UpperLimit < '+str(args.thresh)+ ' by columns. Matrix columns minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal) + sys.exit(-1) + else: + print('\nFiltered out '+str(delCnt)+' columns with UpperLimit < '+str(args.thresh)) +#MADlimit + elif args.choice == "MADcount" or args.choice == "MADpercent": #!! is lowerlimit of median absolute deviation medians + threshold= threshold + if args.axes == "Row": + if args.choice == "MADpercent": threshold= len(row_header_list)*threshold/100.0 + + matrix, filter_rows, filter_cols,delCnt,maxVal = Row_Value_MAD(matrix,threshold,row_header_list,column_header_list) + Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) + if delCnt < 1: + print('\nNO Filtering occurred for rows using MAD < '+str(threshold)+ ' by row. Matrix row MAD maximum value= %.2f' % maxVal) + sys.stderr.write('\nFiltering out rows using MAD < '+str(threshold)+ ' by row. Matrix row MAD maximum value= %.2f' % maxVal) + sys.exit(-1) + else: + print('\nFiltered out '+str(delCnt)+' rows using MAD maximum value > '+str(threshold)) + elif args.axes == "Column": + if args.choice == "MADpercent": threshold= len(column_header_list)*threshold/100.0 + + matrix, filter_rows, filter_cols,delCnt,maxVal = Col_Value_MAD(matrix,threshold,row_header_list,column_header_list) + Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) + if delCnt < 1: + print('\nNO Filtering occurred for columns using MAD < '+str(threshold)+ ' by columns. Matrix columns MAD maximum value= %.2f' % maxVal) + sys.stderr.write('\nFiltering out columns using MAD < '+str(threshold)+ ' by columns. Matrix columns MAD maximum value= %.2f' % maxVal) + sys.exit(-1) + else: + print('\nFiltered out '+str(delCnt)+' columns using MAD maximum value > '+str(threshold)) +#NANlimit + elif args.choice == "NANlimit" or args.choice == "NANpercent": + maxNANs= int(args.thresh) + val= ' ' + if args.choice == "NANpercent": + n,m = np.shape(matrix) + maxNANs= int(int(args.thresh)*n/100) + val= '%' + if args.axes == "Row": + matrix, filter_rows, filter_cols,delCnt, maxFoundNANs = NAN_Filter_Row(matrix,nanList,maxNANs,row_header_list,column_header_list) + Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) + if delCnt < 1: + print('\nNO Filtering occurred for rows using NAN limit = or > '+str(args.thresh)+val+ ' by row. Matrix row max NAN count is =' + str(maxFoundNANs )) + sys.stderr.write('\nNO Filtering out rows using NAN limit = or > '+str(args.thresh)+val+ ' by row. Matrix row max NAN count is =' + str(maxFoundNANs )) + sys.exit(-1) + else: + print('\nFiltered out '+str(delCnt)+' rows using NAN limit = or > '+str(args.thresh)+val) + elif args.axes == "Column": + matrix, filter_rows, filter_cols,delCnt, maxFoundNANs = NAN_Filter_Column(matrix, nanList, maxNANs, row_header_list, column_header_list) + Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) + if delCnt < 1: + print('\nNO Filtering occurred for columns using NAN limit = or > '+str(args.thresh)+val+ ' by columns. Matrix columns max NAN count is = '+ str(maxFoundNANs)) + sys.stderr.write('\nNO Filtering out columns using NAN limit = or > '+str(args.thresh)+val+ ' by columns. Matrix columns max NAN count is = '+ str(maxFoundNANs)) + sys.exit(-1) + else: + print('\nFiltered out '+str(delCnt)+' columns using NAN limit = or > '+str(args.thresh)+val ) + +# elif args.choice == "covariance": +# if args.axes == "Row": +# matrix, filter_rows, filter_cols = CoVariance_Percent_Filter_row(matrix,args.thresh,row_header_list,column_header_list) +# Labeler(matrix,filter_rows,filter_cols,args.output_file_txt) +# print('Covariance_Filter on row') +# elif args.axes == "Column": +# matrix, filter_rows, filter_cols = CoVariance_Percent_Filter_col(matrix,args.thresh,row_header_list,column_header_list) +# Labeler(matrix,filter_rows,filter_cols,args.output_file_txt) +# print('Covariance_Filter on column') + else: + print('Invalid Axes = '+str(args.axes)) + sys.exit(-1) + else: + print("Invalid Filter Choice = "+str(args.choice)) + sys.exit(-2) + + + except Exception as err: + traceback.print_exc() + sys.exit(-3) + +if __name__ == '__main__': + main() + print("\ndone") + sys.exit(0) diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Filters.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Matrix_Filters.xml Tue Nov 27 14:20:40 2018 -0500 @@ -0,0 +1,58 @@ + + + rows or columns based on specified threshold + Matrix_Filters.py '$p_input' '$extra.choice' '$extra.thresh' '$axes' '$output_file' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Manipulation.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Matrix_Manipulation.sh Tue Nov 27 14:20:40 2018 -0500 @@ -0,0 +1,42 @@ +#echo "1: " $1 # tool directory +#echo "2: " $2 # manipulation option +#echo "3: " $3 # input file +#echo "4: " $4 # output file +#echo "5: " $5 # choice +#echo "6: " $6 # thresh +#echo "7: " $7 # axis +#echo "8: " $8 # transpose +#echo "9: " $9 # input2 +#echo "10: " ${10} # offsetvalue +#echo "11: " ${11} # scalevalue +#echo "12: " ${12} +#echo "13: " ${13} +#echo "14: " ${14} +#echo "15: " ${15} +#echo "16: " ${16} + +#echo "tool directory is: " $1 +if [ "$2" = "Matrix_Filters" ]; then + echo "filter chosen" + #python $__tool_directory__/Matrix_Filters.py '$p_input '${manipulation.extra.choice}' '${manipulation.extra.thresh}' '${manipulation.axis}' '$output_file' + python $1/Matrix_Filters.py $3 $5 $6 $7 $4 +elif [ "$2" = "Matrix_Multiply" ]; then + echo "multiply chosen" + #python '$__tool_directory__/Matrix_Multiply.py' '$p_input' '${manipulation.extra.transpose}' '${manipulation.extra.input2}' '${manipulation.extra.choice}' '$output_file' + python $1/Matrix_Multiply.py $3 $8 $9 $5 $4 +elif [ "$2" = "Matrix_Statistics" ]; then + echo "statistics chosen" + #python '$__tool_directory__/Matrix_Statistics.py' '$p_input' '$choice' '$cutoff' '$axis' '$out_file' + python $1/Matrix_Statistics.py $3 $5 $6 $7 $4 +elif [ "$2" = "Matrix_Transformations" ]; then + echo "transform chosen" + #python '$__tool_directory__/Matrix_Transformations.py' '$p_input' '$choice' '$axis' '$scalevalue' '$offsetvalue' '$output_file' + python $1/Matrix_Transformations.py $3 $5 $7 ${11} ${10} $4 +elif [ "$2" = "Matrix_Validations" ]; then + echo "validations chosen" + #python '$__tool_directory__/Matrix_Validations.py' '$p_input' '${manipulation.extra.choice}' '${manipulation.extra.axis}' '$output_file' + python $1/Matrix_Validations.py $3 $5 $7 $4 +else + echo "no valid choice made" +fi + diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Manipulation.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Matrix_Manipulation.xml Tue Nov 27 14:20:40 2018 -0500 @@ -0,0 +1,406 @@ + + + Data manipulation before heatmap creation + + + $__tool_directory__/Matrix_Manipulation.sh '$__tool_directory__' '${manipulation.option}' '$p_input' '$output_file' + '${manipulation.extra.choice}' '${manipulation.extra.thresh}' '${manipulation.extra.axis}' + '${manipulation.extra.transpose}' '${manipulation.extra.input2}' '${manipulation.extra.offsetvalue}' '${manipulation.extra.scalevalue}' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Manipulation_Tool_Shed.zip Binary file Matrix_Manipulation_Tool_Shed.zip has changed diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Multiply.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Matrix_Multiply.py Tue Nov 27 14:20:40 2018 -0500 @@ -0,0 +1,173 @@ +''' +Created on March 6, 2018 + +@author: Bob Brown based on John Weinstein's algorithm +''' + +import os +import re +import shutil +import traceback +import sys, traceback, argparse +import numpy as np +import warnings +#import scipy.stats as ss +from Matrix_Validate_import import reader, Labeler, MatchLabels +import math +warnings.filterwarnings('error') + +# John Weinsteins algorithm by bob brown https://discover.nci.nih.gov/CorrelateMatrices/help.do +#http://www.blog.pythonlibrary.org/2014/04/30/reading-excel-spreadsheets-with-python-and-xlrd/ + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('input_file1', help='text file input matrix(include .txt in name)') + parser.add_argument('transpose', type=str, help='transpose matrix 1?') + parser.add_argument('input_file2', help='text file input matrix(include .txt in name)') + parser.add_argument('choice', type=str, help='Choose Normalization Method: 1 = Z-score, 2 = Mean Centered, 3 = log2, 4= rank') +# parser.add_argument('scaleValue', help='optional scaling factor for matrix)') + parser.add_argument('out_fileName', help='text file output matrix(include .txt in name)') + args = parser.parse_args() + if args.transpose == "": args.transpose = 'n' + return args + + +def Matrix_Multiply(matrix1, matrix2): + + try: +#TODO handle NANs + + matrixOut= np.dot(matrix1, matrix2) + + + except Exception as err: + traceback.print_exc() + sys.exit(-5) + + return(matrixOut) + + +#CorrelateMatrices correlation acorss 2 martices https://discover.nci.nih.gov/CorrelateMatrices/home.do +def Correlate_Matrices(matrix1, matrix2): + + #try: + # Leave both matrices as size axn and bxn and treat a is column and b as row + #matrix1T = Transpose(matrix1) + +#TODO handle NANs + numRows1,numColumns1= np.shape(matrix1) + + numRows2,numColumns2= np.shape(matrix2) + matrixOut= [] + + if numColumns1 != numRows2: + print("ERROR number columns Matrix 1 ", str(numColumns1), " not equal number rows for Matrix 2 ",str(numRows2)) + sys.exit(-1) +#TODO need to look for NANs?? + + for i in range(numRows1): + vectorM1 = matrix1[i][:] + meanVec1 = np.nanmean(vectorM1) + varStdDev1 = np.nanstd(vectorM1, ddof=1) + lowStdDev1 = False + #if equals zero + if abs(varStdDev1) < .000001: + print("ERROR Variance value almost zero", str(varStdDev1), " for Matrix 1 Row ",str(i+1)) + lowStdDev1= True + correlationRow= [] + + for j in range(numColumns2): + vectorM2 = [] + for t in range(numRows2): + vectorM2.append(matrix2[t][j]) + meanVec2 = np.nanmean(vectorM2) + varStdDev2 = np.nanstd(vectorM2, ddof=1) + lowStdDev2= False + #if equals zero + if abs(varStdDev2) < .000001: + print("ERROR Variance value almost zero", str(varStdDev2), " for Matrix 2 Column ",str(j+1)) + lowStdDev2= True + + covarStdDev12= 0 + + if not lowStdDev1 and not lowStdDev2: + #try: + for pos in range(len(vectorM1)): + covarStdDev12 += ((vectorM1[pos]-meanVec1)/varStdDev1)*((vectorM2[pos]-meanVec2)/varStdDev2) +# bottom= (numColumns1 -1)*(varStdDev1*varStdDev2) +# correlationRow.append( covarStdDev12/bottom) + correlationRow.append( covarStdDev12/(numColumns1 -1)) + #except: bad value because of NAN or other + else: + correlationRow.append("divide by 0") # cannot calculate correlation var too small + + matrixOut.append(correlationRow) + +# except Exception as err: +# traceback.print_exc() +# sys.exit(-6) + + return(matrixOut) + +#---------------------------------------------------------------------- +def Transpose(in_mat): + out_mat = [] + numRows,numColumns= np.shape(in_mat) + + for i in range(numColumns): + temp= [] + for j in range(numRows): + temp.append(in_mat[j][i]) + out_mat.append(temp) + #print( str(out_mat)) + return out_mat + + +#---------------------------------------------------------------------- +if __name__ == "__main__": + +# input_file1 = "/Users/bobbrown/Desktop/Gene-by-var.txt" +# input_file2 = "/Users/bobbrown/Desktop/var-by-sample.txt" +# out_fileName = "/Users/bobbrown/Desktop/MatixMult-1-2-Out.txt" +# selection = "MatrixMultiply" +#TODO address NANs ??? + + try: + args = get_args() + selection= args.choice + + matrix1,column_labels1,row_labels1 = reader(args.input_file1) # to be transposed later + matrix2,column_labels2,row_labels2 = reader(args.input_file2) + + + if args.transpose == 'y' or args.input_file1 == args.input_file2: + matrix1 = Transpose(matrix1) + print("\n>>>NOTICE Transposed first matrix so matrix 1 columns = Matrix 2 number rows ") + temp = row_labels1 #swap labels for output matrix + row_labels1 = column_labels1 #swap labels for output matrix + column_labels1= temp #swap labels for output matrix + + MatchLabels(column_labels1,row_labels2) # verfiy labels and their order match + + if len(column_labels1) != len(row_labels2): + print("\n>>> ERROR attempting to multiple Matrices of incompatible dimensions ") + print("First Matrix is "+str(len(row_labels1))+" by "+str(len(column_labels1))+" where second Matrix is "+str(len(og_row2))+" by "+str(len(column_labels2))+"\n") + print("Matrices must have dimensions AxB and BxC. A can equal C (square matrices)") + sys.exit(-1) + + if selection == "MatrixMultiply": + matrixOut= Matrix_Multiply(matrix1, matrix2 ) + + elif selection == "Corr2Matrices" or selection == "Corr1Matrix": + matrixOut = Correlate_Matrices(matrix1, matrix2) + + Labeler(matrixOut,column_labels2,row_labels1,args.out_fileName) + + print("Matrix Multiply "+str(len(row_labels1))+" by "+str(len(column_labels1))+" Matrix 1 by "+str(len(row_labels2))+" by "+str(len(column_labels2))+" matrix 2") + print("Output Matrix dimensions are "+str(len(row_labels1))+" by "+str(len(column_labels2))+"\n") + + except Exception as err: + traceback.print_exc() + sys.exit(-3) + + sys.exit(0) \ No newline at end of file diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Multiply.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Matrix_Multiply.xml Tue Nov 27 14:20:40 2018 -0500 @@ -0,0 +1,52 @@ + + + one matrix using one or two matrices + Matrix_Multiply.py '$extra.input1' 'extra.$transpose' 'extra.$input2' '$extra.choice' '$output_file' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Statistics.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Matrix_Statistics.py Tue Nov 27 14:20:40 2018 -0500 @@ -0,0 +1,145 @@ +''' +Created on Feb2018 + +@author: bob brown +''' + +import sys, traceback, argparse +import numpy as np +from Matrix_Validate_import import reader +#import matplotlib.pyplot as plt +from Matrix_Filters import Variance_Percent_Filter_row, Variance_Percent_Filter_col + +#Define argparse Function +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('input_file_txt', help='tab delimited text file input matrix(include .txt in name)') + parser.add_argument('choice',type=str, help='Variance Filter Method (Variance or Range)') + parser.add_argument('thresh', help='Thershold for Variance Filtering') + parser.add_argument('axes', help='Axes to Filter on (Either Row or Column') + parser.add_argument('output_file_txt', help='tab delimited text file output name (include .txt in name)') + args = parser.parse_args() + return args + + +#Define Function Which Labels Rows/Columns on Output +def labeler(matrix,filter_rows,filter_cols,output_file_txt): + + #Write Data to Specified Text File Output + with open(output_file_txt,'w') as f: + f.write("") + for k in range(0,len(filter_cols)): + f.write('\t' + filter_cols[k]) + f.write('\n') + for i in range(0,len(filter_rows)): + f.write(filter_rows[i]) + for j in range(0,len(matrix[0])): + f.write('\t' + format(matrix[i][j])) + f.write('\n') + + +def Histo(matrix): + numBins= 20 + data = [] +# numRow,numCol= np.shape(matrix) + for i in range(len(matrix[0])): + data.append(np.nanmean([row[i] for row in matrix])) + +# print(str(np.nanmean([row[i] for row in matrix]))) + +#https://stackoverflow.com/questions/5328556/histogram-matplotlib + #bins = [0, 40, 60, 75, 90, 110, 125, 140, 160, 200] + minBin = int(min(data)-0.5) + maxBin = int(max(data)+0.5) + binWidth = float(maxBin-minBin)/numBins + bins= [] + """ + for j in range(numBins): + bins.append(minBin+ j*binWidth) + #bins= 20 + n, bins, patches = plt.hist(data,bins, normed=False) + #n, bins, patches = plt.hist(data,bins, normed=1, color='green') + #hist, bins = np.histogram(data, bins=bins) + width = np.diff(bins) + center = (minBin + bins[1:]) / 2 + + cm = plt.cm.get_cmap('RdYlBu_r') + #col = (n-n.min())/(n.max()-n.min()) + for c, p in zip(bins, patches): + plt.setp( p, 'facecolor', cm(c/numBins)) + fig, ax = plt.subplots(num=1, figsize=(8,3)) + ax.set_title("Distribution of Column Means") + #ax.bar(center,bins, align='center', width=width) + #ax.bar(center, hist, align='center', width=width) + #ax.set_xticks(bins) +# fig.savefig("/Users/bobbrown/Desktop/Matrix-tools-Test-output/Column_Mean_Histogram.png") + + plt.show() + """ + return() + +#========== test create variable number output files in Galaxy +def CreateFiles(output_file_info): + + for i in range(3): + fd= open( output_file_info, 'w') + fd.write('File number = '+ str(i)+"\n") + fd.close() + + return() + +#================== + + #Define Main Function +def main(): + try: + args = get_args() + #sys.stdout.write(str(args)+"\n") + nanList= ["NAN", "NA", "N/A", "-","?","nan", "na", "n/a"] + + matrix, og_cols,og_rows = reader(args.input_file_txt) + #old_reader matrix, og_rows, og_cols = reader(args.input_file_txt) +# if float(args.thresh) < 0.000001: +# print('Invalid negative threshold chosen = '+str(args.thresh)+" choose positive value") +# sys.exit(-4) + + if args.choice == "Histogram": + Histo(matrix) + elif args.choice == "CreateFiles": + CreateFiles(args.output_file_info) + + elif args.choice == "Variance": + if args.axes == "Row": + matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = Variance_Percent_Filter_row(matrix,1,og_rows,og_cols,True) + labeler(matrix,filter_rows,filter_cols,args.output_file_txt) +# if delCnt < 1: +# print('\nNO Filtering occurred for rows using variance < '+str(args.thresh)+ ' by row. Matrix row minimum variance= %.2f' % minVal+' and maximum variance= %.2f' % maxVal) +# sys.stderr.write('\nFiltering out rows using variance < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows') +# sys.exit(-1) +# else: +# print('\nFiltering out rows using variance < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows') + elif args.axes == "Column": + matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = Variance_Percent_Filter_col(matrix,1,og_rows,og_cols,True) + labeler(matrix,filter_rows,filter_cols,args.output_file_txt) +# if delCnt < 1: +# print('\nNO Filtering occurred for columns using variance < '+str(args.thresh)+ ' by columns. Matrix columns minimum variance= %.2f' % minVal+' and maximum variance= %.2f' % maxVal) +# sys.stderr.write('\nFiltering out rows using variance < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows') +# sys.exit(-1) +# else: +# print('\nFiltering out columns using variance < '+str(args.thresh)+ ' removed '+str(delCnt)+' columns') + else: + print('Invalid Axes = '+str(args.axes)) + sys.exit(-1) + else: + print("Invalid Filter Choice = "+str(args.choice)) + sys.exit(-2) + + + except Exception as err: + traceback.print_exc() + sys.exit(-3) + +if __name__ == '__main__': + main() + print("\nFini") + sys.exit(0) \ No newline at end of file diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Statistics.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Matrix_Statistics.xml Tue Nov 27 14:20:40 2018 -0500 @@ -0,0 +1,23 @@ + + + View metadata about Heat Map Matrix + Matrix_Statistics.py '$input' '$choice' '$cutoff' '$axes' '$out_file' + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Transformations.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Matrix_Transformations.py Tue Nov 27 14:20:40 2018 -0500 @@ -0,0 +1,301 @@ +''' +Created on Jun 6, 2017 updated Feb 2018 + +@author: cjacoby and Bob Brown +''' +import os +import sys, traceback, argparse +import numpy as np +from numpy import size, array +import warnings +from Matrix_Validate_import import reader +#import scipy.stats as ss +warnings.filterwarnings('error') + +#Define argparse Function +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('input_file_txt', help='text file input matrix(include .txt in name)') + parser.add_argument('choice', type=str, help='Choose normalization Method: 1 = Z-score, 2 = Mean Centered, 3 = log2, 4= rank') + parser.add_argument('axes', type=str, help='Choose Axis to normalize On (Row or Column)') + parser.add_argument('scalevalue', help='optional scaling factor for matrix)') + parser.add_argument('offsetvalue', help='optional offset for matrix') + parser.add_argument('output_file_txt', help='text file output matrix(include .txt in name)') + args = parser.parse_args() + return args + + +def Zscore_row(matrix): + + #Loop To Perform Z-Score normalization + for i in range(0,len(matrix)): + temp_mean = np.nanmean(matrix[i]) + temp_stdev = np.nanstd(matrix[i],ddof=1) + for j in range(0,len(matrix[0])): + matrix[i][j] = (matrix[i][j]-temp_mean)/temp_stdev + return(matrix) + +#Define Z-Score normalization Function +def Zscore_col(matrix): + + #Loop To Perform Z-Score normalization + for i in range(len(matrix[0])): +# matrix[:][i] = [scaleValue*x+offset for x in matrix[i]] + temp_mean = np.nanmean([row[i] for row in matrix]) + temp_stdev = np.nanstd([row[i] for row in matrix],ddof=1) + #Probably Should Have if statement checking if stdev equals zero, although this implies the data is already Z-score normalized + for j in range(len(matrix)): + matrix[j][i] = (matrix[j][i]-temp_mean)/temp_stdev + return(matrix) + + +#Define Mean Centered or Median centered normalization Function +def MeanMedianCenter_row(matrix,type): + + + #Loop To Perform mean or median center + for i in range(0,len(matrix)): + if type == "mean": + temp_type = np.nanmean(matrix[i][1::]) + else: + temp_type = np.nanmedian(matrix[i][1::]) + + for j in range(0,len(matrix[0])): + matrix[i][j] = (matrix[i][j]-temp_type) + return(matrix) + + +#Define mean or median +def MeanMedianCenter_col(matrix,type): + + #Loop To Perform mean or median center + for i in range(0,len(matrix[0])): + if type == "mean": + temp_type = np.nanmean([row[i] for row in matrix]) + else: + temp_type = np.nanmedian([row[i] for row in matrix]) + #Probably Should Have if statement checking if stdev equals zero, although this implies the data is already Z-score normalized + for j in range(0,len(matrix)): + matrix[j][i] = (matrix[j][i]-temp_type) + return(matrix) + +#Divide by sum of the Row Function +def Divide_By_Sum_row(matrix): + + #Loop To Perform mean or median center + numRow,numCol= np.shape(matrix) + + for i in range(numRow): + sumValue = sum(matrix[i][:]) + + #if equals zero + if abs(sumValue) > .0001: + for j in range(numCol): + matrix[i][j] = matrix[i][j]/sumValue + else: + print("ERROR Cannot divide by Sum almost zero", str(sumValue), " for Row ",str(i+1)) + return(matrix) + + +#Divide by sum of the Column Function +def Divide_By_Sum_col(matrix): + + #Loop To Perform mean or median center + numRow,numCol= np.shape(matrix) + + for i in range(numCol): + sumValue= 0 + + #if equals zero + if abs(sumValue) > .0001: + for j in range(numRow): + matrix[j][i] = (matrix[j][i]/sumValue) + else: + print("ERROR Cannot divide by Sum almost zero", str(sumValue), " for Column ",str(i+1)) + return(matrix) + +#scale or add offset to matrix by row +def ScaleOffset_row(matrix,scaleValue,offset): + + #Loop To Perform scale and offset do one or the other per request + if abs(scaleValue) > 0.0001: + for i in range(0,len(matrix)): + matrix[i][:] = [scaleValue*x+offset for x in matrix[i]] + else: + print (" Scale facter "+str(scaleValue)+" too small") + return(matrix) + +#scale or add offset to matrix by column +def ScaleOffset_col(matrix,scaleValue,offset): + + #Loop To Perform scale and offset do one or the other per request + if abs(scaleValue) > 0.0001: + for i in range(0,len(matrix[0])): + matrix[:][i] = [scaleValue*x+offset for x in matrix[i]] + else: + print (" Scale facter "+str(scaleValue)+" too small") + return(matrix) + +#Define Log2 normalization Method +def Convert2Logs(matrix,logValue, offset): + import warnings + warnings.filterwarnings('error') + + #Loop To Perform Z-Score normalization + for i in range(0,len(matrix)): + for j in range(0,len(matrix[0])): + try: + if logValue == "log2": + matrix[i][j] = np.log2(matrix[i][j]+offset) + else: + matrix[i][j] = np.log10(matrix[i][j]+offset) + + except RuntimeWarning: + print(logValue+" normalization Failed: Encountered elements <= 0, which are invalid inputs for a Log normalization") + break + else: + continue + break + return(matrix) + +#transpose matrix +def Transpose(in_mat): + out_mat = [] + numRow,numCol= np.shape(in_mat) + + for i in range(numCol): + temp= [] + for j in range(numRow): + temp.append(in_mat[j][i]) + out_mat.append(temp) + #print( str(out_mat)) + return out_mat + +# restores row and column labels in ouput +def labeler(matrix,og_cols,og_rows,output_file_txt): + #Define Null Sets For Col and Row Headers + with open(output_file_txt,'w') as f: + f.write("") + for k in range(0,len(og_cols)): + f.write('\t' + str(og_cols[k]) ) + f.write('\n') + for i in range(0,len(og_rows)): + f.write(str(og_rows[i]) ) + for j in range(0,len(matrix[0])): + f.write('\t' + format(matrix[i][j])) + f.write('\n') + +#Define Main Function +def main(): + + try: + args = get_args() + scaleValue = float(args.scalevalue) + offsetValue= float(args.offsetvalue) + #print(args) + #sys.stdout.write(str(args)+"\n") + + matrix,og_cols,og_rows = reader(args.input_file_txt) + if args.choice == "z_score_normalization": + if args.axes == "Row": + matrix = Zscore_row(matrix) + labeler(matrix,og_cols,og_rows,args.output_file_txt) + print("zcore, row") + elif args.axes == "Column": + matrix = Zscore_col(matrix) + labeler(matrix,og_cols,og_rows,args.output_file_txt) + print("zscore, column") + else: + print("zscore, invalid axis") + elif args.choice == "mean_center_normalization": + if args.axes == "Row": + matrix = MeanMedianCenter_row(matrix,"mean") + labeler(matrix,og_cols,og_rows,args.output_file_txt) + print("mean-center by row") + elif args.axes == "Column": + matrix = MeanMedianCenter_col(matrix,"mean") + labeler(matrix,og_cols,og_rows,args.output_file_txt) + print("mean-center by column") + else: + print("meancenter, invalid axis") + elif args.choice == "median_center_normalization": + if args.axes == "Row": + matrix = MeanMedianCenter_row(matrix,"median") + labeler(matrix,og_cols,og_rows,args.output_file_txt) + print("median-center by row") + elif args.axes == "Column": + matrix = MeanMedianCenter_col(matrix,"median") + labeler(matrix,og_cols,og_rows,args.output_file_txt) + print("median-center by column") + else: + print("meancenter, invalid axis") + elif args.choice == "add_offset": + if args.axes == "Row": + #offset = -100 #!!!! TODO REMOVE AND ADD WHEN clause to xml to get value + matrix = ScaleOffset_row(matrix,1.0,offsetValue) + labeler(matrix,og_cols,og_rows,args.output_file_txt) + print("offset of "+str(offsetValue)+" by row") + elif args.axes == "Column": + matrix = ScaleOffset_col(matrix,1.0,offsetValue) + labeler(matrix,og_cols,og_rows,args.output_file_txt) + print("offset of "+str(offsetValue)+" by column") + else: + print("offset"+str(offsetValue)+" invalid axis -not row or column") + elif args.choice == "scale": + if args.axes == "Row": + #scaleValue = 1000 #!!!! TODO REMOVE AND ADD WHEN clause to xml to get value + matrix = ScaleOffset_row(matrix,scaleValue,0.0) + labeler(matrix,og_cols,og_rows,args.output_file_txt) + print("scaling "+str(scaleValue)+" by row") + elif args.axes == "Column": + matrix = ScaleOffset_col(matrix,scaleValue,0.0) + labeler(matrix,og_cols,og_rows,args.output_file_txt) + print("scaling "+str(scaleValue)+" by column") + else: + print("scaling "+str(scaleValue)+" invalid axis") + elif args.choice == "transpose": + matrix = Transpose(matrix) #issue using same matrix? + labeler(matrix,og_rows,og_cols,args.output_file_txt) #swapped row&col labels + print("transpose mxn matrix to nxm size") + elif args.choice == "ln_normalization": + matrix = Convert2Logs(matrix,"log2",offsetValue) + labeler(matrix,og_cols,og_rows,args.output_file_txt) + print("log2 plus "+str(offsetValue)+" normalization for all values") + elif args.choice == "log_normalization": + matrix = Convert2Logs(matrix,"log10",offsetValue) + labeler(matrix,og_cols,og_rows,args.output_file_txt) + print("log10 normalization for all values") + elif args.choice == "rank": + if args.axes == "Row": + matrix = Rankdata_ByRow(matrix) + labeler(matrix,og_cols,og_rows,args.output_file_txt) + print("performed rank normalization by row") + elif args.axes == "Column": + matrix = Rankdata_ByColumn(matrix) + labeler(matrix,og_cols,og_rows,args.output_file_txt) + print("performed rank normalization by column") + else: + print("rank, invalid axis") + elif args.choice == "divide_by_sum": + if args.axes == "Row": + matrix = Divide_By_Sum_row(matrix) + labeler(matrix,og_cols,og_rows,args.output_file_txt) + print("performed divide row N values by row N's sum") + elif args.axes == "Column": + matrix = Divide_By_Sum_col(matrix) + labeler(matrix,og_cols,og_rows,args.output_file_txt) + print("performed divide column N values by column N's sum") + else: + print("divide_by_sum, invalid axis") + + else: + print("Invalid normalization Choice") + + except Exception as err: + traceback.print_exc() + sys.exit(1) + + +if __name__ == '__main__': + main() + print("Done") diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Transformations.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Matrix_Transformations.xml Tue Nov 27 14:20:40 2018 -0500 @@ -0,0 +1,106 @@ + + + by Rows, Columns, All by method + Matrix_Transformations.py '$p_input' '$extra.choice' '$extra.axes' '$extra.scalevalue' '$extra.offsetvalue' '$output_file' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Validate_import.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Matrix_Validate_import.py Tue Nov 27 14:20:40 2018 -0500 @@ -0,0 +1,179 @@ +''' +Created on Jun 7, 2017 modified Feb2018 + +@author: cjacoby and Bob Brown +''' + +import sys, traceback, argparse +import numpy as np +import os +#import matplotlib.pyplot as plt +#import matplotlib.pyplot as plt; plt.rcdefaults() + +# Define the Reading Function Which Pulls the Data from a .txt file +def reader(input_file_txt, create_plot= False): + #Read Matrix, Preserving String Values for Headers first row and first column (both minus first cell) + #Read Matrix, Converting all values to Float for Data Processing + + f = open(input_file_txt, "rU") + + #print( 'Valid NAN identifiers are: empty cells, cells with blanks,"NA","N/A","-", and "?"') + + column_labels = [] + row_labels = [] + matrix = [] + firstLine= True + + line = f.readline() + +# "NA","N/A","-","?","NAN","NaN","Na","na","n/a","null",EMPTY/Null, SPACE (blank char) + + nanList = ["", " ","NAN", "NA", "N/A", "-","?"] + binCatDict = {"":0, " ":0, "Text":0, "NA":0, "-":0,"NAN":0, "N/A":0,"?":0} + row = 0 + nanCnt = 0 + nonNumCnt = 0 + + while line: + line = line.strip("\n") + line = line.split('\t') + + row += 1 + + if firstLine: + lengthRow = len(line) + column_labels = line[1:] + else: + if lengthRow != len(line): + # print("\nERROR matrix row lengths unequal for row 0 and row "+str(row)+"\n" ) + sys.exit(-1) + + temp = [] +# column= 0 + row_labels.append(str(line[0])) + + #for item in line[1:]: use enumerate + for column, item in enumerate(line[1:],1): +# column += 1 + try: + temp.append(float(item)) + except ValueError: + temp.append(np.nan) + itemUC= item.upper() + + if itemUC in nanList: + nanCnt += 1 + binCatDict[itemUC]= binCatDict[itemUC]+1 + # print( 'Legit nans= ',str(item)) + else: + if nonNumCnt == 0: sys.stderr.write("Start List of up to first 50 Invalid cell values \n") + nonNumCnt +=1 + if nonNumCnt < 50: sys.stderr.write("At row_column= "+str(row)+"_"+str(column)+' invalid data cell value '+ item+"\n") + + matrix.append(temp) + + line = f.readline() + firstLine= False + + #sys.stdout.write("\n\n") + f.close() + binCatDict["Text"]= nonNumCnt + +# plot results of NAN counts above + + binCat = ["null", "blank", 'hyphen', '?','NA','N/A' ,'NAN', 'text'] + orderDict= {0:"", 1:"", 2:'-', 3:'?',4:'NA', 5:'N/A' ,6:'NAN', 7:'Text'} +#TODO verify dict orde for data + #print("> key value =",key, str(value)) + + if create_plot: + numBins = len(binCat) + binWidth = 1 + bins = [] + binData = [] + + for key in sorted(orderDict): + value= binCatDict[orderDict[key]] # place items on chart in order and with data value for item + if value < 1: + binData.append(value+0.01) + else: + binData.append(value) + + #""" + for j in range(numBins): + bins.append(j*binWidth) + #ttps://pythonspot.com/matplotlib-bar-chart/ + y_pos = np.arange(numBins) + plt.yticks(y_pos, binCat) + plt.title("Distribution of NAN types (UPPER & lower & MiXeD case combined)") + plt.ylabel('NAN Types') + plt.xlabel('Occurrences') + #plt.legend() + plt.barh(y_pos, binData, align='center', alpha=0.5) + + fig, ax = plt.subplots(num=1, figsize=(8,3)) + ax.set_title("Data Cell Counts of Not A Number (NAN) Types") + #ax.bar(center,bins, align='center', width=width) + #ax.bar(center, hist, align='center', width=width) + #ax.set_xticks(bins) + # fig.savefig("/Users/bobbrown/Desktop/Matrix-tools-Test-output/NAN-plot.png") + + # fig, ax = plt.subplots(num=1, figsize=(8,3)) + # fig.savefig("/Users/bobbrown/Desktop/Matrix-tools-Test-output/hist-out.png") + + plt.show() + #""" + +#after plot error? + x,y=np.shape(matrix) + if nanCnt > 0: print("WARNING -- Found "+str(nanCnt)+" Valid Non-numbers. Their percent of total matrix data cell values = "+str((100*nanCnt)/(x*y))+"% ") + if nonNumCnt > 0: sys.exit(-1) + #print ("reader output:") + #print (matrix) + #print (column_labels) + #print(row_labels) + return matrix,column_labels,row_labels + +#---------------------------------------------------------------------- +# Verify Matrix A column_labels match Matrix B row_labels in name and order for A*B +def MatchLabels(column_labels,row_labels): + + if len(column_labels) != len(row_labels): + sys.err("ERROR 1st matrix column count "+str(len(column_labels))+" not equal 2nd Matrix number row count "+str(len(row_labels))+"\n" ) + else: + cnt= 0 + for k in range(0,len(column_labels)): + if column_labels[k] != row_labels[k] and cnt < 20: + cnt += 1 + #sys.err("ERROR At column & row position "+str(k)+" Matrix 1 column value "+str(column_labels)+" not equal 2nd Matrix row value "+str(row_labels)+"\n" ) + + if cnt > 0: + sys.exit(-11) +#---------------------------------------------------------------------- +# restores row and column labels in ouput +def Labeler(matrix,column_labels,row_labels,output_file_txt): + #print("matrix length: " + str(len(matrix))) + #print("row labels length: " + str(len(row_labels))) + #print("col labels length: " +str(len(column_labels))) + #Define Null Sets For Col and Row Headers + with open(output_file_txt,'w') as f: + f.write("") + for k in range(0,len(column_labels)): + f.write('\t' + str(column_labels[k]) ) + f.write('\n') + #for i in range(0,len(row_labels)): + for i in range(0,len(matrix)): + f.write(str(row_labels[i]) ) + #print("matrix["+str(i)+"] length:" + str(len(matrix[i]))) + for j in range(0,len(matrix[0])): + f.write('\t' + format(matrix[i][j])) + f.write('\n') + + +#---------------------------------------------------------------------- +if __name__ == '__main__': + input_file_txt = str(sys.argv[1]) + + matrix,column_labels,row_labels = reader(input_file_txt) + print("Done") + diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Validations.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Matrix_Validations.py Tue Nov 27 14:20:40 2018 -0500 @@ -0,0 +1,180 @@ +''' +Created on Jun 7, 2017 modified Feb2018 + +@author: Bob Brown and cjacoby +''' + +import sys, traceback, argparse +import numpy as np +import os +from Matrix_Validate_import import reader, Labeler + +#Define The Four Arguments Used in the Program +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('input_file_txt', help='tab delimited text file input matrix(include .txt in name)') + parser.add_argument('replacement', type=str, help='Choose Replacement for Missing Value. Valid Choices are strings: "Mean" or "Zero"') + parser.add_argument('axes', type=str, help='Choose Axes to Normalize On (Either "Row" or "Column"') + parser.add_argument('output_file_txt' ,help='tab delimited text file output name (include .txt in name)') + args = parser.parse_args() + return args + + +#Define Function to Replace Null Values with Row Mean +def nan_replacer_mean_rows(matrix): + + nonNumCnt= 0 + nanCnt = 0 #valid NANs are "NA","N/A","-","?" + + #Loop Replacing all Null Values with Row Mean + for i in range(0,len(matrix)): + temp_mean = np.nanmean(matrix[i]) + for j in range(0,len(matrix[0])): + #if matrix[i][j] == "NA": #np.isnan(matrix[i][j]) == True: + if np.isnan(matrix[i][j]) == True: + matrix[i][j] = temp_mean + nanCnt = nanCnt + 1 + return matrix, nonNumCnt, nanCnt + +#Define Function to Replace Null Values with Column Mean +def nan_replacer_mean_columns(matrix): + + nonNumCnt= 0 + nanCnt = 0 #valid NANs are "NA","N/A","-","?" + + #Loop Replacing all Null Values with Column Mean + for i in range(0,len(matrix[0])): + col = [row[i] for row in matrix] + temp_mean = np.nanmean(col) + for j in range(0,len(matrix)): + #if matrix[i][j] == "NA": #elif np.isnan(matrix[j][i]) == True: + if np.isnan(matrix[j][i]) == True: + matrix[j][i] = temp_mean + nanCnt = nanCnt + 1 + + return matrix, nonNumCnt, nanCnt + +#Define Function to Replace Null Values with Zero (axis orientation is irrelevant) +def nan_replacer_zero(matrix): + + nonNumCnt= 0 + nanCnt = 0 #valid NANs are "NA","N/A","-","?" + + #Loop Replacing all Null Values with Row Range + for i in range(0,len(matrix)): + for j in range(0,len(matrix[0])): + #if matrix[i][j] =="NA": + if np.isnan(matrix[i][j]) == True: + matrix[i][j] = 0 + + return matrix, nonNumCnt, nanCnt + +#Define Function to Re-Label Output Matrix +#!!!! not needed no output matrix from Validate tool +def OLD_labeler(matrix, og_cols, og_rows, output_file_txt): + #Write Data to Specified Text File Output + with open(output_file_txt,'w') as f: + f.write("Use original input file for further processing\n") + f.close() +# f.write("") +# for k in range(0,len(og_cols)): +# f.write('\t' + str(og_cols[k])) +# f.write('\n') +# for i in range(0,len(og_rows)): +# f.write(og_rows[i]) +# for j in range(0,len(matrix[0])): +# f.write('\t' + format(matrix[i][j])) +# f.write('\n') + +#Main Function +def main(): + args = get_args() + #print(args) + #sys.stdout.write(str(args)) + #sys.stdout.write( '\nValid NAN identifiers are "NA","N/A","-", and "?"') + + matrix,og_cols,og_rows = reader(args.input_file_txt) + +# if nonNumCnt > 0: +# print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) +# #sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) +# if nanCnt > 0: +# print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') +# sys.exit(-1) +# else: +# if nanCnt > 0: +# print('\nWARNING Matrix has NO unknown non-numbers in matrix, but contains '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') +# else: +# print('Matrix is Good-to-Go -- all numbers in data area. ') + + #with open(args.output_file_txt,'w') as f: + # f.write("Use original input file for further processing\n") + #f.close() + #sys.exit(0) + +# TODO !!!!! Below if MDA decides to use it TURNED OFF FOR NOW +# TODO !!!!! Below if MDA decides to use it TURNED OFF FOR NOW + + if args.replacement == "Mean": + if args.axes == "Row": + matrix, nonNumCnt, nanCnt = nan_replacer_mean_rows(matrix) + Labeler(matrix,og_cols,og_rows,args.output_file_txt) + #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt) + #print('Mean,Row') + if nonNumCnt > 0: + print('ERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) + sys.stderr.write('ERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) + if nanCnt > 0: + print('WARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') + sys.exit(-1) + else: + if nanCnt > 0: + print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') + else: + print('\nMatrix is Good-to-Go -- all numbers in matrix. ') + sys.exit(0) + elif args.axes == "Column": + matrix, nonNumCnt, nanCnt = nan_replacer_mean_columns(matrix) + Labeler(matrix,og_cols,og_rows,args.output_file_txt) + #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt) + #print('Mean,Column') + if nonNumCnt > 0: + print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) + sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) + if nanCnt > 0: + print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') + sys.exit(-1) + else: + if nanCnt > 0: + print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') + else: + print('\nMatrix is Good-to-Go -- all numbers in matrix. ') + sys.exit(0) + else: + print('Mean, but given Invalid Axis= '+str(args.axes)) + sys.stderr.write('Mean, but given Invalid Axis= '+str(args.axes)) + elif args.replacement == "Zero": + matrix, nonNumCnt, nanCnt = nan_replacer_zero(matrix) + Labeler(matrix,og_cols,og_rows,args.output_file_txt) + #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt) + if nonNumCnt > 0: + print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) + sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) + if nanCnt > 0: + print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') + sys.exit(-1) + else: + if nanCnt > 0: + print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') + else: + print('\nMatrix is Good-to-Go -- all numbers in matrix. ') + sys.exit(0) + else: + print('zero, but given Invalid Axis= '+str(args.axes)) + sys.stderr.write('zero, but given Invalid Axis= '+str(args.axes)) + sys.exit(-2) + + +if __name__ == '__main__': + main() + print("done") diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Validations.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Matrix_Validations.xml Tue Nov 27 14:20:40 2018 -0500 @@ -0,0 +1,40 @@ + + + Locate and identify non-numbers + Matrix_Validations.py '$p_input' 'Zero' 'Row' '$output_file' + + + + + + + + + + + + + + + \ No newline at end of file diff -r 7f12c81e2083 -r f1bcd79cd923 Split_ExcelTabs_IntoFiles.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Split_ExcelTabs_IntoFiles.py Tue Nov 27 14:20:40 2018 -0500 @@ -0,0 +1,290 @@ +import sys +import os +#import MySQLdb +#import config +import subprocess +import re +import shutil +import traceback +#import xlsxwriter +import xlrd + +#http://www.blog.pythonlibrary.org/2014/04/30/reading-excel-spreadsheets-with-python-and-xlrd/ + +def File_From_Tab(infileName, outfileName, tabName,tabNumber): + """ + Open and read an Excel file + """ + book = xlrd.open_workbook(infileName) + # print number of sheets + #print book.nsheets + + # print sheet names + tabList= book.sheet_names() + #print tabList + #print book.sheet_names() + if tabName == "" and (tabNumber <1 or tabNumber > len(tabList)): + sys.stderr.write("\n>>>ERROR illegal tab number "+str(tabNumber)+" input when no tab name was specified\n") + sys.stderr.write("\n>>>Allowed tab numbers, or tab names, for this file with "+str(len(tabList))+" total tabs are:") + + for i in range(len(tabList)): + sys.stderr.write("\n>>> tab number "+str(i+1)+" is named "+str(tabList[i])) + sys.exit(-1) + + if tabName != "": # use name instead of tab number + found = False + i = 0 + while (i < len(tabList)) and not found: + i += 1 + if tabName == str(tabList[i-1]): + tabNumber = i + found = True + if not found: + sys.stderr("\n>>> ERROR -- Input Tab name "+tabName+" was not found\n") + sys.exit(-1) + # get the first worksheet + #first_sheet = book.sheet_by_index(0) + worksheet = book.sheet_by_index(tabNumber-1) + + outFile = open(outfileName+str(tabList[tabNumber-1]+".tsv"), 'w') + + #https://stackoverflow.com/questions/14944623/python-xrld-read-rows-and-columns + #workbook = xlrd.open_workbook('my_workbook.xls') + #worksheet = workbook.sheet_by_name('Sheet1') + num_rows = worksheet.nrows - 1 + num_cells = worksheet.ncols - 1 + curr_row = -1 + while curr_row < num_rows: + curr_row += 1 + row = worksheet.row(curr_row) + + if curr_row == 0: + endOfLine= False + allRowNumCols= len(row) + i= len(row)-1 + # find length of matrix and covariates using first row + # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank + while i <= len(row)-1 and not endOfLine: + cell_type = worksheet.cell_type(curr_row, i) + #temp = str(worksheet.cell_value(curr_row, i)) + #print( " pos and cell type row one ",cell_type, i) + + if cell_type == 0 or cell_type == 6: + allRowNumCols -= 1 + i -= 1 + else: + endOfLine= True + + if allRowNumCols < 5: + sys.stderr.write("\nERROR First row number of columns= "+str(allRowNumCols)+" is too short, so all rows will be ignored\n") + sys.exit(-1) + elif curr_row == 0: + sys.stdout.write("\nALL Rows must all have the same number of columns as the First row's number columns = "+ str(allRowNumCols) +"\n") + + temp= '' + rowLen= 0 + endOfLine= False + + while rowLen < allRowNumCols and not endOfLine: + temp += str(worksheet.cell_value(curr_row, rowLen))+"\t" + #temp += str(row[rowLen])+"\t" + rowLen += 1 + + temp = temp[:-1]+"\n" + #print 'Row:', curr_row, len(row), rowLen + outFile.write(temp) #TODO check if rows are all same length + + sys.stdout.write("File created with "+str(curr_row)+" rows and "+str(allRowNumCols)+" columns\n") +# curr_cell = -1 +# while curr_cell < num_cells: +# curr_cell += 1 +# # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank +# cell_type = worksheet.cell_type(curr_row, curr_cell) +# cell_value = worksheet.cell_value(curr_row, curr_cell) +# print ' ', cell_type, ':', cell_value + # # read a row +# print first_sheet.row_values(0) +# +# # read a cell +# cell = first_sheet.cell(0,0) +# print cell +# print cell.value +# +# # read a row slice +# print first_sheet.row_slice(rowx=0, +# start_colx=0, +# end_colx=2) + + return tabList + + +#====================== +# from RPPA callInSilicoReportWriter.py +# def write_xlsx_for_report(directory_for_reports, report_name, report_id, dict_cf2_values): +# +# +# error_write_xlsx = "" +# error_occurred = 0 +# +# try: +# path_to_dir_when_writing = os.path.join(directory_for_reports, report_name) +# header_path = os.path.join(directory_for_reports, report_name, "header.csv") +# raw_log_2_path = os.path.join(directory_for_reports, report_name, "RawLog2.csv") +# norm_linear_path = os.path.join(directory_for_reports, report_name, "NormLinear.csv") +# norm_log_2_path = os.path.join(directory_for_reports, report_name, "NormLog2.csv") +# norm_log_2_median_centered_path = os.path.join(directory_for_reports, report_name, "NormLog2_MedianCentered.csv") +# +# # put the cf2 values in the NormLinear file +# error_put_cf2_in_normLinear = write_new_normLinear_csv_file_with_cf2_values(path_to_dir_when_writing, norm_linear_path, dict_cf2_values) +# +# +# excel_workBook = xlsxwriter.Workbook(os.path.join(directory_for_reports, report_name,report_name + ".xlsx"), {'strings_to_numbers': True}) +# +# rawLog2_worksheet = excel_workBook.add_worksheet("RawLog2") +# error_rawLog2 = construct_worksheet_for_xlsx(rawLog2_worksheet, header_path, "RawLog2", raw_log_2_path) +# +# norm_linear_worksheet = excel_workBook.add_worksheet("NormLinear") +# error_norm_linear = construct_worksheet_for_xlsx(norm_linear_worksheet, header_path, "NormLinear", norm_linear_path) +# +# norm_log_2_worksheet = excel_workBook.add_worksheet("NormLog2") +# error_norm_log_2 = construct_worksheet_for_xlsx(norm_log_2_worksheet, header_path, "NormLog2", norm_log_2_path) +# +# norm_log_2_median_centered_worksheet = excel_workBook.add_worksheet("NormLog2_MedianCentered") +# error_norm_log_2_median_centered = construct_worksheet_for_xlsx(norm_log_2_median_centered_worksheet, header_path, "Median-Centered", norm_log_2_median_centered_path) +# +# errors_array = [error_put_cf2_in_normLinear, error_rawLog2, error_norm_linear, error_norm_log_2, error_norm_log_2_median_centered] +# for error in errors_array: +# if error != "": +# error_write_xlsx = error_write_xlsx + error +# error_occurred = 1 +# if error_occurred == 1: +# error_write_xlsx + "\nThe excel workbook for the report "+report_name+" was not written successfully.\n\n" +# +# excel_workBook.close() +# except Exception, e: +# error_occurred = 1 +# error_write_xlsx += str(repr(e)) + "\n\n" +# error_write_xlsx + "\nThe excel workbook for the report "+report_name+" was not written successfully.\n\n" +# try: +# excel_workBook.close() +# except Exception, f: +# sys.stderr.write("An unforeseen problem has occurred in write_xlsx_for_report()\n") +# sys.stderr.write(str(repr(f)) + "\n\n") +# +# +# return error_occurred, error_write_xlsx +# +# +# def write_new_normLinear_csv_file_with_cf2_values(path_to_dir, norm_linear_path, dict_cf2_values): +# errors = "" +# try: +# titles = {} +# new_lines_normLinear_with_cf2 = [] +# # read old norm linear file +# rf_normLinear = open(norm_linear_path, 'rU') +# line_num = 0 +# for line in rf_normLinear: +# line = strip_new_line_from_right_side(line) +# toks = line.split(",") +# line_num += 1 +# if line_num == 1: +# line += "1,CF2" +# new_lines_normLinear_with_cf2.append(line) +# titles = toks +# continue +# pos_rf = int(toks[titles.index('Order')]) +# line += "," + str(dict_cf2_values[pos_rf]) +# new_lines_normLinear_with_cf2.append(line) +# rf_normLinear.close() +# # rename the old normLinear file +# os.rename(norm_linear_path, os.path.join(path_to_dir, 'before_cf2_NormLinear.csv')) +# +# # write new normLinear with cf2 +# wf_new_normLinear = open(norm_linear_path, 'w') +# for line_writing in new_lines_normLinear_with_cf2: +# wf_new_normLinear.write(line_writing + "\n") +# wf_new_normLinear.close() +# except Exception, err_write_normLinear_with_cf2_values: +# errors = str(repr(err_write_normLinear_with_cf2_values)) +# +# return errors +# +# +# # This function constructs the worksheet for each tab in the excel file for a report +# # It puts these things in this order: +# # 1. Title of the tab +# # 2. Header for the tab +# # 3. Content of the tab +# def construct_worksheet_for_xlsx(worksheet, header_path, title_top_of_tab, tab_input_path): +# +# reload(sys) +# sys.setdefaultencoding('utf8') +# errors = "" +# +# try: +# # Write the title at the top of the tab +# worksheet.write(0,0,title_top_of_tab) +# +# # Variable to keep track of the rows +# row_num = 1 +# +# # Write the header stuff +# header_file = open(header_path, 'rU') +# for head_line in header_file: +# head_line = strip_new_line_from_right_side(head_line) +# head_toks = head_line.split(",") +# col_num = 0 +# for tok in head_toks: +# worksheet.write(row_num, col_num, tok) +# col_num += 1 +# row_num += 1 +# +# # Write the content stuff +# tab_input_file = open(tab_input_path, 'rU') +# for tab_line in tab_input_file: +# tab_line = strip_new_line_from_right_side(tab_line) +# tab_toks = tab_line.split(",") +# col_num = 0 +# for tok in tab_toks: +# tok = tok.decode('iso-8859-1').encode('utf-8') +# worksheet.write(row_num, col_num, tok) +# col_num += 1 +# row_num += 1 +# +# header_file.close() +# tab_input_file.close() +# except Exception, e: +# errors = errors + "\n\nAn error occurred while constructing the "+title_top_of_tab+" tab for the excel file.\n" +# errors = errors + "The error was :\n\t" + str(e) + "\n\n" +# try: +# header_file.close() +# tab_input_file.close() +# except NameError: +# x = 5 +# + return errors + +#---------------------------------------------------------------------- +if __name__ == "__main__": + + #try: + if len(sys.argv) > 4: + infileName = '"'+sys.argv[1]+'"' + tabName = '"'+sys.argv[2]+'"' + tabNumber = 0 + if tabName == '': tabNumber = int(sys.argv[3]) + outfileName = '"'+sys.argv[4]+'"' #TODO Later multiple outputs one per tab + + sys.stdout.write( "\nInput parameters ",str(sys.argv[1:4]),"\n" ) + + #infileName = "/Users/bobbrown/Desktop/01_Gordon_Mills__Zhiyong_Ding.xlsx" + #outfileName= "/Users/bobbrown/Desktop/01_Gordon_Mills__Zhiyong_Ding-Tab-Out-" + #tabName ="NormLog2" + #tabName ="" + #tabNumber= 10 + + status= File_From_Tab(infileName, outfileName, tabName, tabNumber ) + #except + #sys.exit(-1) + + sys.exit(0) \ No newline at end of file diff -r 7f12c81e2083 -r f1bcd79cd923 Split_ExcelTabs_IntoFiles.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Split_ExcelTabs_IntoFiles.xml Tue Nov 27 14:20:40 2018 -0500 @@ -0,0 +1,35 @@ + + + into separate tab delimited files + Split_ExcelTabs_IntoFiles.py '$input' '' '$extra.tabnumber' '$output_file' + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -r 7f12c81e2083 -r f1bcd79cd923 TestOutFile.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TestOutFile.txt Tue Nov 27 14:20:40 2018 -0500 @@ -0,0 +1,1 @@ +output from input= TestInFile \ No newline at end of file diff -r 7f12c81e2083 -r f1bcd79cd923 Test_input_into_file.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Test_input_into_file.py Tue Nov 27 14:20:40 2018 -0500 @@ -0,0 +1,77 @@ +#!/usr/bin/env python + +#Created on Jule 23, 2018 + +# @author: Bob Brown + +import sys +import os + +def main(): + + # Grab the inputs from the Galaxy xml interface and write to a file that is passed to the program + # Not each of the parameters as separate command line variables. +# ab_gene_name_for_header={} +# ab_rrid_for_header={} + dir= "/Users/bobbrown/Desktop/junk/" + accepted_extensions = ["csv", "tsv"] + filenames = [fn for fn in os.listdir(dir) if fn.split(".")[-1] in accepted_extensions] + for f in filenames: + print("filename= "+f) + os.remove(dir+f) + + sys.exit(0) + + ab_gene_name_for_header={'abc':'geneName'} + ab_rrid_for_header={'abc':'rrid123'} + line= 'abc,123\n' + + pos= line.find(",") + ABname= line[0:pos] + ABnewName= ABname+ "|"+ab_gene_name_for_header[ABname]+"|"+ab_rrid_for_header[ABname] + line= ABnewName+line[pos:] + line= line.replace(',','\t') + sys.exit(0) +# try: + print(' \n starting Test program read params from file stored in tools dir. Arguments=') + print(str(sys.argv[1:])+'\n') + + if False: + infileName = sys.argv[1] + # directory = sys.argv[2] + directory = '/Users/bobbrown/Desktop/' + outfileName = sys.argv[3] #use later + # outfile = sys.argv[6] + + #sys.stdout.write + + # ifile= open(directory+"/"+infileName,'rU') + ifile= open(directory+infileName,'rU') + ofile= open(directory+outfileName,'w') + # ofile= open('/Users/bobbrown/Desktop/TestOutFileVarParams.txt','w') + + cnt= 0 + # for param in range(2,len(sys.argv)): + # cnt +=1 + # ofile.write("param "+str(cnt)+"= "+param+"\n") + + + for param in ifile: + cnt +=1 + ofile.write("param "+str(cnt)+"= "+param) + + ifile.close() + + ofile.close() + + print('Fini -- rows read = '+str(cnt)+'\n') + +# except : +# print('Error>>> ') + + return +## +## + +if __name__ == '__main__': main() + #sys.exit(0) \ No newline at end of file diff -r 7f12c81e2083 -r f1bcd79cd923 Test_input_into_file.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Test_input_into_file.xml Tue Nov 27 14:20:40 2018 -0500 @@ -0,0 +1,61 @@ + + + Read xml params into a file + +Test-input-into-file.py $inputfile "$__tool_dir__" $outputfile + #for $op in $test_param1 + ${op.discrete_fields1.Text_Fields1} + #end for + "$EndofVarParam1" + "$Covariate_Type" + "$EndofVarParam2" + #for $op in $test_param3 + ${op.discrete_fields3.Text_Fields3} + #end for + "$EndofVarParam3" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 7f12c81e2083 -r f1bcd79cd923 bar_chart_plot.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bar_chart_plot.py Tue Nov 27 14:20:40 2018 -0500 @@ -0,0 +1,140 @@ +#!/usr/bin/env python +""" +histogram_gnuplot.py <ylabel> <yrange_min> <yrange_max> <grath_file> +a generic histogram builder based on gnuplot backend + + data_file - tab delimited file with data + xtic_column - column containing labels for x ticks [integer, 0 means no ticks] + column_list - comma separated list of columns to plot + title - title for the entire histrogram + ylabel - y axis label + yrange_max - minimal value at the y axis (integer) + yrange_max - maximal value at the y_axis (integer) + to set yrange to autoscaling assign 0 to yrange_min and yrange_max + graph_file - file to write histogram image to + img_size - as X,Y pair in pixels (e.g., 800,600 or 600,800 etc.) + + + This tool required gnuplot and gnuplot.py + +anton nekrutenko | anton@bx.psu.edu +""" + +import string +import sys +import tempfile + +import Gnuplot +import Gnuplot.funcutils + +assert sys.version_info[:2] >= (2, 4) + + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + + +def main(tmpFileName): + skipped_lines_count = 0 + skipped_lines_index = [] + gf = open(tmpFileName, 'w') + + try: + in_file = open(sys.argv[1], 'r') + xtic = int(sys.argv[2]) + col_list = string.split(sys.argv[3], ",") + title = 'set title "' + sys.argv[4] + '"' + ylabel = 'set ylabel "' + sys.argv[5] + '"' + ymin = sys.argv[6] + ymax = sys.argv[7] + img_file = sys.argv[8] + img_size = sys.argv[9] + except: + stop_err("Check arguments\n") + + try: + int(col_list[0]) + except: + stop_err('You forgot to set columns for plotting\n') + + for i, line in enumerate(in_file): + valid = True + line = line.rstrip('\r\n') + if line and not line.startswith('#'): + row = [] + try: + fields = line.split('\t') + for col in col_list: + row.append(str(float(fields[int(col) - 1]))) + except: + valid = False + skipped_lines_count += 1 + skipped_lines_index.append(i) + else: + valid = False + skipped_lines_count += 1 + skipped_lines_index.append(i) + + if valid and xtic > 0: + row.append(fields[xtic - 1]) + elif valid and xtic == 0: + row.append(str(i)) + + if valid: + gf.write('\t'.join(row)) + gf.write('\n') + + if skipped_lines_count < i: + # Prepare 'using' clause of plot statement + g_plot_command = ' ' + + # Set the first column + if xtic > 0: + g_plot_command = "'%s' using 1:xticlabels(%s) ti 'Column %s', " % (tmpFileName, str(len(row)), col_list[0]) + else: + g_plot_command = "'%s' using 1 ti 'Column %s', " % (tmpFileName, col_list[0]) + + # Set subsequent columns + for i in range(1, len(col_list)): + g_plot_command += "'%s' using %s t 'Column %s', " % (tmpFileName, str(i + 1), col_list[i]) + + g_plot_command = g_plot_command.rstrip(', ') + + yrange = 'set yrange [' + ymin + ":" + ymax + ']' + + try: + g = Gnuplot.Gnuplot() + g('reset') + g('set boxwidth 0.9 absolute') + g('set style fill solid 1.00 border -1') + g('set style histogram clustered gap 5 title offset character 0, 0, 0') + g('set xtics border in scale 1,0.5 nomirror rotate by 90 offset character 0, 0, 0') + g('set key invert reverse Left outside') + if xtic == 0: + g('unset xtics') + g(title) + g(ylabel) + g_term = 'set terminal png tiny size ' + img_size + g(g_term) + g_out = 'set output "' + img_file + '"' + if ymin != ymax: + g(yrange) + g(g_out) + g('set style data histograms') + g.plot(g_plot_command) + except: + stop_err("Gnuplot error: Data cannot be plotted") + else: + sys.stderr.write('Column(s) %s of your dataset do not contain valid numeric data' % sys.argv[3]) + + if skipped_lines_count > 0: + sys.stdout.write('\nWARNING. You dataset contain(s) %d invalid lines starting with line #%d. These lines were skipped while building the graph.\n' % (skipped_lines_count, skipped_lines_index[0] + 1)) + + +if __name__ == "__main__": + # The tempfile initialization is here because while inside the main() it seems to create a condition + # when the file is removed before gnuplot has a chance of accessing it + gp_data_file = tempfile.NamedTemporaryFile('w') + Gnuplot.gp.GnuplotOpts.default_term = 'png' + main(gp_data_file.name)