1
+ − 1 '''
+ − 2 Created on March 6, 2018
+ − 3
+ − 4 @author: Bob Brown based on John Weinstein's algorithm
+ − 5 '''
+ − 6
+ − 7 import os
+ − 8 import re
+ − 9 import shutil
+ − 10 import traceback
+ − 11 import sys, traceback, argparse
+ − 12 import numpy as np
+ − 13 import warnings
+ − 14 #import scipy.stats as ss
+ − 15 from Matrix_Validate_import import reader, Labeler, MatchLabels
+ − 16 import math
+ − 17 warnings.filterwarnings('error')
+ − 18
+ − 19 # John Weinsteins algorithm by bob brown https://discover.nci.nih.gov/CorrelateMatrices/help.do
+ − 20 #http://www.blog.pythonlibrary.org/2014/04/30/reading-excel-spreadsheets-with-python-and-xlrd/
+ − 21
+ − 22 def get_args():
+ − 23 parser = argparse.ArgumentParser()
+ − 24 parser.add_argument('input_file1', help='text file input matrix(include .txt in name)')
+ − 25 parser.add_argument('transpose', type=str, help='transpose matrix 1?')
+ − 26 parser.add_argument('input_file2', help='text file input matrix(include .txt in name)')
+ − 27 parser.add_argument('choice', type=str, help='Choose Normalization Method: 1 = Z-score, 2 = Mean Centered, 3 = log2, 4= rank')
+ − 28 # parser.add_argument('scaleValue', help='optional scaling factor for matrix)')
+ − 29 parser.add_argument('out_fileName', help='text file output matrix(include .txt in name)')
+ − 30 args = parser.parse_args()
+ − 31 if args.transpose == "": args.transpose = 'n'
+ − 32 return args
+ − 33
+ − 34
+ − 35 def Matrix_Multiply(matrix1, matrix2):
+ − 36
+ − 37 try:
+ − 38 #TODO handle NANs
+ − 39
+ − 40 matrixOut= np.dot(matrix1, matrix2)
+ − 41
+ − 42
+ − 43 except Exception as err:
+ − 44 traceback.print_exc()
+ − 45 sys.exit(-5)
+ − 46
+ − 47 return(matrixOut)
+ − 48
+ − 49
+ − 50 #CorrelateMatrices correlation acorss 2 martices https://discover.nci.nih.gov/CorrelateMatrices/home.do
+ − 51 def Correlate_Matrices(matrix1, matrix2):
+ − 52
+ − 53 #try:
+ − 54 # Leave both matrices as size axn and bxn and treat a is column and b as row
+ − 55 #matrix1T = Transpose(matrix1)
+ − 56
+ − 57 #TODO handle NANs
+ − 58 numRows1,numColumns1= np.shape(matrix1)
+ − 59
+ − 60 numRows2,numColumns2= np.shape(matrix2)
+ − 61 matrixOut= []
+ − 62
+ − 63 if numColumns1 != numRows2:
+ − 64 print("ERROR number columns Matrix 1 ", str(numColumns1), " not equal number rows for Matrix 2 ",str(numRows2))
+ − 65 sys.exit(-1)
+ − 66 #TODO need to look for NANs??
+ − 67
+ − 68 for i in range(numRows1):
+ − 69 vectorM1 = matrix1[i][:]
+ − 70 meanVec1 = np.nanmean(vectorM1)
+ − 71 varStdDev1 = np.nanstd(vectorM1, ddof=1)
+ − 72 lowStdDev1 = False
+ − 73 #if equals zero
+ − 74 if abs(varStdDev1) < .000001:
+ − 75 print("ERROR Variance value almost zero", str(varStdDev1), " for Matrix 1 Row ",str(i+1))
+ − 76 lowStdDev1= True
+ − 77 correlationRow= []
+ − 78
+ − 79 for j in range(numColumns2):
+ − 80 vectorM2 = []
+ − 81 for t in range(numRows2):
+ − 82 vectorM2.append(matrix2[t][j])
+ − 83 meanVec2 = np.nanmean(vectorM2)
+ − 84 varStdDev2 = np.nanstd(vectorM2, ddof=1)
+ − 85 lowStdDev2= False
+ − 86 #if equals zero
+ − 87 if abs(varStdDev2) < .000001:
+ − 88 print("ERROR Variance value almost zero", str(varStdDev2), " for Matrix 2 Column ",str(j+1))
+ − 89 lowStdDev2= True
+ − 90
+ − 91 covarStdDev12= 0
+ − 92
+ − 93 if not lowStdDev1 and not lowStdDev2:
+ − 94 #try:
+ − 95 for pos in range(len(vectorM1)):
+ − 96 covarStdDev12 += ((vectorM1[pos]-meanVec1)/varStdDev1)*((vectorM2[pos]-meanVec2)/varStdDev2)
+ − 97 # bottom= (numColumns1 -1)*(varStdDev1*varStdDev2)
+ − 98 # correlationRow.append( covarStdDev12/bottom)
+ − 99 correlationRow.append( covarStdDev12/(numColumns1 -1))
+ − 100 #except: bad value because of NAN or other
+ − 101 else:
+ − 102 correlationRow.append("divide by 0") # cannot calculate correlation var too small
+ − 103
+ − 104 matrixOut.append(correlationRow)
+ − 105
+ − 106 # except Exception as err:
+ − 107 # traceback.print_exc()
+ − 108 # sys.exit(-6)
+ − 109
+ − 110 return(matrixOut)
+ − 111
+ − 112 #----------------------------------------------------------------------
+ − 113 def Transpose(in_mat):
+ − 114 out_mat = []
+ − 115 numRows,numColumns= np.shape(in_mat)
+ − 116
+ − 117 for i in range(numColumns):
+ − 118 temp= []
+ − 119 for j in range(numRows):
+ − 120 temp.append(in_mat[j][i])
+ − 121 out_mat.append(temp)
+ − 122 #print( str(out_mat))
+ − 123 return out_mat
+ − 124
+ − 125
+ − 126 #----------------------------------------------------------------------
+ − 127 if __name__ == "__main__":
+ − 128
+ − 129 # input_file1 = "/Users/bobbrown/Desktop/Gene-by-var.txt"
+ − 130 # input_file2 = "/Users/bobbrown/Desktop/var-by-sample.txt"
+ − 131 # out_fileName = "/Users/bobbrown/Desktop/MatixMult-1-2-Out.txt"
+ − 132 # selection = "MatrixMultiply"
+ − 133 #TODO address NANs ???
+ − 134
+ − 135 try:
+ − 136 args = get_args()
+ − 137 selection= args.choice
+ − 138
+ − 139 matrix1,column_labels1,row_labels1 = reader(args.input_file1) # to be transposed later
+ − 140 matrix2,column_labels2,row_labels2 = reader(args.input_file2)
+ − 141
+ − 142
+ − 143 if args.transpose == 'y' or args.input_file1 == args.input_file2:
+ − 144 matrix1 = Transpose(matrix1)
+ − 145 print("\n>>>NOTICE Transposed first matrix so matrix 1 columns = Matrix 2 number rows ")
+ − 146 temp = row_labels1 #swap labels for output matrix
+ − 147 row_labels1 = column_labels1 #swap labels for output matrix
+ − 148 column_labels1= temp #swap labels for output matrix
+ − 149
+ − 150 MatchLabels(column_labels1,row_labels2) # verfiy labels and their order match
+ − 151
+ − 152 if len(column_labels1) != len(row_labels2):
+ − 153 print("\n>>> ERROR attempting to multiple Matrices of incompatible dimensions ")
+ − 154 print("First Matrix is "+str(len(row_labels1))+" by "+str(len(column_labels1))+" where second Matrix is "+str(len(og_row2))+" by "+str(len(column_labels2))+"\n")
+ − 155 print("Matrices must have dimensions AxB and BxC. A can equal C (square matrices)")
+ − 156 sys.exit(-1)
+ − 157
+ − 158 if selection == "MatrixMultiply":
+ − 159 matrixOut= Matrix_Multiply(matrix1, matrix2 )
+ − 160
+ − 161 elif selection == "Corr2Matrices" or selection == "Corr1Matrix":
+ − 162 matrixOut = Correlate_Matrices(matrix1, matrix2)
+ − 163
+ − 164 Labeler(matrixOut,column_labels2,row_labels1,args.out_fileName)
+ − 165
+ − 166 print("Matrix Multiply "+str(len(row_labels1))+" by "+str(len(column_labels1))+" Matrix 1 by "+str(len(row_labels2))+" by "+str(len(column_labels2))+" matrix 2")
+ − 167 print("Output Matrix dimensions are "+str(len(row_labels1))+" by "+str(len(column_labels2))+"\n")
+ − 168
+ − 169 except Exception as err:
+ − 170 traceback.print_exc()
+ − 171 sys.exit(-3)
+ − 172
+ − 173 sys.exit(0)