annotate Matrix_Filters.py @ 1:f1bcd79cd923 draft default tip

Uploaded
author insilico-bob
date Tue, 27 Nov 2018 14:20:40 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
1 '''
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
2 Created on Jun 7, 2017 updated Feb2018
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
3
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
4 @author: rbrown and cjacoby
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
5 '''
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
6
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
7 import sys, traceback, argparse
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
8 import numpy as np
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
9 from Matrix_Validate_import import reader, Labeler
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
10 import math
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
11 #import matplotlib.pyplot as plt
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
12
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
13 #Define argparse Function
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
14 def get_args():
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
15 parser = argparse.ArgumentParser()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
16 parser.add_argument('input_file_txt', help='tab delimited text file input matrix(include .txt in name)')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
17 parser.add_argument('choice',type=str, help='Variance Filter Method (Variance or Range)')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
18 parser.add_argument('thresh', help='Thershold for Variance Filtering')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
19 parser.add_argument('axes', help='Axes to Filter on (Either Row or Column')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
20 parser.add_argument('output_file_txt', help='tab delimited text file output name (include .txt in name)')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
21 args = parser.parse_args()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
22 return args
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
23
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
24 def Range_Filter_Row(matrix,thresh,row_header_list,column_header_list):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
25 #Create Null Set of Filtered Row(Populated Later)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
26 deletes = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
27 minVal = +9999999
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
28 maxVal = -99999
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
29 #Loop to Determine Which Rows have sub-Threshold Range
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
30 for i in range(0,len(matrix)):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
31 temp_range = np.max(matrix[i][0::]) - np.min(matrix[i][0::])
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
32
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
33 if temp_range < minVal: minVal = temp_range
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
34 elif temp_range > maxVal: maxVal = temp_range
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
35
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
36 if temp_range <= float(thresh):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
37 deletes = np.append(deletes,[i],0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
38
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
39 #Delete Rows sub-Threshold Rows
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
40 matrix = np.delete(matrix,deletes,0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
41 filter_rows = np.delete(row_header_list,deletes,0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
42 filter_cols = column_header_list
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
43 return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
44
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
45 def Range_Filter_Col(matrix,thresh,row_header_list,column_header_list):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
46 #Create Null Set of Filtered Row(Populated Later)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
47 deletes = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
48 minVal = +9999999
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
49 maxVal = -99999
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
50 #Loop to Determine Which Rows have sub-Threshold Variance
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
51 for i in range(0,len(matrix[0])):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
52
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
53 temp_range = np.max([row[i] for row in matrix]) - np.min([row[i] for row in matrix])
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
54
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
55 if temp_range < minVal: minVal = temp_range
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
56 elif temp_range > maxVal: maxVal = temp_range
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
57
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
58 #print(temp_stdev)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
59 if temp_range <= float(thresh):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
60 deletes = np.append(deletes,[i],0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
61 print(deletes)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
62
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
63 #Delete Rows sub-Threshold Rows
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
64 matrix = np.delete(matrix,deletes,1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
65 filter_rows = row_header_list
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
66 filter_cols = np.delete(column_header_list,deletes,0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
67 #np.savetxt('testtest.txt',matrix,delimiter='\t')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
68
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
69 return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
70
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
71 #Define Function Which Deletes Sub-Threshold Rows
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
72 def Variance_Percent_Filter_row(matrix,cutoff,row_header_list,column_header_list, create_plot= False):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
73 # if create a plot then DO NOT remove DATA only print diagram of variance ranges !!!
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
74
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
75 # temp_stdev = np.var(matrix[i][1::])
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
76 #cutoff is the percentile rank of the variance values
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
77 cutoff= int(cutoff)/100.0
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
78 if cutoff > 0.99 or cutoff < .01:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
79 sys.stderr.write( "ERROR illegal cutoff value= "+str(cutoff*100)+" allowed values 1 to 99")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
80 sys.exit(-8)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
81
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
82 deletes = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
83 varianceDict = {}
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
84 minVal = +9999999
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
85 maxVal = -99999
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
86
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
87 #Loop to Determine Which Rows have sub-Threshold Variance
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
88 for i in range(len(matrix)):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
89 vector = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
90 for p in range(len(matrix[0])):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
91 if not math.isnan(matrix[i][p]):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
92 vector.append(matrix[i][p])
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
93
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
94 #temp_stdev = np.var(matrix[:,i])
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
95 if len(vector) > 1:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
96 temp_stdev = np.var(vector)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
97 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
98 temp_stdev = 0.0
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
99
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
100 if temp_stdev < minVal:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
101 minVal = temp_stdev
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
102 elif temp_stdev > maxVal:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
103 maxVal = temp_stdev
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
104
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
105 if temp_stdev not in varianceDict:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
106 varianceDict[temp_stdev] = [i]
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
107 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
108 tmp= varianceDict[temp_stdev]
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
109 tmp.append(i)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
110 varianceDict[temp_stdev] = tmp
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
111
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
112
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
113 #calc how many rows to remove
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
114 lowerLimit = int(cutoff*len(matrix) +1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
115 limit = False
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
116 cnt = 0
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
117
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
118 for key in sorted(varianceDict.items()):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
119 #rows = varianceDict[key]
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
120 rows= key[1]
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
121 cnt += len(rows)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
122 if cnt < lowerLimit: #remove rows below percentile cutoff
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
123 for j in rows:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
124 deletes = np.append(deletes,[j],0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
125 #print(deletes)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
126 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
127 limit = True
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
128
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
129 print( "Dataset Lowest Variance= %.2f" % minVal+" Highest Variance= %.2f" % maxVal+" and Percentile cutoff row = "+str(lowerLimit)+" of "+str(len(matrix))+" rows")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
130
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
131
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
132 #Delete Rows sub-Threshold Rows
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
133 matrix = np.delete(matrix,deletes,0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
134 filter_rows = np.delete(row_header_list,deletes,0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
135 filter_cols = column_header_list
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
136 #np.savetxt('testtest.txt',matrix,delimiter='\t')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
137
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
138 """
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
139 if create_plot:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
140 numBins = 10
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
141 binWidth = 1
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
142 binCat = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
143 binData = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
144 counted = False
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
145 incrmnt= (maxVal-minVal)/(numBins-1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
146 current_bin_max = minVal + incrmnt/2
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
147 cnt = 0
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
148 for key, val in sorted(varianceDict.items()):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
149 if key < current_bin_max:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
150 cnt += len(val) # add all rows having that variance value
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
151 counted = False
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
152 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
153 binData.append(cnt)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
154 cnt= len(val)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
155 binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0)))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
156 current_bin_max += incrmnt
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
157 counted = True
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
158
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
159 if not counted:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
160 binData.append(cnt)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
161 binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0)))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
162
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
163 tot = sum(binData)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
164 bins = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
165 for j in range(numBins):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
166 bins.append(j*binWidth)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
167
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
168 #ttps://pythonspot.com/matplotlib-bar-chart/
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
169 y_pos = np.arange(numBins)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
170 plt.xticks(y_pos, binCat)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
171 plt.title("Distribution of Variance Values by Row")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
172 plt.ylabel('Variance Bin Totals')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
173 plt.xlabel('Variance Value Bins')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
174 #plt.legend()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
175 plt.bar(y_pos, binData, align='center', alpha=0.5)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
176
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
177 fig, ax = plt.subplots(num=1, figsize=(8,3))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
178
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
179 plt.show()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
180 """
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
181
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
182
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
183
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
184 return matrix,filter_rows,filter_cols ,len(deletes), minVal,maxVal
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
185
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
186 def Variance_Percent_Filter_col(matrix,cutoff,row_header_list,column_header_list, create_plot=False):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
187 #cutoff is the percentile rank of the variance values
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
188 cutoff= int(cutoff)/100.0
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
189 if cutoff > 0.99 or cutoff < .01:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
190 sys.stderr.write( "ERROR illegal cutoff value= "+str(cutoff*100)+" allowed values 1 to 99")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
191 sys.exit(-8)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
192
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
193 deletes = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
194 varianceDict = {}
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
195 minVal = +9999999
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
196 maxVal = -99999
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
197 lenCol = len(matrix[0])
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
198
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
199 #Loop to Determine Which Rows have sub-Threshold Variance
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
200 for i in range(lenCol):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
201 vector = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
202 for p in range(len(matrix)):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
203 if not math.isnan(matrix[p][i]):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
204 vector.append(matrix[p][i])
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
205
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
206 #temp_stdev = np.var(matrix[:,i])
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
207 if len(vector) > 1:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
208 temp_stdev = np.var(vector)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
209 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
210 temp_stdev = 0.0
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
211
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
212 if temp_stdev < minVal:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
213 minVal = temp_stdev
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
214 elif temp_stdev > maxVal:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
215 maxVal = temp_stdev
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
216
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
217 if temp_stdev not in varianceDict:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
218 varianceDict[temp_stdev] = [i]
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
219 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
220 tmp= varianceDict[temp_stdev]
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
221 tmp.append(i)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
222 varianceDict[temp_stdev] = tmp
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
223
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
224 #print(temp_stdev)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
225 #if temp_stdev <= float(cutoff):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
226
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
227 #calc how many rows to remove
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
228 lowerLimit = int(cutoff*lenCol +1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
229 limit = False
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
230 cnt = 0
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
231
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
232 for key in sorted(varianceDict.items()):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
233 #rows = varianceDict[key]
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
234 cols= key[1]
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
235 cnt += len(cols)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
236 if cnt < lowerLimit: #remove rows below percentile cutoff
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
237 for j in cols:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
238 deletes = np.append(deletes,[j],0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
239 #print(deletes)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
240 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
241 limit = True
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
242
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
243 print( "Dataset Lowest Variance= %.2f" % minVal+" Highest Variance= %.2f" % maxVal+" and Percentile cutoff column= "+str(lowerLimit)+" of "+str(lenCol)+" columns")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
244
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
245 matrix = np.delete(matrix,deletes,1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
246 filter_rows = row_header_list
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
247 filter_cols = np.delete(column_header_list,deletes,0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
248 #np.savetxt('testtest.txt',matrix,delimiter='\t')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
249
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
250 """
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
251 if create_plot:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
252 numBins = 10
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
253 binWidth = 1
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
254 binCat = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
255 binData = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
256 counted = False
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
257 incrmnt= (maxVal-minVal)/(numBins-1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
258 current_bin_max = minVal + incrmnt/2
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
259 cnt = 0
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
260 for key, val in sorted(varianceDict.items()):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
261 if key < current_bin_max:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
262 cnt += len(val) # add all rows having that variance value
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
263 counted = False
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
264 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
265 binData.append(cnt)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
266 cnt= len(val)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
267 binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0)))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
268 current_bin_max += incrmnt
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
269 counted = True
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
270
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
271 if not counted:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
272 binData.append(cnt)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
273 binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0)))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
274
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
275 tot = sum(binData)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
276 bins = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
277
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
278 for j in range(numBins):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
279 bins.append(j*binWidth)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
280 #https://pythonspot.com/matplotlib-bar-chart/
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
281 y_pos = np.arange(numBins)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
282
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
283 plt.xticks(y_pos, binCat)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
284 plt.title("Distribution of Variance Values by Column")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
285 plt.ylabel('Variance Bin Totals')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
286 plt.xlabel('Variance Value Bins')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
287 #plt.legend()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
288 plt.bar(y_pos, binData, align='center', alpha=0.5)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
289
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
290 fig, ax = plt.subplots(num=1, figsize=(8,3))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
291 plt.show()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
292 """
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
293
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
294 return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
295
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
296 def UpperLowerLimit_Filter_Row(upperLower, matrix,cutoff,row_header_list,column_header_list):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
297 #Create Null Set of Filtered Row(Populated Later)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
298 deletes = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
299 minVal = +9999999
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
300 maxVal = -99999
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
301 #Loop to Determine Which Rows have sub-Threshold Range
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
302 for i in range(0,len(matrix)):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
303 removeRow = False
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
304
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
305 for j in range(len(matrix[0])):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
306 val= matrix[i][j]
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
307 if not math.isnan(val):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
308 if val <= cutoff and upperLower == 'lower':
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
309 removeRow = True
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
310 elif val >= cutoff and upperLower == 'upper':
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
311 removeRow = True
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
312 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
313 if val < minVal: minVal = val
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
314 if val > maxVal: maxVal = val
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
315
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
316 #print(temp_stdev)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
317 if removeRow:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
318 deletes = np.append(deletes,[i],0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
319
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
320 #Delete Rows sub-Threshold Rows
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
321 matrix = np.delete(matrix,deletes,0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
322 filter_rows = np.delete(row_header_list,deletes,0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
323 filter_cols = column_header_list
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
324
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
325 return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
326
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
327 def UpperLowerLimit_Filter_Col(upperLower,matrix,cutoff,row_header_list,column_header_list):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
328 #Create Null Set of Filtered Row(Populated Later)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
329 deletes = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
330 minVal = +9999999
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
331 maxVal = -99999
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
332 #Loop to Determine Which Rows have sub-Threshold Variance
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
333
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
334 for i in range(0,len(matrix[0])):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
335 removeRow = False
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
336
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
337 for j in range(len(matrix)):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
338 val= matrix[j][i]
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
339 if not math.isnan(val):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
340 if val <= cutoff and upperLower == 'lower':
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
341 removeRow = True
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
342 elif val >= cutoff and upperLower == 'upper':
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
343 removeRow = True
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
344 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
345 if val < minVal: minVal = val
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
346 if val > maxVal: maxVal = val
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
347
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
348 #print(temp_stdev)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
349 if removeRow: deletes = np.append(deletes,[i],0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
350
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
351 #Delete Rows sub-Threshold Rows
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
352 matrix = np.delete(matrix,deletes,1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
353 filter_rows = row_header_list
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
354 filter_cols = np.delete(column_header_list,deletes,0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
355 #np.savetxt('testtest.txt',matrix,delimiter='\t')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
356
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
357 return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
358
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
359 #========= remove rows with too many NANs in cells
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
360 def NAN_Filter_Row(matrix,nanList,maxAllowedNANs,row_header_list,column_header_list):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
361
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
362 try:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
363 #Create Null Set of Filtered Row(Populated Later)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
364 maxFoundNANs = 0
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
365 deletes = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
366 #Loop to Determine Which Rows have sub-Threshold Range
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
367 for i in range(0,len(matrix)):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
368 #matches= [s for s in matrix[i][0::] if any(nan == s.upper() for nan in nanList)]
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
369 #matches= [s for s in matrix[i][:] if s in nanList]
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
370 matches= []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
371 for s in matrix[i]:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
372 if str(s) in nanList: matches.append(s)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
373
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
374
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
375 lenMatches = len(matches)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
376 if lenMatches > maxFoundNANs: maxFoundNANs = lenMatches
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
377
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
378 if lenMatches >= maxAllowedNANs:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
379 deletes = np.append(deletes,[i],0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
380
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
381 #Delete Rows sub-Threshold Rows
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
382 matrix = np.delete(matrix,deletes,0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
383 filter_rows = np.delete(row_header_list,deletes,0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
384 filter_cols = column_header_list
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
385
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
386 except Exception as err:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
387 traceback.print_exc()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
388 sys.exit(-4)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
389
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
390 return matrix, filter_rows, filter_cols,len(deletes),maxFoundNANs
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
391
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
392 #========= remove Cols with too many NANs
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
393
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
394 def NAN_Filter_Column(matrix,nanList,maxAllowedNANs,row_header_list,column_header_list):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
395
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
396 #Create Null Set of Filtered Row(Populated Later)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
397 minNumNANs = 0
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
398 maxFoundNANs = 0
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
399 deletes = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
400 #Loop to Determine Which Rows have sub-Threshold Variance
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
401 for i in range(0,len(matrix[0])):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
402 matches= []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
403 for j in range(len(matrix)):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
404 if str(matrix[j][i]) in nanList: matches.append(matrix[j][i])
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
405
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
406 lenMatches = len(matches)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
407 if lenMatches > maxFoundNANs:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
408 maxFoundNANs = lenMatches
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
409
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
410 if lenMatches >= maxAllowedNANs:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
411 deletes = np.append(deletes,[i],0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
412
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
413 #Delete cols with too many NANs
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
414 matrix = np.delete(matrix,deletes,1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
415 filter_rows = row_header_list
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
416 filter_cols = np.delete(column_header_list,deletes,0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
417 #np.savetxt('testtest.txt',matrix,delimiter='\t')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
418 return matrix, filter_rows, filter_cols,len(deletes),maxFoundNANs
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
419
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
420
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
421 #MAD Median Absolute Deviation median (|Xi - Xmedian|) > X
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
422 def Row_Value_MAD(matrix,cutoff,row_header_list,column_header_list):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
423 #MAD Median Absolute Deviation median (|Xi - Xmedian|) > X
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
424 # cutoff is MAX value used to meant to minimize the impact of one outlier
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
425
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
426 deletes = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
427 minVal = +9999999
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
428 maxVal = -99999
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
429 #Loop to Determine Which Rows have sub-Threshold Range
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
430 for i in range(0,len(matrix)):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
431 medianRow = np.median(matrix[i])
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
432 temp = np.median(abs(matrix[i]- medianRow))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
433 # median (|Xi - Xmedian|) > X => meant to minimize the impact of one outlier
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
434 if temp < cutoff:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
435 deletes = np.append(deletes,[i],0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
436
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
437 if temp < minVal: minVal = temp
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
438 if temp > maxVal: maxVal = temp
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
439
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
440 #Delete Rows sub-Threshold Rows
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
441 matrix = np.delete(matrix,deletes,0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
442 filter_rows = np.delete(row_header_list,deletes,0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
443 filter_cols = column_header_list
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
444 print( "INFO Row MAD - Matrix min MAD value= "+str(minVal)+" and the max MAD value= "+str(maxVal) )
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
445
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
446 return matrix, filter_rows, filter_cols,len(deletes),maxVal
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
447
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
448 #MAD Median Absolute Deviation median (|Xi - Xmedian|) > X
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
449 def Col_Value_MAD(matrix,cutoff,row_header_list,column_header_list):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
450 #MAD Median Absolute Deviation median (|Xi - Xmedian|) > X
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
451 # cutoff is MAX value used to meant to minimize the impact of one outlier
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
452 deletes = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
453 minVal = +9999999
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
454 maxVal = -99999
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
455 #Loop to Determine Which Rows have sub-Threshold Range
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
456 for i in range(0,len(matrix[0])):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
457 matrixCol= []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
458 for j in range(len(matrix)):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
459 matrixCol.append(matrix[j][i])
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
460
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
461 medianCol = np.median(matrixCol)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
462 temp = np.median(abs(matrixCol- medianCol))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
463 # median (|Xi - Xmedian|) > X meant to minimize the impact of one outlier
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
464 if temp < cutoff:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
465 deletes = np.append(deletes,[i],0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
466
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
467 if temp < minVal: minVal = temp
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
468 if temp > maxVal: maxVal = temp
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
469
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
470 #Delete Rows sub-Threshold Rows
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
471 matrix = np.delete(matrix,deletes,1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
472 filter_rows = row_header_list
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
473 filter_cols = np.delete(column_header_list,deletes,0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
474 print( "INFO Column MAD - Matrix min MAD value= "+str(minVal)+" and the max MAD value= "+str(maxVal) )
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
475
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
476 return matrix, filter_rows, filter_cols,len(deletes),maxVal
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
477
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
478
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
479 # if covariance of the data in two columns exceeds a thresehold remove one row list the rows in a separate output
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
480 # def CoVariance_Percent_Filter_row_col(matrix,thresh,row_header_list,column_header_list):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
481 # xv= array([8., 9.5, 7.8, 4.2, -7.7, -5.4, 3.2])
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
482 # yv= array([8.9, 2.0, 4.8, -4.2, 2.7, -3.4, -5.9])
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
483 #
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
484 # def cov(x,y):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
485 # if (len(x) != len(y)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
486 # [Stop]
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
487 # x.bar = mean(x)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
488 # y.bar = mean(y)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
489 # N = len(x)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
490 # Cov = (sum((x-x.bar)*(y-y.bar))) / (N-1.0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
491 # return(Cov)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
492
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
493 # #Create Null Set of Filtered Row(Populated Later)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
494 # deletes = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
495 #
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
496 # temp_mean = np.nanmean(matrix[i])
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
497 # temp_stdev = np.nanstd(matrix[i])
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
498 #
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
499 # get stddev of each row the calc xi -xj sq
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
500 #
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
501 # for i in range(0,len(matrix)):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
502 # temp_range = np.max(matrix[i][0::]) - np.min(matrix[i][0::])
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
503 # if temp_range <= float(thresh):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
504 # deletes = np.append(deletes,[i],0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
505 #
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
506 # #Delete Rows sub-Threshold Rows
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
507 # matrix = np.delete(matrix,deletes,0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
508 # filter_rows = np.delete(row_header_list,deletes,0)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
509 # filter_cols = column_header_list
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
510 # return(matrix,filter_rows,filter_cols)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
511 #
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
512 # #np.savetxt('testtest.txt',matrix,delimiter='\t')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
513 # return(matrix,filter_rows,filter_cols)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
514 #
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
515
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
516 #Define Function Which Labels Rows/Columns on Output
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
517 #below replace
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
518 # def labeler(matrix,filter_rows,filter_cols,output_file_txt):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
519 #
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
520 # #Write Data to Specified Text File Output
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
521 # with open(output_file_txt,'w') as f:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
522 # f.write("")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
523 # for k in range(0,len(filter_cols)):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
524 # f.write('\t' + filter_cols[k])
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
525 # f.write('\n')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
526 # for i in range(0,len(filter_rows)):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
527 # f.write(filter_rows[i])
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
528 # for j in range(0,len(matrix[0])):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
529 # f.write('\t' + format(matrix[i][j]))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
530 # f.write('\n')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
531
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
532
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
533 #Define Main Function
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
534 def main():
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
535 try:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
536 args = get_args()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
537 #sys.stdout.write(str(args)+"\n")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
538 # <option value="LowerLimit">Minimum Absolute(Cell) Values to remove row/column</option>
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
539 # <option value="UpperLimit">Maximum Absolute(Cell) Values to remove row/column</option>
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
540 # <option value="NANnumber">NAN Number Cells Limit to remove row/column</option>
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
541 # <option value="NANpercent">NAN Percent Cells Limit to remove row/column</option>
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
542 nanList= ["NAN", "NA", "N/A", "-","?","nan", "na", "n/a"]
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
543
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
544 matrix, column_header_list,row_header_list = reader(args.input_file_txt)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
545 #old_reader matrix, row_header_list, column_header_list = reader(args.input_file_txt)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
546 threshold = float(args.thresh)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
547 if threshold < 0.000001:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
548 print('Invalid negative or near-zero threshold chosen = '+str(args.thresh)+" choose positive value")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
549 sys.exit(-4)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
550
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
551 #VariancePercent
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
552 if args.choice == "VariancePercent" or args.choice == "VarianceCount": # > percent variance
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
553
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
554 if args.axes == "Row":
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
555 if args.choice == "VarianceCount": threshold= (1-threshold/len(row_header_list))*100.0
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
556
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
557 matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = Variance_Percent_Filter_row(matrix,threshold,row_header_list,column_header_list)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
558 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
559 if delCnt < 1:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
560 print('\nNO Filtering occurred for rows using variance percentile < '+str(args.thresh)+ ' by row. Matrix row minimum variance= %.2f' % minVal+' and maximum variance= %.2f' % maxVal)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
561 sys.stderr.write('\nFiltering out rows using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
562 sys.exit(-1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
563 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
564 print('\nFiltering out rows using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
565 elif args.axes == "Column":
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
566 if args.choice == "VarianceCount": threshold= (1-threshold/len(column_header_list))*100.0
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
567 matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = Variance_Percent_Filter_col(matrix,threshold,row_header_list,column_header_list)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
568 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
569 if delCnt < 1:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
570 print('\nNO Filtering occurred for columns using variance percentile < '+str(args.thresh)+ ' by columns. Matrix columns minimum variance= %.2f' % minVal+' and maximum variance= %.2f' % maxVal)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
571 sys.stderr.write('\nNO Filtering out rows using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
572 sys.exit(-1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
573 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
574 print('\nFiltering out columns using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' columns')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
575 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
576 print('Invalid Axes ='+str(args.thresh))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
577 sys.exit(-1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
578 #LowerLimit
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
579 elif args.choice == "LowerLimit": #!! todo is NOT lower or upper limit but range of values
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
580 if args.axes == "Row":
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
581 matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Row('lower',matrix,threshold,row_header_list,column_header_list)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
582 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
583 if delCnt < 1:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
584 print('\nNO Filtering occurred for rows using LowerLimit < '+str(args.thresh)+ ' by row. Matrix row minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
585 sys.stderr.write('\nNO Filtering out rows using LowerLimit < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
586 sys.exit(-1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
587 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
588 print('\nFiltered out '+str(delCnt)+' rows with Lower Limit < '+str(args.thresh))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
589 elif args.axes == "Column":
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
590 matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Col('lower', matrix,threshold,row_header_list,column_header_list)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
591 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
592 if delCnt < 1:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
593 print('\nNO Filtering occurred for columns using Lower Limit < '+str(args.thresh)+ ' by columns. Matrix columns minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
594 sys.stderr.write('\nNO Filtering out rows using Lower Limit < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
595 sys.exit(-1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
596 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
597 print('\nFiltered out '+str(delCnt)+' columns with Lower Limit < '+str(args.thresh))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
598 #UpperLimit
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
599 elif args.choice == "UpperLimit": #!! todo is NOT lower or upper limit but range of values
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
600 if args.axes == "Row":
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
601 matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Row('upper',matrix,threshold,row_header_list,column_header_list)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
602 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
603 if delCnt < 1:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
604 print('\nNO Filtering occurred for rows using Upper Limit < '+str(args.thresh)+ ' by row. Matrix row minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
605 sys.stderr.write('\nNO Filtering out rows using Upper Limit < '+str(args.thresh)+ ' by row. Matrix row minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
606 sys.exit(-1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
607 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
608 print('\nFiltered out '+str(delCnt)+' rows with UpperLimit < '+str(args.thresh))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
609 elif args.axes == "Column":
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
610 matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Col('upper', matrix,threshold,row_header_list,column_header_list)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
611 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
612 if delCnt < 1:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
613 print('\nNO Filtering occurred for columns using UpperLimit < '+str(args.thresh)+ ' by columns. Matrix columns minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
614 sys.stderr.write('\nFiltering out rows using UpperLimit < '+str(args.thresh)+ ' by columns. Matrix columns minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
615 sys.exit(-1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
616 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
617 print('\nFiltered out '+str(delCnt)+' columns with UpperLimit < '+str(args.thresh))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
618 #MADlimit
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
619 elif args.choice == "MADcount" or args.choice == "MADpercent": #!! is lowerlimit of median absolute deviation medians
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
620 threshold= threshold
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
621 if args.axes == "Row":
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
622 if args.choice == "MADpercent": threshold= len(row_header_list)*threshold/100.0
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
623
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
624 matrix, filter_rows, filter_cols,delCnt,maxVal = Row_Value_MAD(matrix,threshold,row_header_list,column_header_list)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
625 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
626 if delCnt < 1:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
627 print('\nNO Filtering occurred for rows using MAD < '+str(threshold)+ ' by row. Matrix row MAD maximum value= %.2f' % maxVal)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
628 sys.stderr.write('\nFiltering out rows using MAD < '+str(threshold)+ ' by row. Matrix row MAD maximum value= %.2f' % maxVal)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
629 sys.exit(-1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
630 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
631 print('\nFiltered out '+str(delCnt)+' rows using MAD maximum value > '+str(threshold))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
632 elif args.axes == "Column":
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
633 if args.choice == "MADpercent": threshold= len(column_header_list)*threshold/100.0
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
634
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
635 matrix, filter_rows, filter_cols,delCnt,maxVal = Col_Value_MAD(matrix,threshold,row_header_list,column_header_list)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
636 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
637 if delCnt < 1:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
638 print('\nNO Filtering occurred for columns using MAD < '+str(threshold)+ ' by columns. Matrix columns MAD maximum value= %.2f' % maxVal)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
639 sys.stderr.write('\nFiltering out columns using MAD < '+str(threshold)+ ' by columns. Matrix columns MAD maximum value= %.2f' % maxVal)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
640 sys.exit(-1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
641 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
642 print('\nFiltered out '+str(delCnt)+' columns using MAD maximum value > '+str(threshold))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
643 #NANlimit
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
644 elif args.choice == "NANlimit" or args.choice == "NANpercent":
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
645 maxNANs= int(args.thresh)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
646 val= ' '
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
647 if args.choice == "NANpercent":
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
648 n,m = np.shape(matrix)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
649 maxNANs= int(int(args.thresh)*n/100)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
650 val= '%'
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
651 if args.axes == "Row":
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
652 matrix, filter_rows, filter_cols,delCnt, maxFoundNANs = NAN_Filter_Row(matrix,nanList,maxNANs,row_header_list,column_header_list)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
653 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
654 if delCnt < 1:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
655 print('\nNO Filtering occurred for rows using NAN limit = or > '+str(args.thresh)+val+ ' by row. Matrix row max NAN count is =' + str(maxFoundNANs ))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
656 sys.stderr.write('\nNO Filtering out rows using NAN limit = or > '+str(args.thresh)+val+ ' by row. Matrix row max NAN count is =' + str(maxFoundNANs ))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
657 sys.exit(-1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
658 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
659 print('\nFiltered out '+str(delCnt)+' rows using NAN limit = or > '+str(args.thresh)+val)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
660 elif args.axes == "Column":
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
661 matrix, filter_rows, filter_cols,delCnt, maxFoundNANs = NAN_Filter_Column(matrix, nanList, maxNANs, row_header_list, column_header_list)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
662 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
663 if delCnt < 1:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
664 print('\nNO Filtering occurred for columns using NAN limit = or > '+str(args.thresh)+val+ ' by columns. Matrix columns max NAN count is = '+ str(maxFoundNANs))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
665 sys.stderr.write('\nNO Filtering out columns using NAN limit = or > '+str(args.thresh)+val+ ' by columns. Matrix columns max NAN count is = '+ str(maxFoundNANs))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
666 sys.exit(-1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
667 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
668 print('\nFiltered out '+str(delCnt)+' columns using NAN limit = or > '+str(args.thresh)+val )
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
669
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
670 # elif args.choice == "covariance":
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
671 # if args.axes == "Row":
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
672 # matrix, filter_rows, filter_cols = CoVariance_Percent_Filter_row(matrix,args.thresh,row_header_list,column_header_list)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
673 # Labeler(matrix,filter_rows,filter_cols,args.output_file_txt)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
674 # print('Covariance_Filter on row')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
675 # elif args.axes == "Column":
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
676 # matrix, filter_rows, filter_cols = CoVariance_Percent_Filter_col(matrix,args.thresh,row_header_list,column_header_list)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
677 # Labeler(matrix,filter_rows,filter_cols,args.output_file_txt)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
678 # print('Covariance_Filter on column')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
679 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
680 print('Invalid Axes = '+str(args.axes))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
681 sys.exit(-1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
682 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
683 print("Invalid Filter Choice = "+str(args.choice))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
684 sys.exit(-2)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
685
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
686
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
687 except Exception as err:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
688 traceback.print_exc()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
689 sys.exit(-3)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
690
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
691 if __name__ == '__main__':
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
692 main()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
693 print("\ndone")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
694 sys.exit(0)