1
|
1 '''
|
|
2 Created on Jun 7, 2017 updated Feb2018
|
|
3
|
|
4 @author: rbrown and cjacoby
|
|
5 '''
|
|
6
|
|
7 import sys, traceback, argparse
|
|
8 import numpy as np
|
|
9 from Matrix_Validate_import import reader, Labeler
|
|
10 import math
|
|
11 #import matplotlib.pyplot as plt
|
|
12
|
|
13 #Define argparse Function
|
|
14 def get_args():
|
|
15 parser = argparse.ArgumentParser()
|
|
16 parser.add_argument('input_file_txt', help='tab delimited text file input matrix(include .txt in name)')
|
|
17 parser.add_argument('choice',type=str, help='Variance Filter Method (Variance or Range)')
|
|
18 parser.add_argument('thresh', help='Thershold for Variance Filtering')
|
|
19 parser.add_argument('axes', help='Axes to Filter on (Either Row or Column')
|
|
20 parser.add_argument('output_file_txt', help='tab delimited text file output name (include .txt in name)')
|
|
21 args = parser.parse_args()
|
|
22 return args
|
|
23
|
|
24 def Range_Filter_Row(matrix,thresh,row_header_list,column_header_list):
|
|
25 #Create Null Set of Filtered Row(Populated Later)
|
|
26 deletes = []
|
|
27 minVal = +9999999
|
|
28 maxVal = -99999
|
|
29 #Loop to Determine Which Rows have sub-Threshold Range
|
|
30 for i in range(0,len(matrix)):
|
|
31 temp_range = np.max(matrix[i][0::]) - np.min(matrix[i][0::])
|
|
32
|
|
33 if temp_range < minVal: minVal = temp_range
|
|
34 elif temp_range > maxVal: maxVal = temp_range
|
|
35
|
|
36 if temp_range <= float(thresh):
|
|
37 deletes = np.append(deletes,[i],0)
|
|
38
|
|
39 #Delete Rows sub-Threshold Rows
|
|
40 matrix = np.delete(matrix,deletes,0)
|
|
41 filter_rows = np.delete(row_header_list,deletes,0)
|
|
42 filter_cols = column_header_list
|
|
43 return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal
|
|
44
|
|
45 def Range_Filter_Col(matrix,thresh,row_header_list,column_header_list):
|
|
46 #Create Null Set of Filtered Row(Populated Later)
|
|
47 deletes = []
|
|
48 minVal = +9999999
|
|
49 maxVal = -99999
|
|
50 #Loop to Determine Which Rows have sub-Threshold Variance
|
|
51 for i in range(0,len(matrix[0])):
|
|
52
|
|
53 temp_range = np.max([row[i] for row in matrix]) - np.min([row[i] for row in matrix])
|
|
54
|
|
55 if temp_range < minVal: minVal = temp_range
|
|
56 elif temp_range > maxVal: maxVal = temp_range
|
|
57
|
|
58 #print(temp_stdev)
|
|
59 if temp_range <= float(thresh):
|
|
60 deletes = np.append(deletes,[i],0)
|
|
61 print(deletes)
|
|
62
|
|
63 #Delete Rows sub-Threshold Rows
|
|
64 matrix = np.delete(matrix,deletes,1)
|
|
65 filter_rows = row_header_list
|
|
66 filter_cols = np.delete(column_header_list,deletes,0)
|
|
67 #np.savetxt('testtest.txt',matrix,delimiter='\t')
|
|
68
|
|
69 return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal
|
|
70
|
|
71 #Define Function Which Deletes Sub-Threshold Rows
|
|
72 def Variance_Percent_Filter_row(matrix,cutoff,row_header_list,column_header_list, create_plot= False):
|
|
73 # if create a plot then DO NOT remove DATA only print diagram of variance ranges !!!
|
|
74
|
|
75 # temp_stdev = np.var(matrix[i][1::])
|
|
76 #cutoff is the percentile rank of the variance values
|
|
77 cutoff= int(cutoff)/100.0
|
|
78 if cutoff > 0.99 or cutoff < .01:
|
|
79 sys.stderr.write( "ERROR illegal cutoff value= "+str(cutoff*100)+" allowed values 1 to 99")
|
|
80 sys.exit(-8)
|
|
81
|
|
82 deletes = []
|
|
83 varianceDict = {}
|
|
84 minVal = +9999999
|
|
85 maxVal = -99999
|
|
86
|
|
87 #Loop to Determine Which Rows have sub-Threshold Variance
|
|
88 for i in range(len(matrix)):
|
|
89 vector = []
|
|
90 for p in range(len(matrix[0])):
|
|
91 if not math.isnan(matrix[i][p]):
|
|
92 vector.append(matrix[i][p])
|
|
93
|
|
94 #temp_stdev = np.var(matrix[:,i])
|
|
95 if len(vector) > 1:
|
|
96 temp_stdev = np.var(vector)
|
|
97 else:
|
|
98 temp_stdev = 0.0
|
|
99
|
|
100 if temp_stdev < minVal:
|
|
101 minVal = temp_stdev
|
|
102 elif temp_stdev > maxVal:
|
|
103 maxVal = temp_stdev
|
|
104
|
|
105 if temp_stdev not in varianceDict:
|
|
106 varianceDict[temp_stdev] = [i]
|
|
107 else:
|
|
108 tmp= varianceDict[temp_stdev]
|
|
109 tmp.append(i)
|
|
110 varianceDict[temp_stdev] = tmp
|
|
111
|
|
112
|
|
113 #calc how many rows to remove
|
|
114 lowerLimit = int(cutoff*len(matrix) +1)
|
|
115 limit = False
|
|
116 cnt = 0
|
|
117
|
|
118 for key in sorted(varianceDict.items()):
|
|
119 #rows = varianceDict[key]
|
|
120 rows= key[1]
|
|
121 cnt += len(rows)
|
|
122 if cnt < lowerLimit: #remove rows below percentile cutoff
|
|
123 for j in rows:
|
|
124 deletes = np.append(deletes,[j],0)
|
|
125 #print(deletes)
|
|
126 else:
|
|
127 limit = True
|
|
128
|
|
129 print( "Dataset Lowest Variance= %.2f" % minVal+" Highest Variance= %.2f" % maxVal+" and Percentile cutoff row = "+str(lowerLimit)+" of "+str(len(matrix))+" rows")
|
|
130
|
|
131
|
|
132 #Delete Rows sub-Threshold Rows
|
|
133 matrix = np.delete(matrix,deletes,0)
|
|
134 filter_rows = np.delete(row_header_list,deletes,0)
|
|
135 filter_cols = column_header_list
|
|
136 #np.savetxt('testtest.txt',matrix,delimiter='\t')
|
|
137
|
|
138 """
|
|
139 if create_plot:
|
|
140 numBins = 10
|
|
141 binWidth = 1
|
|
142 binCat = []
|
|
143 binData = []
|
|
144 counted = False
|
|
145 incrmnt= (maxVal-minVal)/(numBins-1)
|
|
146 current_bin_max = minVal + incrmnt/2
|
|
147 cnt = 0
|
|
148 for key, val in sorted(varianceDict.items()):
|
|
149 if key < current_bin_max:
|
|
150 cnt += len(val) # add all rows having that variance value
|
|
151 counted = False
|
|
152 else:
|
|
153 binData.append(cnt)
|
|
154 cnt= len(val)
|
|
155 binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0)))
|
|
156 current_bin_max += incrmnt
|
|
157 counted = True
|
|
158
|
|
159 if not counted:
|
|
160 binData.append(cnt)
|
|
161 binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0)))
|
|
162
|
|
163 tot = sum(binData)
|
|
164 bins = []
|
|
165 for j in range(numBins):
|
|
166 bins.append(j*binWidth)
|
|
167
|
|
168 #ttps://pythonspot.com/matplotlib-bar-chart/
|
|
169 y_pos = np.arange(numBins)
|
|
170 plt.xticks(y_pos, binCat)
|
|
171 plt.title("Distribution of Variance Values by Row")
|
|
172 plt.ylabel('Variance Bin Totals')
|
|
173 plt.xlabel('Variance Value Bins')
|
|
174 #plt.legend()
|
|
175 plt.bar(y_pos, binData, align='center', alpha=0.5)
|
|
176
|
|
177 fig, ax = plt.subplots(num=1, figsize=(8,3))
|
|
178
|
|
179 plt.show()
|
|
180 """
|
|
181
|
|
182
|
|
183
|
|
184 return matrix,filter_rows,filter_cols ,len(deletes), minVal,maxVal
|
|
185
|
|
186 def Variance_Percent_Filter_col(matrix,cutoff,row_header_list,column_header_list, create_plot=False):
|
|
187 #cutoff is the percentile rank of the variance values
|
|
188 cutoff= int(cutoff)/100.0
|
|
189 if cutoff > 0.99 or cutoff < .01:
|
|
190 sys.stderr.write( "ERROR illegal cutoff value= "+str(cutoff*100)+" allowed values 1 to 99")
|
|
191 sys.exit(-8)
|
|
192
|
|
193 deletes = []
|
|
194 varianceDict = {}
|
|
195 minVal = +9999999
|
|
196 maxVal = -99999
|
|
197 lenCol = len(matrix[0])
|
|
198
|
|
199 #Loop to Determine Which Rows have sub-Threshold Variance
|
|
200 for i in range(lenCol):
|
|
201 vector = []
|
|
202 for p in range(len(matrix)):
|
|
203 if not math.isnan(matrix[p][i]):
|
|
204 vector.append(matrix[p][i])
|
|
205
|
|
206 #temp_stdev = np.var(matrix[:,i])
|
|
207 if len(vector) > 1:
|
|
208 temp_stdev = np.var(vector)
|
|
209 else:
|
|
210 temp_stdev = 0.0
|
|
211
|
|
212 if temp_stdev < minVal:
|
|
213 minVal = temp_stdev
|
|
214 elif temp_stdev > maxVal:
|
|
215 maxVal = temp_stdev
|
|
216
|
|
217 if temp_stdev not in varianceDict:
|
|
218 varianceDict[temp_stdev] = [i]
|
|
219 else:
|
|
220 tmp= varianceDict[temp_stdev]
|
|
221 tmp.append(i)
|
|
222 varianceDict[temp_stdev] = tmp
|
|
223
|
|
224 #print(temp_stdev)
|
|
225 #if temp_stdev <= float(cutoff):
|
|
226
|
|
227 #calc how many rows to remove
|
|
228 lowerLimit = int(cutoff*lenCol +1)
|
|
229 limit = False
|
|
230 cnt = 0
|
|
231
|
|
232 for key in sorted(varianceDict.items()):
|
|
233 #rows = varianceDict[key]
|
|
234 cols= key[1]
|
|
235 cnt += len(cols)
|
|
236 if cnt < lowerLimit: #remove rows below percentile cutoff
|
|
237 for j in cols:
|
|
238 deletes = np.append(deletes,[j],0)
|
|
239 #print(deletes)
|
|
240 else:
|
|
241 limit = True
|
|
242
|
|
243 print( "Dataset Lowest Variance= %.2f" % minVal+" Highest Variance= %.2f" % maxVal+" and Percentile cutoff column= "+str(lowerLimit)+" of "+str(lenCol)+" columns")
|
|
244
|
|
245 matrix = np.delete(matrix,deletes,1)
|
|
246 filter_rows = row_header_list
|
|
247 filter_cols = np.delete(column_header_list,deletes,0)
|
|
248 #np.savetxt('testtest.txt',matrix,delimiter='\t')
|
|
249
|
|
250 """
|
|
251 if create_plot:
|
|
252 numBins = 10
|
|
253 binWidth = 1
|
|
254 binCat = []
|
|
255 binData = []
|
|
256 counted = False
|
|
257 incrmnt= (maxVal-minVal)/(numBins-1)
|
|
258 current_bin_max = minVal + incrmnt/2
|
|
259 cnt = 0
|
|
260 for key, val in sorted(varianceDict.items()):
|
|
261 if key < current_bin_max:
|
|
262 cnt += len(val) # add all rows having that variance value
|
|
263 counted = False
|
|
264 else:
|
|
265 binData.append(cnt)
|
|
266 cnt= len(val)
|
|
267 binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0)))
|
|
268 current_bin_max += incrmnt
|
|
269 counted = True
|
|
270
|
|
271 if not counted:
|
|
272 binData.append(cnt)
|
|
273 binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0)))
|
|
274
|
|
275 tot = sum(binData)
|
|
276 bins = []
|
|
277
|
|
278 for j in range(numBins):
|
|
279 bins.append(j*binWidth)
|
|
280 #https://pythonspot.com/matplotlib-bar-chart/
|
|
281 y_pos = np.arange(numBins)
|
|
282
|
|
283 plt.xticks(y_pos, binCat)
|
|
284 plt.title("Distribution of Variance Values by Column")
|
|
285 plt.ylabel('Variance Bin Totals')
|
|
286 plt.xlabel('Variance Value Bins')
|
|
287 #plt.legend()
|
|
288 plt.bar(y_pos, binData, align='center', alpha=0.5)
|
|
289
|
|
290 fig, ax = plt.subplots(num=1, figsize=(8,3))
|
|
291 plt.show()
|
|
292 """
|
|
293
|
|
294 return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal
|
|
295
|
|
296 def UpperLowerLimit_Filter_Row(upperLower, matrix,cutoff,row_header_list,column_header_list):
|
|
297 #Create Null Set of Filtered Row(Populated Later)
|
|
298 deletes = []
|
|
299 minVal = +9999999
|
|
300 maxVal = -99999
|
|
301 #Loop to Determine Which Rows have sub-Threshold Range
|
|
302 for i in range(0,len(matrix)):
|
|
303 removeRow = False
|
|
304
|
|
305 for j in range(len(matrix[0])):
|
|
306 val= matrix[i][j]
|
|
307 if not math.isnan(val):
|
|
308 if val <= cutoff and upperLower == 'lower':
|
|
309 removeRow = True
|
|
310 elif val >= cutoff and upperLower == 'upper':
|
|
311 removeRow = True
|
|
312 else:
|
|
313 if val < minVal: minVal = val
|
|
314 if val > maxVal: maxVal = val
|
|
315
|
|
316 #print(temp_stdev)
|
|
317 if removeRow:
|
|
318 deletes = np.append(deletes,[i],0)
|
|
319
|
|
320 #Delete Rows sub-Threshold Rows
|
|
321 matrix = np.delete(matrix,deletes,0)
|
|
322 filter_rows = np.delete(row_header_list,deletes,0)
|
|
323 filter_cols = column_header_list
|
|
324
|
|
325 return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal
|
|
326
|
|
327 def UpperLowerLimit_Filter_Col(upperLower,matrix,cutoff,row_header_list,column_header_list):
|
|
328 #Create Null Set of Filtered Row(Populated Later)
|
|
329 deletes = []
|
|
330 minVal = +9999999
|
|
331 maxVal = -99999
|
|
332 #Loop to Determine Which Rows have sub-Threshold Variance
|
|
333
|
|
334 for i in range(0,len(matrix[0])):
|
|
335 removeRow = False
|
|
336
|
|
337 for j in range(len(matrix)):
|
|
338 val= matrix[j][i]
|
|
339 if not math.isnan(val):
|
|
340 if val <= cutoff and upperLower == 'lower':
|
|
341 removeRow = True
|
|
342 elif val >= cutoff and upperLower == 'upper':
|
|
343 removeRow = True
|
|
344 else:
|
|
345 if val < minVal: minVal = val
|
|
346 if val > maxVal: maxVal = val
|
|
347
|
|
348 #print(temp_stdev)
|
|
349 if removeRow: deletes = np.append(deletes,[i],0)
|
|
350
|
|
351 #Delete Rows sub-Threshold Rows
|
|
352 matrix = np.delete(matrix,deletes,1)
|
|
353 filter_rows = row_header_list
|
|
354 filter_cols = np.delete(column_header_list,deletes,0)
|
|
355 #np.savetxt('testtest.txt',matrix,delimiter='\t')
|
|
356
|
|
357 return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal
|
|
358
|
|
359 #========= remove rows with too many NANs in cells
|
|
360 def NAN_Filter_Row(matrix,nanList,maxAllowedNANs,row_header_list,column_header_list):
|
|
361
|
|
362 try:
|
|
363 #Create Null Set of Filtered Row(Populated Later)
|
|
364 maxFoundNANs = 0
|
|
365 deletes = []
|
|
366 #Loop to Determine Which Rows have sub-Threshold Range
|
|
367 for i in range(0,len(matrix)):
|
|
368 #matches= [s for s in matrix[i][0::] if any(nan == s.upper() for nan in nanList)]
|
|
369 #matches= [s for s in matrix[i][:] if s in nanList]
|
|
370 matches= []
|
|
371 for s in matrix[i]:
|
|
372 if str(s) in nanList: matches.append(s)
|
|
373
|
|
374
|
|
375 lenMatches = len(matches)
|
|
376 if lenMatches > maxFoundNANs: maxFoundNANs = lenMatches
|
|
377
|
|
378 if lenMatches >= maxAllowedNANs:
|
|
379 deletes = np.append(deletes,[i],0)
|
|
380
|
|
381 #Delete Rows sub-Threshold Rows
|
|
382 matrix = np.delete(matrix,deletes,0)
|
|
383 filter_rows = np.delete(row_header_list,deletes,0)
|
|
384 filter_cols = column_header_list
|
|
385
|
|
386 except Exception as err:
|
|
387 traceback.print_exc()
|
|
388 sys.exit(-4)
|
|
389
|
|
390 return matrix, filter_rows, filter_cols,len(deletes),maxFoundNANs
|
|
391
|
|
392 #========= remove Cols with too many NANs
|
|
393
|
|
394 def NAN_Filter_Column(matrix,nanList,maxAllowedNANs,row_header_list,column_header_list):
|
|
395
|
|
396 #Create Null Set of Filtered Row(Populated Later)
|
|
397 minNumNANs = 0
|
|
398 maxFoundNANs = 0
|
|
399 deletes = []
|
|
400 #Loop to Determine Which Rows have sub-Threshold Variance
|
|
401 for i in range(0,len(matrix[0])):
|
|
402 matches= []
|
|
403 for j in range(len(matrix)):
|
|
404 if str(matrix[j][i]) in nanList: matches.append(matrix[j][i])
|
|
405
|
|
406 lenMatches = len(matches)
|
|
407 if lenMatches > maxFoundNANs:
|
|
408 maxFoundNANs = lenMatches
|
|
409
|
|
410 if lenMatches >= maxAllowedNANs:
|
|
411 deletes = np.append(deletes,[i],0)
|
|
412
|
|
413 #Delete cols with too many NANs
|
|
414 matrix = np.delete(matrix,deletes,1)
|
|
415 filter_rows = row_header_list
|
|
416 filter_cols = np.delete(column_header_list,deletes,0)
|
|
417 #np.savetxt('testtest.txt',matrix,delimiter='\t')
|
|
418 return matrix, filter_rows, filter_cols,len(deletes),maxFoundNANs
|
|
419
|
|
420
|
|
421 #MAD Median Absolute Deviation median (|Xi - Xmedian|) > X
|
|
422 def Row_Value_MAD(matrix,cutoff,row_header_list,column_header_list):
|
|
423 #MAD Median Absolute Deviation median (|Xi - Xmedian|) > X
|
|
424 # cutoff is MAX value used to meant to minimize the impact of one outlier
|
|
425
|
|
426 deletes = []
|
|
427 minVal = +9999999
|
|
428 maxVal = -99999
|
|
429 #Loop to Determine Which Rows have sub-Threshold Range
|
|
430 for i in range(0,len(matrix)):
|
|
431 medianRow = np.median(matrix[i])
|
|
432 temp = np.median(abs(matrix[i]- medianRow))
|
|
433 # median (|Xi - Xmedian|) > X => meant to minimize the impact of one outlier
|
|
434 if temp < cutoff:
|
|
435 deletes = np.append(deletes,[i],0)
|
|
436
|
|
437 if temp < minVal: minVal = temp
|
|
438 if temp > maxVal: maxVal = temp
|
|
439
|
|
440 #Delete Rows sub-Threshold Rows
|
|
441 matrix = np.delete(matrix,deletes,0)
|
|
442 filter_rows = np.delete(row_header_list,deletes,0)
|
|
443 filter_cols = column_header_list
|
|
444 print( "INFO Row MAD - Matrix min MAD value= "+str(minVal)+" and the max MAD value= "+str(maxVal) )
|
|
445
|
|
446 return matrix, filter_rows, filter_cols,len(deletes),maxVal
|
|
447
|
|
448 #MAD Median Absolute Deviation median (|Xi - Xmedian|) > X
|
|
449 def Col_Value_MAD(matrix,cutoff,row_header_list,column_header_list):
|
|
450 #MAD Median Absolute Deviation median (|Xi - Xmedian|) > X
|
|
451 # cutoff is MAX value used to meant to minimize the impact of one outlier
|
|
452 deletes = []
|
|
453 minVal = +9999999
|
|
454 maxVal = -99999
|
|
455 #Loop to Determine Which Rows have sub-Threshold Range
|
|
456 for i in range(0,len(matrix[0])):
|
|
457 matrixCol= []
|
|
458 for j in range(len(matrix)):
|
|
459 matrixCol.append(matrix[j][i])
|
|
460
|
|
461 medianCol = np.median(matrixCol)
|
|
462 temp = np.median(abs(matrixCol- medianCol))
|
|
463 # median (|Xi - Xmedian|) > X meant to minimize the impact of one outlier
|
|
464 if temp < cutoff:
|
|
465 deletes = np.append(deletes,[i],0)
|
|
466
|
|
467 if temp < minVal: minVal = temp
|
|
468 if temp > maxVal: maxVal = temp
|
|
469
|
|
470 #Delete Rows sub-Threshold Rows
|
|
471 matrix = np.delete(matrix,deletes,1)
|
|
472 filter_rows = row_header_list
|
|
473 filter_cols = np.delete(column_header_list,deletes,0)
|
|
474 print( "INFO Column MAD - Matrix min MAD value= "+str(minVal)+" and the max MAD value= "+str(maxVal) )
|
|
475
|
|
476 return matrix, filter_rows, filter_cols,len(deletes),maxVal
|
|
477
|
|
478
|
|
479 # if covariance of the data in two columns exceeds a thresehold remove one row list the rows in a separate output
|
|
480 # def CoVariance_Percent_Filter_row_col(matrix,thresh,row_header_list,column_header_list):
|
|
481 # xv= array([8., 9.5, 7.8, 4.2, -7.7, -5.4, 3.2])
|
|
482 # yv= array([8.9, 2.0, 4.8, -4.2, 2.7, -3.4, -5.9])
|
|
483 #
|
|
484 # def cov(x,y):
|
|
485 # if (len(x) != len(y)
|
|
486 # [Stop]
|
|
487 # x.bar = mean(x)
|
|
488 # y.bar = mean(y)
|
|
489 # N = len(x)
|
|
490 # Cov = (sum((x-x.bar)*(y-y.bar))) / (N-1.0)
|
|
491 # return(Cov)
|
|
492
|
|
493 # #Create Null Set of Filtered Row(Populated Later)
|
|
494 # deletes = []
|
|
495 #
|
|
496 # temp_mean = np.nanmean(matrix[i])
|
|
497 # temp_stdev = np.nanstd(matrix[i])
|
|
498 #
|
|
499 # get stddev of each row the calc xi -xj sq
|
|
500 #
|
|
501 # for i in range(0,len(matrix)):
|
|
502 # temp_range = np.max(matrix[i][0::]) - np.min(matrix[i][0::])
|
|
503 # if temp_range <= float(thresh):
|
|
504 # deletes = np.append(deletes,[i],0)
|
|
505 #
|
|
506 # #Delete Rows sub-Threshold Rows
|
|
507 # matrix = np.delete(matrix,deletes,0)
|
|
508 # filter_rows = np.delete(row_header_list,deletes,0)
|
|
509 # filter_cols = column_header_list
|
|
510 # return(matrix,filter_rows,filter_cols)
|
|
511 #
|
|
512 # #np.savetxt('testtest.txt',matrix,delimiter='\t')
|
|
513 # return(matrix,filter_rows,filter_cols)
|
|
514 #
|
|
515
|
|
516 #Define Function Which Labels Rows/Columns on Output
|
|
517 #below replace
|
|
518 # def labeler(matrix,filter_rows,filter_cols,output_file_txt):
|
|
519 #
|
|
520 # #Write Data to Specified Text File Output
|
|
521 # with open(output_file_txt,'w') as f:
|
|
522 # f.write("")
|
|
523 # for k in range(0,len(filter_cols)):
|
|
524 # f.write('\t' + filter_cols[k])
|
|
525 # f.write('\n')
|
|
526 # for i in range(0,len(filter_rows)):
|
|
527 # f.write(filter_rows[i])
|
|
528 # for j in range(0,len(matrix[0])):
|
|
529 # f.write('\t' + format(matrix[i][j]))
|
|
530 # f.write('\n')
|
|
531
|
|
532
|
|
533 #Define Main Function
|
|
534 def main():
|
|
535 try:
|
|
536 args = get_args()
|
|
537 #sys.stdout.write(str(args)+"\n")
|
|
538 # <option value="LowerLimit">Minimum Absolute(Cell) Values to remove row/column</option>
|
|
539 # <option value="UpperLimit">Maximum Absolute(Cell) Values to remove row/column</option>
|
|
540 # <option value="NANnumber">NAN Number Cells Limit to remove row/column</option>
|
|
541 # <option value="NANpercent">NAN Percent Cells Limit to remove row/column</option>
|
|
542 nanList= ["NAN", "NA", "N/A", "-","?","nan", "na", "n/a"]
|
|
543
|
|
544 matrix, column_header_list,row_header_list = reader(args.input_file_txt)
|
|
545 #old_reader matrix, row_header_list, column_header_list = reader(args.input_file_txt)
|
|
546 threshold = float(args.thresh)
|
|
547 if threshold < 0.000001:
|
|
548 print('Invalid negative or near-zero threshold chosen = '+str(args.thresh)+" choose positive value")
|
|
549 sys.exit(-4)
|
|
550
|
|
551 #VariancePercent
|
|
552 if args.choice == "VariancePercent" or args.choice == "VarianceCount": # > percent variance
|
|
553
|
|
554 if args.axes == "Row":
|
|
555 if args.choice == "VarianceCount": threshold= (1-threshold/len(row_header_list))*100.0
|
|
556
|
|
557 matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = Variance_Percent_Filter_row(matrix,threshold,row_header_list,column_header_list)
|
|
558 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
|
|
559 if delCnt < 1:
|
|
560 print('\nNO Filtering occurred for rows using variance percentile < '+str(args.thresh)+ ' by row. Matrix row minimum variance= %.2f' % minVal+' and maximum variance= %.2f' % maxVal)
|
|
561 sys.stderr.write('\nFiltering out rows using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows')
|
|
562 sys.exit(-1)
|
|
563 else:
|
|
564 print('\nFiltering out rows using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows')
|
|
565 elif args.axes == "Column":
|
|
566 if args.choice == "VarianceCount": threshold= (1-threshold/len(column_header_list))*100.0
|
|
567 matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = Variance_Percent_Filter_col(matrix,threshold,row_header_list,column_header_list)
|
|
568 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
|
|
569 if delCnt < 1:
|
|
570 print('\nNO Filtering occurred for columns using variance percentile < '+str(args.thresh)+ ' by columns. Matrix columns minimum variance= %.2f' % minVal+' and maximum variance= %.2f' % maxVal)
|
|
571 sys.stderr.write('\nNO Filtering out rows using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows')
|
|
572 sys.exit(-1)
|
|
573 else:
|
|
574 print('\nFiltering out columns using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' columns')
|
|
575 else:
|
|
576 print('Invalid Axes ='+str(args.thresh))
|
|
577 sys.exit(-1)
|
|
578 #LowerLimit
|
|
579 elif args.choice == "LowerLimit": #!! todo is NOT lower or upper limit but range of values
|
|
580 if args.axes == "Row":
|
|
581 matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Row('lower',matrix,threshold,row_header_list,column_header_list)
|
|
582 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
|
|
583 if delCnt < 1:
|
|
584 print('\nNO Filtering occurred for rows using LowerLimit < '+str(args.thresh)+ ' by row. Matrix row minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal)
|
|
585 sys.stderr.write('\nNO Filtering out rows using LowerLimit < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows')
|
|
586 sys.exit(-1)
|
|
587 else:
|
|
588 print('\nFiltered out '+str(delCnt)+' rows with Lower Limit < '+str(args.thresh))
|
|
589 elif args.axes == "Column":
|
|
590 matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Col('lower', matrix,threshold,row_header_list,column_header_list)
|
|
591 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
|
|
592 if delCnt < 1:
|
|
593 print('\nNO Filtering occurred for columns using Lower Limit < '+str(args.thresh)+ ' by columns. Matrix columns minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal)
|
|
594 sys.stderr.write('\nNO Filtering out rows using Lower Limit < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows')
|
|
595 sys.exit(-1)
|
|
596 else:
|
|
597 print('\nFiltered out '+str(delCnt)+' columns with Lower Limit < '+str(args.thresh))
|
|
598 #UpperLimit
|
|
599 elif args.choice == "UpperLimit": #!! todo is NOT lower or upper limit but range of values
|
|
600 if args.axes == "Row":
|
|
601 matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Row('upper',matrix,threshold,row_header_list,column_header_list)
|
|
602 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
|
|
603 if delCnt < 1:
|
|
604 print('\nNO Filtering occurred for rows using Upper Limit < '+str(args.thresh)+ ' by row. Matrix row minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal)
|
|
605 sys.stderr.write('\nNO Filtering out rows using Upper Limit < '+str(args.thresh)+ ' by row. Matrix row minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal)
|
|
606 sys.exit(-1)
|
|
607 else:
|
|
608 print('\nFiltered out '+str(delCnt)+' rows with UpperLimit < '+str(args.thresh))
|
|
609 elif args.axes == "Column":
|
|
610 matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Col('upper', matrix,threshold,row_header_list,column_header_list)
|
|
611 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
|
|
612 if delCnt < 1:
|
|
613 print('\nNO Filtering occurred for columns using UpperLimit < '+str(args.thresh)+ ' by columns. Matrix columns minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal)
|
|
614 sys.stderr.write('\nFiltering out rows using UpperLimit < '+str(args.thresh)+ ' by columns. Matrix columns minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal)
|
|
615 sys.exit(-1)
|
|
616 else:
|
|
617 print('\nFiltered out '+str(delCnt)+' columns with UpperLimit < '+str(args.thresh))
|
|
618 #MADlimit
|
|
619 elif args.choice == "MADcount" or args.choice == "MADpercent": #!! is lowerlimit of median absolute deviation medians
|
|
620 threshold= threshold
|
|
621 if args.axes == "Row":
|
|
622 if args.choice == "MADpercent": threshold= len(row_header_list)*threshold/100.0
|
|
623
|
|
624 matrix, filter_rows, filter_cols,delCnt,maxVal = Row_Value_MAD(matrix,threshold,row_header_list,column_header_list)
|
|
625 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
|
|
626 if delCnt < 1:
|
|
627 print('\nNO Filtering occurred for rows using MAD < '+str(threshold)+ ' by row. Matrix row MAD maximum value= %.2f' % maxVal)
|
|
628 sys.stderr.write('\nFiltering out rows using MAD < '+str(threshold)+ ' by row. Matrix row MAD maximum value= %.2f' % maxVal)
|
|
629 sys.exit(-1)
|
|
630 else:
|
|
631 print('\nFiltered out '+str(delCnt)+' rows using MAD maximum value > '+str(threshold))
|
|
632 elif args.axes == "Column":
|
|
633 if args.choice == "MADpercent": threshold= len(column_header_list)*threshold/100.0
|
|
634
|
|
635 matrix, filter_rows, filter_cols,delCnt,maxVal = Col_Value_MAD(matrix,threshold,row_header_list,column_header_list)
|
|
636 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
|
|
637 if delCnt < 1:
|
|
638 print('\nNO Filtering occurred for columns using MAD < '+str(threshold)+ ' by columns. Matrix columns MAD maximum value= %.2f' % maxVal)
|
|
639 sys.stderr.write('\nFiltering out columns using MAD < '+str(threshold)+ ' by columns. Matrix columns MAD maximum value= %.2f' % maxVal)
|
|
640 sys.exit(-1)
|
|
641 else:
|
|
642 print('\nFiltered out '+str(delCnt)+' columns using MAD maximum value > '+str(threshold))
|
|
643 #NANlimit
|
|
644 elif args.choice == "NANlimit" or args.choice == "NANpercent":
|
|
645 maxNANs= int(args.thresh)
|
|
646 val= ' '
|
|
647 if args.choice == "NANpercent":
|
|
648 n,m = np.shape(matrix)
|
|
649 maxNANs= int(int(args.thresh)*n/100)
|
|
650 val= '%'
|
|
651 if args.axes == "Row":
|
|
652 matrix, filter_rows, filter_cols,delCnt, maxFoundNANs = NAN_Filter_Row(matrix,nanList,maxNANs,row_header_list,column_header_list)
|
|
653 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
|
|
654 if delCnt < 1:
|
|
655 print('\nNO Filtering occurred for rows using NAN limit = or > '+str(args.thresh)+val+ ' by row. Matrix row max NAN count is =' + str(maxFoundNANs ))
|
|
656 sys.stderr.write('\nNO Filtering out rows using NAN limit = or > '+str(args.thresh)+val+ ' by row. Matrix row max NAN count is =' + str(maxFoundNANs ))
|
|
657 sys.exit(-1)
|
|
658 else:
|
|
659 print('\nFiltered out '+str(delCnt)+' rows using NAN limit = or > '+str(args.thresh)+val)
|
|
660 elif args.axes == "Column":
|
|
661 matrix, filter_rows, filter_cols,delCnt, maxFoundNANs = NAN_Filter_Column(matrix, nanList, maxNANs, row_header_list, column_header_list)
|
|
662 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
|
|
663 if delCnt < 1:
|
|
664 print('\nNO Filtering occurred for columns using NAN limit = or > '+str(args.thresh)+val+ ' by columns. Matrix columns max NAN count is = '+ str(maxFoundNANs))
|
|
665 sys.stderr.write('\nNO Filtering out columns using NAN limit = or > '+str(args.thresh)+val+ ' by columns. Matrix columns max NAN count is = '+ str(maxFoundNANs))
|
|
666 sys.exit(-1)
|
|
667 else:
|
|
668 print('\nFiltered out '+str(delCnt)+' columns using NAN limit = or > '+str(args.thresh)+val )
|
|
669
|
|
670 # elif args.choice == "covariance":
|
|
671 # if args.axes == "Row":
|
|
672 # matrix, filter_rows, filter_cols = CoVariance_Percent_Filter_row(matrix,args.thresh,row_header_list,column_header_list)
|
|
673 # Labeler(matrix,filter_rows,filter_cols,args.output_file_txt)
|
|
674 # print('Covariance_Filter on row')
|
|
675 # elif args.axes == "Column":
|
|
676 # matrix, filter_rows, filter_cols = CoVariance_Percent_Filter_col(matrix,args.thresh,row_header_list,column_header_list)
|
|
677 # Labeler(matrix,filter_rows,filter_cols,args.output_file_txt)
|
|
678 # print('Covariance_Filter on column')
|
|
679 else:
|
|
680 print('Invalid Axes = '+str(args.axes))
|
|
681 sys.exit(-1)
|
|
682 else:
|
|
683 print("Invalid Filter Choice = "+str(args.choice))
|
|
684 sys.exit(-2)
|
|
685
|
|
686
|
|
687 except Exception as err:
|
|
688 traceback.print_exc()
|
|
689 sys.exit(-3)
|
|
690
|
|
691 if __name__ == '__main__':
|
|
692 main()
|
|
693 print("\ndone")
|
|
694 sys.exit(0)
|