Mercurial > repos > md-anderson-bioinformatics > matrix_manipulation
comparison Matrix_Filters.py @ 1:f1bcd79cd923 draft default tip
Uploaded
author | insilico-bob |
---|---|
date | Tue, 27 Nov 2018 14:20:40 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:7f12c81e2083 | 1:f1bcd79cd923 |
---|---|
1 ''' | |
2 Created on Jun 7, 2017 updated Feb2018 | |
3 | |
4 @author: rbrown and cjacoby | |
5 ''' | |
6 | |
7 import sys, traceback, argparse | |
8 import numpy as np | |
9 from Matrix_Validate_import import reader, Labeler | |
10 import math | |
11 #import matplotlib.pyplot as plt | |
12 | |
13 #Define argparse Function | |
14 def get_args(): | |
15 parser = argparse.ArgumentParser() | |
16 parser.add_argument('input_file_txt', help='tab delimited text file input matrix(include .txt in name)') | |
17 parser.add_argument('choice',type=str, help='Variance Filter Method (Variance or Range)') | |
18 parser.add_argument('thresh', help='Thershold for Variance Filtering') | |
19 parser.add_argument('axes', help='Axes to Filter on (Either Row or Column') | |
20 parser.add_argument('output_file_txt', help='tab delimited text file output name (include .txt in name)') | |
21 args = parser.parse_args() | |
22 return args | |
23 | |
24 def Range_Filter_Row(matrix,thresh,row_header_list,column_header_list): | |
25 #Create Null Set of Filtered Row(Populated Later) | |
26 deletes = [] | |
27 minVal = +9999999 | |
28 maxVal = -99999 | |
29 #Loop to Determine Which Rows have sub-Threshold Range | |
30 for i in range(0,len(matrix)): | |
31 temp_range = np.max(matrix[i][0::]) - np.min(matrix[i][0::]) | |
32 | |
33 if temp_range < minVal: minVal = temp_range | |
34 elif temp_range > maxVal: maxVal = temp_range | |
35 | |
36 if temp_range <= float(thresh): | |
37 deletes = np.append(deletes,[i],0) | |
38 | |
39 #Delete Rows sub-Threshold Rows | |
40 matrix = np.delete(matrix,deletes,0) | |
41 filter_rows = np.delete(row_header_list,deletes,0) | |
42 filter_cols = column_header_list | |
43 return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal | |
44 | |
45 def Range_Filter_Col(matrix,thresh,row_header_list,column_header_list): | |
46 #Create Null Set of Filtered Row(Populated Later) | |
47 deletes = [] | |
48 minVal = +9999999 | |
49 maxVal = -99999 | |
50 #Loop to Determine Which Rows have sub-Threshold Variance | |
51 for i in range(0,len(matrix[0])): | |
52 | |
53 temp_range = np.max([row[i] for row in matrix]) - np.min([row[i] for row in matrix]) | |
54 | |
55 if temp_range < minVal: minVal = temp_range | |
56 elif temp_range > maxVal: maxVal = temp_range | |
57 | |
58 #print(temp_stdev) | |
59 if temp_range <= float(thresh): | |
60 deletes = np.append(deletes,[i],0) | |
61 print(deletes) | |
62 | |
63 #Delete Rows sub-Threshold Rows | |
64 matrix = np.delete(matrix,deletes,1) | |
65 filter_rows = row_header_list | |
66 filter_cols = np.delete(column_header_list,deletes,0) | |
67 #np.savetxt('testtest.txt',matrix,delimiter='\t') | |
68 | |
69 return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal | |
70 | |
71 #Define Function Which Deletes Sub-Threshold Rows | |
72 def Variance_Percent_Filter_row(matrix,cutoff,row_header_list,column_header_list, create_plot= False): | |
73 # if create a plot then DO NOT remove DATA only print diagram of variance ranges !!! | |
74 | |
75 # temp_stdev = np.var(matrix[i][1::]) | |
76 #cutoff is the percentile rank of the variance values | |
77 cutoff= int(cutoff)/100.0 | |
78 if cutoff > 0.99 or cutoff < .01: | |
79 sys.stderr.write( "ERROR illegal cutoff value= "+str(cutoff*100)+" allowed values 1 to 99") | |
80 sys.exit(-8) | |
81 | |
82 deletes = [] | |
83 varianceDict = {} | |
84 minVal = +9999999 | |
85 maxVal = -99999 | |
86 | |
87 #Loop to Determine Which Rows have sub-Threshold Variance | |
88 for i in range(len(matrix)): | |
89 vector = [] | |
90 for p in range(len(matrix[0])): | |
91 if not math.isnan(matrix[i][p]): | |
92 vector.append(matrix[i][p]) | |
93 | |
94 #temp_stdev = np.var(matrix[:,i]) | |
95 if len(vector) > 1: | |
96 temp_stdev = np.var(vector) | |
97 else: | |
98 temp_stdev = 0.0 | |
99 | |
100 if temp_stdev < minVal: | |
101 minVal = temp_stdev | |
102 elif temp_stdev > maxVal: | |
103 maxVal = temp_stdev | |
104 | |
105 if temp_stdev not in varianceDict: | |
106 varianceDict[temp_stdev] = [i] | |
107 else: | |
108 tmp= varianceDict[temp_stdev] | |
109 tmp.append(i) | |
110 varianceDict[temp_stdev] = tmp | |
111 | |
112 | |
113 #calc how many rows to remove | |
114 lowerLimit = int(cutoff*len(matrix) +1) | |
115 limit = False | |
116 cnt = 0 | |
117 | |
118 for key in sorted(varianceDict.items()): | |
119 #rows = varianceDict[key] | |
120 rows= key[1] | |
121 cnt += len(rows) | |
122 if cnt < lowerLimit: #remove rows below percentile cutoff | |
123 for j in rows: | |
124 deletes = np.append(deletes,[j],0) | |
125 #print(deletes) | |
126 else: | |
127 limit = True | |
128 | |
129 print( "Dataset Lowest Variance= %.2f" % minVal+" Highest Variance= %.2f" % maxVal+" and Percentile cutoff row = "+str(lowerLimit)+" of "+str(len(matrix))+" rows") | |
130 | |
131 | |
132 #Delete Rows sub-Threshold Rows | |
133 matrix = np.delete(matrix,deletes,0) | |
134 filter_rows = np.delete(row_header_list,deletes,0) | |
135 filter_cols = column_header_list | |
136 #np.savetxt('testtest.txt',matrix,delimiter='\t') | |
137 | |
138 """ | |
139 if create_plot: | |
140 numBins = 10 | |
141 binWidth = 1 | |
142 binCat = [] | |
143 binData = [] | |
144 counted = False | |
145 incrmnt= (maxVal-minVal)/(numBins-1) | |
146 current_bin_max = minVal + incrmnt/2 | |
147 cnt = 0 | |
148 for key, val in sorted(varianceDict.items()): | |
149 if key < current_bin_max: | |
150 cnt += len(val) # add all rows having that variance value | |
151 counted = False | |
152 else: | |
153 binData.append(cnt) | |
154 cnt= len(val) | |
155 binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0))) | |
156 current_bin_max += incrmnt | |
157 counted = True | |
158 | |
159 if not counted: | |
160 binData.append(cnt) | |
161 binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0))) | |
162 | |
163 tot = sum(binData) | |
164 bins = [] | |
165 for j in range(numBins): | |
166 bins.append(j*binWidth) | |
167 | |
168 #ttps://pythonspot.com/matplotlib-bar-chart/ | |
169 y_pos = np.arange(numBins) | |
170 plt.xticks(y_pos, binCat) | |
171 plt.title("Distribution of Variance Values by Row") | |
172 plt.ylabel('Variance Bin Totals') | |
173 plt.xlabel('Variance Value Bins') | |
174 #plt.legend() | |
175 plt.bar(y_pos, binData, align='center', alpha=0.5) | |
176 | |
177 fig, ax = plt.subplots(num=1, figsize=(8,3)) | |
178 | |
179 plt.show() | |
180 """ | |
181 | |
182 | |
183 | |
184 return matrix,filter_rows,filter_cols ,len(deletes), minVal,maxVal | |
185 | |
186 def Variance_Percent_Filter_col(matrix,cutoff,row_header_list,column_header_list, create_plot=False): | |
187 #cutoff is the percentile rank of the variance values | |
188 cutoff= int(cutoff)/100.0 | |
189 if cutoff > 0.99 or cutoff < .01: | |
190 sys.stderr.write( "ERROR illegal cutoff value= "+str(cutoff*100)+" allowed values 1 to 99") | |
191 sys.exit(-8) | |
192 | |
193 deletes = [] | |
194 varianceDict = {} | |
195 minVal = +9999999 | |
196 maxVal = -99999 | |
197 lenCol = len(matrix[0]) | |
198 | |
199 #Loop to Determine Which Rows have sub-Threshold Variance | |
200 for i in range(lenCol): | |
201 vector = [] | |
202 for p in range(len(matrix)): | |
203 if not math.isnan(matrix[p][i]): | |
204 vector.append(matrix[p][i]) | |
205 | |
206 #temp_stdev = np.var(matrix[:,i]) | |
207 if len(vector) > 1: | |
208 temp_stdev = np.var(vector) | |
209 else: | |
210 temp_stdev = 0.0 | |
211 | |
212 if temp_stdev < minVal: | |
213 minVal = temp_stdev | |
214 elif temp_stdev > maxVal: | |
215 maxVal = temp_stdev | |
216 | |
217 if temp_stdev not in varianceDict: | |
218 varianceDict[temp_stdev] = [i] | |
219 else: | |
220 tmp= varianceDict[temp_stdev] | |
221 tmp.append(i) | |
222 varianceDict[temp_stdev] = tmp | |
223 | |
224 #print(temp_stdev) | |
225 #if temp_stdev <= float(cutoff): | |
226 | |
227 #calc how many rows to remove | |
228 lowerLimit = int(cutoff*lenCol +1) | |
229 limit = False | |
230 cnt = 0 | |
231 | |
232 for key in sorted(varianceDict.items()): | |
233 #rows = varianceDict[key] | |
234 cols= key[1] | |
235 cnt += len(cols) | |
236 if cnt < lowerLimit: #remove rows below percentile cutoff | |
237 for j in cols: | |
238 deletes = np.append(deletes,[j],0) | |
239 #print(deletes) | |
240 else: | |
241 limit = True | |
242 | |
243 print( "Dataset Lowest Variance= %.2f" % minVal+" Highest Variance= %.2f" % maxVal+" and Percentile cutoff column= "+str(lowerLimit)+" of "+str(lenCol)+" columns") | |
244 | |
245 matrix = np.delete(matrix,deletes,1) | |
246 filter_rows = row_header_list | |
247 filter_cols = np.delete(column_header_list,deletes,0) | |
248 #np.savetxt('testtest.txt',matrix,delimiter='\t') | |
249 | |
250 """ | |
251 if create_plot: | |
252 numBins = 10 | |
253 binWidth = 1 | |
254 binCat = [] | |
255 binData = [] | |
256 counted = False | |
257 incrmnt= (maxVal-minVal)/(numBins-1) | |
258 current_bin_max = minVal + incrmnt/2 | |
259 cnt = 0 | |
260 for key, val in sorted(varianceDict.items()): | |
261 if key < current_bin_max: | |
262 cnt += len(val) # add all rows having that variance value | |
263 counted = False | |
264 else: | |
265 binData.append(cnt) | |
266 cnt= len(val) | |
267 binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0))) | |
268 current_bin_max += incrmnt | |
269 counted = True | |
270 | |
271 if not counted: | |
272 binData.append(cnt) | |
273 binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0))) | |
274 | |
275 tot = sum(binData) | |
276 bins = [] | |
277 | |
278 for j in range(numBins): | |
279 bins.append(j*binWidth) | |
280 #https://pythonspot.com/matplotlib-bar-chart/ | |
281 y_pos = np.arange(numBins) | |
282 | |
283 plt.xticks(y_pos, binCat) | |
284 plt.title("Distribution of Variance Values by Column") | |
285 plt.ylabel('Variance Bin Totals') | |
286 plt.xlabel('Variance Value Bins') | |
287 #plt.legend() | |
288 plt.bar(y_pos, binData, align='center', alpha=0.5) | |
289 | |
290 fig, ax = plt.subplots(num=1, figsize=(8,3)) | |
291 plt.show() | |
292 """ | |
293 | |
294 return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal | |
295 | |
296 def UpperLowerLimit_Filter_Row(upperLower, matrix,cutoff,row_header_list,column_header_list): | |
297 #Create Null Set of Filtered Row(Populated Later) | |
298 deletes = [] | |
299 minVal = +9999999 | |
300 maxVal = -99999 | |
301 #Loop to Determine Which Rows have sub-Threshold Range | |
302 for i in range(0,len(matrix)): | |
303 removeRow = False | |
304 | |
305 for j in range(len(matrix[0])): | |
306 val= matrix[i][j] | |
307 if not math.isnan(val): | |
308 if val <= cutoff and upperLower == 'lower': | |
309 removeRow = True | |
310 elif val >= cutoff and upperLower == 'upper': | |
311 removeRow = True | |
312 else: | |
313 if val < minVal: minVal = val | |
314 if val > maxVal: maxVal = val | |
315 | |
316 #print(temp_stdev) | |
317 if removeRow: | |
318 deletes = np.append(deletes,[i],0) | |
319 | |
320 #Delete Rows sub-Threshold Rows | |
321 matrix = np.delete(matrix,deletes,0) | |
322 filter_rows = np.delete(row_header_list,deletes,0) | |
323 filter_cols = column_header_list | |
324 | |
325 return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal | |
326 | |
327 def UpperLowerLimit_Filter_Col(upperLower,matrix,cutoff,row_header_list,column_header_list): | |
328 #Create Null Set of Filtered Row(Populated Later) | |
329 deletes = [] | |
330 minVal = +9999999 | |
331 maxVal = -99999 | |
332 #Loop to Determine Which Rows have sub-Threshold Variance | |
333 | |
334 for i in range(0,len(matrix[0])): | |
335 removeRow = False | |
336 | |
337 for j in range(len(matrix)): | |
338 val= matrix[j][i] | |
339 if not math.isnan(val): | |
340 if val <= cutoff and upperLower == 'lower': | |
341 removeRow = True | |
342 elif val >= cutoff and upperLower == 'upper': | |
343 removeRow = True | |
344 else: | |
345 if val < minVal: minVal = val | |
346 if val > maxVal: maxVal = val | |
347 | |
348 #print(temp_stdev) | |
349 if removeRow: deletes = np.append(deletes,[i],0) | |
350 | |
351 #Delete Rows sub-Threshold Rows | |
352 matrix = np.delete(matrix,deletes,1) | |
353 filter_rows = row_header_list | |
354 filter_cols = np.delete(column_header_list,deletes,0) | |
355 #np.savetxt('testtest.txt',matrix,delimiter='\t') | |
356 | |
357 return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal | |
358 | |
359 #========= remove rows with too many NANs in cells | |
360 def NAN_Filter_Row(matrix,nanList,maxAllowedNANs,row_header_list,column_header_list): | |
361 | |
362 try: | |
363 #Create Null Set of Filtered Row(Populated Later) | |
364 maxFoundNANs = 0 | |
365 deletes = [] | |
366 #Loop to Determine Which Rows have sub-Threshold Range | |
367 for i in range(0,len(matrix)): | |
368 #matches= [s for s in matrix[i][0::] if any(nan == s.upper() for nan in nanList)] | |
369 #matches= [s for s in matrix[i][:] if s in nanList] | |
370 matches= [] | |
371 for s in matrix[i]: | |
372 if str(s) in nanList: matches.append(s) | |
373 | |
374 | |
375 lenMatches = len(matches) | |
376 if lenMatches > maxFoundNANs: maxFoundNANs = lenMatches | |
377 | |
378 if lenMatches >= maxAllowedNANs: | |
379 deletes = np.append(deletes,[i],0) | |
380 | |
381 #Delete Rows sub-Threshold Rows | |
382 matrix = np.delete(matrix,deletes,0) | |
383 filter_rows = np.delete(row_header_list,deletes,0) | |
384 filter_cols = column_header_list | |
385 | |
386 except Exception as err: | |
387 traceback.print_exc() | |
388 sys.exit(-4) | |
389 | |
390 return matrix, filter_rows, filter_cols,len(deletes),maxFoundNANs | |
391 | |
392 #========= remove Cols with too many NANs | |
393 | |
394 def NAN_Filter_Column(matrix,nanList,maxAllowedNANs,row_header_list,column_header_list): | |
395 | |
396 #Create Null Set of Filtered Row(Populated Later) | |
397 minNumNANs = 0 | |
398 maxFoundNANs = 0 | |
399 deletes = [] | |
400 #Loop to Determine Which Rows have sub-Threshold Variance | |
401 for i in range(0,len(matrix[0])): | |
402 matches= [] | |
403 for j in range(len(matrix)): | |
404 if str(matrix[j][i]) in nanList: matches.append(matrix[j][i]) | |
405 | |
406 lenMatches = len(matches) | |
407 if lenMatches > maxFoundNANs: | |
408 maxFoundNANs = lenMatches | |
409 | |
410 if lenMatches >= maxAllowedNANs: | |
411 deletes = np.append(deletes,[i],0) | |
412 | |
413 #Delete cols with too many NANs | |
414 matrix = np.delete(matrix,deletes,1) | |
415 filter_rows = row_header_list | |
416 filter_cols = np.delete(column_header_list,deletes,0) | |
417 #np.savetxt('testtest.txt',matrix,delimiter='\t') | |
418 return matrix, filter_rows, filter_cols,len(deletes),maxFoundNANs | |
419 | |
420 | |
421 #MAD Median Absolute Deviation median (|Xi - Xmedian|) > X | |
422 def Row_Value_MAD(matrix,cutoff,row_header_list,column_header_list): | |
423 #MAD Median Absolute Deviation median (|Xi - Xmedian|) > X | |
424 # cutoff is MAX value used to meant to minimize the impact of one outlier | |
425 | |
426 deletes = [] | |
427 minVal = +9999999 | |
428 maxVal = -99999 | |
429 #Loop to Determine Which Rows have sub-Threshold Range | |
430 for i in range(0,len(matrix)): | |
431 medianRow = np.median(matrix[i]) | |
432 temp = np.median(abs(matrix[i]- medianRow)) | |
433 # median (|Xi - Xmedian|) > X => meant to minimize the impact of one outlier | |
434 if temp < cutoff: | |
435 deletes = np.append(deletes,[i],0) | |
436 | |
437 if temp < minVal: minVal = temp | |
438 if temp > maxVal: maxVal = temp | |
439 | |
440 #Delete Rows sub-Threshold Rows | |
441 matrix = np.delete(matrix,deletes,0) | |
442 filter_rows = np.delete(row_header_list,deletes,0) | |
443 filter_cols = column_header_list | |
444 print( "INFO Row MAD - Matrix min MAD value= "+str(minVal)+" and the max MAD value= "+str(maxVal) ) | |
445 | |
446 return matrix, filter_rows, filter_cols,len(deletes),maxVal | |
447 | |
448 #MAD Median Absolute Deviation median (|Xi - Xmedian|) > X | |
449 def Col_Value_MAD(matrix,cutoff,row_header_list,column_header_list): | |
450 #MAD Median Absolute Deviation median (|Xi - Xmedian|) > X | |
451 # cutoff is MAX value used to meant to minimize the impact of one outlier | |
452 deletes = [] | |
453 minVal = +9999999 | |
454 maxVal = -99999 | |
455 #Loop to Determine Which Rows have sub-Threshold Range | |
456 for i in range(0,len(matrix[0])): | |
457 matrixCol= [] | |
458 for j in range(len(matrix)): | |
459 matrixCol.append(matrix[j][i]) | |
460 | |
461 medianCol = np.median(matrixCol) | |
462 temp = np.median(abs(matrixCol- medianCol)) | |
463 # median (|Xi - Xmedian|) > X meant to minimize the impact of one outlier | |
464 if temp < cutoff: | |
465 deletes = np.append(deletes,[i],0) | |
466 | |
467 if temp < minVal: minVal = temp | |
468 if temp > maxVal: maxVal = temp | |
469 | |
470 #Delete Rows sub-Threshold Rows | |
471 matrix = np.delete(matrix,deletes,1) | |
472 filter_rows = row_header_list | |
473 filter_cols = np.delete(column_header_list,deletes,0) | |
474 print( "INFO Column MAD - Matrix min MAD value= "+str(minVal)+" and the max MAD value= "+str(maxVal) ) | |
475 | |
476 return matrix, filter_rows, filter_cols,len(deletes),maxVal | |
477 | |
478 | |
479 # if covariance of the data in two columns exceeds a thresehold remove one row list the rows in a separate output | |
480 # def CoVariance_Percent_Filter_row_col(matrix,thresh,row_header_list,column_header_list): | |
481 # xv= array([8., 9.5, 7.8, 4.2, -7.7, -5.4, 3.2]) | |
482 # yv= array([8.9, 2.0, 4.8, -4.2, 2.7, -3.4, -5.9]) | |
483 # | |
484 # def cov(x,y): | |
485 # if (len(x) != len(y) | |
486 # [Stop] | |
487 # x.bar = mean(x) | |
488 # y.bar = mean(y) | |
489 # N = len(x) | |
490 # Cov = (sum((x-x.bar)*(y-y.bar))) / (N-1.0) | |
491 # return(Cov) | |
492 | |
493 # #Create Null Set of Filtered Row(Populated Later) | |
494 # deletes = [] | |
495 # | |
496 # temp_mean = np.nanmean(matrix[i]) | |
497 # temp_stdev = np.nanstd(matrix[i]) | |
498 # | |
499 # get stddev of each row the calc xi -xj sq | |
500 # | |
501 # for i in range(0,len(matrix)): | |
502 # temp_range = np.max(matrix[i][0::]) - np.min(matrix[i][0::]) | |
503 # if temp_range <= float(thresh): | |
504 # deletes = np.append(deletes,[i],0) | |
505 # | |
506 # #Delete Rows sub-Threshold Rows | |
507 # matrix = np.delete(matrix,deletes,0) | |
508 # filter_rows = np.delete(row_header_list,deletes,0) | |
509 # filter_cols = column_header_list | |
510 # return(matrix,filter_rows,filter_cols) | |
511 # | |
512 # #np.savetxt('testtest.txt',matrix,delimiter='\t') | |
513 # return(matrix,filter_rows,filter_cols) | |
514 # | |
515 | |
516 #Define Function Which Labels Rows/Columns on Output | |
517 #below replace | |
518 # def labeler(matrix,filter_rows,filter_cols,output_file_txt): | |
519 # | |
520 # #Write Data to Specified Text File Output | |
521 # with open(output_file_txt,'w') as f: | |
522 # f.write("") | |
523 # for k in range(0,len(filter_cols)): | |
524 # f.write('\t' + filter_cols[k]) | |
525 # f.write('\n') | |
526 # for i in range(0,len(filter_rows)): | |
527 # f.write(filter_rows[i]) | |
528 # for j in range(0,len(matrix[0])): | |
529 # f.write('\t' + format(matrix[i][j])) | |
530 # f.write('\n') | |
531 | |
532 | |
533 #Define Main Function | |
534 def main(): | |
535 try: | |
536 args = get_args() | |
537 #sys.stdout.write(str(args)+"\n") | |
538 # <option value="LowerLimit">Minimum Absolute(Cell) Values to remove row/column</option> | |
539 # <option value="UpperLimit">Maximum Absolute(Cell) Values to remove row/column</option> | |
540 # <option value="NANnumber">NAN Number Cells Limit to remove row/column</option> | |
541 # <option value="NANpercent">NAN Percent Cells Limit to remove row/column</option> | |
542 nanList= ["NAN", "NA", "N/A", "-","?","nan", "na", "n/a"] | |
543 | |
544 matrix, column_header_list,row_header_list = reader(args.input_file_txt) | |
545 #old_reader matrix, row_header_list, column_header_list = reader(args.input_file_txt) | |
546 threshold = float(args.thresh) | |
547 if threshold < 0.000001: | |
548 print('Invalid negative or near-zero threshold chosen = '+str(args.thresh)+" choose positive value") | |
549 sys.exit(-4) | |
550 | |
551 #VariancePercent | |
552 if args.choice == "VariancePercent" or args.choice == "VarianceCount": # > percent variance | |
553 | |
554 if args.axes == "Row": | |
555 if args.choice == "VarianceCount": threshold= (1-threshold/len(row_header_list))*100.0 | |
556 | |
557 matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = Variance_Percent_Filter_row(matrix,threshold,row_header_list,column_header_list) | |
558 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) | |
559 if delCnt < 1: | |
560 print('\nNO Filtering occurred for rows using variance percentile < '+str(args.thresh)+ ' by row. Matrix row minimum variance= %.2f' % minVal+' and maximum variance= %.2f' % maxVal) | |
561 sys.stderr.write('\nFiltering out rows using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows') | |
562 sys.exit(-1) | |
563 else: | |
564 print('\nFiltering out rows using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows') | |
565 elif args.axes == "Column": | |
566 if args.choice == "VarianceCount": threshold= (1-threshold/len(column_header_list))*100.0 | |
567 matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = Variance_Percent_Filter_col(matrix,threshold,row_header_list,column_header_list) | |
568 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) | |
569 if delCnt < 1: | |
570 print('\nNO Filtering occurred for columns using variance percentile < '+str(args.thresh)+ ' by columns. Matrix columns minimum variance= %.2f' % minVal+' and maximum variance= %.2f' % maxVal) | |
571 sys.stderr.write('\nNO Filtering out rows using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows') | |
572 sys.exit(-1) | |
573 else: | |
574 print('\nFiltering out columns using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' columns') | |
575 else: | |
576 print('Invalid Axes ='+str(args.thresh)) | |
577 sys.exit(-1) | |
578 #LowerLimit | |
579 elif args.choice == "LowerLimit": #!! todo is NOT lower or upper limit but range of values | |
580 if args.axes == "Row": | |
581 matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Row('lower',matrix,threshold,row_header_list,column_header_list) | |
582 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) | |
583 if delCnt < 1: | |
584 print('\nNO Filtering occurred for rows using LowerLimit < '+str(args.thresh)+ ' by row. Matrix row minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal) | |
585 sys.stderr.write('\nNO Filtering out rows using LowerLimit < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows') | |
586 sys.exit(-1) | |
587 else: | |
588 print('\nFiltered out '+str(delCnt)+' rows with Lower Limit < '+str(args.thresh)) | |
589 elif args.axes == "Column": | |
590 matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Col('lower', matrix,threshold,row_header_list,column_header_list) | |
591 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) | |
592 if delCnt < 1: | |
593 print('\nNO Filtering occurred for columns using Lower Limit < '+str(args.thresh)+ ' by columns. Matrix columns minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal) | |
594 sys.stderr.write('\nNO Filtering out rows using Lower Limit < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows') | |
595 sys.exit(-1) | |
596 else: | |
597 print('\nFiltered out '+str(delCnt)+' columns with Lower Limit < '+str(args.thresh)) | |
598 #UpperLimit | |
599 elif args.choice == "UpperLimit": #!! todo is NOT lower or upper limit but range of values | |
600 if args.axes == "Row": | |
601 matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Row('upper',matrix,threshold,row_header_list,column_header_list) | |
602 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) | |
603 if delCnt < 1: | |
604 print('\nNO Filtering occurred for rows using Upper Limit < '+str(args.thresh)+ ' by row. Matrix row minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal) | |
605 sys.stderr.write('\nNO Filtering out rows using Upper Limit < '+str(args.thresh)+ ' by row. Matrix row minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal) | |
606 sys.exit(-1) | |
607 else: | |
608 print('\nFiltered out '+str(delCnt)+' rows with UpperLimit < '+str(args.thresh)) | |
609 elif args.axes == "Column": | |
610 matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Col('upper', matrix,threshold,row_header_list,column_header_list) | |
611 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) | |
612 if delCnt < 1: | |
613 print('\nNO Filtering occurred for columns using UpperLimit < '+str(args.thresh)+ ' by columns. Matrix columns minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal) | |
614 sys.stderr.write('\nFiltering out rows using UpperLimit < '+str(args.thresh)+ ' by columns. Matrix columns minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal) | |
615 sys.exit(-1) | |
616 else: | |
617 print('\nFiltered out '+str(delCnt)+' columns with UpperLimit < '+str(args.thresh)) | |
618 #MADlimit | |
619 elif args.choice == "MADcount" or args.choice == "MADpercent": #!! is lowerlimit of median absolute deviation medians | |
620 threshold= threshold | |
621 if args.axes == "Row": | |
622 if args.choice == "MADpercent": threshold= len(row_header_list)*threshold/100.0 | |
623 | |
624 matrix, filter_rows, filter_cols,delCnt,maxVal = Row_Value_MAD(matrix,threshold,row_header_list,column_header_list) | |
625 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) | |
626 if delCnt < 1: | |
627 print('\nNO Filtering occurred for rows using MAD < '+str(threshold)+ ' by row. Matrix row MAD maximum value= %.2f' % maxVal) | |
628 sys.stderr.write('\nFiltering out rows using MAD < '+str(threshold)+ ' by row. Matrix row MAD maximum value= %.2f' % maxVal) | |
629 sys.exit(-1) | |
630 else: | |
631 print('\nFiltered out '+str(delCnt)+' rows using MAD maximum value > '+str(threshold)) | |
632 elif args.axes == "Column": | |
633 if args.choice == "MADpercent": threshold= len(column_header_list)*threshold/100.0 | |
634 | |
635 matrix, filter_rows, filter_cols,delCnt,maxVal = Col_Value_MAD(matrix,threshold,row_header_list,column_header_list) | |
636 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) | |
637 if delCnt < 1: | |
638 print('\nNO Filtering occurred for columns using MAD < '+str(threshold)+ ' by columns. Matrix columns MAD maximum value= %.2f' % maxVal) | |
639 sys.stderr.write('\nFiltering out columns using MAD < '+str(threshold)+ ' by columns. Matrix columns MAD maximum value= %.2f' % maxVal) | |
640 sys.exit(-1) | |
641 else: | |
642 print('\nFiltered out '+str(delCnt)+' columns using MAD maximum value > '+str(threshold)) | |
643 #NANlimit | |
644 elif args.choice == "NANlimit" or args.choice == "NANpercent": | |
645 maxNANs= int(args.thresh) | |
646 val= ' ' | |
647 if args.choice == "NANpercent": | |
648 n,m = np.shape(matrix) | |
649 maxNANs= int(int(args.thresh)*n/100) | |
650 val= '%' | |
651 if args.axes == "Row": | |
652 matrix, filter_rows, filter_cols,delCnt, maxFoundNANs = NAN_Filter_Row(matrix,nanList,maxNANs,row_header_list,column_header_list) | |
653 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) | |
654 if delCnt < 1: | |
655 print('\nNO Filtering occurred for rows using NAN limit = or > '+str(args.thresh)+val+ ' by row. Matrix row max NAN count is =' + str(maxFoundNANs )) | |
656 sys.stderr.write('\nNO Filtering out rows using NAN limit = or > '+str(args.thresh)+val+ ' by row. Matrix row max NAN count is =' + str(maxFoundNANs )) | |
657 sys.exit(-1) | |
658 else: | |
659 print('\nFiltered out '+str(delCnt)+' rows using NAN limit = or > '+str(args.thresh)+val) | |
660 elif args.axes == "Column": | |
661 matrix, filter_rows, filter_cols,delCnt, maxFoundNANs = NAN_Filter_Column(matrix, nanList, maxNANs, row_header_list, column_header_list) | |
662 Labeler(matrix,filter_cols,filter_rows,args.output_file_txt) | |
663 if delCnt < 1: | |
664 print('\nNO Filtering occurred for columns using NAN limit = or > '+str(args.thresh)+val+ ' by columns. Matrix columns max NAN count is = '+ str(maxFoundNANs)) | |
665 sys.stderr.write('\nNO Filtering out columns using NAN limit = or > '+str(args.thresh)+val+ ' by columns. Matrix columns max NAN count is = '+ str(maxFoundNANs)) | |
666 sys.exit(-1) | |
667 else: | |
668 print('\nFiltered out '+str(delCnt)+' columns using NAN limit = or > '+str(args.thresh)+val ) | |
669 | |
670 # elif args.choice == "covariance": | |
671 # if args.axes == "Row": | |
672 # matrix, filter_rows, filter_cols = CoVariance_Percent_Filter_row(matrix,args.thresh,row_header_list,column_header_list) | |
673 # Labeler(matrix,filter_rows,filter_cols,args.output_file_txt) | |
674 # print('Covariance_Filter on row') | |
675 # elif args.axes == "Column": | |
676 # matrix, filter_rows, filter_cols = CoVariance_Percent_Filter_col(matrix,args.thresh,row_header_list,column_header_list) | |
677 # Labeler(matrix,filter_rows,filter_cols,args.output_file_txt) | |
678 # print('Covariance_Filter on column') | |
679 else: | |
680 print('Invalid Axes = '+str(args.axes)) | |
681 sys.exit(-1) | |
682 else: | |
683 print("Invalid Filter Choice = "+str(args.choice)) | |
684 sys.exit(-2) | |
685 | |
686 | |
687 except Exception as err: | |
688 traceback.print_exc() | |
689 sys.exit(-3) | |
690 | |
691 if __name__ == '__main__': | |
692 main() | |
693 print("\ndone") | |
694 sys.exit(0) |