comparison filter_kw_val.py @ 6:b4641c0f8a82 draft

planemo upload commit 77279e994f5751c6cd9aa165aa0604db3d241271-dirty
author proteore
date Mon, 11 Mar 2019 09:14:42 -0400
parents 33ca9ba2495a
children 98cb671a92eb
comparison
equal deleted inserted replaced
5:33ca9ba2495a 6:b4641c0f8a82
1 import argparse, re, csv 1 import argparse, re, csv, sys
2 2
3 def options(): 3 def options():
4 """ 4 """
5 Parse options: 5 Parse options:
6 -i, --input Input filename and boolean value if the file contains header ["filename,true/false"] 6 -i, --input Input filename and boolean value if the file contains header ["filename,true/false"]
40 elif v.lower() in ('no', 'false', 'f', 'n', '0'): 40 elif v.lower() in ('no', 'false', 'f', 'n', '0'):
41 return False 41 return False
42 else: 42 else:
43 raise argparse.ArgumentTypeError('Boolean value expected.') 43 raise argparse.ArgumentTypeError('Boolean value expected.')
44 44
45 def proper_ncol (ncol,file):
46 if ncol not in range(len(file[0])):
47 print("Column "+str(ncol+1)+" not found in input file")
48 #traceback.print_exc(file=sys.stdout)
49 sys.exit(1)
50
45 #Check if a variable is a float or an integer 51 #Check if a variable is a float or an integer
46 def is_number(number_format, n): 52 def is_number(number_format, n):
47 float_format = re.compile(r"^[-]?[0-9][0-9]*.?[0-9]+$") 53 float_format = re.compile(r"^[-]?[0-9][0-9]*.?[0-9]+$")
48 int_format = re.compile(r"^[-]?[0-9][0-9]*$") 54 int_format = re.compile(r"^[-]?[0-9][0-9]*$")
49 scientific_number = re.compile(r"^[-+]?[\d]+\.?[\d]*[Ee](?:[-+]?[\d]+)?$") 55 scientific_number = re.compile(r"^[-+]?[\d]+\.?[\d]*[Ee](?:[-+]?[\d]+)?$")
74 80
75 if args.kw_file: 81 if args.kw_file:
76 key_files = args.kw_file 82 key_files = args.kw_file
77 for kf in key_files: 83 for kf in key_files:
78 header = str_to_bool(kf[1]) 84 header = str_to_bool(kf[1])
79 ncol = column_from_txt(kf[2]) 85 ncol = column_from_txt(kf[2],csv_file)
80 keywords = read_keywords_file(kf[0],header,ncol) 86 keywords = read_keywords_file(kf[0],header,ncol)
81 results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[3], kf[4]) 87 results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[3], kf[4])
82 88
83 if args.value: 89 if args.value:
84 for v in args.value: 90 for v in args.value:
85 v[0] = v[0].replace(",",".") 91 v[0] = v[0].replace(",",".")
86 v[2] = operator_dict[v[2]] 92 v[2] = operator_dict[v[2]]
87 if is_number("float", v[0]): 93 if is_number("float", v[0]):
88 csv_file = comma_number_to_float(csv_file,column_from_txt(v[1]),header) 94 csv_file = comma_number_to_float(csv_file,column_from_txt(v[1],csv_file),header)
89 results_dict = filter_value(csv_file, header, results_dict, v[0], v[1], v[2]) 95 results_dict = filter_value(csv_file, header, results_dict, v[0], v[1], v[2])
90 else: 96 else:
91 raise ValueError("Please enter a number in filter by value") 97 raise ValueError("Please enter a number in filter by value")
92 98
93 if args.values_range: 99 if args.values_range:
94 for vr in args.values_range: 100 for vr in args.values_range:
95 vr[:2] = [value.replace(",",".") for value in vr[:2]] 101 vr[:2] = [value.replace(",",".") for value in vr[:2]]
96 csv_file = comma_number_to_float(csv_file,column_from_txt(vr[2]),header) 102 csv_file = comma_number_to_float(csv_file,column_from_txt(vr[2],csv_file),header)
97 if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float",vr[1]) or is_number("int",vr[1])): 103 if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float",vr[1]) or is_number("int",vr[1])):
98 results_dict = filter_values_range(csv_file, header, results_dict, vr[0], vr[1], vr[2], vr[3]) 104 results_dict = filter_values_range(csv_file, header, results_dict, vr[0], vr[1], vr[2], vr[3])
99 105
100 remaining_lines=[] 106 remaining_lines=[]
101 filtered_lines=[] 107 filtered_lines=[]
122 remaining_lines.append(line) 128 remaining_lines.append(line)
123 129
124 #sort of results by column 130 #sort of results by column
125 if args.sort_col : 131 if args.sort_col :
126 sort_col=args.sort_col.split(",")[0] 132 sort_col=args.sort_col.split(",")[0]
127 sort_col=column_from_txt(sort_col) 133 sort_col=column_from_txt(sort_col,csv_file)
128 reverse=str_to_bool(args.sort_col.split(",")[1]) 134 reverse=str_to_bool(args.sort_col.split(",")[1])
129 remaining_lines= sort_by_column(remaining_lines,sort_col,reverse,header) 135 remaining_lines= sort_by_column(remaining_lines,sort_col,reverse,header)
130 filtered_lines = sort_by_column(filtered_lines,sort_col,reverse,header) 136 filtered_lines = sort_by_column(filtered_lines,sort_col,reverse,header)
131 137
132 #swap lists of lines (files) if 'keep' option selected 138 #swap lists of lines (files) if 'keep' option selected
237 return tab 243 return tab
238 244
239 #seek for keywords in rows of csvfile, return a dictionary of boolean (true if keyword found, false otherwise) 245 #seek for keywords in rows of csvfile, return a dictionary of boolean (true if keyword found, false otherwise)
240 def filter_keyword(csv_file, header, results_dict, keywords, ncol, match): 246 def filter_keyword(csv_file, header, results_dict, keywords, ncol, match):
241 match=str_to_bool(match) 247 match=str_to_bool(match)
242 ncol=column_from_txt(ncol) 248 ncol=column_from_txt(ncol,csv_file)
243 if type(keywords) != list : keywords = keywords.upper().split() # Split list of filter keyword 249 if type(keywords) != list : keywords = keywords.upper().split() # Split list of filter keyword
244 250
245 for id_line,line in enumerate(csv_file): 251 for id_line,line in enumerate(csv_file):
246 if header is True and id_line == 0 : continue 252 if header is True and id_line == 0 : continue
247 keyword_inline = line[ncol].replace('"', "").split(";") 253 keyword_inline = line[ncol].replace('"', "").split(";")
260 266
261 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise) 267 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise)
262 def filter_value(csv_file, header, results_dict, filter_value, ncol, opt): 268 def filter_value(csv_file, header, results_dict, filter_value, ncol, opt):
263 269
264 filter_value = float(filter_value) 270 filter_value = float(filter_value)
265 ncol=column_from_txt(ncol) 271 ncol=column_from_txt(ncol,csv_file)
266 nb_string=0 272 nb_string=0
267 273
268 for id_line,line in enumerate(csv_file): 274 for id_line,line in enumerate(csv_file):
269 if header is True and id_line == 0 : continue 275 if header is True and id_line == 0 : continue
270 value = line[ncol].replace('"', "").replace(",",".").strip() 276 value = line[ncol].replace('"', "").replace(",",".").strip()
295 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise) 301 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise)
296 def filter_values_range(csv_file, header, results_dict, bottom_value, top_value, ncol, inclusive): 302 def filter_values_range(csv_file, header, results_dict, bottom_value, top_value, ncol, inclusive):
297 inclusive=str_to_bool(inclusive) 303 inclusive=str_to_bool(inclusive)
298 bottom_value = float(bottom_value) 304 bottom_value = float(bottom_value)
299 top_value=float(top_value) 305 top_value=float(top_value)
300 ncol=column_from_txt(ncol) 306 ncol=column_from_txt(ncol,csv_file)
301 nb_string=0 307 nb_string=0
302 308
303 for id_line, line in enumerate(csv_file): 309 for id_line, line in enumerate(csv_file):
304 if header is True and id_line == 0 : continue 310 if header is True and id_line == 0 : continue
305 value = line[ncol].replace('"', "").replace(",",".").strip() 311 value = line[ncol].replace('"', "").replace(",",".").strip()
330 if inclusive : print ('The filter "'+str(bottom_value)+' <= x <= '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) 336 if inclusive : print ('The filter "'+str(bottom_value)+' <= x <= '+str(top_value)+'" can not be applied on the column '+str(ncol+1))
331 else : print ('The filter "'+str(bottom_value)+' < x < '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) 337 else : print ('The filter "'+str(bottom_value)+' < x < '+str(top_value)+'" can not be applied on the column '+str(ncol+1))
332 338
333 return results_dict 339 return results_dict
334 340
335 def column_from_txt(ncol): 341 def column_from_txt(ncol,file):
336 if is_number("int", ncol.replace("c", "")): 342 if is_number("int", ncol.replace("c", "")):
337 ncol = int(ncol.replace("c", "")) - 1 343 ncol = int(ncol.replace("c", "")) - 1
338 else: 344 else:
339 raise ValueError("Please specify the column where " 345 raise ValueError("Please specify the column where "
340 "you would like to apply the filter " 346 "you would like to apply the filter "
341 "with valid format") 347 "with valid format")
348
349 proper_ncol (ncol,file)
350
342 return ncol 351 return ncol
343 352
344 #return True if value is in the determined values, false otherwise 353 #return True if value is in the determined values, false otherwise
345 def value_compare(value,filter_value,opt): 354 def value_compare(value,filter_value,opt):
346 test_value=False 355 test_value=False