Mercurial > repos > proteore > proteore_filter_keywords_values
comparison filter_kw_val.py @ 6:b4641c0f8a82 draft
planemo upload commit 77279e994f5751c6cd9aa165aa0604db3d241271-dirty
author | proteore |
---|---|
date | Mon, 11 Mar 2019 09:14:42 -0400 |
parents | 33ca9ba2495a |
children | 98cb671a92eb |
comparison
equal
deleted
inserted
replaced
5:33ca9ba2495a | 6:b4641c0f8a82 |
---|---|
1 import argparse, re, csv | 1 import argparse, re, csv, sys |
2 | 2 |
3 def options(): | 3 def options(): |
4 """ | 4 """ |
5 Parse options: | 5 Parse options: |
6 -i, --input Input filename and boolean value if the file contains header ["filename,true/false"] | 6 -i, --input Input filename and boolean value if the file contains header ["filename,true/false"] |
40 elif v.lower() in ('no', 'false', 'f', 'n', '0'): | 40 elif v.lower() in ('no', 'false', 'f', 'n', '0'): |
41 return False | 41 return False |
42 else: | 42 else: |
43 raise argparse.ArgumentTypeError('Boolean value expected.') | 43 raise argparse.ArgumentTypeError('Boolean value expected.') |
44 | 44 |
45 def proper_ncol (ncol,file): | |
46 if ncol not in range(len(file[0])): | |
47 print("Column "+str(ncol+1)+" not found in input file") | |
48 #traceback.print_exc(file=sys.stdout) | |
49 sys.exit(1) | |
50 | |
45 #Check if a variable is a float or an integer | 51 #Check if a variable is a float or an integer |
46 def is_number(number_format, n): | 52 def is_number(number_format, n): |
47 float_format = re.compile(r"^[-]?[0-9][0-9]*.?[0-9]+$") | 53 float_format = re.compile(r"^[-]?[0-9][0-9]*.?[0-9]+$") |
48 int_format = re.compile(r"^[-]?[0-9][0-9]*$") | 54 int_format = re.compile(r"^[-]?[0-9][0-9]*$") |
49 scientific_number = re.compile(r"^[-+]?[\d]+\.?[\d]*[Ee](?:[-+]?[\d]+)?$") | 55 scientific_number = re.compile(r"^[-+]?[\d]+\.?[\d]*[Ee](?:[-+]?[\d]+)?$") |
74 | 80 |
75 if args.kw_file: | 81 if args.kw_file: |
76 key_files = args.kw_file | 82 key_files = args.kw_file |
77 for kf in key_files: | 83 for kf in key_files: |
78 header = str_to_bool(kf[1]) | 84 header = str_to_bool(kf[1]) |
79 ncol = column_from_txt(kf[2]) | 85 ncol = column_from_txt(kf[2],csv_file) |
80 keywords = read_keywords_file(kf[0],header,ncol) | 86 keywords = read_keywords_file(kf[0],header,ncol) |
81 results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[3], kf[4]) | 87 results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[3], kf[4]) |
82 | 88 |
83 if args.value: | 89 if args.value: |
84 for v in args.value: | 90 for v in args.value: |
85 v[0] = v[0].replace(",",".") | 91 v[0] = v[0].replace(",",".") |
86 v[2] = operator_dict[v[2]] | 92 v[2] = operator_dict[v[2]] |
87 if is_number("float", v[0]): | 93 if is_number("float", v[0]): |
88 csv_file = comma_number_to_float(csv_file,column_from_txt(v[1]),header) | 94 csv_file = comma_number_to_float(csv_file,column_from_txt(v[1],csv_file),header) |
89 results_dict = filter_value(csv_file, header, results_dict, v[0], v[1], v[2]) | 95 results_dict = filter_value(csv_file, header, results_dict, v[0], v[1], v[2]) |
90 else: | 96 else: |
91 raise ValueError("Please enter a number in filter by value") | 97 raise ValueError("Please enter a number in filter by value") |
92 | 98 |
93 if args.values_range: | 99 if args.values_range: |
94 for vr in args.values_range: | 100 for vr in args.values_range: |
95 vr[:2] = [value.replace(",",".") for value in vr[:2]] | 101 vr[:2] = [value.replace(",",".") for value in vr[:2]] |
96 csv_file = comma_number_to_float(csv_file,column_from_txt(vr[2]),header) | 102 csv_file = comma_number_to_float(csv_file,column_from_txt(vr[2],csv_file),header) |
97 if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float",vr[1]) or is_number("int",vr[1])): | 103 if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float",vr[1]) or is_number("int",vr[1])): |
98 results_dict = filter_values_range(csv_file, header, results_dict, vr[0], vr[1], vr[2], vr[3]) | 104 results_dict = filter_values_range(csv_file, header, results_dict, vr[0], vr[1], vr[2], vr[3]) |
99 | 105 |
100 remaining_lines=[] | 106 remaining_lines=[] |
101 filtered_lines=[] | 107 filtered_lines=[] |
122 remaining_lines.append(line) | 128 remaining_lines.append(line) |
123 | 129 |
124 #sort of results by column | 130 #sort of results by column |
125 if args.sort_col : | 131 if args.sort_col : |
126 sort_col=args.sort_col.split(",")[0] | 132 sort_col=args.sort_col.split(",")[0] |
127 sort_col=column_from_txt(sort_col) | 133 sort_col=column_from_txt(sort_col,csv_file) |
128 reverse=str_to_bool(args.sort_col.split(",")[1]) | 134 reverse=str_to_bool(args.sort_col.split(",")[1]) |
129 remaining_lines= sort_by_column(remaining_lines,sort_col,reverse,header) | 135 remaining_lines= sort_by_column(remaining_lines,sort_col,reverse,header) |
130 filtered_lines = sort_by_column(filtered_lines,sort_col,reverse,header) | 136 filtered_lines = sort_by_column(filtered_lines,sort_col,reverse,header) |
131 | 137 |
132 #swap lists of lines (files) if 'keep' option selected | 138 #swap lists of lines (files) if 'keep' option selected |
237 return tab | 243 return tab |
238 | 244 |
239 #seek for keywords in rows of csvfile, return a dictionary of boolean (true if keyword found, false otherwise) | 245 #seek for keywords in rows of csvfile, return a dictionary of boolean (true if keyword found, false otherwise) |
240 def filter_keyword(csv_file, header, results_dict, keywords, ncol, match): | 246 def filter_keyword(csv_file, header, results_dict, keywords, ncol, match): |
241 match=str_to_bool(match) | 247 match=str_to_bool(match) |
242 ncol=column_from_txt(ncol) | 248 ncol=column_from_txt(ncol,csv_file) |
243 if type(keywords) != list : keywords = keywords.upper().split() # Split list of filter keyword | 249 if type(keywords) != list : keywords = keywords.upper().split() # Split list of filter keyword |
244 | 250 |
245 for id_line,line in enumerate(csv_file): | 251 for id_line,line in enumerate(csv_file): |
246 if header is True and id_line == 0 : continue | 252 if header is True and id_line == 0 : continue |
247 keyword_inline = line[ncol].replace('"', "").split(";") | 253 keyword_inline = line[ncol].replace('"', "").split(";") |
260 | 266 |
261 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise) | 267 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise) |
262 def filter_value(csv_file, header, results_dict, filter_value, ncol, opt): | 268 def filter_value(csv_file, header, results_dict, filter_value, ncol, opt): |
263 | 269 |
264 filter_value = float(filter_value) | 270 filter_value = float(filter_value) |
265 ncol=column_from_txt(ncol) | 271 ncol=column_from_txt(ncol,csv_file) |
266 nb_string=0 | 272 nb_string=0 |
267 | 273 |
268 for id_line,line in enumerate(csv_file): | 274 for id_line,line in enumerate(csv_file): |
269 if header is True and id_line == 0 : continue | 275 if header is True and id_line == 0 : continue |
270 value = line[ncol].replace('"', "").replace(",",".").strip() | 276 value = line[ncol].replace('"', "").replace(",",".").strip() |
295 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise) | 301 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise) |
296 def filter_values_range(csv_file, header, results_dict, bottom_value, top_value, ncol, inclusive): | 302 def filter_values_range(csv_file, header, results_dict, bottom_value, top_value, ncol, inclusive): |
297 inclusive=str_to_bool(inclusive) | 303 inclusive=str_to_bool(inclusive) |
298 bottom_value = float(bottom_value) | 304 bottom_value = float(bottom_value) |
299 top_value=float(top_value) | 305 top_value=float(top_value) |
300 ncol=column_from_txt(ncol) | 306 ncol=column_from_txt(ncol,csv_file) |
301 nb_string=0 | 307 nb_string=0 |
302 | 308 |
303 for id_line, line in enumerate(csv_file): | 309 for id_line, line in enumerate(csv_file): |
304 if header is True and id_line == 0 : continue | 310 if header is True and id_line == 0 : continue |
305 value = line[ncol].replace('"', "").replace(",",".").strip() | 311 value = line[ncol].replace('"', "").replace(",",".").strip() |
330 if inclusive : print ('The filter "'+str(bottom_value)+' <= x <= '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) | 336 if inclusive : print ('The filter "'+str(bottom_value)+' <= x <= '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) |
331 else : print ('The filter "'+str(bottom_value)+' < x < '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) | 337 else : print ('The filter "'+str(bottom_value)+' < x < '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) |
332 | 338 |
333 return results_dict | 339 return results_dict |
334 | 340 |
335 def column_from_txt(ncol): | 341 def column_from_txt(ncol,file): |
336 if is_number("int", ncol.replace("c", "")): | 342 if is_number("int", ncol.replace("c", "")): |
337 ncol = int(ncol.replace("c", "")) - 1 | 343 ncol = int(ncol.replace("c", "")) - 1 |
338 else: | 344 else: |
339 raise ValueError("Please specify the column where " | 345 raise ValueError("Please specify the column where " |
340 "you would like to apply the filter " | 346 "you would like to apply the filter " |
341 "with valid format") | 347 "with valid format") |
348 | |
349 proper_ncol (ncol,file) | |
350 | |
342 return ncol | 351 return ncol |
343 | 352 |
344 #return True if value is in the determined values, false otherwise | 353 #return True if value is in the determined values, false otherwise |
345 def value_compare(value,filter_value,opt): | 354 def value_compare(value,filter_value,opt): |
346 test_value=False | 355 test_value=False |