# HG changeset patch # User proteore # Date 1620649624 0 # Node ID 98cb671a92eb665ce7cb44c807aaa74e70eff65b # Parent 5621406a4d2fcb01a408a30670ebef20dfca1911 "planemo upload commit 80e3e50ca52b5b232f91e6dd6850da606d9c4c5f-dirty" diff -r 5621406a4d2f -r 98cb671a92eb filter_kw_val.py --- a/filter_kw_val.py Fri Jun 28 05:09:20 2019 -0400 +++ b/filter_kw_val.py Mon May 10 12:27:04 2021 +0000 @@ -1,15 +1,19 @@ -import argparse, re, csv, sys +import argparse +import csv +import re +import sys + def options(): """ Parse options: - -i, --input Input filename and boolean value if the file contains header ["filename,true/false"] - --kw Keyword to be filtered, the column number where this filter applies, + -i, --input Input filename and boolean value if the file contains header ["filename,true/false"] # noqa 501 + --kw Keyword to be filtered, the column number where this filter applies, boolean value if the keyword should be filtered in exact ["keyword,ncol,true/false"]. This option can be repeated: --kw "kw1,c1,true" --kw "kw2,c1,false" --kw "kw3,c2,true" - --kwfile A file that contains keywords to be filter, the column where this filter applies and + --kwfile A file that contains keywords to be filter, the column where this filter applies and boolean value if the keyword should be filtered in exact ["filename,ncol,true/false"] - --value The value to be filtered, the column number where this filter applies and the + --value The value to be filtered, the column number where this filter applies and the operation symbol ["value,ncol,=/>/>=/" , "Equal-or-higher" : ">=" , "Lower" : "<" , "Equal-or-lower" : "<=" , "Different" : "!=" } + operator_dict = {"Equal": "=", "Higher": ">", "Equal-or-higher": ">=", "Lower": "<", "Equal-or-lower": "<=", "Different": "!="} # noqa 501 if args.kw: keywords = args.kw for k in keywords: - results_dict=filter_keyword(csv_file, header, results_dict, k[0], k[1], k[2]) + results_dict = filter_keyword(csv_file, + header, + results_dict, + k[0], + k[1], + k[2]) if args.kw_file: key_files = args.kw_file for kf in key_files: header = str_to_bool(kf[1]) - ncol = column_from_txt(kf[2],csv_file) - keywords = read_keywords_file(kf[0],header,ncol) - results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[3], kf[4]) + ncol = column_from_txt(kf[2], csv_file) + keywords = read_keywords_file(kf[0], header, ncol) + results_dict = filter_keyword(csv_file, header, results_dict, + keywords, kf[3], kf[4]) if args.value: for v in args.value: - v[0] = v[0].replace(",",".") + v[0] = v[0].replace(",", ".") v[2] = operator_dict[v[2]] if is_number("float", v[0]): - csv_file = comma_number_to_float(csv_file,column_from_txt(v[1],csv_file),header) - results_dict = filter_value(csv_file, header, results_dict, v[0], v[1], v[2]) + csv_file = comma_number_to_float(csv_file, + column_from_txt( + v[1], csv_file), header) + results_dict = filter_value(csv_file, header, + results_dict, v[0], v[1], v[2]) else: raise ValueError("Please enter a number in filter by value") if args.values_range: for vr in args.values_range: - vr[:2] = [value.replace(",",".") for value in vr[:2]] - csv_file = comma_number_to_float(csv_file,column_from_txt(vr[2],csv_file),header) - if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float",vr[1]) or is_number("int",vr[1])): - results_dict = filter_values_range(csv_file, header, results_dict, vr[0], vr[1], vr[2], vr[3]) + vr[:2] = [value.replace(",", ".") for value in vr[:2]] + csv_file = comma_number_to_float(csv_file, + column_from_txt( + vr[2], csv_file), header) + if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float", vr[1]) or is_number("int", vr[1])): # noqa 501 + results_dict = filter_values_range(csv_file, + header, results_dict, + vr[0], vr[1], vr[2], vr[3]) - remaining_lines=[] - filtered_lines=[] + remaining_lines = [] + filtered_lines = [] - if header is True : + if header is True: remaining_lines.append(csv_file[0]) filtered_lines.append(csv_file[0]) - if results_dict == {} : #no filter used + if results_dict == {}: # no filter used remaining_lines.extend(csv_file[1:]) - else : - for id_line,line in enumerate(csv_file) : - if id_line in results_dict : #skip header and empty lines - if args.operator == 'OR' : - if any(results_dict[id_line]) : + else: + for id_line, line in enumerate(csv_file): + if id_line in results_dict: # skip header and empty lines + if args.operator == 'OR': + if any(results_dict[id_line]): filtered_lines.append(line) - else : + else: remaining_lines.append(line) - elif args.operator == "AND" : - if all(results_dict[id_line]) : + elif args.operator == "AND": + if all(results_dict[id_line]): filtered_lines.append(line) - else : + else: remaining_lines.append(line) - #sort of results by column - if args.sort_col : - sort_col=args.sort_col.split(",")[0] - sort_col=column_from_txt(sort_col,csv_file) - reverse=str_to_bool(args.sort_col.split(",")[1]) - remaining_lines= sort_by_column(remaining_lines,sort_col,reverse,header) - filtered_lines = sort_by_column(filtered_lines,sort_col,reverse,header) + # sort of results by column + if args.sort_col: + sort_col = args.sort_col.split(",")[0] + sort_col = column_from_txt(sort_col, csv_file) + reverse = str_to_bool(args.sort_col.split(",")[1]) + remaining_lines = sort_by_column(remaining_lines, sort_col, + reverse, header) + filtered_lines = sort_by_column(filtered_lines, sort_col, + reverse, header) - #swap lists of lines (files) if 'keep' option selected - if args.operation == "keep" : + # swap lists of lines (files) if 'keep' option selected + if args.operation == "keep": swap = remaining_lines, filtered_lines remaining_lines = swap[1] filtered_lines = swap[0] - + # Write results to output - with open(args.output,"w") as output : - writer = csv.writer(output,delimiter="\t") + with open(args.output, "w") as output: + writer = csv.writer(output, delimiter="\t") writer.writerows(remaining_lines) # Write filtered lines to filtered_output - with open(args.discarded_lines,"w") as filtered_output : - writer = csv.writer(filtered_output,delimiter="\t") + with open(args.discarded_lines, "w") as filtered_output: + writer = csv.writer(filtered_output, delimiter="\t") writer.writerows(filtered_lines) -#function to sort the csv_file by value in a specific column -def sort_by_column(tab,sort_col,reverse,header): - - if len(tab) > 1 : #if there's more than just a header or 1 row - if header : - head=tab[0] - tab=tab[1:] +# function to sort the csv_file by value in a specific column + + +def sort_by_column(tab, sort_col, reverse, header): + + if len(tab) > 1: # if there's more than just a header or 1 row + if header: + head = tab[0] + tab = tab[1:] - #list of empty cells in the column to sort - unsortable_lines = [i for i,line in enumerate(tab) if (line[sort_col]=='' or line[sort_col] == 'NA')] - unsorted_tab=[ tab[i] for i in unsortable_lines] - tab= [line for i,line in enumerate(tab) if i not in unsortable_lines] + # list of empty cells in the column to sort + unsortable_lines = [i for i, line in enumerate(tab) if (line[sort_col]=='' or line[sort_col] == 'NA')] # noqa 501 + unsorted_tab = [tab[i] for i in unsortable_lines] + tab = [line for i, line in enumerate(tab) if i not in unsortable_lines] - if only_number(tab,sort_col) and any_float(tab,sort_col) : - tab = comma_number_to_float(tab,sort_col,False) - tab = sorted(tab, key=lambda row: float(row[sort_col]), reverse=reverse) - elif only_number(tab,sort_col): - tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse) - else : + if only_number(tab, sort_col) and any_float(tab, sort_col): + tab = comma_number_to_float(tab, sort_col, False) + tab = sorted(tab, key=lambda row: float(row[sort_col]), + reverse=reverse) + elif only_number(tab, sort_col): + tab = sorted(tab, key=lambda row: int(row[sort_col]), + reverse=reverse) + else: tab = sorted(tab, key=lambda row: row[sort_col], reverse=reverse) - + tab.extend(unsorted_tab) - if header is True : tab = [head]+tab + if header is True: + tab = [head]+tab return tab -#replace all blank cells to NA -def blank_to_NA(csv_file) : - - tmp=[] - for line in csv_file : - line = ["NA" if cell=="" or cell==" " or cell=="NaN" else cell for cell in line ] +# replace all blank cells to NA + + +def blank_to_NA(csv_file): + + tmp = [] + for line in csv_file: + line = ["NA" if cell=="" or cell==" " or cell=="NaN" else cell for cell in line ] # noqa 501 tmp.append(line) - + return tmp -#turn into float a column -def comma_number_to_float(csv_file,ncol,header) : - if header : - tmp=[csv_file[0]] - csv_file=csv_file[1:] - else : - tmp=[] +# turn into float a column + - for line in csv_file : - line[ncol]=line[ncol].replace(",",".") +def comma_number_to_float(csv_file, ncol, header): + if header: + tmp = [csv_file[0]] + csv_file = csv_file[1:] + else: + tmp = [] + + for line in csv_file: + line[ncol] = line[ncol].replace(",", ".") tmp.append(line) return (tmp) -#return True is there is at least one float in the column -def any_float(tab,col) : - - for line in tab : - if is_number("float",line[col].replace(",",".")) : +# return True is there is at least one float in the column + + +def any_float(tab, col): + + for line in tab: + if is_number("float", line[col].replace(",", ".")): return True return False -def only_number(tab,col) : - for line in tab : - if not (is_number("float",line[col].replace(",",".")) or is_number("int",line[col].replace(",","."))) : + +def only_number(tab, col): + for line in tab: + if not (is_number("float", line[col].replace(",", ".")) or is_number("int", line[col].replace(",", "."))): # noqa 501 return False return True -#Read the keywords file to extract the list of keywords -def read_keywords_file(filename,header,ncol): - with open(filename, "r") as csv_file : - lines= csv.reader(csv_file, delimiter='\t') +# Read the keywords file to extract the list of keywords + + +def read_keywords_file(filename, header, ncol): + with open(filename, "r") as csv_file: + lines = csv.reader(csv_file, delimiter='\t') lines = blank_to_NA(lines) - if (len(lines[0])) > 1 : keywords = [line[ncol] for line in lines] - else : - keywords= ["".join(key) for key in lines] - if header : keywords = keywords[1:] + if (len(lines[0])) > 1: + keywords = [line[ncol] for line in lines] + else: + keywords = ["".join(key) for key in lines] + if header: + keywords = keywords[1:] keywords = list(set(keywords)) return keywords # Read input file + + def read_file(filename): - with open(filename,"r") as f : - reader=csv.reader(f,delimiter="\t") - tab=list(reader) + with open(filename, "r") as f: + reader = csv.reader(f, delimiter="\t") + tab = list(reader) # Remove empty lines (contain only space or new line or "") - #[tab.remove(blank) for blank in tab if blank.isspace() or blank == ""] - tab=[line for line in tab if len("".join(line).replace(" ","")) !=0 ] - + # [tab.remove(blank) for blank in tab if blank.isspace() or blank == ""] + tab = [line for line in tab if len("".join(line).replace(" ", "")) != 0] # noqa 501 + return tab -#seek for keywords in rows of csvfile, return a dictionary of boolean (true if keyword found, false otherwise) +# seek for keywords in rows of csvfile, return a dictionary of boolean +# (true if keyword found, false otherwise) + + def filter_keyword(csv_file, header, results_dict, keywords, ncol, match): - match=str_to_bool(match) - ncol=column_from_txt(ncol,csv_file) - if type(keywords) != list : keywords = keywords.upper().split() # Split list of filter keyword + match = str_to_bool(match) + ncol = column_from_txt(ncol, csv_file) + if type(keywords) != list: + keywords = keywords.upper().split() # Split list of filter keyword - for id_line,line in enumerate(csv_file): - if header is True and id_line == 0 : continue + for id_line, line in enumerate(csv_file): + if header is True and id_line == 0: + continue keyword_inline = line[ncol].replace('"', "").split(";") - #Perfect match or not - if match is True : - found_in_line = any(pid.upper() in keywords for pid in keyword_inline) - else: - found_in_line = any(ft in pid.upper() for pid in keyword_inline for ft in keywords) + # Perfect match or not + if match is True: + found_in_line = any(pid.upper() in keywords for pid in keyword_inline) # noqa 501 + else: + found_in_line = any(ft in pid.upper() for pid in keyword_inline for ft in keywords) # noqa 501 - #if the keyword is found in line - if id_line in results_dict : results_dict[id_line].append(found_in_line) - else : results_dict[id_line]=[found_in_line] + # if the keyword is found in line + if id_line in results_dict: + results_dict[id_line].append(found_in_line) + else: + results_dict[id_line] = [found_in_line] return results_dict -#filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise) +# filter ba determined value in rows of csvfile, return a dictionary +# of boolean (true if value filtered, false otherwise) + + def filter_value(csv_file, header, results_dict, filter_value, ncol, opt): filter_value = float(filter_value) - ncol=column_from_txt(ncol,csv_file) - nb_string=0 + ncol = column_from_txt(ncol, csv_file) + nb_string = 0 - for id_line,line in enumerate(csv_file): - if header is True and id_line == 0 : continue - value = line[ncol].replace('"', "").replace(",",".").strip() + for id_line, line in enumerate(csv_file): + if header is True and id_line == 0: + continue + value = line[ncol].replace('"', "").replace(",", ".").strip() if value.replace(".", "", 1).isdigit(): - to_filter=value_compare(value,filter_value,opt) - - #adding the result to the dictionary - if id_line in results_dict : results_dict[id_line].append(to_filter) - else : results_dict[id_line]=[to_filter] + to_filter = value_compare(value, filter_value, opt) + + # adding the result to the dictionary + if id_line in results_dict: + results_dict[id_line].append(to_filter) + else: + results_dict[id_line] = [to_filter] - #impossible to treat (ex : "" instead of a number), we keep the line by default - else : - nb_string+=1 - if id_line in results_dict : results_dict[id_line].append(False) - else : results_dict[id_line]=[False] - - #number of lines in the csv file - if header : nb_lines = len(csv_file) -1 - else : nb_lines = len(csv_file) - - #if there's no numeric value in the column - if nb_string == nb_lines : - print ('No numeric values found in the column '+str(ncol+1)) - print ('The filter "'+str(opt)+' '+str(filter_value)+'" can not be applied on the column '+str(ncol+1)) - + # impossible to treat (ex : "" instead of a number), + # we keep the line by default + else: + nb_string += 1 + if id_line in results_dict: + results_dict[id_line].append(False) + else: + results_dict[id_line] = [False] + + # number of lines in the csv file + if header: + nb_lines = len(csv_file) - 1 + else: + nb_lines = len(csv_file) + + # if there's no numeric value in the column + if nb_string == nb_lines: + print('No numeric values found in the column '+str(ncol+1)) + print('The filter "'+str(opt)+' '+str(filter_value)+'" can not be applied on the column '+str(ncol+1)) # noqa 501 + return results_dict -#filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise) -def filter_values_range(csv_file, header, results_dict, bottom_value, top_value, ncol, inclusive): - inclusive=str_to_bool(inclusive) +# filter ba determined value in rows of csvfile, return a dictionary +# of boolean (true if value filtered, false otherwise) + + +def filter_values_range(csv_file, header, results_dict, bottom_value, top_value, ncol, inclusive): # noqa 501 + inclusive = str_to_bool(inclusive) bottom_value = float(bottom_value) - top_value=float(top_value) - ncol=column_from_txt(ncol,csv_file) - nb_string=0 + top_value = float(top_value) + ncol = column_from_txt(ncol, csv_file) + nb_string = 0 for id_line, line in enumerate(csv_file): - if header is True and id_line == 0 : continue - value = line[ncol].replace('"', "").replace(",",".").strip() + if header is True and id_line == 0: + continue + value = line[ncol].replace('"', "").replace(",", ".").strip() if value.replace(".", "", 1).isdigit(): - value=float(value) + value = float(value) if inclusive is True: in_range = not (bottom_value <= value <= top_value) - else : + else: in_range = not (bottom_value < value < top_value) - #adding the result to the dictionary - if id_line in results_dict : results_dict[id_line].append(in_range) - else : results_dict[id_line]=[in_range] - - #impossible to treat (ex : "" instead of a number), we keep the line by default - else : - nb_string+=1 - if id_line in results_dict : results_dict[id_line].append(False) - else : results_dict[id_line]=[False] + # adding the result to the dictionary + if id_line in results_dict: + results_dict[id_line].append(in_range) + else: + results_dict[id_line] = [in_range] + + # impossible to treat (ex : "" instead of a number), + # we keep the line by default + else: + nb_string += 1 + if id_line in results_dict: + results_dict[id_line].append(False) + else: + results_dict[id_line] = [False] - #number of lines in the csv file - if header : nb_lines = len(csv_file) -1 - else : nb_lines = len(csv_file) - - #if there's no numeric value in the column - if nb_string == nb_lines : - print ('No numeric values found in the column '+str(ncol+1)) - if inclusive : print ('The filter "'+str(bottom_value)+' <= x <= '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) - else : print ('The filter "'+str(bottom_value)+' < x < '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) + # number of lines in the csv file + if header: + nb_lines = len(csv_file) - 1 + else: + nb_lines = len(csv_file) - return results_dict + # if there's no numeric value in the column + if nb_string == nb_lines: + print('No numeric values found in the column '+str(ncol+1)) + if inclusive: + print ('The filter "'+str(bottom_value)+' <= x <= '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) # noqa 501 + else: + print ('The filter "'+str(bottom_value)+' < x < '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) # noqa 501 -def column_from_txt(ncol,file): - if is_number("int", ncol.replace("c", "")): - ncol = int(ncol.replace("c", "")) - 1 + return results_dict + + +def column_from_txt(ncol, file): + if is_number("int", ncol.replace("c", "")): + ncol = int(ncol.replace("c", "")) - 1 else: raise ValueError("Please specify the column where " "you would like to apply the filter " "with valid format") - proper_ncol (ncol,file) - + proper_ncol(ncol, file) + return ncol -#return True if value is in the determined values, false otherwise -def value_compare(value,filter_value,opt): - test_value=False +# return True if value is in the determined values, false otherwise + + +def value_compare(value, filter_value, opt): + test_value = False if opt == "<": if float(value) < filter_value: @@ -369,11 +447,12 @@ elif opt == "=": if float(value) == filter_value: test_value = True - elif opt == "!=": + elif opt == "!=": if float(value) != filter_value: test_value = True return test_value + if __name__ == "__main__": options() diff -r 5621406a4d2f -r 98cb671a92eb filter_kw_val.xml --- a/filter_kw_val.xml Fri Jun 28 05:09:20 2019 -0400 +++ b/filter_kw_val.xml Mon May 10 12:27:04 2021 +0000 @@ -1,4 +1,4 @@ - +