comparison filter_kw_val.py @ 8:98cb671a92eb draft default tip

"planemo upload commit 80e3e50ca52b5b232f91e6dd6850da606d9c4c5f-dirty"
author proteore
date Mon, 10 May 2021 12:27:04 +0000
parents b4641c0f8a82
children
comparison
equal deleted inserted replaced
7:5621406a4d2f 8:98cb671a92eb
1 import argparse, re, csv, sys 1 import argparse
2 import csv
3 import re
4 import sys
5
2 6
3 def options(): 7 def options():
4 """ 8 """
5 Parse options: 9 Parse options:
6 -i, --input Input filename and boolean value if the file contains header ["filename,true/false"] 10 -i, --input Input filename and boolean value if the file contains header ["filename,true/false"] # noqa 501
7 --kw Keyword to be filtered, the column number where this filter applies, 11 --kw Keyword to be filtered, the column number where this filter applies,
8 boolean value if the keyword should be filtered in exact ["keyword,ncol,true/false"]. 12 boolean value if the keyword should be filtered in exact ["keyword,ncol,true/false"].
9 This option can be repeated: --kw "kw1,c1,true" --kw "kw2,c1,false" --kw "kw3,c2,true" 13 This option can be repeated: --kw "kw1,c1,true" --kw "kw2,c1,false" --kw "kw3,c2,true"
10 --kwfile A file that contains keywords to be filter, the column where this filter applies and 14 --kwfile A file that contains keywords to be filter, the column where this filter applies and
11 boolean value if the keyword should be filtered in exact ["filename,ncol,true/false"] 15 boolean value if the keyword should be filtered in exact ["filename,ncol,true/false"]
12 --value The value to be filtered, the column number where this filter applies and the 16 --value The value to be filtered, the column number where this filter applies and the
13 operation symbol ["value,ncol,=/>/>=/</<=/!="] 17 operation symbol ["value,ncol,=/>/>=/</<=/!="]
14 --values_range range of values to be keep, example : --values_range 5 20 c1 true 18 --values_range range of values to be keep, example : --values_range 5 20 c1 true
15 --operation 'keep' or 'discard' lines concerned by filter(s) 19 --operation 'keep' or 'discard' lines concerned by filter(s)
16 --operator The operator used to filter with several keywords/values : AND or OR 20 --operator The operator used to filter with several keywords/values : AND or OR
17 --o --output The output filename 21 --o --output The output filename
22 parser.add_argument("-i", "--input", help="Input file", required=True) 26 parser.add_argument("-i", "--input", help="Input file", required=True)
23 parser.add_argument("--kw", nargs="+", action="append", help="") 27 parser.add_argument("--kw", nargs="+", action="append", help="")
24 parser.add_argument("--kw_file", nargs="+", action="append", help="") 28 parser.add_argument("--kw_file", nargs="+", action="append", help="")
25 parser.add_argument("--value", nargs="+", action="append", help="") 29 parser.add_argument("--value", nargs="+", action="append", help="")
26 parser.add_argument("--values_range", nargs="+", action="append", help="") 30 parser.add_argument("--values_range", nargs="+", action="append", help="")
27 parser.add_argument("--operation", default="keep", type=str, choices=['keep','discard'],help='') 31 parser.add_argument("--operation", default="keep", type=str, choices=['keep', 'discard'], help='') # noqa 501
28 parser.add_argument("--operator", default="OR", type=str, choices=['AND','OR'],help='') 32 parser.add_argument("--operator", default="OR", type=str, choices=['AND', 'OR'], help='') # noqa 501
29 parser.add_argument("-o", "--output", default="output.txt") 33 parser.add_argument("-o", "--output", default="output.txt")
30 parser.add_argument("--discarded_lines", default="filtered_output.txt") 34 parser.add_argument("--discarded_lines", default="filtered_output.txt")
31 parser.add_argument("-s","--sort_col", help="") 35 parser.add_argument("-s", "--sort_col", help="")
32 36
33 args = parser.parse_args() 37 args = parser.parse_args()
34 38
35 filters(args) 39 filters(args)
40
36 41
37 def str_to_bool(v): 42 def str_to_bool(v):
38 if v.lower() in ('yes', 'true', 't', 'y', '1'): 43 if v.lower() in ('yes', 'true', 't', 'y', '1'):
39 return True 44 return True
40 elif v.lower() in ('no', 'false', 'f', 'n', '0'): 45 elif v.lower() in ('no', 'false', 'f', 'n', '0'):
41 return False 46 return False
42 else: 47 else:
43 raise argparse.ArgumentTypeError('Boolean value expected.') 48 raise argparse.ArgumentTypeError('Boolean value expected.')
44 49
45 def proper_ncol (ncol,file): 50
51 def proper_ncol(ncol, file):
46 if ncol not in range(len(file[0])): 52 if ncol not in range(len(file[0])):
47 print("Column "+str(ncol+1)+" not found in input file") 53 print("Column "+str(ncol+1)+" not found in input file")
48 #traceback.print_exc(file=sys.stdout) 54 # traceback.print_exc(file=sys.stdout)
49 sys.exit(1) 55 sys.exit(1)
50 56
51 #Check if a variable is a float or an integer 57 # Check if a variable is a float or an integer
58
59
52 def is_number(number_format, n): 60 def is_number(number_format, n):
53 float_format = re.compile(r"^[-]?[0-9][0-9]*.?[0-9]+$") 61 float_format = re.compile(r"^[-]?[0-9][0-9]*.?[0-9]+$")
54 int_format = re.compile(r"^[-]?[0-9][0-9]*$") 62 int_format = re.compile(r"^[-]?[0-9][0-9]*$")
55 scientific_number = re.compile(r"^[-+]?[\d]+\.?[\d]*[Ee](?:[-+]?[\d]+)?$") 63 scientific_number = re.compile(r"^[-+]?[\d]+\.?[\d]*[Ee](?:[-+]?[\d]+)?$")
56 test = "" 64 test = ""
57 if number_format == "int": 65 if number_format == "int":
58 test = re.match(int_format, n) 66 test = re.match(int_format, n)
59 elif number_format == "float": 67 elif number_format == "float":
60 test = re.match(float_format, n) 68 test = re.match(float_format, n)
61 if test is None : test = re.match(scientific_number,n) 69 if test is None:
70 test = re.match(scientific_number, n)
62 71
63 if test: 72 if test:
64 return True 73 return True
65 else : 74 else:
66 return False 75 return False
67 76
68 #Filter the document 77 # Filter the document
78
79
69 def filters(args): 80 def filters(args):
70 filename = args.input.split(",")[0] 81 filename = args.input.split(",")[0]
71 header = str_to_bool(args.input.split(",")[1]) 82 header = str_to_bool(args.input.split(",")[1])
72 csv_file = blank_to_NA(read_file(filename)) 83 csv_file = blank_to_NA(read_file(filename))
73 results_dict = {} 84 results_dict = {}
74 operator_dict = { "Equal" : "=" , "Higher" : ">" , "Equal-or-higher" : ">=" , "Lower" : "<" , "Equal-or-lower" : "<=" , "Different" : "!=" } 85 operator_dict = {"Equal": "=", "Higher": ">", "Equal-or-higher": ">=", "Lower": "<", "Equal-or-lower": "<=", "Different": "!="} # noqa 501
75 86
76 if args.kw: 87 if args.kw:
77 keywords = args.kw 88 keywords = args.kw
78 for k in keywords: 89 for k in keywords:
79 results_dict=filter_keyword(csv_file, header, results_dict, k[0], k[1], k[2]) 90 results_dict = filter_keyword(csv_file,
91 header,
92 results_dict,
93 k[0],
94 k[1],
95 k[2])
80 96
81 if args.kw_file: 97 if args.kw_file:
82 key_files = args.kw_file 98 key_files = args.kw_file
83 for kf in key_files: 99 for kf in key_files:
84 header = str_to_bool(kf[1]) 100 header = str_to_bool(kf[1])
85 ncol = column_from_txt(kf[2],csv_file) 101 ncol = column_from_txt(kf[2], csv_file)
86 keywords = read_keywords_file(kf[0],header,ncol) 102 keywords = read_keywords_file(kf[0], header, ncol)
87 results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[3], kf[4]) 103 results_dict = filter_keyword(csv_file, header, results_dict,
104 keywords, kf[3], kf[4])
88 105
89 if args.value: 106 if args.value:
90 for v in args.value: 107 for v in args.value:
91 v[0] = v[0].replace(",",".") 108 v[0] = v[0].replace(",", ".")
92 v[2] = operator_dict[v[2]] 109 v[2] = operator_dict[v[2]]
93 if is_number("float", v[0]): 110 if is_number("float", v[0]):
94 csv_file = comma_number_to_float(csv_file,column_from_txt(v[1],csv_file),header) 111 csv_file = comma_number_to_float(csv_file,
95 results_dict = filter_value(csv_file, header, results_dict, v[0], v[1], v[2]) 112 column_from_txt(
113 v[1], csv_file), header)
114 results_dict = filter_value(csv_file, header,
115 results_dict, v[0], v[1], v[2])
96 else: 116 else:
97 raise ValueError("Please enter a number in filter by value") 117 raise ValueError("Please enter a number in filter by value")
98 118
99 if args.values_range: 119 if args.values_range:
100 for vr in args.values_range: 120 for vr in args.values_range:
101 vr[:2] = [value.replace(",",".") for value in vr[:2]] 121 vr[:2] = [value.replace(",", ".") for value in vr[:2]]
102 csv_file = comma_number_to_float(csv_file,column_from_txt(vr[2],csv_file),header) 122 csv_file = comma_number_to_float(csv_file,
103 if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float",vr[1]) or is_number("int",vr[1])): 123 column_from_txt(
104 results_dict = filter_values_range(csv_file, header, results_dict, vr[0], vr[1], vr[2], vr[3]) 124 vr[2], csv_file), header)
105 125 if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float", vr[1]) or is_number("int", vr[1])): # noqa 501
106 remaining_lines=[] 126 results_dict = filter_values_range(csv_file,
107 filtered_lines=[] 127 header, results_dict,
108 128 vr[0], vr[1], vr[2], vr[3])
109 if header is True : 129
130 remaining_lines = []
131 filtered_lines = []
132
133 if header is True:
110 remaining_lines.append(csv_file[0]) 134 remaining_lines.append(csv_file[0])
111 filtered_lines.append(csv_file[0]) 135 filtered_lines.append(csv_file[0])
112 136
113 if results_dict == {} : #no filter used 137 if results_dict == {}: # no filter used
114 remaining_lines.extend(csv_file[1:]) 138 remaining_lines.extend(csv_file[1:])
115 else : 139 else:
116 for id_line,line in enumerate(csv_file) : 140 for id_line, line in enumerate(csv_file):
117 if id_line in results_dict : #skip header and empty lines 141 if id_line in results_dict: # skip header and empty lines
118 if args.operator == 'OR' : 142 if args.operator == 'OR':
119 if any(results_dict[id_line]) : 143 if any(results_dict[id_line]):
120 filtered_lines.append(line) 144 filtered_lines.append(line)
121 else : 145 else:
122 remaining_lines.append(line) 146 remaining_lines.append(line)
123 147
124 elif args.operator == "AND" : 148 elif args.operator == "AND":
125 if all(results_dict[id_line]) : 149 if all(results_dict[id_line]):
126 filtered_lines.append(line) 150 filtered_lines.append(line)
127 else : 151 else:
128 remaining_lines.append(line) 152 remaining_lines.append(line)
129 153
130 #sort of results by column 154 # sort of results by column
131 if args.sort_col : 155 if args.sort_col:
132 sort_col=args.sort_col.split(",")[0] 156 sort_col = args.sort_col.split(",")[0]
133 sort_col=column_from_txt(sort_col,csv_file) 157 sort_col = column_from_txt(sort_col, csv_file)
134 reverse=str_to_bool(args.sort_col.split(",")[1]) 158 reverse = str_to_bool(args.sort_col.split(",")[1])
135 remaining_lines= sort_by_column(remaining_lines,sort_col,reverse,header) 159 remaining_lines = sort_by_column(remaining_lines, sort_col,
136 filtered_lines = sort_by_column(filtered_lines,sort_col,reverse,header) 160 reverse, header)
137 161 filtered_lines = sort_by_column(filtered_lines, sort_col,
138 #swap lists of lines (files) if 'keep' option selected 162 reverse, header)
139 if args.operation == "keep" : 163
164 # swap lists of lines (files) if 'keep' option selected
165 if args.operation == "keep":
140 swap = remaining_lines, filtered_lines 166 swap = remaining_lines, filtered_lines
141 remaining_lines = swap[1] 167 remaining_lines = swap[1]
142 filtered_lines = swap[0] 168 filtered_lines = swap[0]
143 169
144 # Write results to output 170 # Write results to output
145 with open(args.output,"w") as output : 171 with open(args.output, "w") as output:
146 writer = csv.writer(output,delimiter="\t") 172 writer = csv.writer(output, delimiter="\t")
147 writer.writerows(remaining_lines) 173 writer.writerows(remaining_lines)
148 174
149 # Write filtered lines to filtered_output 175 # Write filtered lines to filtered_output
150 with open(args.discarded_lines,"w") as filtered_output : 176 with open(args.discarded_lines, "w") as filtered_output:
151 writer = csv.writer(filtered_output,delimiter="\t") 177 writer = csv.writer(filtered_output, delimiter="\t")
152 writer.writerows(filtered_lines) 178 writer.writerows(filtered_lines)
153 179
154 #function to sort the csv_file by value in a specific column 180 # function to sort the csv_file by value in a specific column
155 def sort_by_column(tab,sort_col,reverse,header): 181
156 182
157 if len(tab) > 1 : #if there's more than just a header or 1 row 183 def sort_by_column(tab, sort_col, reverse, header):
158 if header : 184
159 head=tab[0] 185 if len(tab) > 1: # if there's more than just a header or 1 row
160 tab=tab[1:] 186 if header:
161 187 head = tab[0]
162 #list of empty cells in the column to sort 188 tab = tab[1:]
163 unsortable_lines = [i for i,line in enumerate(tab) if (line[sort_col]=='' or line[sort_col] == 'NA')] 189
164 unsorted_tab=[ tab[i] for i in unsortable_lines] 190 # list of empty cells in the column to sort
165 tab= [line for i,line in enumerate(tab) if i not in unsortable_lines] 191 unsortable_lines = [i for i, line in enumerate(tab) if (line[sort_col]=='' or line[sort_col] == 'NA')] # noqa 501
166 192 unsorted_tab = [tab[i] for i in unsortable_lines]
167 if only_number(tab,sort_col) and any_float(tab,sort_col) : 193 tab = [line for i, line in enumerate(tab) if i not in unsortable_lines]
168 tab = comma_number_to_float(tab,sort_col,False) 194
169 tab = sorted(tab, key=lambda row: float(row[sort_col]), reverse=reverse) 195 if only_number(tab, sort_col) and any_float(tab, sort_col):
170 elif only_number(tab,sort_col): 196 tab = comma_number_to_float(tab, sort_col, False)
171 tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse) 197 tab = sorted(tab, key=lambda row: float(row[sort_col]),
172 else : 198 reverse=reverse)
199 elif only_number(tab, sort_col):
200 tab = sorted(tab, key=lambda row: int(row[sort_col]),
201 reverse=reverse)
202 else:
173 tab = sorted(tab, key=lambda row: row[sort_col], reverse=reverse) 203 tab = sorted(tab, key=lambda row: row[sort_col], reverse=reverse)
174 204
175 tab.extend(unsorted_tab) 205 tab.extend(unsorted_tab)
176 if header is True : tab = [head]+tab 206 if header is True:
207 tab = [head]+tab
177 208
178 return tab 209 return tab
179 210
180 211
181 #replace all blank cells to NA 212 # replace all blank cells to NA
182 def blank_to_NA(csv_file) : 213
183 214
184 tmp=[] 215 def blank_to_NA(csv_file):
185 for line in csv_file : 216
186 line = ["NA" if cell=="" or cell==" " or cell=="NaN" else cell for cell in line ] 217 tmp = []
218 for line in csv_file:
219 line = ["NA" if cell=="" or cell==" " or cell=="NaN" else cell for cell in line ] # noqa 501
187 tmp.append(line) 220 tmp.append(line)
188 221
189 return tmp 222 return tmp
190 223
191 #turn into float a column 224 # turn into float a column
192 def comma_number_to_float(csv_file,ncol,header) : 225
193 if header : 226
194 tmp=[csv_file[0]] 227 def comma_number_to_float(csv_file, ncol, header):
195 csv_file=csv_file[1:] 228 if header:
196 else : 229 tmp = [csv_file[0]]
197 tmp=[] 230 csv_file = csv_file[1:]
198 231 else:
199 for line in csv_file : 232 tmp = []
200 line[ncol]=line[ncol].replace(",",".") 233
234 for line in csv_file:
235 line[ncol] = line[ncol].replace(",", ".")
201 tmp.append(line) 236 tmp.append(line)
202 237
203 return (tmp) 238 return (tmp)
204 239
205 #return True is there is at least one float in the column 240 # return True is there is at least one float in the column
206 def any_float(tab,col) : 241
207 242
208 for line in tab : 243 def any_float(tab, col):
209 if is_number("float",line[col].replace(",",".")) : 244
245 for line in tab:
246 if is_number("float", line[col].replace(",", ".")):
210 return True 247 return True
211 248
212 return False 249 return False
213 250
214 def only_number(tab,col) : 251
215 for line in tab : 252 def only_number(tab, col):
216 if not (is_number("float",line[col].replace(",",".")) or is_number("int",line[col].replace(",","."))) : 253 for line in tab:
254 if not (is_number("float", line[col].replace(",", ".")) or is_number("int", line[col].replace(",", "."))): # noqa 501
217 return False 255 return False
218 return True 256 return True
219 257
220 #Read the keywords file to extract the list of keywords 258 # Read the keywords file to extract the list of keywords
221 def read_keywords_file(filename,header,ncol): 259
222 with open(filename, "r") as csv_file : 260
223 lines= csv.reader(csv_file, delimiter='\t') 261 def read_keywords_file(filename, header, ncol):
262 with open(filename, "r") as csv_file:
263 lines = csv.reader(csv_file, delimiter='\t')
224 lines = blank_to_NA(lines) 264 lines = blank_to_NA(lines)
225 if (len(lines[0])) > 1 : keywords = [line[ncol] for line in lines] 265 if (len(lines[0])) > 1:
226 else : 266 keywords = [line[ncol] for line in lines]
227 keywords= ["".join(key) for key in lines] 267 else:
228 if header : keywords = keywords[1:] 268 keywords = ["".join(key) for key in lines]
269 if header:
270 keywords = keywords[1:]
229 keywords = list(set(keywords)) 271 keywords = list(set(keywords))
230 272
231 return keywords 273 return keywords
232 274
233 # Read input file 275 # Read input file
276
277
234 def read_file(filename): 278 def read_file(filename):
235 with open(filename,"r") as f : 279 with open(filename, "r") as f:
236 reader=csv.reader(f,delimiter="\t") 280 reader = csv.reader(f, delimiter="\t")
237 tab=list(reader) 281 tab = list(reader)
238 282
239 # Remove empty lines (contain only space or new line or "") 283 # Remove empty lines (contain only space or new line or "")
240 #[tab.remove(blank) for blank in tab if blank.isspace() or blank == ""] 284 # [tab.remove(blank) for blank in tab if blank.isspace() or blank == ""]
241 tab=[line for line in tab if len("".join(line).replace(" ","")) !=0 ] 285 tab = [line for line in tab if len("".join(line).replace(" ", "")) != 0] # noqa 501
242 286
243 return tab 287 return tab
244 288
245 #seek for keywords in rows of csvfile, return a dictionary of boolean (true if keyword found, false otherwise) 289 # seek for keywords in rows of csvfile, return a dictionary of boolean
290 # (true if keyword found, false otherwise)
291
292
246 def filter_keyword(csv_file, header, results_dict, keywords, ncol, match): 293 def filter_keyword(csv_file, header, results_dict, keywords, ncol, match):
247 match=str_to_bool(match) 294 match = str_to_bool(match)
248 ncol=column_from_txt(ncol,csv_file) 295 ncol = column_from_txt(ncol, csv_file)
249 if type(keywords) != list : keywords = keywords.upper().split() # Split list of filter keyword 296 if type(keywords) != list:
250 297 keywords = keywords.upper().split() # Split list of filter keyword
251 for id_line,line in enumerate(csv_file): 298
252 if header is True and id_line == 0 : continue 299 for id_line, line in enumerate(csv_file):
300 if header is True and id_line == 0:
301 continue
253 keyword_inline = line[ncol].replace('"', "").split(";") 302 keyword_inline = line[ncol].replace('"', "").split(";")
254 303
255 #Perfect match or not 304 # Perfect match or not
256 if match is True : 305 if match is True:
257 found_in_line = any(pid.upper() in keywords for pid in keyword_inline) 306 found_in_line = any(pid.upper() in keywords for pid in keyword_inline) # noqa 501
258 else: 307 else:
259 found_in_line = any(ft in pid.upper() for pid in keyword_inline for ft in keywords) 308 found_in_line = any(ft in pid.upper() for pid in keyword_inline for ft in keywords) # noqa 501
260 309
261 #if the keyword is found in line 310 # if the keyword is found in line
262 if id_line in results_dict : results_dict[id_line].append(found_in_line) 311 if id_line in results_dict:
263 else : results_dict[id_line]=[found_in_line] 312 results_dict[id_line].append(found_in_line)
313 else:
314 results_dict[id_line] = [found_in_line]
264 315
265 return results_dict 316 return results_dict
266 317
267 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise) 318 # filter ba determined value in rows of csvfile, return a dictionary
319 # of boolean (true if value filtered, false otherwise)
320
321
268 def filter_value(csv_file, header, results_dict, filter_value, ncol, opt): 322 def filter_value(csv_file, header, results_dict, filter_value, ncol, opt):
269 323
270 filter_value = float(filter_value) 324 filter_value = float(filter_value)
271 ncol=column_from_txt(ncol,csv_file) 325 ncol = column_from_txt(ncol, csv_file)
272 nb_string=0 326 nb_string = 0
273 327
274 for id_line,line in enumerate(csv_file): 328 for id_line, line in enumerate(csv_file):
275 if header is True and id_line == 0 : continue 329 if header is True and id_line == 0:
276 value = line[ncol].replace('"', "").replace(",",".").strip() 330 continue
331 value = line[ncol].replace('"', "").replace(",", ".").strip()
277 if value.replace(".", "", 1).isdigit(): 332 if value.replace(".", "", 1).isdigit():
278 to_filter=value_compare(value,filter_value,opt) 333 to_filter = value_compare(value, filter_value, opt)
279 334
280 #adding the result to the dictionary 335 # adding the result to the dictionary
281 if id_line in results_dict : results_dict[id_line].append(to_filter) 336 if id_line in results_dict:
282 else : results_dict[id_line]=[to_filter] 337 results_dict[id_line].append(to_filter)
283 338 else:
284 #impossible to treat (ex : "" instead of a number), we keep the line by default 339 results_dict[id_line] = [to_filter]
285 else : 340
286 nb_string+=1 341 # impossible to treat (ex : "" instead of a number),
287 if id_line in results_dict : results_dict[id_line].append(False) 342 # we keep the line by default
288 else : results_dict[id_line]=[False] 343 else:
289 344 nb_string += 1
290 #number of lines in the csv file 345 if id_line in results_dict:
291 if header : nb_lines = len(csv_file) -1 346 results_dict[id_line].append(False)
292 else : nb_lines = len(csv_file) 347 else:
293 348 results_dict[id_line] = [False]
294 #if there's no numeric value in the column 349
295 if nb_string == nb_lines : 350 # number of lines in the csv file
296 print ('No numeric values found in the column '+str(ncol+1)) 351 if header:
297 print ('The filter "'+str(opt)+' '+str(filter_value)+'" can not be applied on the column '+str(ncol+1)) 352 nb_lines = len(csv_file) - 1
298 353 else:
354 nb_lines = len(csv_file)
355
356 # if there's no numeric value in the column
357 if nb_string == nb_lines:
358 print('No numeric values found in the column '+str(ncol+1))
359 print('The filter "'+str(opt)+' '+str(filter_value)+'" can not be applied on the column '+str(ncol+1)) # noqa 501
360
299 return results_dict 361 return results_dict
300 362
301 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise) 363 # filter ba determined value in rows of csvfile, return a dictionary
302 def filter_values_range(csv_file, header, results_dict, bottom_value, top_value, ncol, inclusive): 364 # of boolean (true if value filtered, false otherwise)
303 inclusive=str_to_bool(inclusive) 365
366
367 def filter_values_range(csv_file, header, results_dict, bottom_value, top_value, ncol, inclusive): # noqa 501
368 inclusive = str_to_bool(inclusive)
304 bottom_value = float(bottom_value) 369 bottom_value = float(bottom_value)
305 top_value=float(top_value) 370 top_value = float(top_value)
306 ncol=column_from_txt(ncol,csv_file) 371 ncol = column_from_txt(ncol, csv_file)
307 nb_string=0 372 nb_string = 0
308 373
309 for id_line, line in enumerate(csv_file): 374 for id_line, line in enumerate(csv_file):
310 if header is True and id_line == 0 : continue 375 if header is True and id_line == 0:
311 value = line[ncol].replace('"', "").replace(",",".").strip() 376 continue
377 value = line[ncol].replace('"', "").replace(",", ".").strip()
312 if value.replace(".", "", 1).isdigit(): 378 if value.replace(".", "", 1).isdigit():
313 value=float(value) 379 value = float(value)
314 if inclusive is True: 380 if inclusive is True:
315 in_range = not (bottom_value <= value <= top_value) 381 in_range = not (bottom_value <= value <= top_value)
316 else : 382 else:
317 in_range = not (bottom_value < value < top_value) 383 in_range = not (bottom_value < value < top_value)
318 384
319 #adding the result to the dictionary 385 # adding the result to the dictionary
320 if id_line in results_dict : results_dict[id_line].append(in_range) 386 if id_line in results_dict:
321 else : results_dict[id_line]=[in_range] 387 results_dict[id_line].append(in_range)
322 388 else:
323 #impossible to treat (ex : "" instead of a number), we keep the line by default 389 results_dict[id_line] = [in_range]
324 else : 390
325 nb_string+=1 391 # impossible to treat (ex : "" instead of a number),
326 if id_line in results_dict : results_dict[id_line].append(False) 392 # we keep the line by default
327 else : results_dict[id_line]=[False] 393 else:
328 394 nb_string += 1
329 #number of lines in the csv file 395 if id_line in results_dict:
330 if header : nb_lines = len(csv_file) -1 396 results_dict[id_line].append(False)
331 else : nb_lines = len(csv_file) 397 else:
332 398 results_dict[id_line] = [False]
333 #if there's no numeric value in the column 399
334 if nb_string == nb_lines : 400 # number of lines in the csv file
335 print ('No numeric values found in the column '+str(ncol+1)) 401 if header:
336 if inclusive : print ('The filter "'+str(bottom_value)+' <= x <= '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) 402 nb_lines = len(csv_file) - 1
337 else : print ('The filter "'+str(bottom_value)+' < x < '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) 403 else:
338 404 nb_lines = len(csv_file)
339 return results_dict 405
340 406 # if there's no numeric value in the column
341 def column_from_txt(ncol,file): 407 if nb_string == nb_lines:
342 if is_number("int", ncol.replace("c", "")): 408 print('No numeric values found in the column '+str(ncol+1))
343 ncol = int(ncol.replace("c", "")) - 1 409 if inclusive:
410 print ('The filter "'+str(bottom_value)+' <= x <= '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) # noqa 501
411 else:
412 print ('The filter "'+str(bottom_value)+' < x < '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) # noqa 501
413
414 return results_dict
415
416
417 def column_from_txt(ncol, file):
418 if is_number("int", ncol.replace("c", "")):
419 ncol = int(ncol.replace("c", "")) - 1
344 else: 420 else:
345 raise ValueError("Please specify the column where " 421 raise ValueError("Please specify the column where "
346 "you would like to apply the filter " 422 "you would like to apply the filter "
347 "with valid format") 423 "with valid format")
348 424
349 proper_ncol (ncol,file) 425 proper_ncol(ncol, file)
350 426
351 return ncol 427 return ncol
352 428
353 #return True if value is in the determined values, false otherwise 429 # return True if value is in the determined values, false otherwise
354 def value_compare(value,filter_value,opt): 430
355 test_value=False 431
432 def value_compare(value, filter_value, opt):
433 test_value = False
356 434
357 if opt == "<": 435 if opt == "<":
358 if float(value) < filter_value: 436 if float(value) < filter_value:
359 test_value = True 437 test_value = True
360 elif opt == "<=": 438 elif opt == "<=":
367 if float(value) >= filter_value: 445 if float(value) >= filter_value:
368 test_value = True 446 test_value = True
369 elif opt == "=": 447 elif opt == "=":
370 if float(value) == filter_value: 448 if float(value) == filter_value:
371 test_value = True 449 test_value = True
372 elif opt == "!=": 450 elif opt == "!=":
373 if float(value) != filter_value: 451 if float(value) != filter_value:
374 test_value = True 452 test_value = True
375 453
376 return test_value 454 return test_value
455
377 456
378 if __name__ == "__main__": 457 if __name__ == "__main__":
379 options() 458 options()