Mercurial > repos > proteore > proteore_filter_keywords_values

diff filter_kw_val.py @ 2:52a7afd01c6d draft
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
author: proteore
date: Tue, 18 Dec 2018 09:25:11 -0500
parents: a55e8b137c6b
children: 2080e2a4f209
--- a/filter_kw_val.py	Fri Sep 21 06:03:25 2018 -0400
+++ b/filter_kw_val.py	Tue Dec 18 09:25:11 2018 -0500
@@ -55,7 +55,7 @@
 def filters(args):
     filename = args.input.split(",")[0]
     header = str_to_bool(args.input.split(",")[1])
-    csv_file = read_file(filename)
+    csv_file = blank_to_NA(read_file(filename))
     results_dict = {}
 
     if args.kw:
@@ -66,18 +66,24 @@
     if args.kw_file:
         key_files = args.kw_file
         for kf in key_files:
-            keywords = read_option(kf[0])
-            results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[1], kf[2])
+            header = str_to_bool(kf[1])
+            ncol = column_from_txt(kf[2]) 
+            keywords = read_keywords_file(kf[0],header,ncol)
+            results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[3], kf[4])
 
     if args.value:
         for v in args.value:
+            v[0] = v[0].replace(",",".")
             if is_number("float", v[0]):
+                csv_file = comma_number_to_float(csv_file,v[1],header)
                 results_dict = filter_value(csv_file, header, results_dict, v[0], v[1], v[2])
             else:
                 raise ValueError("Please enter a number in filter by value")
 
     if args.values_range:
         for vr in args.values_range:
+            vr[:2] = [value.replace(",",".") for value in vr[:2]]
+            csv_file = comma_number_to_float(csv_file,vr[2],header)
             if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float",vr[1]) or is_number("int",vr[1])):
                 results_dict = filter_values_range(csv_file, header, results_dict, vr[0], vr[1], vr[2], vr[3])
 
@@ -88,20 +94,23 @@
         remaining_lines.append(csv_file[0])
         filtered_lines.append(csv_file[0])
 
-    for id_line,line in enumerate(csv_file) :
-        if id_line in results_dict :   #skip header and empty lines
-            if args.operator == 'OR' :
-                if any(results_dict[id_line]) :
-                    filtered_lines.append(line)
-                else : 
-                    remaining_lines.append(line)
+    if results_dict == {} :   #no filter used
+        remaining_lines.extend(csv_file[1:])
+    else :
+        for id_line,line in enumerate(csv_file) :
+            if id_line in results_dict :   #skip header and empty lines
+                if args.operator == 'OR' :
+                    if any(results_dict[id_line]) :
+                        filtered_lines.append(line)
+                    else : 
+                        remaining_lines.append(line)
 
-            elif args.operator == "AND" :
-                if all(results_dict[id_line]) :
-                    filtered_lines.append(line)
-                else : 
-                    remaining_lines.append(line)
-    
+                elif args.operator == "AND" :
+                    if all(results_dict[id_line]) :
+                        filtered_lines.append(line)
+                    else : 
+                        remaining_lines.append(line)
+
     #sort of results by column
     if args.sort_col :
         sort_col=args.sort_col.split(",")[0]
@@ -124,29 +133,81 @@
 def sort_by_column(tab,sort_col,reverse,header):
     
     if len(tab) > 1 : #if there's more than just a header or 1 row
-        if header is True :
+        if header :
             head=tab[0]
             tab=tab[1:]
 
-        if is_number("int",tab[0][sort_col]) :
-            tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse)
-        elif is_number("float",tab[0][sort_col]) :
+        #list of empty cells in the column to sort
+        unsortable_lines = [i for i,line in enumerate(tab) if (line[sort_col]=='' or line[sort_col] == 'NA')]
+        unsorted_tab=[ tab[i] for i in unsortable_lines]
+        tab= [line for i,line in enumerate(tab) if i not in unsortable_lines]
+
+        if only_number(tab,sort_col) and any_float(tab,sort_col)  : 
             tab = sorted(tab, key=lambda row: float(row[sort_col]), reverse=reverse)
+        elif only_number(tab,sort_col):
+            tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse)      
         else :
             tab = sorted(tab, key=lambda row: row[sort_col], reverse=reverse)
         
+        tab.extend(unsorted_tab)
         if header is True : tab = [head]+tab
 
     return tab
 
+
+#replace all blank cells to NA
+def blank_to_NA(csv_file) :
+    
+    tmp=[]
+    for line in csv_file :
+        line = ["NA" if cell=="" or cell==" " or cell=="NaN" else cell for cell in line ]
+        tmp.append(line)
+    
+    return tmp
+
+#turn into float a column
+def comma_number_to_float(csv_file,ncol,header) :
+    ncol = int(ncol.replace("c","")) - 1
+    if header : 
+        tmp=[csv_file[0]]
+        csv_file=csv_file[1:]
+    else : 
+        tmp=[]
+
+    for line in csv_file :
+        line[ncol]=line[ncol].replace(",",".")
+        tmp.append(line)
+
+    return (tmp)
+
+#return True is there is at least one float in the column
+def any_float(tab,col) :
+    
+    for line in tab :
+        if is_number("float",line[col].replace(",",".")) :
+            return True
+
+    return False
+
+def only_number(tab,col) :
+
+    for line in tab :
+        if not (is_number("float",line[col].replace(",",".")) or is_number("int",line[col].replace(",","."))) :
+            return False
+    return True
+
 #Read the keywords file to extract the list of keywords
-def read_option(filename):
-    with open(filename, "r") as f:
-        filter_list=f.read().splitlines()
-    filter_list=[key for key in filter_list if len(key.replace(' ',''))!=0]
-    filters=";".join(filter_list)
+def read_keywords_file(filename,header,ncol):
+    with open(filename, "r") as csv_file :
+        lines= csv.reader(csv_file, delimiter='\t')
+        lines = blank_to_NA(lines)
+        if (len(lines[0])) > 1 : keywords = [line[ncol] for line in lines]
+        else : 
+            keywords= ["".join(key) for key in lines]
+    if header : keywords = keywords[1:]
+    keywords = list(set(keywords))
 
-    return filters
+    return keywords
 
 # Read input file
 def read_file(filename):
@@ -164,16 +225,11 @@
 def filter_keyword(csv_file, header, results_dict, keywords, ncol, match):
     match=str_to_bool(match)
     ncol=column_from_txt(ncol)
-
-    keywords = keywords.upper().split(";")                                            # Split list of filter keyword
-    [keywords.remove(blank) for blank in keywords if blank.isspace() or blank == ""]  # Remove blank keywords
-    keywords = [k.strip() for k in keywords]        # Remove space from 2 heads of keywords
+    if type(keywords) != list : keywords = keywords.upper().split()            # Split list of filter keyword
 
     for id_line,line in enumerate(csv_file):
         if header is True and id_line == 0 : continue
-        #line = line.replace("\n", "")
         keyword_inline = line[ncol].replace('"', "").split(";")
-        #line = line + "\n"
 
         #Perfect match or not
         if match is True :
@@ -192,16 +248,32 @@
 
     filter_value = float(filter_value)
     ncol=column_from_txt(ncol)
+    nb_string=0
 
     for id_line,line in enumerate(csv_file):
         if header is True and id_line == 0 : continue
-        value = line[ncol].replace('"', "").strip()
+        value = line[ncol].replace('"', "").replace(",",".").strip()
         if value.replace(".", "", 1).isdigit():
             to_filter=value_compare(value,filter_value,opt)
             
             #adding the result to the dictionary
             if id_line in results_dict : results_dict[id_line].append(to_filter)
             else : results_dict[id_line]=[to_filter]
+
+        #impossible to treat (ex : "" instead of a number), we keep the line by default        
+        else :
+            nb_string+=1
+            if id_line in results_dict : results_dict[id_line].append(False)
+            else : results_dict[id_line]=[False]
+    
+    #number of lines in the csv file
+    if header : nb_lines = len(csv_file) -1
+    else : nb_lines = len(csv_file)
+    
+    #if there's no numeric value in the column
+    if nb_string == nb_lines :
+        print ('No numeric values found in the column '+str(ncol+1))
+        print ('The filter "'+str(opt)+' '+str(filter_value)+'" can not be applied on the column '+str(ncol+1))
             
     return results_dict
 
@@ -211,10 +283,11 @@
     bottom_value = float(bottom_value)
     top_value=float(top_value)
     ncol=column_from_txt(ncol)
+    nb_string=0
 
     for id_line, line in enumerate(csv_file):
         if header is True and id_line == 0 : continue
-        value = line[ncol].replace('"', "").strip()
+        value = line[ncol].replace('"', "").replace(",",".").strip()
         if value.replace(".", "", 1).isdigit():
             value=float(value)
             if inclusive is True:
@@ -225,6 +298,22 @@
             #adding the result to the dictionary
             if id_line in results_dict : results_dict[id_line].append(in_range)
             else : results_dict[id_line]=[in_range]
+        
+        #impossible to treat (ex : "" instead of a number), we keep the line by default        
+        else :
+            nb_string+=1
+            if id_line in results_dict : results_dict[id_line].append(False)
+            else : results_dict[id_line]=[False]
+
+    #number of lines in the csv file
+    if header : nb_lines = len(csv_file) -1
+    else : nb_lines = len(csv_file)
+    
+    #if there's no numeric value in the column
+    if nb_string == nb_lines :
+        print ('No numeric values found in the column '+str(ncol+1))
+        if inclusive : print ('The filter "'+str(bottom_value)+' <= x <= '+str(top_value)+'" can not be applied on the column '+str(ncol+1))
+        else : print ('The filter "'+str(bottom_value)+' < x < '+str(top_value)+'" can not be applied on the column '+str(ncol+1))
 
     return results_dict
author	proteore
date	Tue, 18 Dec 2018 09:25:11 -0500
parents	a55e8b137c6b
children	2080e2a4f209