Mercurial > repos > proteore > proteore_filter_keywords_values
annotate filter_kw_val.py @ 2:52a7afd01c6d draft
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
author | proteore |
---|---|
date | Tue, 18 Dec 2018 09:25:11 -0500 |
parents | a55e8b137c6b |
children | 2080e2a4f209 |
rev | line source |
---|---|
0
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
1 import argparse, re, csv |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
2 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
3 def options(): |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
4 """ |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
5 Parse options: |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
6 -i, --input Input filename and boolean value if the file contains header ["filename,true/false"] |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
7 --kw Keyword to be filtered, the column number where this filter applies, |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
8 boolean value if the keyword should be filtered in exact ["keyword,ncol,true/false"]. |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
9 This option can be repeated: --kw "kw1,c1,true" --kw "kw2,c1,false" --kw "kw3,c2,true" |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
10 --kwfile A file that contains keywords to be filter, the column where this filter applies and |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
11 boolean value if the keyword should be filtered in exact ["filename,ncol,true/false"] |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
12 --value The value to be filtered, the column number where this filter applies and the |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
13 operation symbol ["value,ncol,=/>/>=/</<=/!="] |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
14 --values_range range of values to be keep, example : --values_range 5 20 c1 true |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
15 --operator The operator used to filter with several keywords/values : AND or OR |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
16 --o --output The output filename |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
17 --filtered_file The file contains removed lines |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
18 -s --sort_col Used column to sort the file, ",true" for reverse sorting, ",false" otherwise example : c1,false |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
19 """ |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
20 parser = argparse.ArgumentParser() |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
21 parser.add_argument("-i", "--input", help="Input file", required=True) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
22 parser.add_argument("--kw", nargs="+", action="append", help="") |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
23 parser.add_argument("--kw_file", nargs="+", action="append", help="") |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
24 parser.add_argument("--value", nargs="+", action="append", help="") |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
25 parser.add_argument("--values_range", nargs="+", action="append", help="") |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
26 parser.add_argument("--operator", default="OR", type=str, choices=['AND','OR'],help='') |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
27 parser.add_argument("-o", "--output", default="output.txt") |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
28 parser.add_argument("--filtered_file", default="filtered_output.txt") |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
29 parser.add_argument("-s","--sort_col", help="") |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
30 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
31 args = parser.parse_args() |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
32 filters(args) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
33 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
34 def str_to_bool(v): |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
35 if v.lower() in ('yes', 'true', 't', 'y', '1'): |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
36 return True |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
37 elif v.lower() in ('no', 'false', 'f', 'n', '0'): |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
38 return False |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
39 else: |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
40 raise argparse.ArgumentTypeError('Boolean value expected.') |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
41 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
42 #Check if a variable is a float or an integer |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
43 def is_number(number_format, n): |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
44 float_format = re.compile(r"^[-]?[0-9][0-9]*.?[0-9]+$") |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
45 int_format = re.compile(r"^[-]?[0-9][0-9]*$") |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
46 test = "" |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
47 if number_format == "int": |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
48 test = re.match(int_format, n) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
49 elif number_format == "float": |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
50 test = re.match(float_format, n) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
51 if test: |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
52 return True |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
53 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
54 #Filter the document |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
55 def filters(args): |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
56 filename = args.input.split(",")[0] |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
57 header = str_to_bool(args.input.split(",")[1]) |
2
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
58 csv_file = blank_to_NA(read_file(filename)) |
0
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
59 results_dict = {} |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
60 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
61 if args.kw: |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
62 keywords = args.kw |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
63 for k in keywords: |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
64 results_dict=filter_keyword(csv_file, header, results_dict, k[0], k[1], k[2]) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
65 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
66 if args.kw_file: |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
67 key_files = args.kw_file |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
68 for kf in key_files: |
2
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
69 header = str_to_bool(kf[1]) |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
70 ncol = column_from_txt(kf[2]) |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
71 keywords = read_keywords_file(kf[0],header,ncol) |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
72 results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[3], kf[4]) |
0
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
73 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
74 if args.value: |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
75 for v in args.value: |
2
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
76 v[0] = v[0].replace(",",".") |
0
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
77 if is_number("float", v[0]): |
2
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
78 csv_file = comma_number_to_float(csv_file,v[1],header) |
0
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
79 results_dict = filter_value(csv_file, header, results_dict, v[0], v[1], v[2]) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
80 else: |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
81 raise ValueError("Please enter a number in filter by value") |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
82 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
83 if args.values_range: |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
84 for vr in args.values_range: |
2
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
85 vr[:2] = [value.replace(",",".") for value in vr[:2]] |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
86 csv_file = comma_number_to_float(csv_file,vr[2],header) |
0
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
87 if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float",vr[1]) or is_number("int",vr[1])): |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
88 results_dict = filter_values_range(csv_file, header, results_dict, vr[0], vr[1], vr[2], vr[3]) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
89 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
90 remaining_lines=[] |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
91 filtered_lines=[] |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
92 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
93 if header is True : |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
94 remaining_lines.append(csv_file[0]) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
95 filtered_lines.append(csv_file[0]) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
96 |
2
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
97 if results_dict == {} : #no filter used |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
98 remaining_lines.extend(csv_file[1:]) |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
99 else : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
100 for id_line,line in enumerate(csv_file) : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
101 if id_line in results_dict : #skip header and empty lines |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
102 if args.operator == 'OR' : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
103 if any(results_dict[id_line]) : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
104 filtered_lines.append(line) |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
105 else : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
106 remaining_lines.append(line) |
0
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
107 |
2
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
108 elif args.operator == "AND" : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
109 if all(results_dict[id_line]) : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
110 filtered_lines.append(line) |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
111 else : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
112 remaining_lines.append(line) |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
113 |
0
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
114 #sort of results by column |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
115 if args.sort_col : |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
116 sort_col=args.sort_col.split(",")[0] |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
117 sort_col=column_from_txt(sort_col) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
118 reverse=str_to_bool(args.sort_col.split(",")[1]) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
119 remaining_lines= sort_by_column(remaining_lines,sort_col,reverse,header) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
120 filtered_lines = sort_by_column(filtered_lines,sort_col,reverse,header) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
121 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
122 # Write results to output |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
123 with open(args.output,"w") as output : |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
124 writer = csv.writer(output,delimiter="\t") |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
125 writer.writerows(remaining_lines) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
126 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
127 # Write filtered lines to filtered_output |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
128 with open(args.filtered_file,"w") as filtered_output : |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
129 writer = csv.writer(filtered_output,delimiter="\t") |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
130 writer.writerows(filtered_lines) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
131 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
132 #function to sort the csv_file by value in a specific column |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
133 def sort_by_column(tab,sort_col,reverse,header): |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
134 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
135 if len(tab) > 1 : #if there's more than just a header or 1 row |
2
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
136 if header : |
0
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
137 head=tab[0] |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
138 tab=tab[1:] |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
139 |
2
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
140 #list of empty cells in the column to sort |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
141 unsortable_lines = [i for i,line in enumerate(tab) if (line[sort_col]=='' or line[sort_col] == 'NA')] |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
142 unsorted_tab=[ tab[i] for i in unsortable_lines] |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
143 tab= [line for i,line in enumerate(tab) if i not in unsortable_lines] |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
144 |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
145 if only_number(tab,sort_col) and any_float(tab,sort_col) : |
0
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
146 tab = sorted(tab, key=lambda row: float(row[sort_col]), reverse=reverse) |
2
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
147 elif only_number(tab,sort_col): |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
148 tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse) |
0
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
149 else : |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
150 tab = sorted(tab, key=lambda row: row[sort_col], reverse=reverse) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
151 |
2
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
152 tab.extend(unsorted_tab) |
0
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
153 if header is True : tab = [head]+tab |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
154 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
155 return tab |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
156 |
2
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
157 |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
158 #replace all blank cells to NA |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
159 def blank_to_NA(csv_file) : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
160 |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
161 tmp=[] |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
162 for line in csv_file : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
163 line = ["NA" if cell=="" or cell==" " or cell=="NaN" else cell for cell in line ] |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
164 tmp.append(line) |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
165 |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
166 return tmp |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
167 |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
168 #turn into float a column |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
169 def comma_number_to_float(csv_file,ncol,header) : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
170 ncol = int(ncol.replace("c","")) - 1 |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
171 if header : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
172 tmp=[csv_file[0]] |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
173 csv_file=csv_file[1:] |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
174 else : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
175 tmp=[] |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
176 |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
177 for line in csv_file : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
178 line[ncol]=line[ncol].replace(",",".") |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
179 tmp.append(line) |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
180 |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
181 return (tmp) |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
182 |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
183 #return True is there is at least one float in the column |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
184 def any_float(tab,col) : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
185 |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
186 for line in tab : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
187 if is_number("float",line[col].replace(",",".")) : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
188 return True |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
189 |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
190 return False |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
191 |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
192 def only_number(tab,col) : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
193 |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
194 for line in tab : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
195 if not (is_number("float",line[col].replace(",",".")) or is_number("int",line[col].replace(",","."))) : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
196 return False |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
197 return True |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
198 |
0
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
199 #Read the keywords file to extract the list of keywords |
2
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
200 def read_keywords_file(filename,header,ncol): |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
201 with open(filename, "r") as csv_file : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
202 lines= csv.reader(csv_file, delimiter='\t') |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
203 lines = blank_to_NA(lines) |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
204 if (len(lines[0])) > 1 : keywords = [line[ncol] for line in lines] |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
205 else : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
206 keywords= ["".join(key) for key in lines] |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
207 if header : keywords = keywords[1:] |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
208 keywords = list(set(keywords)) |
0
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
209 |
2
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
210 return keywords |
0
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
211 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
212 # Read input file |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
213 def read_file(filename): |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
214 with open(filename,"r") as f : |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
215 reader=csv.reader(f,delimiter="\t") |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
216 tab=list(reader) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
217 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
218 # Remove empty lines (contain only space or new line or "") |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
219 #[tab.remove(blank) for blank in tab if blank.isspace() or blank == ""] |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
220 tab=[line for line in tab if len("".join(line).replace(" ","")) !=0 ] |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
221 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
222 return tab |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
223 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
224 #seek for keywords in rows of csvfile, return a dictionary of boolean (true if keyword found, false otherwise) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
225 def filter_keyword(csv_file, header, results_dict, keywords, ncol, match): |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
226 match=str_to_bool(match) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
227 ncol=column_from_txt(ncol) |
2
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
228 if type(keywords) != list : keywords = keywords.upper().split() # Split list of filter keyword |
0
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
229 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
230 for id_line,line in enumerate(csv_file): |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
231 if header is True and id_line == 0 : continue |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
232 keyword_inline = line[ncol].replace('"', "").split(";") |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
233 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
234 #Perfect match or not |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
235 if match is True : |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
236 found_in_line = any(pid.upper() in keywords for pid in keyword_inline) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
237 else: |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
238 found_in_line = any(ft in pid.upper() for pid in keyword_inline for ft in keywords) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
239 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
240 #if the keyword is found in line |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
241 if id_line in results_dict : results_dict[id_line].append(found_in_line) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
242 else : results_dict[id_line]=[found_in_line] |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
243 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
244 return results_dict |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
245 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
246 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
247 def filter_value(csv_file, header, results_dict, filter_value, ncol, opt): |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
248 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
249 filter_value = float(filter_value) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
250 ncol=column_from_txt(ncol) |
2
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
251 nb_string=0 |
0
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
252 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
253 for id_line,line in enumerate(csv_file): |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
254 if header is True and id_line == 0 : continue |
2
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
255 value = line[ncol].replace('"', "").replace(",",".").strip() |
0
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
256 if value.replace(".", "", 1).isdigit(): |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
257 to_filter=value_compare(value,filter_value,opt) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
258 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
259 #adding the result to the dictionary |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
260 if id_line in results_dict : results_dict[id_line].append(to_filter) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
261 else : results_dict[id_line]=[to_filter] |
2
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
262 |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
263 #impossible to treat (ex : "" instead of a number), we keep the line by default |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
264 else : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
265 nb_string+=1 |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
266 if id_line in results_dict : results_dict[id_line].append(False) |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
267 else : results_dict[id_line]=[False] |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
268 |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
269 #number of lines in the csv file |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
270 if header : nb_lines = len(csv_file) -1 |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
271 else : nb_lines = len(csv_file) |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
272 |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
273 #if there's no numeric value in the column |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
274 if nb_string == nb_lines : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
275 print ('No numeric values found in the column '+str(ncol+1)) |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
276 print ('The filter "'+str(opt)+' '+str(filter_value)+'" can not be applied on the column '+str(ncol+1)) |
0
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
277 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
278 return results_dict |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
279 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
280 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
281 def filter_values_range(csv_file, header, results_dict, bottom_value, top_value, ncol, inclusive): |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
282 inclusive=str_to_bool(inclusive) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
283 bottom_value = float(bottom_value) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
284 top_value=float(top_value) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
285 ncol=column_from_txt(ncol) |
2
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
286 nb_string=0 |
0
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
287 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
288 for id_line, line in enumerate(csv_file): |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
289 if header is True and id_line == 0 : continue |
2
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
290 value = line[ncol].replace('"', "").replace(",",".").strip() |
0
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
291 if value.replace(".", "", 1).isdigit(): |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
292 value=float(value) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
293 if inclusive is True: |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
294 in_range = not (bottom_value <= value <= top_value) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
295 else : |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
296 in_range = not (bottom_value < value < top_value) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
297 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
298 #adding the result to the dictionary |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
299 if id_line in results_dict : results_dict[id_line].append(in_range) |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
300 else : results_dict[id_line]=[in_range] |
2
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
301 |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
302 #impossible to treat (ex : "" instead of a number), we keep the line by default |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
303 else : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
304 nb_string+=1 |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
305 if id_line in results_dict : results_dict[id_line].append(False) |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
306 else : results_dict[id_line]=[False] |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
307 |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
308 #number of lines in the csv file |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
309 if header : nb_lines = len(csv_file) -1 |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
310 else : nb_lines = len(csv_file) |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
311 |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
312 #if there's no numeric value in the column |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
313 if nb_string == nb_lines : |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
314 print ('No numeric values found in the column '+str(ncol+1)) |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
315 if inclusive : print ('The filter "'+str(bottom_value)+' <= x <= '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) |
52a7afd01c6d
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents:
0
diff
changeset
|
316 else : print ('The filter "'+str(bottom_value)+' < x < '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) |
0
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
317 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
318 return results_dict |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
319 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
320 def column_from_txt(ncol): |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
321 if is_number("int", ncol.replace("c", "")): |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
322 ncol = int(ncol.replace("c", "")) - 1 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
323 else: |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
324 raise ValueError("Please specify the column where " |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
325 "you would like to apply the filter " |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
326 "with valid format") |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
327 return ncol |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
328 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
329 #return True if value is in the determined values, false otherwise |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
330 def value_compare(value,filter_value,opt): |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
331 test_value=False |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
332 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
333 if opt == "<": |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
334 if float(value) < filter_value: |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
335 test_value = True |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
336 elif opt == "<=": |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
337 if float(value) <= filter_value: |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
338 test_value = True |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
339 elif opt == ">": |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
340 if float(value) > filter_value: |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
341 test_value = True |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
342 elif opt == ">=": |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
343 if float(value) >= filter_value: |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
344 test_value = True |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
345 elif opt == "=": |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
346 if float(value) == filter_value: |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
347 test_value = True |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
348 elif opt == "!=": |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
349 if float(value) != filter_value: |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
350 test_value = True |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
351 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
352 return test_value |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
353 |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
354 if __name__ == "__main__": |
a55e8b137c6b
planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff
changeset
|
355 options() |