Mercurial > repos > proteore > filter_keywords_values
annotate filter_kw_val.py @ 5:1e9911190142 draft
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
author | proteore |
---|---|
date | Wed, 14 Mar 2018 10:24:54 -0400 |
parents | d29e469b6b20 |
children | c6ba1e6f6869 |
rev | line source |
---|---|
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
1 import argparse |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
2 import re |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
3 |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
4 |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
5 def options(): |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
6 """ |
5
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
7 Parse options: |
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
8 -i, --input Input filename and boolean value if the file contains header ["filename,true/false"] |
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
9 -m, --match if the keywords should be filtered in exact |
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
10 --kw Keyword to be filtered, the column number where this filter applies, |
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
11 boolean value if the keyword should be filtered in exact ["keyword,ncol,true/false"]. |
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
12 This option can be repeated: --kw "kw1,c1,true" --kw "kw2,c1,false" --kw "kw3,c2,true" |
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
13 --kwfile A file that contains keywords to be filter, the column where this filter applies and |
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
14 boolean value if the keyword should be filtered in exact ["filename,ncol,true/false"] |
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
15 --value The value to be filtered, the column number where this filter applies and the |
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
16 operation symbol ["value,ncol,=/>/>=/</<="] |
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
17 --o --output The output filename |
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
18 --trash_file The file contains removed lines |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
19 """ |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
20 parser = argparse.ArgumentParser() |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
21 parser.add_argument("-i", "--input", help="Input file", required=True) |
5
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
22 parser.add_argument("--kw", nargs="+", action="append", help="") |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
23 parser.add_argument("--kw_file", nargs="+", action="append", help="") |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
24 parser.add_argument("--value", nargs="+", action="append", help="") |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
25 parser.add_argument("-o", "--output", default="output.txt") |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
26 parser.add_argument("--trash_file", default="trash_MQfilter.txt") |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
27 |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
28 args = parser.parse_args() |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
29 |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
30 filters(args) |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
31 |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
32 def isnumber(number_format, n): |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
33 """ |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
34 Check if a variable is a float or an integer |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
35 """ |
5
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
36 float_format = re.compile(r"^[-]?[1-9][0-9]*.?[0-9]+$") |
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
37 int_format = re.compile(r"^[-]?[1-9][0-9]*$") |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
38 test = "" |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
39 if number_format == "int": |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
40 test = re.match(int_format, n) |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
41 elif number_format == "float": |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
42 test = re.match(float_format, n) |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
43 if test: |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
44 return True |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
45 |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
46 def filters(args): |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
47 """ |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
48 Filter the document |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
49 """ |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
50 MQfilename = args.input.split(",")[0] |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
51 header = args.input.split(",")[1] |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
52 MQfile = readMQ(MQfilename) |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
53 results = [MQfile, None] |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
54 |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
55 if args.kw: |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
56 keywords = args.kw |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
57 for k in keywords: |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
58 results = filter_keyword(results[0], header, results[1], k[0], k[1], k[2]) |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
59 if args.kw_file: |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
60 key_files = args.kw_file |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
61 for kf in key_files: |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
62 ids = readOption(kf[0]) |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
63 results = filter_keyword(results[0], header, results[1], ids, kf[1], kf[2]) |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
64 if args.value: |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
65 for v in args.value: |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
66 if isnumber("float", v[0]): |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
67 results = filter_value(results[0], header, results[1], v[0], v[1], v[2]) |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
68 else: |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
69 raise ValueError("Please enter a number in filter by value") |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
70 |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
71 # Write results to output |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
72 output = open(args.output, "w") |
5
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
73 output.write("".join(results[0])) |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
74 output.close() |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
75 |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
76 # Write deleted lines to trash_file |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
77 trash = open(args.trash_file, "w") |
5
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
78 trash.write("".join(results[1])) |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
79 trash.close() |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
80 |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
81 def readOption(filename): |
5
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
82 # Read the keywords file to extract the list of keywords |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
83 f = open(filename, "r") |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
84 file_content = f.read() |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
85 filter_list = file_content.split("\n") |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
86 filters = "" |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
87 for i in filter_list: |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
88 filters += i + ";" |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
89 filters = filters[:-1] |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
90 return filters |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
91 |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
92 def readMQ(MQfilename): |
5
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
93 # Read input file |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
94 mqfile = open(MQfilename, "r") |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
95 mq = mqfile.readlines() |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
96 # Remove empty lines (contain only space or new line or "") |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
97 [mq.remove(blank) for blank in mq if blank.isspace() or blank == ""] |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
98 return mq |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
99 |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
100 def filter_keyword(MQfile, header, filtered_lines, ids, ncol, match): |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
101 mq = MQfile |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
102 if isnumber("int", ncol.replace("c", "")): |
5
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
103 id_index = int(ncol.replace("c", "")) - 1 |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
104 else: |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
105 raise ValueError("Please specify the column where " |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
106 "you would like to apply the filter " |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
107 "with valid format") |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
108 |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
109 # Split list of filter IDs |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
110 ids = ids.upper().split(";") |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
111 # Remove blank IDs |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
112 [ids.remove(blank) for blank in ids if blank.isspace() or blank == ""] |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
113 # Remove space from 2 heads of IDs |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
114 ids = [id.strip() for id in ids] |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
115 |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
116 |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
117 if header == "true": |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
118 header = mq[0] |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
119 content = mq[1:] |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
120 else: |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
121 header = "" |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
122 content = mq[:] |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
123 |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
124 if not filtered_lines: # In case there is already some filtered lines from other filters |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
125 filtered_lines = [] |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
126 if header != "": |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
127 filtered_lines.append(header) |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
128 |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
129 for line in content: |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
130 line = line.replace("\n", "") |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
131 id_inline = line.split("\t")[id_index].replace('"', "").split(";") |
5
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
132 # Take only first IDs |
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
133 #one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
134 line = line + "\n" |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
135 |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
136 if match != "false": |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
137 # Filter protein IDs |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
138 if any(pid.upper() in ids for pid in id_inline): |
5
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
139 filtered_lines.append(line) |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
140 mq.remove(line) |
5
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
141 #else: |
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
142 # mq[mq.index(line)] = one_id_line |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
143 else: |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
144 if any(ft in pid.upper() for pid in id_inline for ft in ids): |
5
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
145 filtered_lines.append(line) |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
146 mq.remove(line) |
5
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
147 #else: |
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
148 # mq[mq.index(line)] = one_id_line |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
149 return mq, filtered_lines |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
150 |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
151 def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt): |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
152 mq = MQfile |
5
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
153 if ncol and isnumber("int", ncol.replace("c", "")): |
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
154 index = int(ncol.replace("c", "")) - 1 |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
155 else: |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
156 raise ValueError("Please specify the column where " |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
157 "you would like to apply the filter " |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
158 "with valid format") |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
159 if header == "true": |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
160 header = mq[0] |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
161 content = mq[1:] |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
162 else: |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
163 header = "" |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
164 content = mq[:] |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
165 if not filtered_prots: # In case there is already some filtered lines from other filters |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
166 filtered_prots = [] |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
167 if header != "": |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
168 filtered_prots.append(header) |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
169 |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
170 for line in content: |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
171 prot = line.replace("\n","") |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
172 filter_value = float(filter_value) |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
173 pep = prot.split("\t")[index].replace('"', "") |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
174 if pep.replace(".", "", 1).isdigit(): |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
175 if opt == "<": |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
176 if float(pep) >= filter_value: |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
177 filtered_prots.append(line) |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
178 mq.remove(line) |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
179 elif opt == "<=": |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
180 if float(pep) > filter_value: |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
181 filtered_prots.append(line) |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
182 mq.remove(line) |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
183 elif opt == ">": |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
184 #print(prot.number_of_prots, filter_value, int(prot.number_of_prots) > filter_value) |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
185 if float(pep) <= filter_value: |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
186 filtered_prots.append(line) |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
187 mq.remove(line) |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
188 elif opt == ">=": |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
189 if float(pep) < filter_value: |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
190 filtered_prots.append(line) |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
191 mq.remove(line) |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
192 else: |
1
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
193 if float(pep) != filter_value: |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
194 filtered_prots.append(line) |
d29e469b6b20
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
proteore
parents:
0
diff
changeset
|
195 mq.remove(line) |
5
1e9911190142
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
proteore
parents:
1
diff
changeset
|
196 return mq, filtered_prots |
0
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
197 |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
198 if __name__ == "__main__": |
6a45ccfc0e4c
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
199 options() |