Mercurial > repos > proteore > filter_keywords_values
comparison filter_kw_val.py @ 5:1e9911190142 draft
planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
author | proteore |
---|---|
date | Wed, 14 Mar 2018 10:24:54 -0400 |
parents | d29e469b6b20 |
children | c6ba1e6f6869 |
comparison
equal
deleted
inserted
replaced
4:2c1012e0a628 | 5:1e9911190142 |
---|---|
2 import re | 2 import re |
3 | 3 |
4 | 4 |
5 def options(): | 5 def options(): |
6 """ | 6 """ |
7 Parse options | 7 Parse options: |
8 -i, --input Input filename and boolean value if the file contains header ["filename,true/false"] | |
9 -m, --match if the keywords should be filtered in exact | |
10 --kw Keyword to be filtered, the column number where this filter applies, | |
11 boolean value if the keyword should be filtered in exact ["keyword,ncol,true/false"]. | |
12 This option can be repeated: --kw "kw1,c1,true" --kw "kw2,c1,false" --kw "kw3,c2,true" | |
13 --kwfile A file that contains keywords to be filter, the column where this filter applies and | |
14 boolean value if the keyword should be filtered in exact ["filename,ncol,true/false"] | |
15 --value The value to be filtered, the column number where this filter applies and the | |
16 operation symbol ["value,ncol,=/>/>=/</<="] | |
17 --o --output The output filename | |
18 --trash_file The file contains removed lines | |
8 """ | 19 """ |
9 parser = argparse.ArgumentParser() | 20 parser = argparse.ArgumentParser() |
10 parser.add_argument("-i", "--input", help="Input file", required=True) | 21 parser.add_argument("-i", "--input", help="Input file", required=True) |
11 parser.add_argument("-m", "--match", help="Exact macth") | 22 parser.add_argument("--kw", nargs="+", action="append", help="") |
12 parser.add_argument("--kw", nargs="+", action="append", help="") # | |
13 parser.add_argument("--kw_file", nargs="+", action="append", help="") | 23 parser.add_argument("--kw_file", nargs="+", action="append", help="") |
14 parser.add_argument("--value", nargs="+", action="append", help="") | 24 parser.add_argument("--value", nargs="+", action="append", help="") |
15 parser.add_argument("-o", "--output", default="output.txt") | 25 parser.add_argument("-o", "--output", default="output.txt") |
16 parser.add_argument("--trash_file", default="trash_MQfilter.txt") | 26 parser.add_argument("--trash_file", default="trash_MQfilter.txt") |
17 | 27 |
18 args = parser.parse_args() | 28 args = parser.parse_args() |
19 | 29 |
20 filters(args) | 30 filters(args) |
21 | 31 |
22 # python filter2.py -i "/projet/galaxydev/galaxy/tools/proteore_uc1/proteinGroups_Maud.txt" | |
23 # --protein_IDs "A2A288:A8K2U0" --peptides 2 "=" -o "test-data/output_MQfilter.txt" | |
24 | |
25 | |
26 def isnumber(number_format, n): | 32 def isnumber(number_format, n): |
27 """ | 33 """ |
28 Check if a variable is a float or an integer | 34 Check if a variable is a float or an integer |
29 """ | 35 """ |
30 float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$") | 36 float_format = re.compile(r"^[-]?[1-9][0-9]*.?[0-9]+$") |
31 int_format = re.compile("^[\-]?[1-9][0-9]*$") | 37 int_format = re.compile(r"^[-]?[1-9][0-9]*$") |
32 test = "" | 38 test = "" |
33 if number_format == "int": | 39 if number_format == "int": |
34 test = re.match(int_format, n) | 40 test = re.match(int_format, n) |
35 elif number_format == "float": | 41 elif number_format == "float": |
36 test = re.match(float_format, n) | 42 test = re.match(float_format, n) |
37 if test: | 43 if test: |
38 return True | 44 return True |
39 # else: | |
40 # return False | |
41 | 45 |
42 def filters(args): | 46 def filters(args): |
43 """ | 47 """ |
44 Filter the document | 48 Filter the document |
45 """ | 49 """ |
64 else: | 68 else: |
65 raise ValueError("Please enter a number in filter by value") | 69 raise ValueError("Please enter a number in filter by value") |
66 | 70 |
67 # Write results to output | 71 # Write results to output |
68 output = open(args.output, "w") | 72 output = open(args.output, "w") |
69 output.write("\n".join(results[0])) | 73 output.write("".join(results[0])) |
70 output.close() | 74 output.close() |
71 | 75 |
72 # Write deleted lines to trash_file | 76 # Write deleted lines to trash_file |
73 trash = open(args.trash_file, "w") | 77 trash = open(args.trash_file, "w") |
74 trash.write("\n".join(results[1])) | 78 trash.write("".join(results[1])) |
75 trash.close() | 79 trash.close() |
76 | 80 |
77 def readOption(filename): | 81 def readOption(filename): |
82 # Read the keywords file to extract the list of keywords | |
78 f = open(filename, "r") | 83 f = open(filename, "r") |
79 file_content = f.read() | 84 file_content = f.read() |
80 filter_list = file_content.split("\n") | 85 filter_list = file_content.split("\n") |
81 filters = "" | 86 filters = "" |
82 for i in filter_list: | 87 for i in filter_list: |
83 filters += i + ";" | 88 filters += i + ";" |
84 filters = filters[:-1] | 89 filters = filters[:-1] |
85 return filters | 90 return filters |
86 | 91 |
87 def readMQ(MQfilename): | 92 def readMQ(MQfilename): |
88 # Read MQ file | 93 # Read input file |
89 mqfile = open(MQfilename, "r") | 94 mqfile = open(MQfilename, "r") |
90 mq = mqfile.readlines() | 95 mq = mqfile.readlines() |
91 # Remove empty lines (contain only space or new line or "") | 96 # Remove empty lines (contain only space or new line or "") |
92 [mq.remove(blank) for blank in mq if blank.isspace() or blank == ""] | 97 [mq.remove(blank) for blank in mq if blank.isspace() or blank == ""] |
93 return mq | 98 return mq |
94 | 99 |
95 def filter_keyword(MQfile, header, filtered_lines, ids, ncol, match): | 100 def filter_keyword(MQfile, header, filtered_lines, ids, ncol, match): |
96 mq = MQfile | 101 mq = MQfile |
97 if isnumber("int", ncol.replace("c", "")): | 102 if isnumber("int", ncol.replace("c", "")): |
98 id_index = int(ncol.replace("c", "")) - 1 #columns.index("Majority protein IDs") | 103 id_index = int(ncol.replace("c", "")) - 1 |
99 else: | 104 else: |
100 raise ValueError("Please specify the column where " | 105 raise ValueError("Please specify the column where " |
101 "you would like to apply the filter " | 106 "you would like to apply the filter " |
102 "with valid format") | 107 "with valid format") |
103 | 108 |
122 filtered_lines.append(header) | 127 filtered_lines.append(header) |
123 | 128 |
124 for line in content: | 129 for line in content: |
125 line = line.replace("\n", "") | 130 line = line.replace("\n", "") |
126 id_inline = line.split("\t")[id_index].replace('"', "").split(";") | 131 id_inline = line.split("\t")[id_index].replace('"', "").split(";") |
127 one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) # Take only first IDs | 132 # Take only first IDs |
133 #one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) | |
128 line = line + "\n" | 134 line = line + "\n" |
129 | 135 |
130 if match != "false": | 136 if match != "false": |
131 # Filter protein IDs | 137 # Filter protein IDs |
132 if any(pid.upper() in ids for pid in id_inline): | 138 if any(pid.upper() in ids for pid in id_inline): |
133 filtered_lines.append(one_id_line) | 139 filtered_lines.append(line) |
134 mq.remove(line) | 140 mq.remove(line) |
135 else: | 141 #else: |
136 mq[mq.index(line)] = one_id_line | 142 # mq[mq.index(line)] = one_id_line |
137 else: | 143 else: |
138 if any(ft in pid.upper() for pid in id_inline for ft in ids): | 144 if any(ft in pid.upper() for pid in id_inline for ft in ids): |
139 filtered_lines.append(one_id_line) | 145 filtered_lines.append(line) |
140 mq.remove(line) | 146 mq.remove(line) |
141 else: | 147 #else: |
142 mq[mq.index(line)] = one_id_line | 148 # mq[mq.index(line)] = one_id_line |
143 return mq, filtered_lines | 149 return mq, filtered_lines |
144 | 150 |
145 def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt): | 151 def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt): |
146 mq = MQfile | 152 mq = MQfile |
147 if ncol and isnumber("int", ncol.replace("c", "")): #"Gene names" in columns: | 153 if ncol and isnumber("int", ncol.replace("c", "")): |
148 index = int(ncol.replace("c", "")) - 1 #columns.index("Gene names") | 154 index = int(ncol.replace("c", "")) - 1 |
149 else: | 155 else: |
150 raise ValueError("Please specify the column where " | 156 raise ValueError("Please specify the column where " |
151 "you would like to apply the filter " | 157 "you would like to apply the filter " |
152 "with valid format") | 158 "with valid format") |
153 if header == "true": | 159 if header == "true": |
185 mq.remove(line) | 191 mq.remove(line) |
186 else: | 192 else: |
187 if float(pep) != filter_value: | 193 if float(pep) != filter_value: |
188 filtered_prots.append(line) | 194 filtered_prots.append(line) |
189 mq.remove(line) | 195 mq.remove(line) |
190 return mq, filtered_prots #output, trash_file | 196 return mq, filtered_prots |
191 | 197 |
192 if __name__ == "__main__": | 198 if __name__ == "__main__": |
193 options() | 199 options() |