comparison filter_kw_val.py @ 5:1e9911190142 draft

planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
author proteore
date Wed, 14 Mar 2018 10:24:54 -0400
parents d29e469b6b20
children c6ba1e6f6869
comparison
equal deleted inserted replaced
4:2c1012e0a628 5:1e9911190142
2 import re 2 import re
3 3
4 4
5 def options(): 5 def options():
6 """ 6 """
7 Parse options 7 Parse options:
8 -i, --input Input filename and boolean value if the file contains header ["filename,true/false"]
9 -m, --match if the keywords should be filtered in exact
10 --kw Keyword to be filtered, the column number where this filter applies,
11 boolean value if the keyword should be filtered in exact ["keyword,ncol,true/false"].
12 This option can be repeated: --kw "kw1,c1,true" --kw "kw2,c1,false" --kw "kw3,c2,true"
13 --kwfile A file that contains keywords to be filter, the column where this filter applies and
14 boolean value if the keyword should be filtered in exact ["filename,ncol,true/false"]
15 --value The value to be filtered, the column number where this filter applies and the
16 operation symbol ["value,ncol,=/>/>=/</<="]
17 --o --output The output filename
18 --trash_file The file contains removed lines
8 """ 19 """
9 parser = argparse.ArgumentParser() 20 parser = argparse.ArgumentParser()
10 parser.add_argument("-i", "--input", help="Input file", required=True) 21 parser.add_argument("-i", "--input", help="Input file", required=True)
11 parser.add_argument("-m", "--match", help="Exact macth") 22 parser.add_argument("--kw", nargs="+", action="append", help="")
12 parser.add_argument("--kw", nargs="+", action="append", help="") #
13 parser.add_argument("--kw_file", nargs="+", action="append", help="") 23 parser.add_argument("--kw_file", nargs="+", action="append", help="")
14 parser.add_argument("--value", nargs="+", action="append", help="") 24 parser.add_argument("--value", nargs="+", action="append", help="")
15 parser.add_argument("-o", "--output", default="output.txt") 25 parser.add_argument("-o", "--output", default="output.txt")
16 parser.add_argument("--trash_file", default="trash_MQfilter.txt") 26 parser.add_argument("--trash_file", default="trash_MQfilter.txt")
17 27
18 args = parser.parse_args() 28 args = parser.parse_args()
19 29
20 filters(args) 30 filters(args)
21 31
22 # python filter2.py -i "/projet/galaxydev/galaxy/tools/proteore_uc1/proteinGroups_Maud.txt"
23 # --protein_IDs "A2A288:A8K2U0" --peptides 2 "=" -o "test-data/output_MQfilter.txt"
24
25
26 def isnumber(number_format, n): 32 def isnumber(number_format, n):
27 """ 33 """
28 Check if a variable is a float or an integer 34 Check if a variable is a float or an integer
29 """ 35 """
30 float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$") 36 float_format = re.compile(r"^[-]?[1-9][0-9]*.?[0-9]+$")
31 int_format = re.compile("^[\-]?[1-9][0-9]*$") 37 int_format = re.compile(r"^[-]?[1-9][0-9]*$")
32 test = "" 38 test = ""
33 if number_format == "int": 39 if number_format == "int":
34 test = re.match(int_format, n) 40 test = re.match(int_format, n)
35 elif number_format == "float": 41 elif number_format == "float":
36 test = re.match(float_format, n) 42 test = re.match(float_format, n)
37 if test: 43 if test:
38 return True 44 return True
39 # else:
40 # return False
41 45
42 def filters(args): 46 def filters(args):
43 """ 47 """
44 Filter the document 48 Filter the document
45 """ 49 """
64 else: 68 else:
65 raise ValueError("Please enter a number in filter by value") 69 raise ValueError("Please enter a number in filter by value")
66 70
67 # Write results to output 71 # Write results to output
68 output = open(args.output, "w") 72 output = open(args.output, "w")
69 output.write("\n".join(results[0])) 73 output.write("".join(results[0]))
70 output.close() 74 output.close()
71 75
72 # Write deleted lines to trash_file 76 # Write deleted lines to trash_file
73 trash = open(args.trash_file, "w") 77 trash = open(args.trash_file, "w")
74 trash.write("\n".join(results[1])) 78 trash.write("".join(results[1]))
75 trash.close() 79 trash.close()
76 80
77 def readOption(filename): 81 def readOption(filename):
82 # Read the keywords file to extract the list of keywords
78 f = open(filename, "r") 83 f = open(filename, "r")
79 file_content = f.read() 84 file_content = f.read()
80 filter_list = file_content.split("\n") 85 filter_list = file_content.split("\n")
81 filters = "" 86 filters = ""
82 for i in filter_list: 87 for i in filter_list:
83 filters += i + ";" 88 filters += i + ";"
84 filters = filters[:-1] 89 filters = filters[:-1]
85 return filters 90 return filters
86 91
87 def readMQ(MQfilename): 92 def readMQ(MQfilename):
88 # Read MQ file 93 # Read input file
89 mqfile = open(MQfilename, "r") 94 mqfile = open(MQfilename, "r")
90 mq = mqfile.readlines() 95 mq = mqfile.readlines()
91 # Remove empty lines (contain only space or new line or "") 96 # Remove empty lines (contain only space or new line or "")
92 [mq.remove(blank) for blank in mq if blank.isspace() or blank == ""] 97 [mq.remove(blank) for blank in mq if blank.isspace() or blank == ""]
93 return mq 98 return mq
94 99
95 def filter_keyword(MQfile, header, filtered_lines, ids, ncol, match): 100 def filter_keyword(MQfile, header, filtered_lines, ids, ncol, match):
96 mq = MQfile 101 mq = MQfile
97 if isnumber("int", ncol.replace("c", "")): 102 if isnumber("int", ncol.replace("c", "")):
98 id_index = int(ncol.replace("c", "")) - 1 #columns.index("Majority protein IDs") 103 id_index = int(ncol.replace("c", "")) - 1
99 else: 104 else:
100 raise ValueError("Please specify the column where " 105 raise ValueError("Please specify the column where "
101 "you would like to apply the filter " 106 "you would like to apply the filter "
102 "with valid format") 107 "with valid format")
103 108
122 filtered_lines.append(header) 127 filtered_lines.append(header)
123 128
124 for line in content: 129 for line in content:
125 line = line.replace("\n", "") 130 line = line.replace("\n", "")
126 id_inline = line.split("\t")[id_index].replace('"', "").split(";") 131 id_inline = line.split("\t")[id_index].replace('"', "").split(";")
127 one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) # Take only first IDs 132 # Take only first IDs
133 #one_id_line = line.replace(line.split("\t")[id_index], id_inline[0])
128 line = line + "\n" 134 line = line + "\n"
129 135
130 if match != "false": 136 if match != "false":
131 # Filter protein IDs 137 # Filter protein IDs
132 if any(pid.upper() in ids for pid in id_inline): 138 if any(pid.upper() in ids for pid in id_inline):
133 filtered_lines.append(one_id_line) 139 filtered_lines.append(line)
134 mq.remove(line) 140 mq.remove(line)
135 else: 141 #else:
136 mq[mq.index(line)] = one_id_line 142 # mq[mq.index(line)] = one_id_line
137 else: 143 else:
138 if any(ft in pid.upper() for pid in id_inline for ft in ids): 144 if any(ft in pid.upper() for pid in id_inline for ft in ids):
139 filtered_lines.append(one_id_line) 145 filtered_lines.append(line)
140 mq.remove(line) 146 mq.remove(line)
141 else: 147 #else:
142 mq[mq.index(line)] = one_id_line 148 # mq[mq.index(line)] = one_id_line
143 return mq, filtered_lines 149 return mq, filtered_lines
144 150
145 def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt): 151 def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt):
146 mq = MQfile 152 mq = MQfile
147 if ncol and isnumber("int", ncol.replace("c", "")): #"Gene names" in columns: 153 if ncol and isnumber("int", ncol.replace("c", "")):
148 index = int(ncol.replace("c", "")) - 1 #columns.index("Gene names") 154 index = int(ncol.replace("c", "")) - 1
149 else: 155 else:
150 raise ValueError("Please specify the column where " 156 raise ValueError("Please specify the column where "
151 "you would like to apply the filter " 157 "you would like to apply the filter "
152 "with valid format") 158 "with valid format")
153 if header == "true": 159 if header == "true":
185 mq.remove(line) 191 mq.remove(line)
186 else: 192 else:
187 if float(pep) != filter_value: 193 if float(pep) != filter_value:
188 filtered_prots.append(line) 194 filtered_prots.append(line)
189 mq.remove(line) 195 mq.remove(line)
190 return mq, filtered_prots #output, trash_file 196 return mq, filtered_prots
191 197
192 if __name__ == "__main__": 198 if __name__ == "__main__":
193 options() 199 options()