comparison prepare_data.py @ 6:e94dc7945639 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
author bgruening
date Sun, 16 Oct 2022 11:52:10 +0000
parents 4f7e6612906b
children
comparison
equal deleted inserted replaced
5:4f7e6612906b 6:e94dc7945639
3 machine learning algorithm. The paths are divided 3 machine learning algorithm. The paths are divided
4 into the test and training sets 4 into the test and training sets
5 """ 5 """
6 6
7 import collections 7 import collections
8 import os
9 import random 8 import random
10 9
11 import numpy as np 10 import numpy as np
12 import predict_tool_usage 11 import predict_tool_usage
13 12 from sklearn.model_selection import train_test_split
14 main_path = os.getcwd()
15 13
16 14
17 class PrepareData: 15 class PrepareData:
16
18 def __init__(self, max_seq_length, test_data_share): 17 def __init__(self, max_seq_length, test_data_share):
19 """ Init method. """ 18 """ Init method. """
20 self.max_tool_sequence_len = max_seq_length 19 self.max_tool_sequence_len = max_seq_length
21 self.test_share = test_data_share 20 self.test_share = test_data_share
22 21
24 """ 23 """
25 Get all the tools and complete set of individual paths for each workflow 24 Get all the tools and complete set of individual paths for each workflow
26 """ 25 """
27 tokens = list() 26 tokens = list()
28 raw_paths = workflow_paths 27 raw_paths = workflow_paths
29 raw_paths = [x.replace("\n", "") for x in raw_paths] 28 raw_paths = [x.replace("\n", '') for x in raw_paths]
30 for item in raw_paths: 29 for item in raw_paths:
31 split_items = item.split(",") 30 split_items = item.split(",")
32 for token in split_items: 31 for token in split_items:
33 if token != "": 32 if token != "":
34 tokens.append(token) 33 tokens.append(token)
35 tokens = list(set(tokens)) 34 tokens = list(set(tokens))
36 tokens = np.array(tokens) 35 tokens = np.array(tokens)
37 tokens = np.reshape( 36 tokens = np.reshape(tokens, [-1, ])
38 tokens,
39 [
40 -1,
41 ],
42 )
43 return tokens, raw_paths 37 return tokens, raw_paths
44 38
45 def create_new_dict(self, new_data_dict): 39 def create_new_dict(self, new_data_dict):
46 """ 40 """
47 Create new data dictionary 41 Create new data dictionary
60 """ 54 """
61 Create two dictionaries having tools names and their indexes 55 Create two dictionaries having tools names and their indexes
62 """ 56 """
63 count = collections.Counter(words).most_common() 57 count = collections.Counter(words).most_common()
64 dictionary = dict() 58 dictionary = dict()
65 for word, _ in count: 59 for index, (word, _) in enumerate(count):
60 word = word.lstrip()
61 word = word.rstrip()
66 dictionary[word] = len(dictionary) + 1 62 dictionary[word] = len(dictionary) + 1
67 word = word.strip() 63 dictionary, reverse_dictionary = self.assemble_dictionary(dictionary, old_data_dictionary)
68 dictionary, reverse_dictionary = self.assemble_dictionary(
69 dictionary, old_data_dictionary
70 )
71 return dictionary, reverse_dictionary 64 return dictionary, reverse_dictionary
72 65
73 def decompose_paths(self, paths, dictionary): 66 def decompose_paths(self, paths, dictionary):
74 """ 67 """
75 Decompose the paths to variable length sub-paths keeping the first tool fixed 68 Decompose the paths to variable length sub-paths keeping the first tool fixed
76 """ 69 """
70 max_len = 0
77 sub_paths_pos = list() 71 sub_paths_pos = list()
78 for index, item in enumerate(paths): 72 for index, item in enumerate(paths):
79 tools = item.split(",") 73 tools = item.split(",")
80 len_tools = len(tools) 74 len_tools = len(tools)
81 if len_tools <= self.max_tool_sequence_len: 75 if len_tools > max_len:
82 for window in range(1, len_tools): 76 max_len = len_tools
83 sequence = tools[0: window + 1] 77 if len_tools < self.max_tool_sequence_len:
84 tools_pos = [ 78 sequence = tools[0: len_tools]
85 str(dictionary[str(tool_item)]) for tool_item in sequence 79 tools_pos = [str(dictionary[str(tool_item)]) for tool_item in sequence]
86 ] 80 if len(tools_pos) > 1:
87 if len(tools_pos) > 1: 81 sub_paths_pos.append(",".join(tools_pos))
88 sub_paths_pos.append(",".join(tools_pos))
89 sub_paths_pos = list(set(sub_paths_pos)) 82 sub_paths_pos = list(set(sub_paths_pos))
83 print("Max length of tools: ", max_len)
90 return sub_paths_pos 84 return sub_paths_pos
91 85
92 def prepare_paths_labels_dictionary( 86 def prepare_input_one_target_paths(self, dictionary, reverse_dictionary, paths):
93 self, dictionary, reverse_dictionary, paths, compatible_next_tools 87 input_target_paths = dict()
94 ): 88 compatible_tools = dict()
95 """ 89 d_size = 0
96 Create a dictionary of sequences with their labels for training and test paths 90 for i, item in enumerate(paths):
97 """ 91 input_tools = item.split(",")
98 paths_labels = dict() 92 tool_seq = input_tools
99 random.shuffle(paths) 93 i_tools = ",".join(tool_seq[0:-1])
100 for item in paths: 94 last_i_tool = i_tools.split(",")[-1]
101 if item and item not in "": 95 if last_i_tool not in compatible_tools:
102 tools = item.split(",") 96 compatible_tools[last_i_tool] = list()
103 label = tools[-1] 97 t_tools = tool_seq[-1]
104 train_tools = tools[: len(tools) - 1] 98 if t_tools not in compatible_tools[last_i_tool]:
105 last_but_one_name = reverse_dictionary[int(train_tools[-1])] 99 compatible_tools[last_i_tool].append(t_tools)
106 try: 100 if i_tools not in input_target_paths:
107 compatible_tools = compatible_next_tools[last_but_one_name].split( 101 input_target_paths[i_tools] = list()
108 "," 102 if t_tools not in input_target_paths[i_tools]:
109 ) 103 input_target_paths[i_tools].append(t_tools)
110 except Exception: 104 if i_tools not in input_target_paths:
111 continue 105 input_target_paths[i_tools] = list()
112 if len(compatible_tools) > 0: 106 if t_tools not in input_target_paths[i_tools]:
113 compatible_tools_ids = [ 107 input_target_paths[i_tools].append(t_tools)
114 str(dictionary[x]) for x in compatible_tools 108 for item in input_target_paths:
115 ] 109 d_size += len(input_target_paths[item])
116 compatible_tools_ids.append(label) 110 print("Dataset size:", d_size)
117 composite_labels = ",".join(compatible_tools_ids) 111 return input_target_paths, compatible_tools, d_size
118 train_tools = ",".join(train_tools) 112
119 if train_tools in paths_labels: 113 def prepare_input_target_paths(self, dictionary, reverse_dictionary, paths):
120 paths_labels[train_tools] += "," + composite_labels 114 input_target_paths = dict()
121 else: 115 compatible_tools = dict()
122 paths_labels[train_tools] = composite_labels 116 d_size = 0
123 for item in paths_labels: 117 for i, item in enumerate(paths):
124 paths_labels[item] = ",".join(list(set(paths_labels[item].split(",")))) 118 input_tools = item.split(",")
125 return paths_labels 119 ctr = 0
126 120 for ctr in range(len(input_tools) - 1):
127 def pad_test_paths(self, paths_dictionary, num_classes): 121 # uncomment this for one token target idea
128 """ 122 tool_seq = input_tools[0: ctr + 2]
129 Add padding to the tools sequences and create multi-hot encoded labels 123 i_tools = ",".join(tool_seq[0:-1])
130 """ 124 last_i_tool = i_tools.split(",")[-1]
131 size_data = len(paths_dictionary) 125 if last_i_tool not in compatible_tools:
132 data_mat = np.zeros([size_data, self.max_tool_sequence_len]) 126 compatible_tools[last_i_tool] = list()
133 label_mat = np.zeros([size_data, num_classes + 1]) 127 t_tools = tool_seq[-1]
128 if t_tools not in compatible_tools[last_i_tool]:
129 compatible_tools[last_i_tool].append(t_tools)
130 if i_tools not in input_target_paths:
131 input_target_paths[i_tools] = list()
132 if t_tools not in input_target_paths[i_tools]:
133 input_target_paths[i_tools].append(t_tools)
134 if i_tools not in input_target_paths:
135 input_target_paths[i_tools] = list()
136 if t_tools not in input_target_paths[i_tools]:
137 input_target_paths[i_tools].append(t_tools)
138 for item in input_target_paths:
139 d_size += len(input_target_paths[item])
140 print("Dataset size:", d_size)
141 return input_target_paths, compatible_tools, d_size
142
143 def pad_paths_one_tool_target(self, multi_paths, compatible_tools, d_size, rev_dict, dictionary):
144 d_size = len(multi_paths)
145 input_mat = np.zeros([d_size, self.max_tool_sequence_len])
146 target_mat = np.zeros([d_size, len(dictionary) + 1])
134 train_counter = 0 147 train_counter = 0
135 for train_seq, train_label in list(paths_dictionary.items()): 148 for input_seq, target_seq_tools in list(multi_paths.items()):
136 positions = train_seq.split(",") 149 input_seq_tools = input_seq.split(",")
137 start_pos = self.max_tool_sequence_len - len(positions) 150 last_i_tool = input_seq_tools[-1]
138 for id_pos, pos in enumerate(positions): 151 for id_pos, pos in enumerate(input_seq_tools):
139 data_mat[train_counter][start_pos + id_pos] = int(pos) 152 input_mat[train_counter][id_pos] = int(pos)
140 for label_item in train_label.split(","): 153 if last_i_tool in compatible_tools:
141 label_mat[train_counter][int(label_item)] = 1.0 154 compatible_targets = compatible_tools[last_i_tool]
155 for k, t_label in enumerate(target_seq_tools):
156 target_mat[train_counter][int(t_label)] = 1
157 for c_tool in compatible_targets:
158 target_mat[train_counter][int(c_tool)] = 1
142 train_counter += 1 159 train_counter += 1
143 return data_mat, label_mat 160 print("Final data size: ", input_mat.shape, target_mat.shape)
144 161 train_data, test_data, train_labels, test_labels = train_test_split(input_mat, target_mat, test_size=self.test_share, random_state=42)
145 def pad_paths( 162 return train_data, train_labels, test_data, test_labels
146 self, paths_dictionary, num_classes, standard_connections, reverse_dictionary
147 ):
148 """
149 Add padding to the tools sequences and create multi-hot encoded labels
150 """
151 size_data = len(paths_dictionary)
152 data_mat = np.zeros([size_data, self.max_tool_sequence_len])
153 label_mat = np.zeros([size_data, 2 * (num_classes + 1)])
154 pos_flag = 1.0
155 train_counter = 0
156 for train_seq, train_label in list(paths_dictionary.items()):
157 pub_connections = list()
158 positions = train_seq.split(",")
159 last_tool_id = positions[-1]
160 last_tool_name = reverse_dictionary[int(last_tool_id)]
161 start_pos = self.max_tool_sequence_len - len(positions)
162 for id_pos, pos in enumerate(positions):
163 data_mat[train_counter][start_pos + id_pos] = int(pos)
164 if last_tool_name in standard_connections:
165 pub_connections = standard_connections[last_tool_name]
166 for label_item in train_label.split(","):
167 label_pos = int(label_item)
168 label_row = label_mat[train_counter]
169 if reverse_dictionary[label_pos] in pub_connections:
170 label_row[label_pos] = pos_flag
171 else:
172 label_row[label_pos + num_classes + 1] = pos_flag
173 train_counter += 1
174 return data_mat, label_mat
175 163
176 def split_test_train_data(self, multilabels_paths): 164 def split_test_train_data(self, multilabels_paths):
177 """ 165 """
178 Split into test and train data randomly for each run 166 Split into test and train data randomly for each run
179 """ 167 """
219 if u_score < 1.0: 207 if u_score < 1.0:
220 u_score += 1.0 208 u_score += 1.0
221 class_weights[key] = np.round(np.log(u_score), 6) 209 class_weights[key] = np.round(np.log(u_score), 6)
222 return class_weights 210 return class_weights
223 211
224 def get_train_last_tool_freq(self, train_paths, reverse_dictionary): 212 def get_train_tool_labels_freq(self, train_paths, reverse_dictionary):
225 """ 213 """
226 Get the frequency of last tool of each tool sequence 214 Get the frequency of last tool of each tool sequence
227 to estimate the frequency of tool sequences 215 to estimate the frequency of tool sequences
228 """ 216 """
229 last_tool_freq = dict() 217 last_tool_freq = dict()
230 freq_dict_names = dict() 218 freq_dict_names = dict()
231 for path in train_paths: 219 for path in train_paths:
232 last_tool = path.split(",")[-1] 220 tools_pos = np.where(path > 0)[0]
221 path_pos = tools_pos
222 path_pos = [str(int(item)) for item in path_pos]
223
224 for tool_pos in path_pos:
225 if tool_pos not in last_tool_freq:
226 last_tool_freq[tool_pos] = 0
227 freq_dict_names[reverse_dictionary[int(tool_pos)]] = 0
228 last_tool_freq[tool_pos] += 1
229 freq_dict_names[reverse_dictionary[int(tool_pos)]] += 1
230 sorted_dict = dict(sorted(last_tool_freq.items(), key=lambda kv: kv[1], reverse=True))
231 return sorted_dict
232
233 def get_train_last_tool_freq(self, train_paths, reverse_dictionary):
234 """
235 Get the frequency of last tool of each tool sequence
236 to estimate the frequency of tool sequences
237 """
238 last_tool_freq = dict()
239 freq_dict_names = dict()
240 for path in train_paths:
241 tools_pos = np.where(path > 0)[0]
242 path_pos = path[tools_pos]
243 path_pos = [str(int(item)) for item in path_pos]
244 last_tool = path_pos[-1]
233 if last_tool not in last_tool_freq: 245 if last_tool not in last_tool_freq:
234 last_tool_freq[last_tool] = 0 246 last_tool_freq[last_tool] = 0
235 freq_dict_names[reverse_dictionary[int(last_tool)]] = 0 247 freq_dict_names[reverse_dictionary[int(last_tool)]] = 0
236 last_tool_freq[last_tool] += 1 248 last_tool_freq[last_tool] += 1
237 freq_dict_names[reverse_dictionary[int(last_tool)]] += 1 249 freq_dict_names[reverse_dictionary[int(last_tool)]] += 1
238 return last_tool_freq 250 sorted_dict = dict(sorted(last_tool_freq.items(), key=lambda kv: kv[1], reverse=True))
251 return sorted_dict
239 252
240 def get_toolid_samples(self, train_data, l_tool_freq): 253 def get_toolid_samples(self, train_data, l_tool_freq):
241 l_tool_tr_samples = dict() 254 l_tool_tr_samples = dict()
242 for tool_id in l_tool_freq: 255 for tool_id in l_tool_freq:
243 for index, tr_sample in enumerate(train_data): 256 for index, tr_sample in enumerate(train_data):
246 if last_tool_id not in l_tool_tr_samples: 259 if last_tool_id not in l_tool_tr_samples:
247 l_tool_tr_samples[last_tool_id] = list() 260 l_tool_tr_samples[last_tool_id] = list()
248 l_tool_tr_samples[last_tool_id].append(index) 261 l_tool_tr_samples[last_tool_id].append(index)
249 return l_tool_tr_samples 262 return l_tool_tr_samples
250 263
251 def get_data_labels_matrices( 264 def get_data_labels_matrices(self, workflow_paths, usage_df, cutoff_date, standard_connections, old_data_dictionary={}):
252 self,
253 workflow_paths,
254 tool_usage_path,
255 cutoff_date,
256 compatible_next_tools,
257 standard_connections,
258 old_data_dictionary={},
259 ):
260 """ 265 """
261 Convert the training and test paths into corresponding numpy matrices 266 Convert the training and test paths into corresponding numpy matrices
262 """ 267 """
263 processed_data, raw_paths = self.process_workflow_paths(workflow_paths) 268 processed_data, raw_paths = self.process_workflow_paths(workflow_paths)
264 dictionary, rev_dict = self.create_data_dictionary( 269 dictionary, rev_dict = self.create_data_dictionary(processed_data, old_data_dictionary)
265 processed_data, old_data_dictionary 270
266 )
267 num_classes = len(dictionary) 271 num_classes = len(dictionary)
268 272
269 print("Raw paths: %d" % len(raw_paths)) 273 print("Raw paths: %d" % len(raw_paths))
270 random.shuffle(raw_paths) 274 random.shuffle(raw_paths)
271 275
272 print("Decomposing paths...") 276 print("Decomposing paths...")
273 all_unique_paths = self.decompose_paths(raw_paths, dictionary) 277 all_unique_paths = self.decompose_paths(raw_paths, dictionary)
274 random.shuffle(all_unique_paths) 278 random.shuffle(all_unique_paths)
275 279
276 print("Creating dictionaries...") 280 print("Creating dictionaries...")
277 multilabels_paths = self.prepare_paths_labels_dictionary( 281 multilabels_paths, compatible_tools, d_size = self.prepare_input_target_paths(dictionary, rev_dict, all_unique_paths)
278 dictionary, rev_dict, all_unique_paths, compatible_next_tools 282
279 ) 283 print("Complete data: %d" % d_size)
280
281 print("Complete data: %d" % len(multilabels_paths))
282 train_paths_dict, test_paths_dict = self.split_test_train_data(
283 multilabels_paths
284 )
285
286 print("Train data: %d" % len(train_paths_dict))
287 print("Test data: %d" % len(test_paths_dict))
288 284
289 print("Padding train and test data...") 285 print("Padding train and test data...")
290 # pad training and test data with leading zeros 286 # pad training and test data with trailing zeros
291 test_data, test_labels = self.pad_paths( 287 train_data, train_labels, test_data, test_labels = self.pad_paths_one_tool_target(multilabels_paths, compatible_tools, d_size, rev_dict, dictionary)
292 test_paths_dict, num_classes, standard_connections, rev_dict 288
293 ) 289 print("Train data: ", train_data.shape)
294 train_data, train_labels = self.pad_paths( 290 print("Test data: ", test_data.shape)
295 train_paths_dict, num_classes, standard_connections, rev_dict
296 )
297 291
298 print("Estimating sample frequency...") 292 print("Estimating sample frequency...")
299 l_tool_freq = self.get_train_last_tool_freq(train_paths_dict, rev_dict) 293 tr_tool_freq = self.get_train_tool_labels_freq(train_labels, rev_dict)
300 l_tool_tr_samples = self.get_toolid_samples(train_data, l_tool_freq)
301 294
302 # Predict tools usage 295 # Predict tools usage
303 print("Predicting tools' usage...") 296 print("Predicting tools' usage...")
304 usage_pred = predict_tool_usage.ToolPopularity() 297 usage_pred = predict_tool_usage.ToolPopularity()
305 usage = usage_pred.extract_tool_usage(tool_usage_path, cutoff_date, dictionary) 298 usage = usage_pred.extract_tool_usage(usage_df, cutoff_date, dictionary)
306 tool_usage_prediction = usage_pred.get_pupularity_prediction(usage) 299 tool_usage_prediction = usage_pred.get_pupularity_prediction(usage)
307 t_pred_usage = self.get_predicted_usage(dictionary, tool_usage_prediction) 300 t_pred_usage = self.get_predicted_usage(dictionary, tool_usage_prediction)
308
309 # get class weights using the predicted usage for each tool 301 # get class weights using the predicted usage for each tool
310 class_weights = self.assign_class_weights(num_classes, t_pred_usage) 302 class_weights = self.assign_class_weights(num_classes, t_pred_usage)
311 303 return train_data, train_labels, test_data, test_labels, dictionary, rev_dict, class_weights, compatible_tools, tr_tool_freq
312 return (
313 train_data,
314 train_labels,
315 test_data,
316 test_labels,
317 dictionary,
318 rev_dict,
319 class_weights,
320 t_pred_usage,
321 l_tool_freq,
322 l_tool_tr_samples,
323 )