comparison prepare_data.py @ 3:5b3c08710e47 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
author bgruening
date Sat, 09 May 2020 05:38:23 -0400
parents 9bf25dbe00ad
children afec8c595124
comparison
equal deleted inserted replaced
2:76251d1ccdcc 3:5b3c08710e47
8 import collections 8 import collections
9 import numpy as np 9 import numpy as np
10 import random 10 import random
11 11
12 import predict_tool_usage 12 import predict_tool_usage
13 import utils
13 14
14 main_path = os.getcwd() 15 main_path = os.getcwd()
15 16
16 17
17 class PrepareData: 18 class PrepareData:
18 19
19 @classmethod
20 def __init__(self, max_seq_length, test_data_share): 20 def __init__(self, max_seq_length, test_data_share):
21 """ Init method. """ 21 """ Init method. """
22 self.max_tool_sequence_len = max_seq_length 22 self.max_tool_sequence_len = max_seq_length
23 self.test_share = test_data_share 23 self.test_share = test_data_share
24 24
25 @classmethod
26 def process_workflow_paths(self, workflow_paths): 25 def process_workflow_paths(self, workflow_paths):
27 """ 26 """
28 Get all the tools and complete set of individual paths for each workflow 27 Get all the tools and complete set of individual paths for each workflow
29 """ 28 """
30 tokens = list() 29 tokens = list()
38 tokens = list(set(tokens)) 37 tokens = list(set(tokens))
39 tokens = np.array(tokens) 38 tokens = np.array(tokens)
40 tokens = np.reshape(tokens, [-1, ]) 39 tokens = np.reshape(tokens, [-1, ])
41 return tokens, raw_paths 40 return tokens, raw_paths
42 41
43 @classmethod
44 def create_new_dict(self, new_data_dict): 42 def create_new_dict(self, new_data_dict):
45 """ 43 """
46 Create new data dictionary 44 Create new data dictionary
47 """ 45 """
48 reverse_dict = dict((v, k) for k, v in new_data_dict.items()) 46 reverse_dict = dict((v, k) for k, v in new_data_dict.items())
49 return new_data_dict, reverse_dict 47 return new_data_dict, reverse_dict
50 48
51 @classmethod
52 def assemble_dictionary(self, new_data_dict, old_data_dictionary={}): 49 def assemble_dictionary(self, new_data_dict, old_data_dictionary={}):
53 """ 50 """
54 Create/update tools indices in the forward and backward dictionary 51 Create/update tools indices in the forward and backward dictionary
55 """ 52 """
56 new_data_dict, reverse_dict = self.create_new_dict(new_data_dict) 53 new_data_dict, reverse_dict = self.create_new_dict(new_data_dict)
57 return new_data_dict, reverse_dict 54 return new_data_dict, reverse_dict
58 55
59 @classmethod
60 def create_data_dictionary(self, words, old_data_dictionary={}): 56 def create_data_dictionary(self, words, old_data_dictionary={}):
61 """ 57 """
62 Create two dictionaries having tools names and their indexes 58 Create two dictionaries having tools names and their indexes
63 """ 59 """
64 count = collections.Counter(words).most_common() 60 count = collections.Counter(words).most_common()
66 for word, _ in count: 62 for word, _ in count:
67 dictionary[word] = len(dictionary) + 1 63 dictionary[word] = len(dictionary) + 1
68 dictionary, reverse_dictionary = self.assemble_dictionary(dictionary, old_data_dictionary) 64 dictionary, reverse_dictionary = self.assemble_dictionary(dictionary, old_data_dictionary)
69 return dictionary, reverse_dictionary 65 return dictionary, reverse_dictionary
70 66
71 @classmethod
72 def decompose_paths(self, paths, dictionary): 67 def decompose_paths(self, paths, dictionary):
73 """ 68 """
74 Decompose the paths to variable length sub-paths keeping the first tool fixed 69 Decompose the paths to variable length sub-paths keeping the first tool fixed
75 """ 70 """
76 sub_paths_pos = list() 71 sub_paths_pos = list()
84 if len(tools_pos) > 1: 79 if len(tools_pos) > 1:
85 sub_paths_pos.append(",".join(tools_pos)) 80 sub_paths_pos.append(",".join(tools_pos))
86 sub_paths_pos = list(set(sub_paths_pos)) 81 sub_paths_pos = list(set(sub_paths_pos))
87 return sub_paths_pos 82 return sub_paths_pos
88 83
89 @classmethod
90 def prepare_paths_labels_dictionary(self, dictionary, reverse_dictionary, paths, compatible_next_tools): 84 def prepare_paths_labels_dictionary(self, dictionary, reverse_dictionary, paths, compatible_next_tools):
91 """ 85 """
92 Create a dictionary of sequences with their labels for training and test paths 86 Create a dictionary of sequences with their labels for training and test paths
93 """ 87 """
94 paths_labels = dict() 88 paths_labels = dict()
114 paths_labels[train_tools] = composite_labels 108 paths_labels[train_tools] = composite_labels
115 for item in paths_labels: 109 for item in paths_labels:
116 paths_labels[item] = ",".join(list(set(paths_labels[item].split(",")))) 110 paths_labels[item] = ",".join(list(set(paths_labels[item].split(","))))
117 return paths_labels 111 return paths_labels
118 112
119 @classmethod 113 def pad_test_paths(self, paths_dictionary, num_classes):
120 def pad_paths(self, paths_dictionary, num_classes):
121 """ 114 """
122 Add padding to the tools sequences and create multi-hot encoded labels 115 Add padding to the tools sequences and create multi-hot encoded labels
123 """ 116 """
124 size_data = len(paths_dictionary) 117 size_data = len(paths_dictionary)
125 data_mat = np.zeros([size_data, self.max_tool_sequence_len]) 118 data_mat = np.zeros([size_data, self.max_tool_sequence_len])
133 for label_item in train_label.split(","): 126 for label_item in train_label.split(","):
134 label_mat[train_counter][int(label_item)] = 1.0 127 label_mat[train_counter][int(label_item)] = 1.0
135 train_counter += 1 128 train_counter += 1
136 return data_mat, label_mat 129 return data_mat, label_mat
137 130
138 @classmethod 131 def pad_paths(self, paths_dictionary, num_classes, standard_connections, reverse_dictionary):
132 """
133 Add padding to the tools sequences and create multi-hot encoded labels
134 """
135 size_data = len(paths_dictionary)
136 data_mat = np.zeros([size_data, self.max_tool_sequence_len])
137 label_mat = np.zeros([size_data, 2 * (num_classes + 1)])
138 pos_flag = 1.0
139 train_counter = 0
140 for train_seq, train_label in list(paths_dictionary.items()):
141 pub_connections = list()
142 positions = train_seq.split(",")
143 last_tool_id = positions[-1]
144 last_tool_name = reverse_dictionary[int(last_tool_id)]
145 start_pos = self.max_tool_sequence_len - len(positions)
146 for id_pos, pos in enumerate(positions):
147 data_mat[train_counter][start_pos + id_pos] = int(pos)
148 if last_tool_name in standard_connections:
149 pub_connections = standard_connections[last_tool_name]
150 for label_item in train_label.split(","):
151 label_pos = int(label_item)
152 label_row = label_mat[train_counter]
153 if reverse_dictionary[label_pos] in pub_connections:
154 label_row[label_pos] = pos_flag
155 else:
156 label_row[label_pos + num_classes + 1] = pos_flag
157 train_counter += 1
158 return data_mat, label_mat
159
139 def split_test_train_data(self, multilabels_paths): 160 def split_test_train_data(self, multilabels_paths):
140 """ 161 """
141 Split into test and train data randomly for each run 162 Split into test and train data randomly for each run
142 """ 163 """
143 train_dict = dict() 164 train_dict = dict()
150 test_dict[path] = multilabels_paths[path] 171 test_dict[path] = multilabels_paths[path]
151 else: 172 else:
152 train_dict[path] = multilabels_paths[path] 173 train_dict[path] = multilabels_paths[path]
153 return train_dict, test_dict 174 return train_dict, test_dict
154 175
155 @classmethod
156 def verify_overlap(self, train_paths, test_paths):
157 """
158 Verify the overlapping of samples in train and test data
159 """
160 intersection = list(set(train_paths).intersection(set(test_paths)))
161 print("Overlap in train and test: %d" % len(intersection))
162
163 @classmethod
164 def get_predicted_usage(self, data_dictionary, predicted_usage): 176 def get_predicted_usage(self, data_dictionary, predicted_usage):
165 """ 177 """
166 Get predicted usage for tools 178 Get predicted usage for tools
167 """ 179 """
168 usage = dict() 180 usage = dict()
178 except Exception: 190 except Exception:
179 usage[v] = epsilon 191 usage[v] = epsilon
180 continue 192 continue
181 return usage 193 return usage
182 194
183 @classmethod
184 def assign_class_weights(self, n_classes, predicted_usage): 195 def assign_class_weights(self, n_classes, predicted_usage):
185 """ 196 """
186 Compute class weights using usage 197 Compute class weights using usage
187 """ 198 """
188 class_weights = dict() 199 class_weights = dict()
189 class_weights[str(0)] = 0.0 200 class_weights[str(0)] = 0.0
190 for key in range(1, n_classes): 201 for key in range(1, n_classes + 1):
191 u_score = predicted_usage[key] 202 u_score = predicted_usage[key]
192 if u_score < 1.0: 203 if u_score < 1.0:
193 u_score += 1.0 204 u_score += 1.0
194 class_weights[key] = np.log(u_score) 205 class_weights[key] = np.round(np.log(u_score), 6)
195 return class_weights 206 return class_weights
196 207
197 @classmethod 208 def get_train_last_tool_freq(self, train_paths, reverse_dictionary):
198 def get_sample_weights(self, train_data, reverse_dictionary, paths_frequency): 209 """
199 """ 210 Get the frequency of last tool of each tool sequence
200 Compute the frequency of paths in training data 211 to estimate the frequency of tool sequences
201 """ 212 """
202 path_weights = np.zeros(len(train_data)) 213 last_tool_freq = dict()
203 for path_index, path in enumerate(train_data): 214 inv_freq = dict()
204 sample_pos = np.where(path > 0)[0] 215 for path in train_paths:
205 sample_tool_pos = path[sample_pos[0]:] 216 last_tool = path.split(",")[-1]
206 path_name = ",".join([reverse_dictionary[int(tool_pos)] for tool_pos in sample_tool_pos]) 217 if last_tool not in last_tool_freq:
207 try: 218 last_tool_freq[last_tool] = 0
208 path_weights[path_index] = int(paths_frequency[path_name]) 219 last_tool_freq[last_tool] += 1
209 except Exception: 220 max_freq = max(last_tool_freq.values())
210 path_weights[path_index] = 1 221 for t in last_tool_freq:
211 return path_weights 222 inv_freq[t] = int(np.round(max_freq / float(last_tool_freq[t]), 0))
212 223 return last_tool_freq, inv_freq
213 @classmethod 224
214 def get_data_labels_matrices(self, workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools, old_data_dictionary={}): 225 def get_toolid_samples(self, train_data, l_tool_freq):
226 l_tool_tr_samples = dict()
227 for tool_id in l_tool_freq:
228 for index, tr_sample in enumerate(train_data):
229 last_tool_id = str(int(tr_sample[-1]))
230 if last_tool_id == tool_id:
231 if last_tool_id not in l_tool_tr_samples:
232 l_tool_tr_samples[last_tool_id] = list()
233 l_tool_tr_samples[last_tool_id].append(index)
234 return l_tool_tr_samples
235
236 def get_data_labels_matrices(self, workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools, standard_connections, old_data_dictionary={}):
215 """ 237 """
216 Convert the training and test paths into corresponding numpy matrices 238 Convert the training and test paths into corresponding numpy matrices
217 """ 239 """
218 processed_data, raw_paths = self.process_workflow_paths(workflow_paths) 240 processed_data, raw_paths = self.process_workflow_paths(workflow_paths)
219 dictionary, reverse_dictionary = self.create_data_dictionary(processed_data, old_data_dictionary) 241 dictionary, rev_dict = self.create_data_dictionary(processed_data, old_data_dictionary)
220 num_classes = len(dictionary) 242 num_classes = len(dictionary)
221 243
222 print("Raw paths: %d" % len(raw_paths)) 244 print("Raw paths: %d" % len(raw_paths))
223 random.shuffle(raw_paths) 245 random.shuffle(raw_paths)
224 246
225 print("Decomposing paths...") 247 print("Decomposing paths...")
226 all_unique_paths = self.decompose_paths(raw_paths, dictionary) 248 all_unique_paths = self.decompose_paths(raw_paths, dictionary)
227 random.shuffle(all_unique_paths) 249 random.shuffle(all_unique_paths)
228 250
229 print("Creating dictionaries...") 251 print("Creating dictionaries...")
230 multilabels_paths = self.prepare_paths_labels_dictionary(dictionary, reverse_dictionary, all_unique_paths, compatible_next_tools) 252 multilabels_paths = self.prepare_paths_labels_dictionary(dictionary, rev_dict, all_unique_paths, compatible_next_tools)
231 253
232 print("Complete data: %d" % len(multilabels_paths)) 254 print("Complete data: %d" % len(multilabels_paths))
233 train_paths_dict, test_paths_dict = self.split_test_train_data(multilabels_paths) 255 train_paths_dict, test_paths_dict = self.split_test_train_data(multilabels_paths)
234 256
257 # get sample frequency
258 l_tool_freq, inv_last_tool_freq = self.get_train_last_tool_freq(train_paths_dict, rev_dict)
259
235 print("Train data: %d" % len(train_paths_dict)) 260 print("Train data: %d" % len(train_paths_dict))
236 print("Test data: %d" % len(test_paths_dict)) 261 print("Test data: %d" % len(test_paths_dict))
237 262
238 test_data, test_labels = self.pad_paths(test_paths_dict, num_classes) 263 print("Padding train and test data...")
239 train_data, train_labels = self.pad_paths(train_paths_dict, num_classes) 264 # pad training and test data with leading zeros
265 test_data, test_labels = self.pad_paths(test_paths_dict, num_classes, standard_connections, rev_dict)
266 train_data, train_labels = self.pad_paths(train_paths_dict, num_classes, standard_connections, rev_dict)
267
268 l_tool_tr_samples = self.get_toolid_samples(train_data, l_tool_freq)
240 269
241 # Predict tools usage 270 # Predict tools usage
242 print("Predicting tools' usage...") 271 print("Predicting tools' usage...")
243 usage_pred = predict_tool_usage.ToolPopularity() 272 usage_pred = predict_tool_usage.ToolPopularity()
244 usage = usage_pred.extract_tool_usage(tool_usage_path, cutoff_date, dictionary) 273 usage = usage_pred.extract_tool_usage(tool_usage_path, cutoff_date, dictionary)
245 tool_usage_prediction = usage_pred.get_pupularity_prediction(usage) 274 tool_usage_prediction = usage_pred.get_pupularity_prediction(usage)
246 tool_predicted_usage = self.get_predicted_usage(dictionary, tool_usage_prediction) 275 t_pred_usage = self.get_predicted_usage(dictionary, tool_usage_prediction)
247 276
248 # get class weights using the predicted usage for each tool 277 # get class weights using the predicted usage for each tool
249 class_weights = self.assign_class_weights(train_labels.shape[1], tool_predicted_usage) 278 class_weights = self.assign_class_weights(num_classes, t_pred_usage)
250 279
251 return train_data, train_labels, test_data, test_labels, dictionary, reverse_dictionary, class_weights, tool_predicted_usage 280 return train_data, train_labels, test_data, test_labels, dictionary, rev_dict, class_weights, t_pred_usage, l_tool_freq, l_tool_tr_samples