comparison prepare_data.py @ 0:9bf25dbe00ad draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
author bgruening
date Wed, 28 Aug 2019 07:19:38 -0400
parents
children 5b3c08710e47
comparison
equal deleted inserted replaced
-1:000000000000 0:9bf25dbe00ad
1 """
2 Prepare the workflow paths to be used by downstream
3 machine learning algorithm. The paths are divided
4 into the test and training sets
5 """
6
7 import os
8 import collections
9 import numpy as np
10 import random
11
12 import predict_tool_usage
13
14 main_path = os.getcwd()
15
16
17 class PrepareData:
18
19 @classmethod
20 def __init__(self, max_seq_length, test_data_share):
21 """ Init method. """
22 self.max_tool_sequence_len = max_seq_length
23 self.test_share = test_data_share
24
25 @classmethod
26 def process_workflow_paths(self, workflow_paths):
27 """
28 Get all the tools and complete set of individual paths for each workflow
29 """
30 tokens = list()
31 raw_paths = workflow_paths
32 raw_paths = [x.replace("\n", '') for x in raw_paths]
33 for item in raw_paths:
34 split_items = item.split(",")
35 for token in split_items:
36 if token is not "":
37 tokens.append(token)
38 tokens = list(set(tokens))
39 tokens = np.array(tokens)
40 tokens = np.reshape(tokens, [-1, ])
41 return tokens, raw_paths
42
43 @classmethod
44 def create_new_dict(self, new_data_dict):
45 """
46 Create new data dictionary
47 """
48 reverse_dict = dict((v, k) for k, v in new_data_dict.items())
49 return new_data_dict, reverse_dict
50
51 @classmethod
52 def assemble_dictionary(self, new_data_dict, old_data_dictionary={}):
53 """
54 Create/update tools indices in the forward and backward dictionary
55 """
56 new_data_dict, reverse_dict = self.create_new_dict(new_data_dict)
57 return new_data_dict, reverse_dict
58
59 @classmethod
60 def create_data_dictionary(self, words, old_data_dictionary={}):
61 """
62 Create two dictionaries having tools names and their indexes
63 """
64 count = collections.Counter(words).most_common()
65 dictionary = dict()
66 for word, _ in count:
67 dictionary[word] = len(dictionary) + 1
68 dictionary, reverse_dictionary = self.assemble_dictionary(dictionary, old_data_dictionary)
69 return dictionary, reverse_dictionary
70
71 @classmethod
72 def decompose_paths(self, paths, dictionary):
73 """
74 Decompose the paths to variable length sub-paths keeping the first tool fixed
75 """
76 sub_paths_pos = list()
77 for index, item in enumerate(paths):
78 tools = item.split(",")
79 len_tools = len(tools)
80 if len_tools <= self.max_tool_sequence_len:
81 for window in range(1, len_tools):
82 sequence = tools[0: window + 1]
83 tools_pos = [str(dictionary[str(tool_item)]) for tool_item in sequence]
84 if len(tools_pos) > 1:
85 sub_paths_pos.append(",".join(tools_pos))
86 sub_paths_pos = list(set(sub_paths_pos))
87 return sub_paths_pos
88
89 @classmethod
90 def prepare_paths_labels_dictionary(self, dictionary, reverse_dictionary, paths, compatible_next_tools):
91 """
92 Create a dictionary of sequences with their labels for training and test paths
93 """
94 paths_labels = dict()
95 random.shuffle(paths)
96 for item in paths:
97 if item and item not in "":
98 tools = item.split(",")
99 label = tools[-1]
100 train_tools = tools[:len(tools) - 1]
101 last_but_one_name = reverse_dictionary[int(train_tools[-1])]
102 try:
103 compatible_tools = compatible_next_tools[last_but_one_name].split(",")
104 except Exception:
105 continue
106 if len(compatible_tools) > 0:
107 compatible_tools_ids = [str(dictionary[x]) for x in compatible_tools]
108 compatible_tools_ids.append(label)
109 composite_labels = ",".join(compatible_tools_ids)
110 train_tools = ",".join(train_tools)
111 if train_tools in paths_labels:
112 paths_labels[train_tools] += "," + composite_labels
113 else:
114 paths_labels[train_tools] = composite_labels
115 for item in paths_labels:
116 paths_labels[item] = ",".join(list(set(paths_labels[item].split(","))))
117 return paths_labels
118
119 @classmethod
120 def pad_paths(self, paths_dictionary, num_classes):
121 """
122 Add padding to the tools sequences and create multi-hot encoded labels
123 """
124 size_data = len(paths_dictionary)
125 data_mat = np.zeros([size_data, self.max_tool_sequence_len])
126 label_mat = np.zeros([size_data, num_classes + 1])
127 train_counter = 0
128 for train_seq, train_label in list(paths_dictionary.items()):
129 positions = train_seq.split(",")
130 start_pos = self.max_tool_sequence_len - len(positions)
131 for id_pos, pos in enumerate(positions):
132 data_mat[train_counter][start_pos + id_pos] = int(pos)
133 for label_item in train_label.split(","):
134 label_mat[train_counter][int(label_item)] = 1.0
135 train_counter += 1
136 return data_mat, label_mat
137
138 @classmethod
139 def split_test_train_data(self, multilabels_paths):
140 """
141 Split into test and train data randomly for each run
142 """
143 train_dict = dict()
144 test_dict = dict()
145 all_paths = multilabels_paths.keys()
146 random.shuffle(list(all_paths))
147 split_number = int(self.test_share * len(all_paths))
148 for index, path in enumerate(list(all_paths)):
149 if index < split_number:
150 test_dict[path] = multilabels_paths[path]
151 else:
152 train_dict[path] = multilabels_paths[path]
153 return train_dict, test_dict
154
155 @classmethod
156 def verify_overlap(self, train_paths, test_paths):
157 """
158 Verify the overlapping of samples in train and test data
159 """
160 intersection = list(set(train_paths).intersection(set(test_paths)))
161 print("Overlap in train and test: %d" % len(intersection))
162
163 @classmethod
164 def get_predicted_usage(self, data_dictionary, predicted_usage):
165 """
166 Get predicted usage for tools
167 """
168 usage = dict()
169 epsilon = 0.0
170 # index 0 does not belong to any tool
171 usage[0] = epsilon
172 for k, v in data_dictionary.items():
173 try:
174 usg = predicted_usage[k]
175 if usg < epsilon:
176 usg = epsilon
177 usage[v] = usg
178 except Exception:
179 usage[v] = epsilon
180 continue
181 return usage
182
183 @classmethod
184 def assign_class_weights(self, n_classes, predicted_usage):
185 """
186 Compute class weights using usage
187 """
188 class_weights = dict()
189 class_weights[str(0)] = 0.0
190 for key in range(1, n_classes):
191 u_score = predicted_usage[key]
192 if u_score < 1.0:
193 u_score += 1.0
194 class_weights[key] = np.log(u_score)
195 return class_weights
196
197 @classmethod
198 def get_sample_weights(self, train_data, reverse_dictionary, paths_frequency):
199 """
200 Compute the frequency of paths in training data
201 """
202 path_weights = np.zeros(len(train_data))
203 for path_index, path in enumerate(train_data):
204 sample_pos = np.where(path > 0)[0]
205 sample_tool_pos = path[sample_pos[0]:]
206 path_name = ",".join([reverse_dictionary[int(tool_pos)] for tool_pos in sample_tool_pos])
207 try:
208 path_weights[path_index] = int(paths_frequency[path_name])
209 except Exception:
210 path_weights[path_index] = 1
211 return path_weights
212
213 @classmethod
214 def get_data_labels_matrices(self, workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools, old_data_dictionary={}):
215 """
216 Convert the training and test paths into corresponding numpy matrices
217 """
218 processed_data, raw_paths = self.process_workflow_paths(workflow_paths)
219 dictionary, reverse_dictionary = self.create_data_dictionary(processed_data, old_data_dictionary)
220 num_classes = len(dictionary)
221
222 print("Raw paths: %d" % len(raw_paths))
223 random.shuffle(raw_paths)
224
225 print("Decomposing paths...")
226 all_unique_paths = self.decompose_paths(raw_paths, dictionary)
227 random.shuffle(all_unique_paths)
228
229 print("Creating dictionaries...")
230 multilabels_paths = self.prepare_paths_labels_dictionary(dictionary, reverse_dictionary, all_unique_paths, compatible_next_tools)
231
232 print("Complete data: %d" % len(multilabels_paths))
233 train_paths_dict, test_paths_dict = self.split_test_train_data(multilabels_paths)
234
235 print("Train data: %d" % len(train_paths_dict))
236 print("Test data: %d" % len(test_paths_dict))
237
238 test_data, test_labels = self.pad_paths(test_paths_dict, num_classes)
239 train_data, train_labels = self.pad_paths(train_paths_dict, num_classes)
240
241 # Predict tools usage
242 print("Predicting tools' usage...")
243 usage_pred = predict_tool_usage.ToolPopularity()
244 usage = usage_pred.extract_tool_usage(tool_usage_path, cutoff_date, dictionary)
245 tool_usage_prediction = usage_pred.get_pupularity_prediction(usage)
246 tool_predicted_usage = self.get_predicted_usage(dictionary, tool_usage_prediction)
247
248 # get class weights using the predicted usage for each tool
249 class_weights = self.assign_class_weights(train_labels.shape[1], tool_predicted_usage)
250
251 return train_data, train_labels, test_data, test_labels, dictionary, reverse_dictionary, class_weights, tool_predicted_usage