comparison extract_workflow_connections.py @ 6:e94dc7945639 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
author bgruening
date Sun, 16 Oct 2022 11:52:10 +0000
parents 4f7e6612906b
children
comparison
equal deleted inserted replaced
5:4f7e6612906b 6:e94dc7945639
1 """ 1 """
2 Extract workflow paths from the tabular file containing 2 Extract workflow paths from the tabular file containing
3 input and output tools 3 input and output tools
4 """ 4 """
5
6 import csv
7 import random 5 import random
8 6
9 import utils 7 import utils
10 8
11 9
12 class ExtractWorkflowConnections: 10 class ExtractWorkflowConnections:
11
13 def __init__(self): 12 def __init__(self):
14 """ Init method. """ 13 """ Init method. """
15 14
16 def collect_standard_connections(self, row): 15 def process_raw_files(self, wf_path, tool_popu_path, config):
17 published = row[8] 16 """
18 deleted = row[9] 17 Remove pipe from workflows and popularity tabular files
19 has_errors = row[10] 18 """
20 if published == "t" and deleted == "f" and has_errors == "f": 19 print("Removing pipe from tabular datasets...")
21 return True 20 wf_frame = utils.remove_pipe(wf_path)
22 return False 21 tool_popu_frame = utils.remove_pipe(tool_popu_path)
22 return wf_frame, tool_popu_frame
23 23
24 def read_tabular_file(self, raw_file_path): 24 def read_tabular_file(self, wf_dataframe, config):
25 """ 25 """
26 Read tabular file and extract workflow connections 26 Read tabular file and extract workflow connections
27 """ 27 """
28 print("Reading workflows...") 28 print("Reading workflows...")
29 workflows = {} 29 workflows = {}
30 workflow_paths_dup = "" 30 workflow_paths_dup = ""
31 workflow_parents = dict() 31 workflow_parents = dict()
32 workflow_paths = list() 32 workflow_paths = list()
33 unique_paths = dict() 33 unique_paths = dict()
34 standard_connections = dict() 34 standard_connections = dict()
35 with open(raw_file_path, "rt") as workflow_connections_file: 35 for index, row in wf_dataframe.iterrows():
36 workflow_connections = csv.reader(workflow_connections_file, delimiter="\t") 36 row = row.tolist()
37 for index, row in enumerate(workflow_connections): 37 row = [str(item).strip() for item in row]
38 wf_id = str(row[0]) 38 wf_id = str(row[0])
39 in_tool = row[3].strip() 39 if row[1] > config["cutoff_date"]:
40 out_tool = row[6].strip() 40 in_tool = row[3]
41 out_tool = row[6]
41 if wf_id not in workflows: 42 if wf_id not in workflows:
42 workflows[wf_id] = list() 43 workflows[wf_id] = list()
43 if out_tool and in_tool and out_tool != in_tool: 44 if out_tool and in_tool and out_tool != in_tool:
44 workflows[wf_id].append((out_tool, in_tool)) 45 workflows[wf_id].append((out_tool, in_tool))
45 qc = self.collect_standard_connections(row) 46 qc = self.__collect_standard_connections(row)
46 if qc: 47 if qc:
47 i_t = utils.format_tool_id(in_tool) 48 i_t = utils.format_tool_id(in_tool)
48 o_t = utils.format_tool_id(out_tool) 49 o_t = utils.format_tool_id(out_tool)
49 if i_t not in standard_connections: 50 if i_t not in standard_connections:
50 standard_connections[i_t] = list() 51 standard_connections[i_t] = list()
52 standard_connections[i_t].append(o_t) 53 standard_connections[i_t].append(o_t)
53 print("Processing workflows...") 54 print("Processing workflows...")
54 wf_ctr = 0 55 wf_ctr = 0
55 for wf_id in workflows: 56 for wf_id in workflows:
56 wf_ctr += 1 57 wf_ctr += 1
57 workflow_parents[wf_id] = self.read_workflow(wf_id, workflows[wf_id]) 58 workflow_parents[wf_id] = self.__read_workflow(wf_id, workflows[wf_id])
58 59
59 for wf_id in workflow_parents: 60 for wf_id in workflow_parents:
60 flow_paths = list() 61 flow_paths = list()
61 parents_graph = workflow_parents[wf_id] 62 parents_graph = workflow_parents[wf_id]
62 roots, leaves = self.get_roots_leaves(parents_graph) 63 roots, leaves = self.__get_roots_leaves(parents_graph)
63 for root in roots: 64 for root in roots:
64 for leaf in leaves: 65 for leaf in leaves:
65 paths = self.find_tool_paths_workflow(parents_graph, root, leaf) 66 paths = self.__find_tool_paths_workflow(parents_graph, root, leaf)
66 # reverse the paths as they are computed from leaves to roots leaf 67 # reverse the paths as they are computed from leaves to roots leaf
67 paths = [tool_path for tool_path in paths] 68 paths = [tool_path for tool_path in paths]
68 if len(paths) > 0: 69 if len(paths) > 0:
69 flow_paths.extend(paths) 70 flow_paths.extend(paths)
70 workflow_paths.extend(flow_paths) 71 workflow_paths.extend(flow_paths)
82 83
83 # collect unique paths 84 # collect unique paths
84 unique_paths = list(workflow_paths_dup.split("\n")) 85 unique_paths = list(workflow_paths_dup.split("\n"))
85 unique_paths = list(filter(None, unique_paths)) 86 unique_paths = list(filter(None, unique_paths))
86 random.shuffle(unique_paths) 87 random.shuffle(unique_paths)
88 print("unique_paths: {}".format(len(unique_paths)))
87 no_dup_paths = list(set(unique_paths)) 89 no_dup_paths = list(set(unique_paths))
90 print("no_dup_paths: {}".format(len(no_dup_paths)))
91 return no_dup_paths, standard_connections
88 92
89 print("Finding compatible next tools...") 93 def __collect_standard_connections(self, row):
90 compatible_next_tools = self.set_compatible_next_tools(no_dup_paths) 94 published = row[8].strip()
91 return unique_paths, compatible_next_tools, standard_connections 95 deleted = row[9].strip()
96 has_errors = row[10].strip()
97 if published == "t" and deleted == "f" and has_errors == "f":
98 return True
99 return False
92 100
93 def set_compatible_next_tools(self, workflow_paths): 101 def __set_compatible_next_tools(self, workflow_paths):
94 """ 102 """
95 Find next tools for each tool 103 Find next tools for each tool
96 """ 104 """
97 next_tools = dict() 105 next_tools = dict()
98 for path in workflow_paths: 106 for path in workflow_paths:
107 next_tools[current_tool] = next_tool 115 next_tools[current_tool] = next_tool
108 for tool in next_tools: 116 for tool in next_tools:
109 next_tools[tool] = ",".join(list(set(next_tools[tool].split(",")))) 117 next_tools[tool] = ",".join(list(set(next_tools[tool].split(","))))
110 return next_tools 118 return next_tools
111 119
112 def read_workflow(self, wf_id, workflow_rows): 120 def __read_workflow(self, wf_id, workflow_rows):
113 """ 121 """
114 Read all connections for a workflow 122 Read all connections for a workflow
115 """ 123 """
116 tool_parents = dict() 124 tool_parents = dict()
117 for connection in workflow_rows: 125 for connection in workflow_rows:
121 tool_parents[out_tool] = list() 129 tool_parents[out_tool] = list()
122 if in_tool not in tool_parents[out_tool]: 130 if in_tool not in tool_parents[out_tool]:
123 tool_parents[out_tool].append(in_tool) 131 tool_parents[out_tool].append(in_tool)
124 return tool_parents 132 return tool_parents
125 133
126 def get_roots_leaves(self, graph): 134 def __get_roots_leaves(self, graph):
127 roots = list() 135 roots = list()
128 leaves = list() 136 leaves = list()
129 all_parents = list() 137 all_parents = list()
130 for item in graph: 138 for item in graph:
131 all_parents.extend(graph[item]) 139 all_parents.extend(graph[item])
133 children = graph.keys() 141 children = graph.keys()
134 roots = list(set(all_parents).difference(set(children))) 142 roots = list(set(all_parents).difference(set(children)))
135 leaves = list(set(children).difference(set(all_parents))) 143 leaves = list(set(children).difference(set(all_parents)))
136 return roots, leaves 144 return roots, leaves
137 145
138 def find_tool_paths_workflow(self, graph, start, end, path=[]): 146 def __find_tool_paths_workflow(self, graph, start, end, path=[]):
139 path = path + [end] 147 path = path + [end]
140 if start == end: 148 if start == end:
141 return [path] 149 return [path]
142 path_list = list() 150 path_list = list()
143 if end in graph: 151 if end in graph:
144 for node in graph[end]: 152 for node in graph[end]:
145 if node not in path: 153 if node not in path:
146 new_tools_paths = self.find_tool_paths_workflow( 154 new_tools_paths = self.__find_tool_paths_workflow(graph, start, node, path)
147 graph, start, node, path
148 )
149 for tool_path in new_tools_paths: 155 for tool_path in new_tools_paths:
150 path_list.append(tool_path) 156 path_list.append(tool_path)
151 return path_list 157 return path_list