comparison extract_workflow_connections.py @ 0:9bf25dbe00ad draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
author bgruening
date Wed, 28 Aug 2019 07:19:38 -0400
parents
children 5b3c08710e47
comparison
equal deleted inserted replaced
-1:000000000000 0:9bf25dbe00ad
1 """
2 Extract workflow paths from the tabular file containing
3 input and output tools
4 """
5
6 import csv
7 import random
8
9 import utils
10
11
12 class ExtractWorkflowConnections:
13
14 @classmethod
15 def __init__(self):
16 """ Init method. """
17
18 @classmethod
19 def read_tabular_file(self, raw_file_path):
20 """
21 Read tabular file and extract workflow connections
22 """
23 print("Reading workflows...")
24 workflows = {}
25 workflow_paths_dup = ""
26 workflow_parents = dict()
27 workflow_paths = list()
28 unique_paths = list()
29 with open(raw_file_path, 'rt') as workflow_connections_file:
30 workflow_connections = csv.reader(workflow_connections_file, delimiter='\t')
31 for index, row in enumerate(workflow_connections):
32 wf_id = str(row[0])
33 in_tool = row[3]
34 out_tool = row[6]
35 if wf_id not in workflows:
36 workflows[wf_id] = list()
37 if out_tool and in_tool and out_tool != in_tool:
38 workflows[wf_id].append((in_tool, out_tool))
39 print("Processing workflows...")
40 wf_ctr = 0
41 for wf_id in workflows:
42 wf_ctr += 1
43 workflow_parents[wf_id] = self.read_workflow(wf_id, workflows[wf_id])
44
45 for wf_id in workflow_parents:
46 flow_paths = list()
47 parents_graph = workflow_parents[wf_id]
48 roots, leaves = self.get_roots_leaves(parents_graph)
49 for root in roots:
50 for leaf in leaves:
51 paths = self.find_tool_paths_workflow(parents_graph, root, leaf)
52 # reverse the paths as they are computed from leaves to roots leaf
53 paths = [tool_path for tool_path in paths]
54 if len(paths) > 0:
55 flow_paths.extend(paths)
56 workflow_paths.extend(flow_paths)
57
58 print("Workflows processed: %d" % wf_ctr)
59
60 # remove slashes from the tool ids
61 wf_paths_no_slash = list()
62 for path in workflow_paths:
63 path_no_slash = [utils.format_tool_id(tool_id) for tool_id in path]
64 wf_paths_no_slash.append(path_no_slash)
65
66 # collect duplicate paths
67 for path in wf_paths_no_slash:
68 workflow_paths_dup += ",".join(path) + "\n"
69
70 # collect unique paths
71 unique_paths = list(workflow_paths_dup.split("\n"))
72 unique_paths = list(filter(None, unique_paths))
73 random.shuffle(unique_paths)
74 no_dup_paths = list(set(unique_paths))
75
76 print("Finding compatible next tools...")
77 compatible_next_tools = self.set_compatible_next_tools(no_dup_paths)
78 return unique_paths, compatible_next_tools
79
80 @classmethod
81 def set_compatible_next_tools(self, workflow_paths):
82 """
83 Find next tools for each tool
84 """
85 next_tools = dict()
86 for path in workflow_paths:
87 path_split = path.split(",")
88 for window in range(0, len(path_split) - 1):
89 current_next_tools = path_split[window: window + 2]
90 current_tool = current_next_tools[0]
91 next_tool = current_next_tools[1]
92 try:
93 next_tools[current_tool] += "," + next_tool
94 except Exception:
95 next_tools[current_tool] = next_tool
96 for tool in next_tools:
97 next_tools[tool] = ",".join(list(set(next_tools[tool].split(","))))
98 return next_tools
99
100 @classmethod
101 def read_workflow(self, wf_id, workflow_rows):
102 """
103 Read all connections for a workflow
104 """
105 tool_parents = dict()
106 for connection in workflow_rows:
107 in_tool = connection[0]
108 out_tool = connection[1]
109 if out_tool not in tool_parents:
110 tool_parents[out_tool] = list()
111 if in_tool not in tool_parents[out_tool]:
112 tool_parents[out_tool].append(in_tool)
113 return tool_parents
114
115 @classmethod
116 def get_roots_leaves(self, graph):
117 roots = list()
118 leaves = list()
119 all_parents = list()
120 for item in graph:
121 all_parents.extend(graph[item])
122 all_parents = list(set(all_parents))
123 children = graph.keys()
124 roots = list(set(all_parents).difference(set(children)))
125 leaves = list(set(children).difference(set(all_parents)))
126 return roots, leaves
127
128 @classmethod
129 def find_tool_paths_workflow(self, graph, start, end, path=[]):
130 path = path + [end]
131 if start == end:
132 return [path]
133 path_list = list()
134 if end in graph:
135 for node in graph[end]:
136 if node not in path:
137 new_tools_paths = self.find_tool_paths_workflow(graph, start, node, path)
138 for tool_path in new_tools_paths:
139 path_list.append(tool_path)
140 return path_list