comparison env/lib/python3.9/site-packages/galaxy/tool_util/cwl/representation.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 """ This module is responsible for converting between Galaxy's tool
2 input description and the CWL description for a job json. """
3
4 import json
5 import logging
6 import os
7 from enum import Enum
8 from typing import Any, NamedTuple, Optional
9
10 from galaxy.exceptions import RequestParameterInvalidException
11 from galaxy.util import safe_makedirs, string_as_bool
12 from .util import set_basename_and_derived_properties
13
14
15 log = logging.getLogger(__name__)
16
17 NOT_PRESENT = object()
18
19 NO_GALAXY_INPUT = object()
20
21
22 class INPUT_TYPE(str, Enum):
23 DATA = "data"
24 INTEGER = "integer"
25 FLOAT = "float"
26 TEXT = "text"
27 BOOLEAN = "boolean"
28 SELECT = "select"
29 FIELD = "field"
30 CONDITIONAL = "conditional"
31 DATA_COLLECTON = "data_collection"
32
33
34 # There are two approaches to mapping CWL tool state to Galaxy tool state
35 # one is to map CWL types to compound Galaxy tool parameters combinations
36 # with conditionals and the other is to use a new Galaxy parameter type that
37 # allows unions, optional specifications, etc.... The problem with the former
38 # is that it doesn't work with the workflow parameters for instance and is
39 # very complex on the backend. The problem with the latter is that the GUI
40 # for this parameter type is undefined curently.
41 USE_FIELD_TYPES = True
42
43 # There are two approaches to mapping CWL workflow inputs to Galaxy workflow
44 # steps. The first is to simply map everything to expressions and stick them into
45 # files and use data inputs - the second is to use parameter_input steps with
46 # fields types. We are dispatching on USE_FIELD_TYPES for now - to choose but
47 # may diverge later?
48 # There are open issues with each approach:
49 # - Mapping everything to files makes the GUI harder to imagine but the backend
50 # easier to manage in someways.
51 USE_STEP_PARAMETERS = USE_FIELD_TYPES
52
53
54 class TypeRepresentation(NamedTuple):
55 name: str
56 galaxy_param_type: Any
57 label: str
58 collection_type: Optional[str]
59
60 @property
61 def uses_param(self):
62 return self.galaxy_param_type is not NO_GALAXY_INPUT
63
64
65 TYPE_REPRESENTATIONS = [
66 TypeRepresentation("null", NO_GALAXY_INPUT, "no input", None),
67 TypeRepresentation("integer", INPUT_TYPE.INTEGER, "an integer", None),
68 TypeRepresentation("float", INPUT_TYPE.FLOAT, "a decimal number", None),
69 TypeRepresentation("double", INPUT_TYPE.FLOAT, "a decimal number", None),
70 TypeRepresentation("file", INPUT_TYPE.DATA, "a dataset", None),
71 TypeRepresentation("directory", INPUT_TYPE.DATA, "a directory", None),
72 TypeRepresentation("boolean", INPUT_TYPE.BOOLEAN, "a boolean", None),
73 TypeRepresentation("text", INPUT_TYPE.TEXT, "a simple text field", None),
74 TypeRepresentation("record", INPUT_TYPE.DATA_COLLECTON, "record as a dataset collection", "record"),
75 TypeRepresentation("json", INPUT_TYPE.TEXT, "arbitrary JSON structure", None),
76 TypeRepresentation("array", INPUT_TYPE.DATA_COLLECTON, "as a dataset list", "list"),
77 TypeRepresentation("enum", INPUT_TYPE.TEXT, "enum value", None), # TODO: make this a select...
78 TypeRepresentation("field", INPUT_TYPE.FIELD, "arbitrary JSON structure", None),
79 ]
80 FIELD_TYPE_REPRESENTATION = TYPE_REPRESENTATIONS[-1]
81
82 if not USE_FIELD_TYPES:
83 CWL_TYPE_TO_REPRESENTATIONS = {
84 "Any": ["integer", "float", "file", "boolean", "text", "record", "json"],
85 "array": ["array"],
86 "string": ["text"],
87 "boolean": ["boolean"],
88 "int": ["integer"],
89 "float": ["float"],
90 "File": ["file"],
91 "Directory": ["directory"],
92 "null": ["null"],
93 "record": ["record"],
94 }
95 else:
96 CWL_TYPE_TO_REPRESENTATIONS = {
97 "Any": ["field"],
98 "array": ["array"],
99 "string": ["text"],
100 "boolean": ["boolean"],
101 "int": ["integer"],
102 "float": ["float"],
103 "File": ["file"],
104 "Directory": ["directory"],
105 "null": ["null"],
106 "record": ["record"],
107 "enum": ["enum"],
108 "double": ["double"],
109 }
110
111
112 def type_representation_from_name(type_representation_name):
113 for type_representation in TYPE_REPRESENTATIONS:
114 if type_representation.name == type_representation_name:
115 return type_representation
116
117 assert False
118
119
120 def type_descriptions_for_field_types(field_types):
121 type_representation_names = set()
122 for field_type in field_types:
123 if isinstance(field_type, dict) and field_type.get("type"):
124 field_type = field_type.get("type")
125
126 try:
127 type_representation_names_for_field_type = CWL_TYPE_TO_REPRESENTATIONS.get(field_type)
128 except TypeError:
129 raise Exception("Failed to convert field_type %s" % field_type)
130 if type_representation_names_for_field_type is None:
131 raise Exception("Failed to convert type %s" % field_type)
132 type_representation_names.update(type_representation_names_for_field_type)
133 type_representations = []
134 for type_representation in TYPE_REPRESENTATIONS:
135 if type_representation.name in type_representation_names:
136 type_representations.append(type_representation)
137 return type_representations
138
139
140 def dataset_wrapper_to_file_json(inputs_dir, dataset_wrapper):
141 if dataset_wrapper.ext == "expression.json":
142 with open(dataset_wrapper.file_name) as f:
143 return json.load(f)
144
145 if dataset_wrapper.ext == "directory":
146 return dataset_wrapper_to_directory_json(inputs_dir, dataset_wrapper)
147
148 extra_files_path = dataset_wrapper.extra_files_path
149 secondary_files_path = os.path.join(extra_files_path, "__secondary_files__")
150 path = str(dataset_wrapper)
151 raw_file_object = {"class": "File"}
152
153 if os.path.exists(secondary_files_path):
154 safe_makedirs(inputs_dir)
155 name = os.path.basename(path)
156 new_input_path = os.path.join(inputs_dir, name)
157 os.symlink(path, new_input_path)
158 secondary_files = []
159 for secondary_file_name in os.listdir(secondary_files_path):
160 secondary_file_path = os.path.join(secondary_files_path, secondary_file_name)
161 target = os.path.join(inputs_dir, secondary_file_name)
162 log.info(f"linking [{secondary_file_path}] to [{target}]")
163 os.symlink(secondary_file_path, target)
164 is_dir = os.path.isdir(os.path.realpath(secondary_file_path))
165 secondary_files.append({"class": "File" if not is_dir else "Directory", "location": target})
166
167 raw_file_object["secondaryFiles"] = secondary_files
168 path = new_input_path
169
170 raw_file_object["location"] = path
171
172 # Verify it isn't a NoneDataset
173 if dataset_wrapper.unsanitized:
174 raw_file_object["size"] = int(dataset_wrapper.get_size())
175
176 set_basename_and_derived_properties(raw_file_object, str(dataset_wrapper.created_from_basename or dataset_wrapper.name))
177 return raw_file_object
178
179
180 def dataset_wrapper_to_directory_json(inputs_dir, dataset_wrapper):
181 assert dataset_wrapper.ext == "directory"
182
183 # get directory name
184 archive_name = str(dataset_wrapper.created_from_basename or dataset_wrapper.name)
185 nameroot, nameext = os.path.splitext(archive_name)
186 directory_name = nameroot # assume archive file name contains the directory name
187
188 # get archive location
189 try:
190 archive_location = dataset_wrapper.unsanitized.file_name
191 except Exception:
192 archive_location = None
193
194 directory_json = {"location": dataset_wrapper.extra_files_path,
195 "class": "Directory",
196 "name": directory_name,
197 "archive_location": archive_location,
198 "archive_nameext": nameext,
199 "archive_nameroot": nameroot}
200
201 return directory_json
202
203
204 def collection_wrapper_to_array(inputs_dir, wrapped_value):
205 rval = []
206 for value in wrapped_value:
207 rval.append(dataset_wrapper_to_file_json(inputs_dir, value))
208 return rval
209
210
211 def collection_wrapper_to_record(inputs_dir, wrapped_value):
212 rval = {}
213 for key, value in wrapped_value.items():
214 rval[key] = dataset_wrapper_to_file_json(inputs_dir, value)
215 return rval
216
217
218 def to_cwl_job(tool, param_dict, local_working_directory):
219 """ tool is Galaxy's representation of the tool and param_dict is the
220 parameter dictionary with wrapped values.
221 """
222 tool_proxy = tool._cwl_tool_proxy
223 input_fields = tool_proxy.input_fields()
224 inputs = tool.inputs
225 input_json = {}
226
227 inputs_dir = os.path.join(local_working_directory, "_inputs")
228
229 def simple_value(input, param_dict_value, type_representation_name=None):
230 type_representation = type_representation_from_name(type_representation_name)
231 # Hmm... cwl_type isn't really the cwl type in every case,
232 # like in the case of json for instance.
233
234 if type_representation.galaxy_param_type == NO_GALAXY_INPUT:
235 assert param_dict_value is None
236 return None
237
238 if type_representation.name == "file":
239 dataset_wrapper = param_dict_value
240 return dataset_wrapper_to_file_json(inputs_dir, dataset_wrapper)
241 elif type_representation.name == "directory":
242 dataset_wrapper = param_dict_value
243 return dataset_wrapper_to_directory_json(inputs_dir, dataset_wrapper)
244 elif type_representation.name == "integer":
245 return int(str(param_dict_value))
246 elif type_representation.name == "long":
247 return int(str(param_dict_value))
248 elif type_representation.name in ["float", "double"]:
249 return float(str(param_dict_value))
250 elif type_representation.name == "boolean":
251 return string_as_bool(param_dict_value)
252 elif type_representation.name == "text":
253 return str(param_dict_value)
254 elif type_representation.name == "enum":
255 return str(param_dict_value)
256 elif type_representation.name == "json":
257 raw_value = param_dict_value.value
258 return json.loads(raw_value)
259 elif type_representation.name == "field":
260 if param_dict_value is None:
261 return None
262 if hasattr(param_dict_value, "value"):
263 # Is InputValueWrapper
264 rval = param_dict_value.value
265 if isinstance(rval, dict) and "src" in rval and rval["src"] == "json":
266 # needed for wf_step_connect_undeclared_param, so non-file defaults?
267 return rval["value"]
268 return rval
269 elif not param_dict_value.is_collection:
270 # Is DatasetFilenameWrapper
271 return dataset_wrapper_to_file_json(inputs_dir, param_dict_value)
272 else:
273 # Is DatasetCollectionWrapper
274 hdca_wrapper = param_dict_value
275 if hdca_wrapper.collection_type == "list":
276 # TODO: generalize to lists of lists and lists of non-files...
277 return collection_wrapper_to_array(inputs_dir, hdca_wrapper)
278 elif hdca_wrapper.collection_type.collection_type == "record":
279 return collection_wrapper_to_record(inputs_dir, hdca_wrapper)
280
281 elif type_representation.name == "array":
282 # TODO: generalize to lists of lists and lists of non-files...
283 return collection_wrapper_to_array(inputs_dir, param_dict_value)
284 elif type_representation.name == "record":
285 return collection_wrapper_to_record(inputs_dir, param_dict_value)
286 else:
287 return str(param_dict_value)
288
289 for input_name, input in inputs.items():
290 if input.type == "repeat":
291 only_input = next(iter(input.inputs.values()))
292 array_value = []
293 for instance in param_dict[input_name]:
294 array_value.append(simple_value(only_input, instance[input_name[:-len("_repeat")]]))
295 input_json[input_name[:-len("_repeat")]] = array_value
296 elif input.type == "conditional":
297 assert input_name in param_dict, f"No value for {input_name} in {param_dict}"
298 current_case = param_dict[input_name]["_cwl__type_"]
299 if str(current_case) != "null": # str because it is a wrapped...
300 case_index = input.get_current_case(current_case)
301 case_input = input.cases[case_index].inputs["_cwl__value_"]
302 case_value = param_dict[input_name]["_cwl__value_"]
303 input_json[input_name] = simple_value(case_input, case_value, current_case)
304 else:
305 matched_field = None
306 for field in input_fields:
307 if field["name"] == input_name:
308 matched_field = field
309 field_type = field_to_field_type(matched_field)
310 if isinstance(field_type, list):
311 assert USE_FIELD_TYPES
312 type_descriptions = [FIELD_TYPE_REPRESENTATION]
313 else:
314 type_descriptions = type_descriptions_for_field_types([field_type])
315 assert len(type_descriptions) == 1
316 type_description_name = type_descriptions[0].name
317 input_json[input_name] = simple_value(input, param_dict[input_name], type_description_name)
318
319 log.debug("Galaxy Tool State is CWL State is %s" % input_json)
320 return input_json
321
322
323 def to_galaxy_parameters(tool, as_dict):
324 """ Tool is Galaxy's representation of the tool and as_dict is a Galaxified
325 representation of the input json (no paths, HDA references for instance).
326 """
327 inputs = tool.inputs
328 galaxy_request = {}
329
330 def from_simple_value(input, param_dict_value, type_representation_name=None):
331 if type_representation_name == "json":
332 return json.dumps(param_dict_value)
333 else:
334 return param_dict_value
335
336 for input_name, input in inputs.items():
337 as_dict_value = as_dict.get(input_name, NOT_PRESENT)
338 galaxy_input_type = input.type
339
340 if galaxy_input_type == "repeat":
341 if input_name not in as_dict:
342 continue
343
344 only_input = next(iter(input.inputs.values()))
345 for index, value in enumerate(as_dict_value):
346 key = f"{input_name}_repeat_0|{only_input.name}"
347 galaxy_value = from_simple_value(only_input, value)
348 galaxy_request[key] = galaxy_value
349 elif galaxy_input_type == "conditional":
350 case_strings = input.case_strings
351 # TODO: less crazy handling of defaults...
352 if (as_dict_value is NOT_PRESENT or as_dict_value is None) and "null" in case_strings:
353 type_representation_name = "null"
354 elif (as_dict_value is NOT_PRESENT or as_dict_value is None):
355 raise RequestParameterInvalidException(
356 "Cannot translate CWL datatype - value [{}] of type [{}] with case_strings [{}]. Non-null property must be set.".format(
357 as_dict_value, type(as_dict_value), case_strings
358 )
359 )
360 elif isinstance(as_dict_value, bool) and "boolean" in case_strings:
361 type_representation_name = "boolean"
362 elif isinstance(as_dict_value, int) and "integer" in case_strings:
363 type_representation_name = "integer"
364 elif isinstance(as_dict_value, int) and "long" in case_strings:
365 type_representation_name = "long"
366 elif isinstance(as_dict_value, (int, float)) and "float" in case_strings:
367 type_representation_name = "float"
368 elif isinstance(as_dict_value, (int, float)) and "double" in case_strings:
369 type_representation_name = "double"
370 elif isinstance(as_dict_value, str) and "string" in case_strings:
371 type_representation_name = "string"
372 elif isinstance(as_dict_value, dict) and "src" in as_dict_value and "id" in as_dict_value and "file" in case_strings:
373 type_representation_name = "file"
374 elif isinstance(as_dict_value, dict) and "src" in as_dict_value and "id" in as_dict_value and "directory" in case_strings:
375 # TODO: can't disambiuate with above if both are available...
376 type_representation_name = "directory"
377 elif "field" in case_strings:
378 type_representation_name = "field"
379 elif "json" in case_strings and as_dict_value is not None:
380 type_representation_name = "json"
381 else:
382 raise RequestParameterInvalidException(
383 "Cannot translate CWL datatype - value [{}] of type [{}] with case_strings [{}].".format(
384 as_dict_value, type(as_dict_value), case_strings
385 )
386 )
387 galaxy_request["%s|_cwl__type_" % input_name] = type_representation_name
388 if type_representation_name != "null":
389 current_case_index = input.get_current_case(type_representation_name)
390 current_case_inputs = input.cases[current_case_index].inputs
391 current_case_input = current_case_inputs["_cwl__value_"]
392 galaxy_value = from_simple_value(current_case_input, as_dict_value, type_representation_name)
393 galaxy_request["%s|_cwl__value_" % input_name] = galaxy_value
394 elif as_dict_value is NOT_PRESENT:
395 continue
396 else:
397 galaxy_value = from_simple_value(input, as_dict_value)
398 galaxy_request[input_name] = galaxy_value
399
400 log.info("Converted galaxy_request is %s" % galaxy_request)
401 return galaxy_request
402
403
404 def field_to_field_type(field):
405 field_type = field["type"]
406 if isinstance(field_type, dict):
407 field_type = field_type["type"]
408 if isinstance(field_type, list):
409 field_type_length = len(field_type)
410 if field_type_length == 0:
411 raise Exception("Zero-length type list encountered, invalid CWL?")
412 elif len(field_type) == 1:
413 field_type = field_type[0]
414
415 return field_type