Mercurial > repos > genouest > baric_archive_toulouse
diff baric_archive.py @ 0:1ae6b80f1e03 draft default tip
"planemo upload for repository https://github.com/genouest/galaxy-tools/tree/master/tools/baric_archive commit 6419e960f00c0e1c3950bad500487d3681797b40"
author | genouest |
---|---|
date | Fri, 04 Mar 2022 11:16:34 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baric_archive.py Fri Mar 04 11:16:34 2022 +0000 @@ -0,0 +1,139 @@ +#!/usr/bin/env python +# Retrieves data from external data source applications and stores in a dataset file. +# Data source application parameters are temporarily stored in the dataset file. +import os +import socket +import sys +import urllib +from json import dumps, loads +from urllib.parse import urlparse +from urllib.request import urlopen + +from galaxy.datatypes import sniff +from galaxy.datatypes.registry import Registry +from galaxy.jobs import TOOL_PROVIDED_JOB_METADATA_FILE +from galaxy.util import get_charset_from_http_headers + +GALAXY_PARAM_PREFIX = 'GALAXY' +GALAXY_ROOT_DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)) +GALAXY_DATATYPES_CONF_FILE = os.path.join(GALAXY_ROOT_DIR, 'datatypes_conf.xml') + + +def stop_err(msg, json_file=None): + sys.stderr.write(msg) + # Need to write valid (but empty) json to avoid metadata collection failure + # leading to "unable to finish job" error with no logs + if json_file is not None: + json_file.write("%s\n" % dumps({})) + sys.exit(1) + + +def load_input_parameters(filename, erase_file=True): + datasource_params = {} + try: + json_params = loads(open(filename, 'r').read()) + datasource_params = json_params.get('param_dict') + except Exception: + json_params = None + for line in open(filename, 'r'): + try: + line = line.strip() + fields = line.split('\t') + datasource_params[fields[0]] = fields[1] + except Exception: + continue + if erase_file: + open(filename, 'w').close() # open file for writing, then close, removes params from file + return json_params, datasource_params + + +def __main__(): + filename = sys.argv[1] + + user_email = sys.argv[2] + user_id = sys.argv[3] + + try: + max_file_size = int(sys.argv[4]) + except Exception: + max_file_size = 0 + + job_params, params = load_input_parameters(filename) + if job_params is None: # using an older tabular file + enhanced_handling = False + job_params = dict(param_dict=params) + job_params['output_data'] = [dict(out_data_name='output', + ext='auto', + file_name=filename, + extra_files_path=None)] + job_params['job_config'] = dict(GALAXY_ROOT_DIR=GALAXY_ROOT_DIR, GALAXY_DATATYPES_CONF_FILE=GALAXY_DATATYPES_CONF_FILE, TOOL_PROVIDED_JOB_METADATA_FILE=TOOL_PROVIDED_JOB_METADATA_FILE) + else: + enhanced_handling = True + json_file = open(job_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'w') # specially named file for output junk to pass onto set metadata + + datatypes_registry = Registry() + datatypes_registry.load_datatypes(root_dir=job_params['job_config']['GALAXY_ROOT_DIR'], config=job_params['job_config']['GALAXY_DATATYPES_CONF_FILE']) + + URL = params.get('URL', None) # using exactly URL indicates that only one dataset is being downloaded + export = params.get('export', None) + userkey = params.get('userkey', 'none') + URL_method = params.get('URL_method', None) + + URL = URL + "&userkey=" + userkey + "&user_email=" + user_email + "&user_id=" + user_id + + # The Python support for fetching resources from the web is layered. urllib uses the httplib + # library, which in turn uses the socket library. As of Python 2.3 you can specify how long + # a socket should wait for a response before timing out. By default the socket module has no + # timeout and can hang. Currently, the socket timeout is not exposed at the httplib or urllib2 + # levels. However, you can set the default timeout ( in seconds ) globally for all sockets by + # doing the following. + socket.setdefaulttimeout(600) + + for data_dict in job_params['output_data']: + cur_filename = data_dict.get('file_name', filename) + cur_URL = params.get('%s|%s|URL' % (GALAXY_PARAM_PREFIX, data_dict['out_data_name']), URL) + if not cur_URL or urlparse(cur_URL).scheme not in ('http', 'https', 'ftp'): + open(cur_filename, 'w').write("") + stop_err('The remote data source application has not sent back a URL parameter in the request.', json_file) + + # The following calls to urlopen() will use the above default timeout + try: + if not URL_method or URL_method == 'get': + page = urlopen(cur_URL) + elif URL_method == 'post': + page = urlopen(cur_URL, urllib.parse.urlencode(params).encode("utf-8")) + except Exception as e: + stop_err('The remote data source application may be off line, please try again later. Error: %s' % str(e), json_file) + if max_file_size: + file_size = int(page.info().get('Content-Length', 0)) + if file_size > max_file_size: + stop_err('The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % (file_size, max_file_size), json_file) + # handle files available locally + if export: + try: + local_file = export + page.read() + os.remove(cur_filename) + os.symlink(local_file, cur_filename) + except Exception as e: + stop_err('Unable to symlink %s to %s:\n%s' % (local_file, cur_filename, e), json_file) + else: + try: + cur_filename = sniff.stream_to_open_named_file(page, os.open(cur_filename, os.O_WRONLY | os.O_CREAT), cur_filename, source_encoding=get_charset_from_http_headers(page.headers)) + except Exception as e: + stop_err('Unable to fetch %s:\n%s' % (cur_URL, e), json_file) + + # here import checks that upload tool performs + if enhanced_handling: + try: + ext = sniff.handle_uploaded_dataset_file(filename, datatypes_registry, ext=data_dict['ext']) + except Exception as e: + stop_err(str(e), json_file) + info = dict(type='dataset', + dataset_id=data_dict['dataset_id'], + ext=ext) + + json_file.write("%s\n" % dumps(info)) + + +if __name__ == "__main__": + __main__()