Mercurial > repos > devteam > data_manager_rsync_g2
changeset 1:8ff92bd7e2a3 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_rsync_g2 commit 8652f36a3a3838dca989426961561e81432acf4f
author | iuc |
---|---|
date | Tue, 04 Apr 2017 18:13:26 -0400 |
parents | 0a3a6f862104 |
children | e0329ab30f6d |
files | data_manager/data_manager_rsync.py data_manager/data_manager_rsync.xml |
diffstat | 2 files changed, 91 insertions(+), 78 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/data_manager_rsync.py Wed Oct 14 13:48:12 2015 -0400 +++ b/data_manager/data_manager_rsync.py Tue Apr 04 18:13:26 2017 -0400 @@ -1,19 +1,27 @@ #!/usr/bin/env python -#Dan Blankenberg +# Dan Blankenberg +from __future__ import print_function -import sys -import os -import tempfile -import shutil +import datetime +import logging import optparse -import urllib2 +import os +import shutil import subprocess -import datetime +import tempfile +from json import ( + dumps, + loads +) from os.path import basename -from json import loads, dumps from xml.etree.ElementTree import tostring +try: + # For Python 3.0 and later + from urllib.request import urlopen +except ImportError: + # Fall back to Python 2 imports + from urllib2 import urlopen -import logging _log_name = __name__ if _log_name == '__builtin__': _log_name = 'toolshed.installed.g2.rsync.data.manager' @@ -27,8 +35,8 @@ # Pull the Tool Data Table files from github # FIXME: These files should be accessible from the rsync server directly. -TOOL_DATA_TABLE_CONF_XML_URLS = { 'main':"https://raw.githubusercontent.com/galaxyproject/usegalaxy-playbook/master/files/galaxy/usegalaxy.org/config/tool_data_table_conf.xml", - 'test':"https://raw.githubusercontent.com/galaxyproject/usegalaxy-playbook/master/files/galaxy/test.galaxyproject.org/config/tool_data_table_conf.xml" } +TOOL_DATA_TABLE_CONF_XML_URLS = {'main': "https://raw.githubusercontent.com/galaxyproject/usegalaxy-playbook/master/files/galaxy/usegalaxy.org/config/tool_data_table_conf.xml", + 'test': "https://raw.githubusercontent.com/galaxyproject/usegalaxy-playbook/master/files/galaxy/test.galaxyproject.org/config/tool_data_table_conf.xml" } # Replace data table source entries with local temporary location GALAXY_DATA_CANONICAL_PATH = "/galaxy/data/" @@ -50,8 +58,8 @@ # e.g. mafs. Although this maf table is goofy and doesn't have path defined in <table> def, # but it does exit in the .loc. + # --- These methods are called by/within the Galaxy Application - def exec_before_job( app, inp_data, out_data, param_dict, tool=None, **kwd ): # Look for any data tables that haven't been defined for this data manager before and dynamically add them to Galaxy param_dict = dict( **param_dict ) @@ -67,8 +75,8 @@ data_manager = app.data_managers.get_manager( tool.data_manager_id, None ) data_table_entries = get_data_table_entries( param_dict ) data_tables = load_data_tables_from_url( data_table_class=app.tool_data_tables.__class__ ).get( 'data_tables' ) - for data_table_name, entries in data_table_entries.iteritems(): - #get data table managed by this data Manager + for data_table_name, entries in data_table_entries.items(): + # get data table managed by this data Manager has_data_table = app.tool_data_tables.get_tables().get( data_table_name ) if has_data_table: has_data_table = bool( has_data_table.get_filename_for_source( data_manager, None ) ) @@ -77,7 +85,7 @@ from tool_shed.tools import data_table_manager tdtm = data_table_manager.ToolDataTableManager( app ) target_dir, tool_path, relative_target_dir = tdtm.get_target_install_dir( tool_shed_repository ) - #Dynamically add this data table + # Dynamically add this data table log.debug( "Attempting to dynamically create a missing Tool Data Table named %s." % data_table_name ) data_table = data_tables[data_table_name] repo_info = tdtm.generate_repository_info_elem_from_repository( tool_shed_repository, parent_elem=None ) @@ -89,21 +97,23 @@ app.tool_data_tables.add_new_entries_from_config_file( tmp_file.name, None, app.config.shed_tool_data_table_config, persist=True ) tmp_file.close() + def galaxy_code_get_available_data_tables( trans ): - #list of data tables + # list of data tables found_tables = get_available_tables( trans ) - rval = map( lambda x: ( ( x, x, DEFAULT_SELECTED ) ), found_tables ) + rval = [ ( x, x, DEFAULT_SELECTED ) for x in found_tables] return rval + def galaxy_code_get_available_data_tables_entries( trans, dbkey, data_table_names ): - #available entries, optionally filtered by dbkey and table names + # available entries, optionally filtered by dbkey and table names if dbkey in [ None, '', '?' ]: dbkey = None if data_table_names in [ None, '', '?' ]: data_table_names = None found_tables = get_available_tables_for_dbkey( trans, dbkey, data_table_names ) dbkey_text = '(%s) ' % ( dbkey ) if dbkey else '' - rval = map( lambda x: ( "%s%s" % ( dbkey_text, x[0] ), dumps( dict( name=x[0].split( ': ' )[0], entry=x[1] ) ).encode( 'base64' ).rstrip(), DEFAULT_SELECTED ), found_tables.items() ) + rval = [( "%s%s" % ( dbkey_text, x[0] ), dumps( dict( name=x[0].split( ': ' )[0], entry=x[1] ) ).encode( 'base64' ).rstrip(), DEFAULT_SELECTED ) for x in found_tables.items()] return rval # --- End Galaxy called Methods --- @@ -116,8 +126,9 @@ url = url.lstrip( '/' ) return "%s/%s" % ( base, url ) + def rsync_list_dir( server, dir=None, skip_names=[] ): - #drwxr-xr-x 50 2014/05/16 20:58:11 . + # drwxr-xr-x 50 2014/05/16 20:58:11 . if dir: dir = rsync_urljoin( server, dir ) else: @@ -153,6 +164,7 @@ rsync_response.close() return rval + def rsync_sync_to_dir( source, target ): rsync_response = tempfile.NamedTemporaryFile() rsync_stderr = tempfile.NamedTemporaryFile() @@ -176,31 +188,31 @@ if cached_data_table is None: return True, {} if datetime.datetime.now() - cached_data_table.get( 'time_loaded' ) > CACHE_TIME: - data_table_text = urllib2.urlopen( url ).read() + data_table_text = urlopen( url ).read() if cached_data_table.get( 'data_table_text', None ) != data_table_text: - return True, {'data_table_text':data_table_text} + return True, {'data_table_text': data_table_text} loc_file_attrs = rsync_list_dir( RSYNC_SERVER, LOCATION_DIR ) - if cached_data_table.get( 'loc_file_attrs', None ) != loc_file_attrs: - return True, {'loc_file_attrs':loc_file_attrs} + if cached_data_table.get( 'loc_file_attrs', None ) != loc_file_attrs: + return True, {'loc_file_attrs': loc_file_attrs} return False, {} + def load_data_tables_from_url( url=None, site='main', data_table_class=None ): if not url: url = TOOL_DATA_TABLE_CONF_XML_URLS.get( site, None ) assert url, ValueError( 'You must provide either a URL or a site=name.' ) - + cached_data_table = TOOL_DATA_TABLES_LOADED_BY_URL.get( url, None ) refresh, attribs = data_table_needs_refresh( cached_data_table, url ) if refresh: - data_table_text = attribs.get( 'data_table_text' )or urllib2.urlopen( url ).read() + data_table_text = attribs.get( 'data_table_text' )or urlopen( url ).read() loc_file_attrs = attribs.get( 'loc_file_attrs' ) or rsync_list_dir( RSYNC_SERVER, LOCATION_DIR ) - + tmp_dir = tempfile.mkdtemp( prefix='rsync_g2_' ) tmp_loc_dir = os.path.join( tmp_dir, 'location' ) os.mkdir( tmp_loc_dir ) rsync_sync_to_dir( rsync_urljoin( RSYNC_SERVER, LOCATION_DIR ), os.path.abspath( tmp_loc_dir ) ) - - + new_data_table_text = data_table_text.replace( TOOL_DATA_TABLE_CONF_XML_REPLACE_SOURCE, TOOL_DATA_TABLE_CONF_XML_REPLACE_TARGET % ( tmp_loc_dir ) ) data_table_fh = tempfile.NamedTemporaryFile( dir=tmp_dir, prefix='rysnc_data_manager_data_table_conf_' ) data_table_fh.write( new_data_table_text ) @@ -208,17 +220,18 @@ tmp_data_dir = os.path.join( tmp_dir, 'tool-data' ) os.mkdir( tmp_data_dir ) data_tables = data_table_class( tmp_data_dir, config_filename=data_table_fh.name ) - for name, data_table in data_tables.data_tables.items(): + for name, data_table in list(data_tables.data_tables.items()): if name in EXCLUDE_DATA_TABLES or not data_table_has_path_column( data_table ): log.debug( 'Removing data table "%s" because it is excluded by name or does not have a defined "path" column.', name ) del data_tables.data_tables[name] cached_data_table = { 'data_tables': data_tables, 'tmp_dir': tmp_dir, 'data_table_text': data_table_text, 'tmp_loc_dir': tmp_loc_dir, 'loc_file_attrs': loc_file_attrs, 'time_loaded': datetime.datetime.now() } TOOL_DATA_TABLES_LOADED_BY_URL[ url ] = cached_data_table - #delete the files + # delete the files data_table_fh.close() cleanup_before_exit( tmp_dir ) return cached_data_table + def data_table_has_path_column( data_table ): col_names = data_table.get_column_name_list() for name in PATH_COLUMN_NAMES: @@ -226,10 +239,12 @@ return True return False + def get_available_tables( trans ): - #list of data tables + # list of data tables data_tables = load_data_tables_from_url( data_table_class=trans.app.tool_data_tables.__class__ ) - return data_tables.get( 'data_tables' ).get_tables().keys() + return list(data_tables.get( 'data_tables' ).get_tables().keys()) + def get_new_xml_definition( app, data_table, data_manager, repo_info=None, location_file_dir=None ): sub_dict = { 'table_name': data_table.name, 'comment_char': '', 'columns': '', 'file_path': '' } @@ -242,7 +257,7 @@ location_file_dir = location_file_dir or app.config.galaxy_data_manager_data_path for filename in data_table.filenames.keys(): sub_dict['file_path'] = basename( filename ) - sub_dict['file_path'] = os.path.join( location_file_dir, sub_dict['file_path'] ) #os.path.abspath? + sub_dict['file_path'] = os.path.join( location_file_dir, sub_dict['file_path'] ) # os.path.abspath? if not os.path.exists( sub_dict['file_path'] ): # Create empty file open( sub_dict['file_path'], 'wb+' ).close() @@ -256,22 +271,23 @@ </table></tables> """ % sub_dict + def get_available_tables_for_dbkey( trans, dbkey, data_table_names ): - my_data_tables = trans.app.tool_data_tables.get_tables() data_tables = load_data_tables_from_url( data_table_class=trans.app.tool_data_tables.__class__ ) rval = {} - for name, data_table in data_tables.get( 'data_tables' ).get_tables().iteritems(): - if ( not data_table_names or name in data_table_names ): #name in my_data_tables.keys() and - #TODO: check that columns are similiar + for name, data_table in data_tables.get( 'data_tables' ).get_tables().items(): + if ( not data_table_names or name in data_table_names ): + # TODO: check that columns are similiar if not dbkey: entry_getter = data_table.get_named_fields_list() else: entry_getter = data_table.get_entries( 'dbkey', dbkey, None, default=[] ) for entry in entry_getter: - name = "%s: %s" % ( data_table.name, dumps( entry ) ) - rval[name] = entry + name = "%s: %s" % ( data_table.name, dumps( entry ) ) + rval[name] = entry return rval + def split_path_all( path ): rval = [] path = path.rstrip( '/' ) @@ -286,12 +302,13 @@ else: break rval.reverse() - return rval + return rval + def get_data_for_path( path, data_root_dir ): # We list dir with a /, but copy data without # listing with / gives a . entry when its a dir - # cloning without the / will copy that whole directory into the target, + # cloning without the / will copy that whole directory into the target, # instead of just that target's contents if path.startswith( GALAXY_DATA_CANONICAL_PATH ): path = path[ len( GALAXY_DATA_CANONICAL_PATH ):] @@ -301,14 +318,14 @@ rsync_source = rsync_source[:-1] try: dir_list = rsync_list_dir( rsync_source + "/" ) - except Exception, e: + except Exception: dir_list = None while not dir_list or '.' not in dir_list: head, tail = os.path.split( make_path ) if not head: head = tail make_path = head - rsync_source = rsync_urljoin( rsync_urljoin( RSYNC_SERVER, INDEX_DIR ), head ) #if we error here, likely due to a connection issue + rsync_source = rsync_urljoin( rsync_urljoin( RSYNC_SERVER, INDEX_DIR ), head ) # if we error here, likely due to a connection issue if rsync_source.endswith( '/' ): rsync_source = rsync_source[:-1] dir_list = rsync_list_dir( rsync_source + "/" ) @@ -321,43 +338,45 @@ rsync_sync_to_dir( rsync_source, target_path ) return path + def get_data_and_munge_path( data_table_name, data_table_entry, data_root_dir ): path_cols = [] - for key, value in data_table_entry.iteritems(): + for key, value in data_table_entry.items(): if key in PATH_COLUMN_NAMES: path_cols.append( ( key, value ) ) - found_data = False if path_cols: for col_name, value in path_cols: - #GALAXY_DATA_CANONICAL_PATH if value.startswith( GALAXY_DATA_CANONICAL_PATH ): data_table_entry[col_name] = get_data_for_path( value, data_root_dir ) - found_data = True else: - print 'unable to determine location of rsync data for', data_table_name, data_table_entry + print('unable to determine location of rsync data for', data_table_name, data_table_entry) return data_table_entry + def fulfill_data_table_entries( data_table_entries, data_manager_dict, data_root_dir ): - for data_table_name, entries in data_table_entries.iteritems(): + for data_table_name, entries in data_table_entries.items(): for entry in entries: entry = get_data_and_munge_path( data_table_name, entry, data_root_dir ) _add_data_table_entry( data_manager_dict, data_table_name, entry ) return data_manager_dict + def _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ): data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get( data_table_name, [] ) data_manager_dict['data_tables'][data_table_name].append( data_table_entry ) return data_manager_dict + def cleanup_before_exit( tmp_dir ): if tmp_dir and os.path.exists( tmp_dir ): shutil.rmtree( tmp_dir ) + def get_data_table_entries( params ): rval = {} data_table_entries = params.get( 'data_table_entries', None ) - if data_table_entries : + if data_table_entries: for entry_text in data_table_entries.split( ',' ): entry_text = entry_text.strip().decode( 'base64' ) entry_dict = loads( entry_text ) @@ -367,24 +386,26 @@ rval[ data_table_name ].append( data_table_entry ) return rval + def main(): - #Parse Command Line parser = optparse.OptionParser() (options, args) = parser.parse_args() - + filename = args[0] - + params = loads( open( filename ).read() ) target_directory = params[ 'output_data' ][0]['extra_files_path'] os.mkdir( target_directory ) data_manager_dict = {} - + data_table_entries = get_data_table_entries( params['param_dict'] ) - + # Populate the data Tables data_manager_dict = fulfill_data_table_entries( data_table_entries, data_manager_dict, target_directory ) - - #save info to json file + + # save info to json file open( filename, 'wb' ).write( dumps( data_manager_dict ) ) - -if __name__ == "__main__": main() + + +if __name__ == "__main__": + main()
--- a/data_manager/data_manager_rsync.xml Wed Oct 14 13:48:12 2015 -0400 +++ b/data_manager/data_manager_rsync.xml Tue Apr 04 18:13:26 2017 -0400 @@ -1,23 +1,18 @@ <tool id="data_manager_rsync_g2" name="Rsync with g2" version="0.0.1" tool_type="manage_data"> <options sanitize="False" /> <description>fetching</description> - <command interpreter="python">data_manager_rsync.py "${out_file}"</command> - <stdio> - <exit_code range="1:" err_level="fatal" /> - <exit_code range=":-1" err_level="fatal" /> - </stdio> + <command detect_errors="exit_code"><![CDATA[ + python '$__tool_directory__/data_manager_rsync.py' + '${out_file}' + ]]></command> <inputs> - <param name="dbkey" type="genomebuild" label="dbkey to search for Reference Data" help="Specify ? to show all"/> - <param name="data_table_names" type="select" display="checkboxes" multiple="True" optional="True" - label="Choose Desired Data Tables" dynamic_options="galaxy_code_get_available_data_tables( __trans__ )" - refresh_on_change="dbkey"/> - - - <param name="data_table_entries" type="select" display="checkboxes" multiple="True" optional="False" - label="Choose Desired Data Tables Entries" dynamic_options="galaxy_code_get_available_data_tables_entries( __trans__, dbkey, data_table_names )" - refresh_on_change="dbkey"/> + <param name="data_table_names" type="select" display="checkboxes" multiple="True" optional="True" refresh_on_change="dbkey" + label="Choose Desired Data Tables" dynamic_options="galaxy_code_get_available_data_tables( __trans__ )" /> + + <param name="data_table_entries" type="select" display="checkboxes" multiple="true" optional="false" refresh_on_change="dbkey" + label="Choose Desired Data Tables Entries" dynamic_options="galaxy_code_get_available_data_tables_entries( __trans__, dbkey, data_table_names )" /> </inputs> <outputs> <data name="out_file" format="data_manager_json" dbkey="dbkey"/> @@ -37,12 +32,9 @@ ------ - - .. class:: infomark -**Notice:** If you do not have a particular data table defined, then it will be created and persisted dynamically. - +**Notice:** If you do not have a particular data table defined, then it will be created and persisted dynamically. </help> <code file="data_manager_rsync.py" /> </tool>