comparison data_manager/data_manager_rsync.py @ 2:e0329ab30f6d draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_rsync_g2 commit 57f71aa633a43ab02bbf05acd0c6d7f406e01f1e"
author iuc
date Thu, 28 Nov 2019 15:47:47 -0500
parents 8ff92bd7e2a3
children
comparison
equal deleted inserted replaced
1:8ff92bd7e2a3 2:e0329ab30f6d
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # Dan Blankenberg 2 # Dan Blankenberg
3 from __future__ import print_function 3 from __future__ import print_function
4 4
5 import base64
5 import datetime 6 import datetime
6 import logging 7 import logging
7 import optparse 8 import optparse
8 import os 9 import os
9 import shutil 10 import shutil
23 from urllib2 import urlopen 24 from urllib2 import urlopen
24 25
25 _log_name = __name__ 26 _log_name = __name__
26 if _log_name == '__builtin__': 27 if _log_name == '__builtin__':
27 _log_name = 'toolshed.installed.g2.rsync.data.manager' 28 _log_name = 'toolshed.installed.g2.rsync.data.manager'
28 log = logging.getLogger( _log_name ) 29 log = logging.getLogger(_log_name)
29 30
30 # Get the Data from the Galaxy Project rsync server 31 # Get the Data from the Galaxy Project rsync server
31 RSYNC_CMD = 'rsync' 32 RSYNC_CMD = 'rsync'
32 RSYNC_SERVER = "rsync://datacache.g2.bx.psu.edu/" 33 RSYNC_SERVER = "rsync://datacache.g2.bx.psu.edu/"
33 LOCATION_DIR = "location" 34 LOCATION_DIR = "location"
34 INDEX_DIR = "indexes" 35 INDEX_DIR = "indexes"
35 36
36 # Pull the Tool Data Table files from github 37 # Pull the Tool Data Table files from github
37 # FIXME: These files should be accessible from the rsync server directly. 38 # FIXME: These files should be accessible from the rsync server directly.
38 TOOL_DATA_TABLE_CONF_XML_URLS = {'main': "https://raw.githubusercontent.com/galaxyproject/usegalaxy-playbook/master/files/galaxy/usegalaxy.org/config/tool_data_table_conf.xml", 39 TOOL_DATA_TABLE_CONF_XML_URLS = {'main': "https://raw.githubusercontent.com/galaxyproject/usegalaxy-playbook/master/env/main/files/galaxy/config/tool_data_table_conf.xml",
39 'test': "https://raw.githubusercontent.com/galaxyproject/usegalaxy-playbook/master/files/galaxy/test.galaxyproject.org/config/tool_data_table_conf.xml" } 40 'test': "https://raw.githubusercontent.com/galaxyproject/usegalaxy-playbook/master/env/test/files/galaxy/config/tool_data_table_conf.xml"}
40 41
41 # Replace data table source entries with local temporary location 42 # Replace data table source entries with local temporary location
42 GALAXY_DATA_CANONICAL_PATH = "/galaxy/data/" 43 GALAXY_DATA_CANONICAL_PATH = "/galaxy/data/"
43 TOOL_DATA_TABLE_CONF_XML_REPLACE_SOURCE = '<file path="%slocation/' % ( GALAXY_DATA_CANONICAL_PATH ) 44 TOOL_DATA_TABLE_CONF_XML_REPLACE_SOURCE = '<file path="%slocation/' % (GALAXY_DATA_CANONICAL_PATH)
44 TOOL_DATA_TABLE_CONF_XML_REPLACE_TARGET = '<file path="%s/' 45 TOOL_DATA_TABLE_CONF_XML_REPLACE_TARGET = '<file path="%s/'
45 46
46 # Some basic Caching, so we don't have to reload and download everything every time 47 # Some basic Caching, so we don't have to reload and download everything every time
47 CACHE_TIME = datetime.timedelta( minutes=10 ) 48 CACHE_TIME = datetime.timedelta(minutes=10)
48 TOOL_DATA_TABLES_LOADED_BY_URL = {} 49 TOOL_DATA_TABLES_LOADED_BY_URL = {}
49 50
50 # Entries will not be selected by default 51 # Entries will not be selected by default
51 DEFAULT_SELECTED = False 52 DEFAULT_SELECTED = False
52 53
58 # e.g. mafs. Although this maf table is goofy and doesn't have path defined in <table> def, 59 # e.g. mafs. Although this maf table is goofy and doesn't have path defined in <table> def,
59 # but it does exit in the .loc. 60 # but it does exit in the .loc.
60 61
61 62
62 # --- These methods are called by/within the Galaxy Application 63 # --- These methods are called by/within the Galaxy Application
63 def exec_before_job( app, inp_data, out_data, param_dict, tool=None, **kwd ): 64 def exec_before_job(app, inp_data, out_data, param_dict, tool=None, **kwd):
64 # Look for any data tables that haven't been defined for this data manager before and dynamically add them to Galaxy 65 # Look for any data tables that haven't been defined for this data manager before and dynamically add them to Galaxy
65 param_dict = dict( **param_dict ) 66 param_dict = dict(**param_dict)
66 param_dict['data_table_entries'] = param_dict.get( 'data_table_entries', [] ) 67 param_dict['data_table_entries'] = param_dict.get('data_table_entries', [])
67 if not isinstance( param_dict['data_table_entries'], list ): 68 if not isinstance(param_dict['data_table_entries'], list):
68 param_dict['data_table_entries'] = [param_dict['data_table_entries']] 69 param_dict['data_table_entries'] = [param_dict['data_table_entries']]
69 param_dict['data_table_entries'] = ",".join( param_dict['data_table_entries'] ) 70 param_dict['data_table_entries'] = ",".join(param_dict['data_table_entries'])
70 if tool: 71 if tool:
71 tool_shed_repository = tool.tool_shed_repository 72 tool_shed_repository = tool.tool_shed_repository
72 else: 73 else:
73 tool_shed_repository = None 74 tool_shed_repository = None
74 tdtm = None 75 tdtm = None
75 data_manager = app.data_managers.get_manager( tool.data_manager_id, None ) 76 data_manager = app.data_managers.get_manager(tool.data_manager_id, None)
76 data_table_entries = get_data_table_entries( param_dict ) 77 data_table_entries = get_data_table_entries(param_dict)
77 data_tables = load_data_tables_from_url( data_table_class=app.tool_data_tables.__class__ ).get( 'data_tables' ) 78 data_tables = load_data_tables_from_url(data_table_class=app.tool_data_tables.__class__).get('data_tables')
78 for data_table_name, entries in data_table_entries.items(): 79 for data_table_name, entries in data_table_entries.items():
79 # get data table managed by this data Manager 80 # get data table managed by this data Manager
80 has_data_table = app.tool_data_tables.get_tables().get( data_table_name ) 81 has_data_table = app.tool_data_tables.get_tables().get(str(data_table_name))
81 if has_data_table: 82 if has_data_table:
82 has_data_table = bool( has_data_table.get_filename_for_source( data_manager, None ) ) 83 has_data_table = bool(has_data_table.get_filename_for_source(data_manager, None))
83 if not has_data_table: 84 if not has_data_table:
84 if tdtm is None: 85 if tdtm is None:
85 from tool_shed.tools import data_table_manager 86 from tool_shed.tools import data_table_manager
86 tdtm = data_table_manager.ToolDataTableManager( app ) 87 tdtm = data_table_manager.ToolDataTableManager(app)
87 target_dir, tool_path, relative_target_dir = tdtm.get_target_install_dir( tool_shed_repository ) 88 target_dir, tool_path, relative_target_dir = tdtm.get_target_install_dir(tool_shed_repository)
88 # Dynamically add this data table 89 # Dynamically add this data table
89 log.debug( "Attempting to dynamically create a missing Tool Data Table named %s." % data_table_name ) 90 log.debug("Attempting to dynamically create a missing Tool Data Table named %s." % data_table_name)
90 data_table = data_tables[data_table_name] 91 data_table = data_tables[data_table_name]
91 repo_info = tdtm.generate_repository_info_elem_from_repository( tool_shed_repository, parent_elem=None ) 92 repo_info = tdtm.generate_repository_info_elem_from_repository(tool_shed_repository, parent_elem=None)
92 if repo_info is not None: 93 if repo_info is not None:
93 repo_info = tostring( repo_info ) 94 repo_info = tostring(repo_info)
94 tmp_file = tempfile.NamedTemporaryFile() 95 tmp_file = tempfile.NamedTemporaryFile(mode="w")
95 tmp_file.write( get_new_xml_definition( app, data_table, data_manager, repo_info, target_dir ) ) 96 tmp_file.write(get_new_xml_definition(app, data_table, data_manager, repo_info, target_dir))
96 tmp_file.flush() 97 tmp_file.flush()
97 app.tool_data_tables.add_new_entries_from_config_file( tmp_file.name, None, app.config.shed_tool_data_table_config, persist=True ) 98 app.tool_data_tables.add_new_entries_from_config_file(tmp_file.name, None, app.config.shed_tool_data_table_config, persist=True)
98 tmp_file.close() 99 tmp_file.close()
99 100
100 101
101 def galaxy_code_get_available_data_tables( trans ): 102 def galaxy_code_get_available_data_tables(trans):
102 # list of data tables 103 # list of data tables
103 found_tables = get_available_tables( trans ) 104 found_tables = get_available_tables(trans)
104 rval = [ ( x, x, DEFAULT_SELECTED ) for x in found_tables] 105 rval = [(x, x, DEFAULT_SELECTED) for x in found_tables]
105 return rval 106 return rval
106 107
107 108
108 def galaxy_code_get_available_data_tables_entries( trans, dbkey, data_table_names ): 109 def galaxy_code_get_available_data_tables_entries(trans, dbkey, data_table_names):
109 # available entries, optionally filtered by dbkey and table names 110 # available entries, optionally filtered by dbkey and table names
110 if dbkey in [ None, '', '?' ]: 111 if dbkey in [None, '', '?']:
111 dbkey = None 112 dbkey = None
112 if data_table_names in [ None, '', '?' ]: 113 if data_table_names in [None, '', '?']:
113 data_table_names = None 114 data_table_names = None
114 found_tables = get_available_tables_for_dbkey( trans, dbkey, data_table_names ) 115 found_tables = get_available_tables_for_dbkey(trans, dbkey, data_table_names)
115 dbkey_text = '(%s) ' % ( dbkey ) if dbkey else '' 116 dbkey_text = '(%s) ' % (dbkey) if dbkey else ''
116 rval = [( "%s%s" % ( dbkey_text, x[0] ), dumps( dict( name=x[0].split( ': ' )[0], entry=x[1] ) ).encode( 'base64' ).rstrip(), DEFAULT_SELECTED ) for x in found_tables.items()] 117 rval = [("%s%s" % (dbkey_text, x[0]), base64.b64encode(dumps(dict(name=x[0].split(': ')[0], entry=x[1]), sort_keys=True).rstrip().encode('utf-8')).decode('utf-8'), DEFAULT_SELECTED) for x in found_tables.items()]
117 return rval 118 return rval
118 119
119 # --- End Galaxy called Methods --- 120 # --- End Galaxy called Methods ---
120 121
121 122
122 def rsync_urljoin( base, url ): 123 def rsync_urljoin(base, url):
123 # urlparse.urljoin doesn't work correctly for our use-case 124 # urlparse.urljoin doesn't work correctly for our use-case
124 # probably because it doesn't recognize the rsync scheme 125 # probably because it doesn't recognize rhe rsync scheme
125 base = base.rstrip( '/' ) 126 base = base.rstrip('/')
126 url = url.lstrip( '/' ) 127 url = url.lstrip('/')
127 return "%s/%s" % ( base, url ) 128 return "%s/%s" % (base, url)
128 129
129 130
130 def rsync_list_dir( server, dir=None, skip_names=[] ): 131 def rsync_list_dir(server, dir=None, skip_names=[]):
131 # drwxr-xr-x 50 2014/05/16 20:58:11 . 132 # drwxr-xr-x 50 2014/05/16 20:58:11 .
132 if dir: 133 if dir:
133 dir = rsync_urljoin( server, dir ) 134 dir = rsync_urljoin(server, dir)
134 else: 135 else:
135 dir = server 136 dir = server
136 rsync_response = tempfile.NamedTemporaryFile() 137 rsync_response = tempfile.NamedTemporaryFile()
137 rsync_stderr = tempfile.NamedTemporaryFile() 138 rsync_stderr = tempfile.NamedTemporaryFile()
138 rsync_cmd = [ RSYNC_CMD, '--list-only', dir ] 139 rsync_cmd = [RSYNC_CMD, '--list-only', dir]
139 return_code = subprocess.call( rsync_cmd, stdout=rsync_response, stderr=rsync_stderr ) 140 return_code = subprocess.call(rsync_cmd, stdout=rsync_response, stderr=rsync_stderr)
140 rsync_response.flush() 141 rsync_response.flush()
141 rsync_response.seek(0) 142 rsync_response.seek(0)
142 rsync_stderr.flush() 143 rsync_stderr.flush()
143 rsync_stderr.seek(0) 144 rsync_stderr.seek(0)
144 if return_code: 145 if return_code:
145 msg = "stdout:\n%s\nstderr:\n%s" % ( rsync_response.read(), rsync_stderr.read() ) 146 msg = "stdout:\n%s\nstderr:\n%s" % (rsync_response.read(), rsync_stderr.read())
146 rsync_response.close() 147 rsync_response.close()
147 rsync_stderr.close() 148 rsync_stderr.close()
148 raise Exception( 'Failed to execute rsync command (%s), returncode=%s. Rsync_output:\n%s' % ( rsync_cmd, return_code, msg ) ) 149 raise Exception('Failed to execute rsync command (%s), returncode=%s. Rsync_output:\n%s' % (rsync_cmd, return_code, msg))
149 rsync_stderr.close() 150 rsync_stderr.close()
150 rval = {} 151 rval = {}
151 for line in rsync_response: 152 for line in rsync_response:
152 perms, line = line.split( None, 1 ) 153 perms, line = line.split(None, 1)
153 line = line.strip() 154 line = line.strip()
154 size, line = line.split( None, 1 ) 155 size, line = line.split(None, 1)
155 line = line.strip() 156 line = line.strip()
156 date, line = line.split( None, 1 ) 157 date, line = line.split(None, 1)
157 line = line.strip() 158 line = line.strip()
158 time, line = line.split( None, 1 ) 159 time, line = line.split(None, 1)
159 name = line.strip() 160 name = line.strip()
160 if name in skip_names: 161 if name in skip_names:
161 continue 162 continue
162 size = line.strip() 163 size = line.strip()
163 rval[ name ] = dict( name=name, permissions=perms, bytes=size, date=date, time=time ) 164 rval[name] = dict(name=name, permissions=perms, bytes=size, date=date, time=time)
164 rsync_response.close() 165 rsync_response.close()
165 return rval 166 return rval
166 167
167 168
168 def rsync_sync_to_dir( source, target ): 169 def rsync_sync_to_dir(source, target):
169 rsync_response = tempfile.NamedTemporaryFile() 170 rsync_response = tempfile.NamedTemporaryFile()
170 rsync_stderr = tempfile.NamedTemporaryFile() 171 rsync_stderr = tempfile.NamedTemporaryFile()
171 rsync_cmd = [ RSYNC_CMD, '-avzP', source, target ] 172 rsync_cmd = [RSYNC_CMD, '-avzP', source, target]
172 return_code = subprocess.call( rsync_cmd, stdout=rsync_response, stderr=rsync_stderr ) 173 return_code = subprocess.call(rsync_cmd, stdout=rsync_response, stderr=rsync_stderr)
173 rsync_response.flush() 174 rsync_response.flush()
174 rsync_response.seek(0) 175 rsync_response.seek(0)
175 rsync_stderr.flush() 176 rsync_stderr.flush()
176 rsync_stderr.seek(0) 177 rsync_stderr.seek(0)
177 if return_code: 178 if return_code:
178 msg = "stdout:\n%s\nstderr:\n%s" % ( rsync_response.read(), rsync_stderr.read() ) 179 msg = "stdout:\n%s\nstderr:\n%s" % (rsync_response.read(), rsync_stderr.read())
179 rsync_response.close() 180 rsync_response.close()
180 rsync_stderr.close() 181 rsync_stderr.close()
181 raise Exception( 'Failed to execute rsync command (%s), returncode=%s. Rsync_output:\n%s' % ( rsync_cmd, return_code, msg ) ) 182 raise Exception('Failed to execute rsync command (%s), returncode=%s. Rsync_output:\n%s' % (rsync_cmd, return_code, msg))
182 rsync_response.close() 183 rsync_response.close()
183 rsync_stderr.close() 184 rsync_stderr.close()
184 return return_code 185 return return_code
185 186
186 187
187 def data_table_needs_refresh( cached_data_table, url ): 188 def data_table_needs_refresh(cached_data_table, url):
188 if cached_data_table is None: 189 if cached_data_table is None:
189 return True, {} 190 return True, {}
190 if datetime.datetime.now() - cached_data_table.get( 'time_loaded' ) > CACHE_TIME: 191 if datetime.datetime.now() - cached_data_table.get('time_loaded') > CACHE_TIME:
191 data_table_text = urlopen( url ).read() 192 data_table_text = urlopen(url).read().decode('utf-8')
192 if cached_data_table.get( 'data_table_text', None ) != data_table_text: 193 if cached_data_table.get('data_table_text', None) != data_table_text:
193 return True, {'data_table_text': data_table_text} 194 return True, {'data_table_text': data_table_text}
194 loc_file_attrs = rsync_list_dir( RSYNC_SERVER, LOCATION_DIR ) 195 loc_file_attrs = rsync_list_dir(RSYNC_SERVER, LOCATION_DIR)
195 if cached_data_table.get( 'loc_file_attrs', None ) != loc_file_attrs: 196 if cached_data_table.get('loc_file_attrs', None) != loc_file_attrs:
196 return True, {'loc_file_attrs': loc_file_attrs} 197 return True, {'loc_file_attrs': loc_file_attrs}
197 return False, {} 198 return False, {}
198 199
199 200
200 def load_data_tables_from_url( url=None, site='main', data_table_class=None ): 201 def load_data_tables_from_url(url=None, site='main', data_table_class=None):
201 if not url: 202 if not url:
202 url = TOOL_DATA_TABLE_CONF_XML_URLS.get( site, None ) 203 url = TOOL_DATA_TABLE_CONF_XML_URLS.get(site, None)
203 assert url, ValueError( 'You must provide either a URL or a site=name.' ) 204 assert url, ValueError('You must provide either a URL or a site=name.')
204 205
205 cached_data_table = TOOL_DATA_TABLES_LOADED_BY_URL.get( url, None ) 206 cached_data_table = TOOL_DATA_TABLES_LOADED_BY_URL.get(url, None)
206 refresh, attribs = data_table_needs_refresh( cached_data_table, url ) 207 refresh, attribs = data_table_needs_refresh(cached_data_table, url)
207 if refresh: 208 if refresh:
208 data_table_text = attribs.get( 'data_table_text' )or urlopen( url ).read() 209 data_table_text = attribs.get('data_table_text') or urlopen(url).read().decode('utf-8')
209 loc_file_attrs = attribs.get( 'loc_file_attrs' ) or rsync_list_dir( RSYNC_SERVER, LOCATION_DIR ) 210 loc_file_attrs = attribs.get('loc_file_attrs') or rsync_list_dir(RSYNC_SERVER, LOCATION_DIR)
210 211
211 tmp_dir = tempfile.mkdtemp( prefix='rsync_g2_' ) 212 tmp_dir = tempfile.mkdtemp(prefix='rsync_g2_')
212 tmp_loc_dir = os.path.join( tmp_dir, 'location' ) 213 tmp_loc_dir = os.path.join(tmp_dir, 'location')
213 os.mkdir( tmp_loc_dir ) 214 os.mkdir(tmp_loc_dir)
214 rsync_sync_to_dir( rsync_urljoin( RSYNC_SERVER, LOCATION_DIR ), os.path.abspath( tmp_loc_dir ) ) 215 rsync_sync_to_dir(rsync_urljoin(RSYNC_SERVER, LOCATION_DIR), os.path.abspath(tmp_loc_dir))
215 216
216 new_data_table_text = data_table_text.replace( TOOL_DATA_TABLE_CONF_XML_REPLACE_SOURCE, TOOL_DATA_TABLE_CONF_XML_REPLACE_TARGET % ( tmp_loc_dir ) ) 217 new_data_table_text = data_table_text.replace(TOOL_DATA_TABLE_CONF_XML_REPLACE_SOURCE, TOOL_DATA_TABLE_CONF_XML_REPLACE_TARGET % (tmp_loc_dir))
217 data_table_fh = tempfile.NamedTemporaryFile( dir=tmp_dir, prefix='rysnc_data_manager_data_table_conf_' ) 218 data_table_fh = tempfile.NamedTemporaryFile(dir=tmp_dir, prefix='rysnc_data_manager_data_table_conf_', mode="w")
218 data_table_fh.write( new_data_table_text ) 219 data_table_fh.write(new_data_table_text)
219 data_table_fh.flush() 220 data_table_fh.flush()
220 tmp_data_dir = os.path.join( tmp_dir, 'tool-data' ) 221 tmp_data_dir = os.path.join(tmp_dir, 'tool-data')
221 os.mkdir( tmp_data_dir ) 222 os.mkdir(tmp_data_dir)
222 data_tables = data_table_class( tmp_data_dir, config_filename=data_table_fh.name ) 223 data_tables = data_table_class(tmp_data_dir, config_filename=data_table_fh.name)
223 for name, data_table in list(data_tables.data_tables.items()): 224 for name, data_table in list(data_tables.data_tables.items()):
224 if name in EXCLUDE_DATA_TABLES or not data_table_has_path_column( data_table ): 225 if name in EXCLUDE_DATA_TABLES or not data_table_has_path_column(data_table):
225 log.debug( 'Removing data table "%s" because it is excluded by name or does not have a defined "path" column.', name ) 226 log.debug('Removing data table "%s" because it is excluded by name or does not have a defined "path" column.', name)
226 del data_tables.data_tables[name] 227 del data_tables.data_tables[name]
227 cached_data_table = { 'data_tables': data_tables, 'tmp_dir': tmp_dir, 'data_table_text': data_table_text, 'tmp_loc_dir': tmp_loc_dir, 'loc_file_attrs': loc_file_attrs, 'time_loaded': datetime.datetime.now() } 228 cached_data_table = {'data_tables': data_tables, 'tmp_dir': tmp_dir, 'data_table_text': data_table_text, 'tmp_loc_dir': tmp_loc_dir, 'loc_file_attrs': loc_file_attrs, 'time_loaded': datetime.datetime.now()}
228 TOOL_DATA_TABLES_LOADED_BY_URL[ url ] = cached_data_table 229 TOOL_DATA_TABLES_LOADED_BY_URL[url] = cached_data_table
229 # delete the files 230 # delete the files
230 data_table_fh.close() 231 data_table_fh.close()
231 cleanup_before_exit( tmp_dir ) 232 cleanup_before_exit(tmp_dir)
232 return cached_data_table 233 return cached_data_table
233 234
234 235
235 def data_table_has_path_column( data_table ): 236 def data_table_has_path_column(data_table):
236 col_names = data_table.get_column_name_list() 237 col_names = data_table.get_column_name_list()
237 for name in PATH_COLUMN_NAMES: 238 for name in PATH_COLUMN_NAMES:
238 if name in col_names: 239 if name in col_names:
239 return True 240 return True
240 return False 241 return False
241 242
242 243
243 def get_available_tables( trans ): 244 def get_available_tables(trans):
244 # list of data tables 245 # list of data tables
245 data_tables = load_data_tables_from_url( data_table_class=trans.app.tool_data_tables.__class__ ) 246 data_tables = load_data_tables_from_url(data_table_class=trans.app.tool_data_tables.__class__)
246 return list(data_tables.get( 'data_tables' ).get_tables().keys()) 247 return data_tables.get('data_tables').get_tables().keys()
247 248
248 249
249 def get_new_xml_definition( app, data_table, data_manager, repo_info=None, location_file_dir=None ): 250 def get_new_xml_definition(app, data_table, data_manager, repo_info=None, location_file_dir=None):
250 sub_dict = { 'table_name': data_table.name, 'comment_char': '', 'columns': '', 'file_path': '' } 251 sub_dict = {'table_name': data_table.name, 'comment_char': '', 'columns': '', 'file_path': ''}
251 sub_dict.update( data_manager.get_tool_shed_repository_info_dict() ) 252 sub_dict.update(data_manager.get_tool_shed_repository_info_dict())
252 if data_table.comment_char: 253 if data_table.comment_char:
253 sub_dict['comment_char'] = 'comment_char="%s"' % ( data_table.comment_char ) 254 sub_dict['comment_char'] = 'comment_char="%s"' % (data_table.comment_char)
254 for i, name in enumerate( data_table.get_column_name_list() ): 255 for i, name in enumerate(data_table.get_column_name_list()):
255 if name is not None: 256 if name is not None:
256 sub_dict['columns'] = "%s\n%s" % ( sub_dict['columns'], '<column name="%s" index="%s" />' % ( name, i ) ) 257 sub_dict['columns'] = "%s\n%s" % (sub_dict['columns'], '<column name="%s" index="%s" />' % (name, i))
257 location_file_dir = location_file_dir or app.config.galaxy_data_manager_data_path 258 location_file_dir = location_file_dir or app.config.galaxy_data_manager_data_path
258 for filename in data_table.filenames.keys(): 259 for filename in data_table.filenames.keys():
259 sub_dict['file_path'] = basename( filename ) 260 sub_dict['file_path'] = basename(filename)
260 sub_dict['file_path'] = os.path.join( location_file_dir, sub_dict['file_path'] ) # os.path.abspath? 261 sub_dict['file_path'] = os.path.join(location_file_dir, sub_dict['file_path']) # os.path.abspath?
261 if not os.path.exists( sub_dict['file_path'] ): 262 if not os.path.exists(sub_dict['file_path']):
262 # Create empty file 263 # Create empty file
263 open( sub_dict['file_path'], 'wb+' ).close() 264 open(sub_dict['file_path'], 'wb+').close()
264 break 265 break
265 sub_dict[ 'repo_info' ] = repo_info or '' 266 sub_dict['repo_info'] = repo_info or ''
266 return """ 267 return """
267 <tables><table name="%(table_name)s" %(comment_char)s> 268 <tables><table name="%(table_name)s" %(comment_char)s>
268 %(columns)s 269 %(columns)s
269 <file path="%(file_path)s" /> 270 <file path="%(file_path)s" />
270 %(repo_info)s 271 %(repo_info)s
271 </table></tables> 272 </table></tables>
272 """ % sub_dict 273 """ % sub_dict
273 274
274 275
275 def get_available_tables_for_dbkey( trans, dbkey, data_table_names ): 276 def get_available_tables_for_dbkey(trans, dbkey, data_table_names):
276 data_tables = load_data_tables_from_url( data_table_class=trans.app.tool_data_tables.__class__ ) 277 data_tables = load_data_tables_from_url(data_table_class=trans.app.tool_data_tables.__class__)
277 rval = {} 278 rval = {}
278 for name, data_table in data_tables.get( 'data_tables' ).get_tables().items(): 279 for name, data_table in data_tables.get('data_tables').get_tables().items():
279 if ( not data_table_names or name in data_table_names ): 280 if (not data_table_names or name in data_table_names):
280 # TODO: check that columns are similiar 281 # TODO: check that columns are similiar
281 if not dbkey: 282 if not dbkey:
282 entry_getter = data_table.get_named_fields_list() 283 entry_getter = data_table.get_named_fields_list()
283 else: 284 else:
284 entry_getter = data_table.get_entries( 'dbkey', dbkey, None, default=[] ) 285 entry_getter = data_table.get_entries('dbkey', dbkey, None, default=[])
285 for entry in entry_getter: 286 for entry in entry_getter:
286 name = "%s: %s" % ( data_table.name, dumps( entry ) ) 287 name = "%s: %s" % (data_table.name, dumps(entry))
287 rval[name] = entry 288 rval[name] = entry
288 return rval 289 return rval
289 290
290 291
291 def split_path_all( path ): 292 def split_path_all(path):
292 rval = [] 293 rval = []
293 path = path.rstrip( '/' ) 294 path = path.rstrip('/')
294 while True: 295 while True:
295 head, tail = os.path.split( path ) 296 head, tail = os.path.split(path)
296 if tail: 297 if tail:
297 rval.append( tail ) 298 rval.append(tail)
298 path = head 299 path = head
299 elif head: 300 elif head:
300 rval.append( head ) 301 rval.append(head)
301 break 302 break
302 else: 303 else:
303 break 304 break
304 rval.reverse() 305 rval.reverse()
305 return rval 306 return rval
306 307
307 308
308 def get_data_for_path( path, data_root_dir ): 309 def get_data_for_path(path, data_root_dir):
309 # We list dir with a /, but copy data without 310 # We list dir with a /, but copy data without
310 # listing with / gives a . entry when its a dir 311 # listing with / gives a . entry when its a dir
311 # cloning without the / will copy that whole directory into the target, 312 # cloning without the / will copy that whole directory into the target,
312 # instead of just that target's contents 313 # instead of just that target's contents
313 if path.startswith( GALAXY_DATA_CANONICAL_PATH ): 314 if path.startswith(GALAXY_DATA_CANONICAL_PATH):
314 path = path[ len( GALAXY_DATA_CANONICAL_PATH ):] 315 path = path[len(GALAXY_DATA_CANONICAL_PATH):]
315 make_path = path 316 make_path = path
316 rsync_source = rsync_urljoin( rsync_urljoin( RSYNC_SERVER, INDEX_DIR ), path ) 317 rsync_source = rsync_urljoin(rsync_urljoin(RSYNC_SERVER, INDEX_DIR), path)
317 if rsync_source.endswith( '/' ): 318 if rsync_source.endswith('/'):
318 rsync_source = rsync_source[:-1] 319 rsync_source = rsync_source[:-1]
319 try: 320 try:
320 dir_list = rsync_list_dir( rsync_source + "/" ) 321 dir_list = rsync_list_dir(rsync_source + "/")
321 except Exception: 322 except Exception:
322 dir_list = None 323 dir_list = None
323 while not dir_list or '.' not in dir_list: 324 while not dir_list or '.' not in dir_list:
324 head, tail = os.path.split( make_path ) 325 head, tail = os.path.split(make_path)
325 if not head: 326 if not head:
326 head = tail 327 head = tail
327 make_path = head 328 make_path = head
328 rsync_source = rsync_urljoin( rsync_urljoin( RSYNC_SERVER, INDEX_DIR ), head ) # if we error here, likely due to a connection issue 329 rsync_source = rsync_urljoin(rsync_urljoin(RSYNC_SERVER, INDEX_DIR), head) # if we error here, likely due to a connection issue
329 if rsync_source.endswith( '/' ): 330 if rsync_source.endswith('/'):
330 rsync_source = rsync_source[:-1] 331 rsync_source = rsync_source[:-1]
331 dir_list = rsync_list_dir( rsync_source + "/" ) 332 dir_list = rsync_list_dir(rsync_source + "/")
332 split_path = split_path_all( make_path ) 333 split_path = split_path_all(make_path)
333 target_path = data_root_dir 334 target_path = data_root_dir
334 for p in split_path[:-1]: 335 for p in split_path[:-1]:
335 target_path = os.path.join( target_path, p ) 336 target_path = os.path.join(target_path, p)
336 if not os.path.exists( target_path ): 337 if not os.path.exists(target_path):
337 os.mkdir( target_path ) 338 os.mkdir(target_path)
338 rsync_sync_to_dir( rsync_source, target_path ) 339 rsync_sync_to_dir(rsync_source, target_path)
339 return path 340 return path
340 341
341 342
342 def get_data_and_munge_path( data_table_name, data_table_entry, data_root_dir ): 343 def get_data_and_munge_path(data_table_name, data_table_entry, data_root_dir):
343 path_cols = [] 344 path_cols = []
344 for key, value in data_table_entry.items(): 345 for key, value in data_table_entry.items():
345 if key in PATH_COLUMN_NAMES: 346 if key in PATH_COLUMN_NAMES:
346 path_cols.append( ( key, value ) ) 347 path_cols.append((key, value))
347 if path_cols: 348 if path_cols:
348 for col_name, value in path_cols: 349 for col_name, value in path_cols:
349 if value.startswith( GALAXY_DATA_CANONICAL_PATH ): 350 if value.startswith(GALAXY_DATA_CANONICAL_PATH):
350 data_table_entry[col_name] = get_data_for_path( value, data_root_dir ) 351 data_table_entry[col_name] = get_data_for_path(value, data_root_dir)
351 else: 352 else:
352 print('unable to determine location of rsync data for', data_table_name, data_table_entry) 353 print('unable to determine location of rsync data for', data_table_name, data_table_entry)
353 return data_table_entry 354 return data_table_entry
354 355
355 356
356 def fulfill_data_table_entries( data_table_entries, data_manager_dict, data_root_dir ): 357 def fulfill_data_table_entries(data_table_entries, data_manager_dict, data_root_dir):
357 for data_table_name, entries in data_table_entries.items(): 358 for data_table_name, entries in data_table_entries.items():
358 for entry in entries: 359 for entry in entries:
359 entry = get_data_and_munge_path( data_table_name, entry, data_root_dir ) 360 entry = get_data_and_munge_path(data_table_name, entry, data_root_dir)
360 _add_data_table_entry( data_manager_dict, data_table_name, entry ) 361 _add_data_table_entry(data_manager_dict, data_table_name, entry)
361 return data_manager_dict 362 return data_manager_dict
362 363
363 364
364 def _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ): 365 def _add_data_table_entry(data_manager_dict, data_table_name, data_table_entry):
365 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) 366 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
366 data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get( data_table_name, [] ) 367 data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get(data_table_name, [])
367 data_manager_dict['data_tables'][data_table_name].append( data_table_entry ) 368 data_manager_dict['data_tables'][data_table_name].append(data_table_entry)
368 return data_manager_dict 369 return data_manager_dict
369 370
370 371
371 def cleanup_before_exit( tmp_dir ): 372 def cleanup_before_exit(tmp_dir):
372 if tmp_dir and os.path.exists( tmp_dir ): 373 if tmp_dir and os.path.exists(tmp_dir):
373 shutil.rmtree( tmp_dir ) 374 shutil.rmtree(tmp_dir)
374 375
375 376
376 def get_data_table_entries( params ): 377 def get_data_table_entries(params):
377 rval = {} 378 rval = {}
378 data_table_entries = params.get( 'data_table_entries', None ) 379 data_table_entries = params.get('data_table_entries', None)
379 if data_table_entries: 380 if data_table_entries:
380 for entry_text in data_table_entries.split( ',' ): 381 for entry_text in data_table_entries.split(','):
381 entry_text = entry_text.strip().decode( 'base64' ) 382 entry_text = base64.b64decode(entry_text.strip().encode('utf-8'))
382 entry_dict = loads( entry_text ) 383 entry_dict = loads(entry_text)
383 data_table_name = entry_dict['name'] 384 data_table_name = entry_dict['name']
384 data_table_entry = entry_dict['entry'] 385 data_table_entry = entry_dict['entry']
385 rval[ data_table_name ] = rval.get( data_table_name, [] ) 386 rval[data_table_name] = rval.get(data_table_name, [])
386 rval[ data_table_name ].append( data_table_entry ) 387 rval[data_table_name].append(data_table_entry)
387 return rval 388 return rval
388 389
389 390
390 def main(): 391 def main():
391 parser = optparse.OptionParser() 392 parser = optparse.OptionParser()
392 (options, args) = parser.parse_args() 393 (options, args) = parser.parse_args()
393 394
394 filename = args[0] 395 filename = args[0]
395 396
396 params = loads( open( filename ).read() ) 397 params = loads(open(filename).read())
397 target_directory = params[ 'output_data' ][0]['extra_files_path'] 398 target_directory = params['output_data'][0]['extra_files_path']
398 os.mkdir( target_directory ) 399 os.mkdir(target_directory)
399 data_manager_dict = {} 400 data_manager_dict = {}
400 401
401 data_table_entries = get_data_table_entries( params['param_dict'] ) 402 data_table_entries = get_data_table_entries(params['param_dict'])
402 403
403 # Populate the data Tables 404 # Populate the data Tables
404 data_manager_dict = fulfill_data_table_entries( data_table_entries, data_manager_dict, target_directory ) 405 data_manager_dict = fulfill_data_table_entries(data_table_entries, data_manager_dict, target_directory)
405 406
406 # save info to json file 407 # save info to json file
407 open( filename, 'wb' ).write( dumps( data_manager_dict ) ) 408 open(filename, 'w').write(dumps(data_manager_dict, sort_keys=True))
408 409
409 410
410 if __name__ == "__main__": 411 if __name__ == "__main__":
411 main() 412 main()