comparison data_manager/data_manager_rsync.py @ 0:0a3a6f862104 draft

planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/data_managers/data_manager_rsync_g2 commit 704060ebdf7399ecce9e0e8bd7262727fe750c27-dirty
author devteam
date Wed, 14 Oct 2015 13:48:12 -0400
parents
children 8ff92bd7e2a3
comparison
equal deleted inserted replaced
-1:000000000000 0:0a3a6f862104
1 #!/usr/bin/env python
2 #Dan Blankenberg
3
4 import sys
5 import os
6 import tempfile
7 import shutil
8 import optparse
9 import urllib2
10 import subprocess
11 import datetime
12 from os.path import basename
13 from json import loads, dumps
14 from xml.etree.ElementTree import tostring
15
16 import logging
17 _log_name = __name__
18 if _log_name == '__builtin__':
19 _log_name = 'toolshed.installed.g2.rsync.data.manager'
20 log = logging.getLogger( _log_name )
21
22 # Get the Data from the Galaxy Project rsync server
23 RSYNC_CMD = 'rsync'
24 RSYNC_SERVER = "rsync://datacache.g2.bx.psu.edu/"
25 LOCATION_DIR = "location"
26 INDEX_DIR = "indexes"
27
28 # Pull the Tool Data Table files from github
29 # FIXME: These files should be accessible from the rsync server directly.
30 TOOL_DATA_TABLE_CONF_XML_URLS = { 'main':"https://raw.githubusercontent.com/galaxyproject/usegalaxy-playbook/master/files/galaxy/usegalaxy.org/config/tool_data_table_conf.xml",
31 'test':"https://raw.githubusercontent.com/galaxyproject/usegalaxy-playbook/master/files/galaxy/test.galaxyproject.org/config/tool_data_table_conf.xml" }
32
33 # Replace data table source entries with local temporary location
34 GALAXY_DATA_CANONICAL_PATH = "/galaxy/data/"
35 TOOL_DATA_TABLE_CONF_XML_REPLACE_SOURCE = '<file path="%slocation/' % ( GALAXY_DATA_CANONICAL_PATH )
36 TOOL_DATA_TABLE_CONF_XML_REPLACE_TARGET = '<file path="%s/'
37
38 # Some basic Caching, so we don't have to reload and download everything every time
39 CACHE_TIME = datetime.timedelta( minutes=10 )
40 TOOL_DATA_TABLES_LOADED_BY_URL = {}
41
42 # Entries will not be selected by default
43 DEFAULT_SELECTED = False
44
45 # Exclude data managers without 'path' column or that are in the manual exclude list
46 PATH_COLUMN_NAMES = ['path']
47 EXCLUDE_DATA_TABLES = []
48 # TODO: Make additional handler actions available for tables that can't fit into the the basic
49 # "take the value of path" as a dir and copy contents.
50 # e.g. mafs. Although this maf table is goofy and doesn't have path defined in <table> def,
51 # but it does exit in the .loc.
52
53 # --- These methods are called by/within the Galaxy Application
54
55 def exec_before_job( app, inp_data, out_data, param_dict, tool=None, **kwd ):
56 # Look for any data tables that haven't been defined for this data manager before and dynamically add them to Galaxy
57 param_dict = dict( **param_dict )
58 param_dict['data_table_entries'] = param_dict.get( 'data_table_entries', [] )
59 if not isinstance( param_dict['data_table_entries'], list ):
60 param_dict['data_table_entries'] = [param_dict['data_table_entries']]
61 param_dict['data_table_entries'] = ",".join( param_dict['data_table_entries'] )
62 if tool:
63 tool_shed_repository = tool.tool_shed_repository
64 else:
65 tool_shed_repository = None
66 tdtm = None
67 data_manager = app.data_managers.get_manager( tool.data_manager_id, None )
68 data_table_entries = get_data_table_entries( param_dict )
69 data_tables = load_data_tables_from_url( data_table_class=app.tool_data_tables.__class__ ).get( 'data_tables' )
70 for data_table_name, entries in data_table_entries.iteritems():
71 #get data table managed by this data Manager
72 has_data_table = app.tool_data_tables.get_tables().get( data_table_name )
73 if has_data_table:
74 has_data_table = bool( has_data_table.get_filename_for_source( data_manager, None ) )
75 if not has_data_table:
76 if tdtm is None:
77 from tool_shed.tools import data_table_manager
78 tdtm = data_table_manager.ToolDataTableManager( app )
79 target_dir, tool_path, relative_target_dir = tdtm.get_target_install_dir( tool_shed_repository )
80 #Dynamically add this data table
81 log.debug( "Attempting to dynamically create a missing Tool Data Table named %s." % data_table_name )
82 data_table = data_tables[data_table_name]
83 repo_info = tdtm.generate_repository_info_elem_from_repository( tool_shed_repository, parent_elem=None )
84 if repo_info is not None:
85 repo_info = tostring( repo_info )
86 tmp_file = tempfile.NamedTemporaryFile()
87 tmp_file.write( get_new_xml_definition( app, data_table, data_manager, repo_info, target_dir ) )
88 tmp_file.flush()
89 app.tool_data_tables.add_new_entries_from_config_file( tmp_file.name, None, app.config.shed_tool_data_table_config, persist=True )
90 tmp_file.close()
91
92 def galaxy_code_get_available_data_tables( trans ):
93 #list of data tables
94 found_tables = get_available_tables( trans )
95 rval = map( lambda x: ( ( x, x, DEFAULT_SELECTED ) ), found_tables )
96 return rval
97
98 def galaxy_code_get_available_data_tables_entries( trans, dbkey, data_table_names ):
99 #available entries, optionally filtered by dbkey and table names
100 if dbkey in [ None, '', '?' ]:
101 dbkey = None
102 if data_table_names in [ None, '', '?' ]:
103 data_table_names = None
104 found_tables = get_available_tables_for_dbkey( trans, dbkey, data_table_names )
105 dbkey_text = '(%s) ' % ( dbkey ) if dbkey else ''
106 rval = map( lambda x: ( "%s%s" % ( dbkey_text, x[0] ), dumps( dict( name=x[0].split( ': ' )[0], entry=x[1] ) ).encode( 'base64' ).rstrip(), DEFAULT_SELECTED ), found_tables.items() )
107 return rval
108
109 # --- End Galaxy called Methods ---
110
111
112 def rsync_urljoin( base, url ):
113 # urlparse.urljoin doesn't work correctly for our use-case
114 # probably because it doesn't recognize the rsync scheme
115 base = base.rstrip( '/' )
116 url = url.lstrip( '/' )
117 return "%s/%s" % ( base, url )
118
119 def rsync_list_dir( server, dir=None, skip_names=[] ):
120 #drwxr-xr-x 50 2014/05/16 20:58:11 .
121 if dir:
122 dir = rsync_urljoin( server, dir )
123 else:
124 dir = server
125 rsync_response = tempfile.NamedTemporaryFile()
126 rsync_stderr = tempfile.NamedTemporaryFile()
127 rsync_cmd = [ RSYNC_CMD, '--list-only', dir ]
128 return_code = subprocess.call( rsync_cmd, stdout=rsync_response, stderr=rsync_stderr )
129 rsync_response.flush()
130 rsync_response.seek(0)
131 rsync_stderr.flush()
132 rsync_stderr.seek(0)
133 if return_code:
134 msg = "stdout:\n%s\nstderr:\n%s" % ( rsync_response.read(), rsync_stderr.read() )
135 rsync_response.close()
136 rsync_stderr.close()
137 raise Exception( 'Failed to execute rsync command (%s), returncode=%s. Rsync_output:\n%s' % ( rsync_cmd, return_code, msg ) )
138 rsync_stderr.close()
139 rval = {}
140 for line in rsync_response:
141 perms, line = line.split( None, 1 )
142 line = line.strip()
143 size, line = line.split( None, 1 )
144 line = line.strip()
145 date, line = line.split( None, 1 )
146 line = line.strip()
147 time, line = line.split( None, 1 )
148 name = line.strip()
149 if name in skip_names:
150 continue
151 size = line.strip()
152 rval[ name ] = dict( name=name, permissions=perms, bytes=size, date=date, time=time )
153 rsync_response.close()
154 return rval
155
156 def rsync_sync_to_dir( source, target ):
157 rsync_response = tempfile.NamedTemporaryFile()
158 rsync_stderr = tempfile.NamedTemporaryFile()
159 rsync_cmd = [ RSYNC_CMD, '-avzP', source, target ]
160 return_code = subprocess.call( rsync_cmd, stdout=rsync_response, stderr=rsync_stderr )
161 rsync_response.flush()
162 rsync_response.seek(0)
163 rsync_stderr.flush()
164 rsync_stderr.seek(0)
165 if return_code:
166 msg = "stdout:\n%s\nstderr:\n%s" % ( rsync_response.read(), rsync_stderr.read() )
167 rsync_response.close()
168 rsync_stderr.close()
169 raise Exception( 'Failed to execute rsync command (%s), returncode=%s. Rsync_output:\n%s' % ( rsync_cmd, return_code, msg ) )
170 rsync_response.close()
171 rsync_stderr.close()
172 return return_code
173
174
175 def data_table_needs_refresh( cached_data_table, url ):
176 if cached_data_table is None:
177 return True, {}
178 if datetime.datetime.now() - cached_data_table.get( 'time_loaded' ) > CACHE_TIME:
179 data_table_text = urllib2.urlopen( url ).read()
180 if cached_data_table.get( 'data_table_text', None ) != data_table_text:
181 return True, {'data_table_text':data_table_text}
182 loc_file_attrs = rsync_list_dir( RSYNC_SERVER, LOCATION_DIR )
183 if cached_data_table.get( 'loc_file_attrs', None ) != loc_file_attrs:
184 return True, {'loc_file_attrs':loc_file_attrs}
185 return False, {}
186
187 def load_data_tables_from_url( url=None, site='main', data_table_class=None ):
188 if not url:
189 url = TOOL_DATA_TABLE_CONF_XML_URLS.get( site, None )
190 assert url, ValueError( 'You must provide either a URL or a site=name.' )
191
192 cached_data_table = TOOL_DATA_TABLES_LOADED_BY_URL.get( url, None )
193 refresh, attribs = data_table_needs_refresh( cached_data_table, url )
194 if refresh:
195 data_table_text = attribs.get( 'data_table_text' )or urllib2.urlopen( url ).read()
196 loc_file_attrs = attribs.get( 'loc_file_attrs' ) or rsync_list_dir( RSYNC_SERVER, LOCATION_DIR )
197
198 tmp_dir = tempfile.mkdtemp( prefix='rsync_g2_' )
199 tmp_loc_dir = os.path.join( tmp_dir, 'location' )
200 os.mkdir( tmp_loc_dir )
201 rsync_sync_to_dir( rsync_urljoin( RSYNC_SERVER, LOCATION_DIR ), os.path.abspath( tmp_loc_dir ) )
202
203
204 new_data_table_text = data_table_text.replace( TOOL_DATA_TABLE_CONF_XML_REPLACE_SOURCE, TOOL_DATA_TABLE_CONF_XML_REPLACE_TARGET % ( tmp_loc_dir ) )
205 data_table_fh = tempfile.NamedTemporaryFile( dir=tmp_dir, prefix='rysnc_data_manager_data_table_conf_' )
206 data_table_fh.write( new_data_table_text )
207 data_table_fh.flush()
208 tmp_data_dir = os.path.join( tmp_dir, 'tool-data' )
209 os.mkdir( tmp_data_dir )
210 data_tables = data_table_class( tmp_data_dir, config_filename=data_table_fh.name )
211 for name, data_table in data_tables.data_tables.items():
212 if name in EXCLUDE_DATA_TABLES or not data_table_has_path_column( data_table ):
213 log.debug( 'Removing data table "%s" because it is excluded by name or does not have a defined "path" column.', name )
214 del data_tables.data_tables[name]
215 cached_data_table = { 'data_tables': data_tables, 'tmp_dir': tmp_dir, 'data_table_text': data_table_text, 'tmp_loc_dir': tmp_loc_dir, 'loc_file_attrs': loc_file_attrs, 'time_loaded': datetime.datetime.now() }
216 TOOL_DATA_TABLES_LOADED_BY_URL[ url ] = cached_data_table
217 #delete the files
218 data_table_fh.close()
219 cleanup_before_exit( tmp_dir )
220 return cached_data_table
221
222 def data_table_has_path_column( data_table ):
223 col_names = data_table.get_column_name_list()
224 for name in PATH_COLUMN_NAMES:
225 if name in col_names:
226 return True
227 return False
228
229 def get_available_tables( trans ):
230 #list of data tables
231 data_tables = load_data_tables_from_url( data_table_class=trans.app.tool_data_tables.__class__ )
232 return data_tables.get( 'data_tables' ).get_tables().keys()
233
234 def get_new_xml_definition( app, data_table, data_manager, repo_info=None, location_file_dir=None ):
235 sub_dict = { 'table_name': data_table.name, 'comment_char': '', 'columns': '', 'file_path': '' }
236 sub_dict.update( data_manager.get_tool_shed_repository_info_dict() )
237 if data_table.comment_char:
238 sub_dict['comment_char'] = 'comment_char="%s"' % ( data_table.comment_char )
239 for i, name in enumerate( data_table.get_column_name_list() ):
240 if name is not None:
241 sub_dict['columns'] = "%s\n%s" % ( sub_dict['columns'], '<column name="%s" index="%s" />' % ( name, i ) )
242 location_file_dir = location_file_dir or app.config.galaxy_data_manager_data_path
243 for filename in data_table.filenames.keys():
244 sub_dict['file_path'] = basename( filename )
245 sub_dict['file_path'] = os.path.join( location_file_dir, sub_dict['file_path'] ) #os.path.abspath?
246 if not os.path.exists( sub_dict['file_path'] ):
247 # Create empty file
248 open( sub_dict['file_path'], 'wb+' ).close()
249 break
250 sub_dict[ 'repo_info' ] = repo_info or ''
251 return """
252 <tables><table name="%(table_name)s" %(comment_char)s>
253 %(columns)s
254 <file path="%(file_path)s" />
255 %(repo_info)s
256 </table></tables>
257 """ % sub_dict
258
259 def get_available_tables_for_dbkey( trans, dbkey, data_table_names ):
260 my_data_tables = trans.app.tool_data_tables.get_tables()
261 data_tables = load_data_tables_from_url( data_table_class=trans.app.tool_data_tables.__class__ )
262 rval = {}
263 for name, data_table in data_tables.get( 'data_tables' ).get_tables().iteritems():
264 if ( not data_table_names or name in data_table_names ): #name in my_data_tables.keys() and
265 #TODO: check that columns are similiar
266 if not dbkey:
267 entry_getter = data_table.get_named_fields_list()
268 else:
269 entry_getter = data_table.get_entries( 'dbkey', dbkey, None, default=[] )
270 for entry in entry_getter:
271 name = "%s: %s" % ( data_table.name, dumps( entry ) )
272 rval[name] = entry
273 return rval
274
275 def split_path_all( path ):
276 rval = []
277 path = path.rstrip( '/' )
278 while True:
279 head, tail = os.path.split( path )
280 if tail:
281 rval.append( tail )
282 path = head
283 elif head:
284 rval.append( head )
285 break
286 else:
287 break
288 rval.reverse()
289 return rval
290
291 def get_data_for_path( path, data_root_dir ):
292 # We list dir with a /, but copy data without
293 # listing with / gives a . entry when its a dir
294 # cloning without the / will copy that whole directory into the target,
295 # instead of just that target's contents
296 if path.startswith( GALAXY_DATA_CANONICAL_PATH ):
297 path = path[ len( GALAXY_DATA_CANONICAL_PATH ):]
298 make_path = path
299 rsync_source = rsync_urljoin( rsync_urljoin( RSYNC_SERVER, INDEX_DIR ), path )
300 if rsync_source.endswith( '/' ):
301 rsync_source = rsync_source[:-1]
302 try:
303 dir_list = rsync_list_dir( rsync_source + "/" )
304 except Exception, e:
305 dir_list = None
306 while not dir_list or '.' not in dir_list:
307 head, tail = os.path.split( make_path )
308 if not head:
309 head = tail
310 make_path = head
311 rsync_source = rsync_urljoin( rsync_urljoin( RSYNC_SERVER, INDEX_DIR ), head ) #if we error here, likely due to a connection issue
312 if rsync_source.endswith( '/' ):
313 rsync_source = rsync_source[:-1]
314 dir_list = rsync_list_dir( rsync_source + "/" )
315 split_path = split_path_all( make_path )
316 target_path = data_root_dir
317 for p in split_path[:-1]:
318 target_path = os.path.join( target_path, p )
319 if not os.path.exists( target_path ):
320 os.mkdir( target_path )
321 rsync_sync_to_dir( rsync_source, target_path )
322 return path
323
324 def get_data_and_munge_path( data_table_name, data_table_entry, data_root_dir ):
325 path_cols = []
326 for key, value in data_table_entry.iteritems():
327 if key in PATH_COLUMN_NAMES:
328 path_cols.append( ( key, value ) )
329 found_data = False
330 if path_cols:
331 for col_name, value in path_cols:
332 #GALAXY_DATA_CANONICAL_PATH
333 if value.startswith( GALAXY_DATA_CANONICAL_PATH ):
334 data_table_entry[col_name] = get_data_for_path( value, data_root_dir )
335 found_data = True
336 else:
337 print 'unable to determine location of rsync data for', data_table_name, data_table_entry
338 return data_table_entry
339
340 def fulfill_data_table_entries( data_table_entries, data_manager_dict, data_root_dir ):
341 for data_table_name, entries in data_table_entries.iteritems():
342 for entry in entries:
343 entry = get_data_and_munge_path( data_table_name, entry, data_root_dir )
344 _add_data_table_entry( data_manager_dict, data_table_name, entry )
345 return data_manager_dict
346
347 def _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ):
348 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} )
349 data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get( data_table_name, [] )
350 data_manager_dict['data_tables'][data_table_name].append( data_table_entry )
351 return data_manager_dict
352
353 def cleanup_before_exit( tmp_dir ):
354 if tmp_dir and os.path.exists( tmp_dir ):
355 shutil.rmtree( tmp_dir )
356
357 def get_data_table_entries( params ):
358 rval = {}
359 data_table_entries = params.get( 'data_table_entries', None )
360 if data_table_entries :
361 for entry_text in data_table_entries.split( ',' ):
362 entry_text = entry_text.strip().decode( 'base64' )
363 entry_dict = loads( entry_text )
364 data_table_name = entry_dict['name']
365 data_table_entry = entry_dict['entry']
366 rval[ data_table_name ] = rval.get( data_table_name, [] )
367 rval[ data_table_name ].append( data_table_entry )
368 return rval
369
370 def main():
371 #Parse Command Line
372 parser = optparse.OptionParser()
373 (options, args) = parser.parse_args()
374
375 filename = args[0]
376
377 params = loads( open( filename ).read() )
378 target_directory = params[ 'output_data' ][0]['extra_files_path']
379 os.mkdir( target_directory )
380 data_manager_dict = {}
381
382 data_table_entries = get_data_table_entries( params['param_dict'] )
383
384 # Populate the data Tables
385 data_manager_dict = fulfill_data_table_entries( data_table_entries, data_manager_dict, target_directory )
386
387 #save info to json file
388 open( filename, 'wb' ).write( dumps( data_manager_dict ) )
389
390 if __name__ == "__main__": main()