1
+ − 1 #!/usr/bin/python
+ − 2
+ − 3 """
+ − 4 ****************************** versioned_data_cache_clear.py ******************************
+ − 5 Call this script directly to clear out all but the latest galaxy Versioned Data data library
+ − 6 and server data store cached folder versions.
+ − 7
+ − 8 SUGGEST RUNNING THIS UNDER GALAXY OR LESS PRIVILEGED USER, BUT the versioneddata_api_key file does need to be readable by the user.
+ − 9
+ − 10 """
+ − 11 import vdb_retrieval
+ − 12 import vdb_common
+ − 13 import glob
+ − 14 import os
+ − 15
+ − 16 # Note that globals from vdb_retrieval can be referenced by prefixing with vdb_retrieval.XYZ
+ − 17 # Note that this script uses the admin_api established in vdb_retrieval.py
+ − 18
+ − 19 retrieval_obj = vdb_retrieval.VDBRetrieval()
+ − 20 retrieval_obj.set_admin_api()
+ − 21 retrieval_obj.user_api = retrieval_obj.admin_api
+ − 22 retrieval_obj.set_datastores()
+ − 23
+ − 24 workflow_keepers = [] #stack of Versioned Data library dataset_ids that if found in a workflow data input folder key name, can be saved; otherwise remove folder.
+ − 25 library_folder_deletes = []
+ − 26 library_dataset_deletes = []
+ − 27
+ − 28 # Cycle through datastores, listing subfolders under each, sorted.
+ − 29 # Permanently delete all but latest subfolder.
+ − 30 for data_store in retrieval_obj.data_stores:
+ − 31 spec_file_id = data_store['id']
+ − 32 # STEP 1: Determine data store type and location
+ − 33 data_store_spec = retrieval_obj.admin_api.libraries.show_folder(retrieval_obj.library_id, spec_file_id)
+ − 34 data_store_type = retrieval_obj.test_data_store_type(data_store_spec['name'])
+ − 35
+ − 36 if not data_store_type in 'folder biomaj': # Folders are static - they don't do caching.
+ − 37
+ − 38 base_folder_id = data_store_spec['folder_id']
+ − 39 ds_obj = retrieval_obj.get_data_store_gateway(data_store_type, spec_file_id)
+ − 40
+ − 41 print
+ − 42
+ − 43 #Cycle through library tree; have to look at the whole thing since there's no /[string]/* wildcard search:
+ − 44 folders = retrieval_obj.get_library_folders(ds_obj.library_label_path)
+ − 45 for ptr, folder in enumerate(folders):
+ − 46
+ − 47 # Ignore folder that represents data store itself:
+ − 48 if ptr == 0:
+ − 49 print 'Data Store ::' + folder['name']
+ − 50
+ − 51 # Keep most recent cache item
+ − 52 elif ptr == len(folders)-1:
+ − 53 print 'Cached Version ::' + folder['name']
+ − 54 workflow_keepers.extend(folder['files'])
+ − 55
+ − 56 # Drop version caches that are further in the past:
+ − 57 else:
+ − 58 print 'Clearing version cache:' + folder['name']
+ − 59 library_folder_deletes.extend(folder['id'])
+ − 60 library_dataset_deletes.extend(folder['files'])
+ − 61
+ − 62
+ − 63 # Now auto-clean versioned/ folders too?
+ − 64 print "Server loc: " + ds_obj.data_store_path
+ − 65
+ − 66 items = os.listdir(ds_obj.data_store_path)
+ − 67 items = sorted(items, key=lambda el: vdb_common.natural_sort_key(el), reverse=True)
+ − 68 count = 0
+ − 69 for name in items:
+ − 70
+ − 71 # If it is a directory and it isn't the master or symlinked "current" one:
+ − 72 # Add ability to skip sym-linked folders too?
+ − 73 version_folder=os.path.join(ds_obj.data_store_path, name)
+ − 74 if not name == 'master' \
+ − 75 and os.path.isdir(version_folder) \
+ − 76 and not os.path.islink(version_folder):
+ − 77
+ − 78 count += 1
+ − 79 if count == 1:
+ − 80 print "Keeping cache:" + name
+ − 81 else:
+ − 82 print "Dropping cache:" + name
+ − 83 for root2, dirs2, files2 in os.walk(version_folder):
+ − 84 for version_file in files2:
+ − 85 full_path = os.path.join(root2, version_file)
+ − 86 print "Removing " + full_path
+ − 87 os.remove(full_path)
+ − 88 #Not expecting any subfolders here.
+ − 89
+ − 90 os.rmdir(version_folder)
+ − 91
+ − 92
+ − 93 # Permanently delete specific data library datasets:
+ − 94 for item in library_dataset_deletes:
+ − 95 retrieval_obj.admin_api.libraries.delete_library_dataset(retrieval_obj.library_id, item['id'], purged=True)
+ − 96
+ − 97
+ − 98 # Newer Bioblend API method for deleting galaxy library folders.
+ − 99 # OLD Galaxy way possible: http DELETE request to {{url}}/api/folders/{{encoded_folder_id}}?key={{key}}
+ − 100 if 'folders' in dir(retrieval_obj.admin_api):
+ − 101 for folder in library_folder_deletes:
+ − 102 retrieval_obj.admin_api.folders.delete(folder['id'])
+ − 103
+ − 104
+ − 105 print workflow_keepers
+ − 106
+ − 107 workflow_cache_folders = retrieval_obj.get_library_folders('/'+ vdb_retrieval.VDB_WORKFLOW_CACHE_FOLDER_NAME+'/')
+ − 108
+ − 109 for folder in workflow_cache_folders:
+ − 110 dataset_ids = folder['name'].split('_') #input dataset ids separated by underscore
+ − 111 count = 0
+ − 112 for id in dataset_ids:
+ − 113 if id in workflow_keepers:
+ − 114 count += 1
+ − 115
+ − 116 # If every input dataset in workflow cache exists in library cache, then keep it.
+ − 117 if count == len(dataset_ids):
+ − 118 continue
+ − 119
+ − 120 # We have one or more cached datasets to drop.
+ − 121 print "Dropping workflow cache: " + folder['name']
+ − 122 for id in [item['id'] for item in folder['files']]:
+ − 123 print id
+ − 124 retrieval_obj.admin_api.libraries.delete_library_dataset(retrieval_obj.library_id, id, purged=True)
+ − 125
+ − 126 # NOW DELETE WORKFLOW FOLDER.
+ − 127 if 'folders' in dir(retrieval_obj.admin_api):
+ − 128 retrieval_obj.admin_api.folders.delete(folder['id'])
+ − 129
+ − 130