diff versioned_data_cache_clear.py @ 1:5c5027485f7d draft

Uploaded correct file
author damion
date Sun, 09 Aug 2015 16:07:50 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/versioned_data_cache_clear.py	Sun Aug 09 16:07:50 2015 -0400
@@ -0,0 +1,130 @@
+#!/usr/bin/python
+
+"""
+****************************** versioned_data_cache_clear.py ******************************
+ Call this script directly to clear out all but the latest galaxy Versioned Data data library
+ and server data store cached folder versions.
+
+ SUGGEST RUNNING THIS UNDER GALAXY OR LESS PRIVILEGED USER, BUT the versioneddata_api_key file does need to be readable by the user.
+ 
+"""
+import vdb_retrieval
+import vdb_common
+import glob
+import os
+
+# Note that globals from vdb_retrieval can be referenced by prefixing with vdb_retrieval.XYZ
+# Note that this script uses the admin_api established in vdb_retrieval.py
+
+retrieval_obj = vdb_retrieval.VDBRetrieval()
+retrieval_obj.set_admin_api()
+retrieval_obj.user_api = retrieval_obj.admin_api
+retrieval_obj.set_datastores()
+
+workflow_keepers = [] #stack of Versioned Data library dataset_ids that if found in a workflow data input folder key name, can be saved; otherwise remove folder.
+library_folder_deletes = []
+library_dataset_deletes = []
+
+# Cycle through datastores, listing subfolders under each, sorted.  
+# Permanently delete all but latest subfolder.
+for data_store in retrieval_obj.data_stores:
+	spec_file_id = data_store['id']
+	# STEP 1:  Determine data store type and location
+	data_store_spec = retrieval_obj.admin_api.libraries.show_folder(retrieval_obj.library_id, spec_file_id)
+	data_store_type = retrieval_obj.test_data_store_type(data_store_spec['name'])
+
+	if not data_store_type in 'folder biomaj': # Folders are static - they don't do caching.
+
+		base_folder_id = data_store_spec['folder_id']
+		ds_obj = retrieval_obj.get_data_store_gateway(data_store_type, spec_file_id)
+
+		print
+
+		#Cycle through library tree; have to look at the whole thing since there's no /[string]/* wildcard search:
+		folders = retrieval_obj.get_library_folders(ds_obj.library_label_path)
+		for ptr, folder in enumerate(folders):
+	
+			# Ignore folder that represents data store itself:		
+			if ptr == 0: 
+				print 'Data Store ::' + folder['name']
+
+			# Keep most recent cache item
+			elif ptr == len(folders)-1:
+				print 'Cached Version ::' + folder['name']
+				workflow_keepers.extend(folder['files'])
+				
+			# Drop version caches that are further in the past:
+			else:
+				print 'Clearing version cache:' + folder['name']
+				library_folder_deletes.extend(folder['id'])
+				library_dataset_deletes.extend(folder['files'])
+
+	
+		# Now auto-clean versioned/ folders too?
+		print "Server loc: " + ds_obj.data_store_path	
+
+		items = os.listdir(ds_obj.data_store_path)
+		items = sorted(items, key=lambda el: vdb_common.natural_sort_key(el), reverse=True)
+		count = 0
+		for name in items:
+
+			# If it is a directory and it isn't the master or symlinked "current" one:
+			# Add ability to skip sym-linked folders too?
+			version_folder=os.path.join(ds_obj.data_store_path, name)
+			if not name == 'master' \
+				and os.path.isdir(version_folder) \
+				and not os.path.islink(version_folder):
+
+				count += 1
+				if count == 1:
+					print "Keeping cache:" + name
+				else:
+					print "Dropping cache:" + name
+					for root2, dirs2, files2 in os.walk(version_folder):
+						for version_file in files2:
+							full_path = os.path.join(root2, version_file)
+							print "Removing " +	full_path						
+							os.remove(full_path)
+						#Not expecting any subfolders here.
+							
+					os.rmdir(version_folder)		
+
+
+# Permanently delete specific data library datasets:
+for item in library_dataset_deletes:
+	retrieval_obj.admin_api.libraries.delete_library_dataset(retrieval_obj.library_id, item['id'], purged=True)
+
+
+# Newer Bioblend API method for deleting galaxy library folders.  
+# OLD Galaxy way possible: http DELETE request to {{url}}/api/folders/{{encoded_folder_id}}?key={{key}}
+if 'folders' in dir(retrieval_obj.admin_api):
+	for folder in library_folder_deletes:
+		retrieval_obj.admin_api.folders.delete(folder['id'])
+
+
+print workflow_keepers
+
+workflow_cache_folders = retrieval_obj.get_library_folders('/'+ vdb_retrieval.VDB_WORKFLOW_CACHE_FOLDER_NAME+'/')
+
+for folder in workflow_cache_folders:
+	dataset_ids = folder['name'].split('_') #input dataset ids separated by underscore 
+	count = 0
+	for id in dataset_ids:
+		if id in workflow_keepers:
+			count += 1
+
+	# If every input dataset in workflow cache exists in library cache, then keep it.
+	if count == len(dataset_ids):
+		continue
+
+	# We have one or more cached datasets to drop.
+	print "Dropping workflow cache: " + folder['name']
+	for id in [item['id'] for item in folder['files']]:
+		print id
+		retrieval_obj.admin_api.libraries.delete_library_dataset(retrieval_obj.library_id, id, purged=True)
+
+	# NOW DELETE WORKFLOW FOLDER.
+	if 'folders' in dir(retrieval_obj.admin_api):
+		retrieval_obj.admin_api.folders.delete(folder['id'])
+
+