Previous changeset 33:228038cd0683 (2017-06-07) Next changeset 35:5c10dbaa9cc5 (2017-06-13) |
Commit message:
Uploaded 20170613 |
modified:
json_collect_data_source.py |
added:
._gdcwebapp.xml ._json_collect_data_source.py |
b |
diff -r 228038cd0683 -r d65de900967e ._gdcwebapp.xml |
b |
Binary file ._gdcwebapp.xml has changed |
b |
diff -r 228038cd0683 -r d65de900967e ._json_collect_data_source.py |
b |
Binary file ._json_collect_data_source.py has changed |
b |
diff -r 228038cd0683 -r d65de900967e json_collect_data_source.py --- a/json_collect_data_source.py Wed Jun 07 18:02:01 2017 -0400 +++ b/json_collect_data_source.py Tue Jun 13 16:39:40 2017 -0400 |
[ |
b'@@ -6,6 +6,7 @@\n import os\n from operator import itemgetter\n import tarfile\n+import zipfile\n \n __version__ = "1.0.0"\n CHUNK_SIZE = 2**20 #1mb\n@@ -13,6 +14,7 @@\n \n \n def splitext(path):\n+ # extract the folder path and extension of a file from its path\n for ext in [\'.tar.gz\', \'.tar.bz2\']:\n if path.endswith(ext):\n path, ext = path[:-len(ext)], path[-len(ext):]\n@@ -57,16 +59,10 @@\n query_stream.close()\n output_stream.close()\n \n-def store_file_from_archive( file_object, target_output_filename, isString=False ):\n- """ Store file after extracting from archive and organize them as a collection using the structure \n- (collection-name)_(file-name).ext as file name\n- """\n+def store_file_from_tarfile( file_object, target_output_filename, isString=False ):\n+ # store the file_object (from tarfile) on the filesystem\n output_stream = open( target_output_filename, \'wb\' )\n- #chunk_write( file_object.read(), output_stream )\n- if not isString:\n- output_stream.write(file_object.read())\n- else:\n- output_stream.write(file_object)\n+ output_stream.write(file_object.read())\n output_stream.close()\n \n \n@@ -105,23 +101,47 @@\n return "%s\\n" % json.dumps( meta_dict )\n \n \n-def walk_on_archive(target_output_filename, check_ext, archive_name, appdata_path, db_key="?"):\n+def walk_on_archive(target_output_filename, check_ext, archive_library, archive_name, appdata_path, db_key="?"):\n+ # fix archive name using valid chars only\n+ archive_name = \'\'.join(e for e in archive_name if e in VALID_CHARS)\n archive_name = archive_name.replace("_", "-").replace(".", "-")\n- with tarfile.open( target_output_filename, check_ext ) as tf:\n- for entry in tf:\n- if entry.isfile():\n- fileobj = tf.extractfile( entry )\n- # reserve the underscore for the collection searator\n- filename = os.path.basename( entry.name ).replace("_", "-")\n- extension = splitext( filename )[1]\n- # pattern: (?P<identifier_0>[^_]+)_(?P<identifier_1>[^_]+)_(?P<ext>[^_]+)_(?P<dbkey>[^_]+)\n- if (len(extension) > 0):\n- filename = (filename[0:len(filename)-(len(extension)+1)]).replace(".", "-") + "." + extension + "_" + extension\n- else:\n- extension = "auto"\n- filename_with_collection_prefix = archive_name + "_" + filename + "_" + db_key\n- target_entry_output_filename = os.path.join(appdata_path, filename_with_collection_prefix)\n- store_file_from_archive( fileobj, target_entry_output_filename )\n+ if archive_library is "zipfile":\n+ # iterate over entries inside the archive [zip]\n+ with zipfile.ZipFile( target_output_filename, check_ext ) as zf:\n+ for entry in zf.namelist():\n+ # if entry is file\n+ if entry.startswith("%s/" % entry.rstrip("/")) is False:\n+ # retrieve file name\n+ # the underscore character is reserved\n+ filename = os.path.basename( entry.split("/")[-1] ).replace("_", "-")\n+ # retrieve file extension\n+ extension = splitext( filename )[1]\n+ # if no extension use \'auto\'\n+ if (len(extension) == 0):\n+ extension = "auto"\n+ # pattern: (?P<identifier_0>[^_]+)_(?P<identifier_1>[^_]+)_(?P<ext>[^_]+)_(?P<dbkey>[^_]+)\n+ filename_with_collection_prefix = archive_name + "_" + filename + "_" + extension + "_" + db_key\n+ # store current entry on filesystem\n+ zf.extract( filename_with_collection_prefix, appdata_path )\n+ elif archive_library is "tarfile":\n+ # iterate over entries inside the archive [gz, bz2, tar]\n+ with tarfile.open( target_output_filename, check_ext ) as tf:\n+ for entry in tf:\n+ '..b'primary_dataset\',\n primary=primary) )\n else:\n target_output_filename = output_filename\n- if isArchive is False:\n+ if (isArchive is False) or ((isArchive is True) and (organize is False)):\n metadata_parameter_file.write( metadata_to_json( dataset_id, query_item,\n target_output_filename,\n ds_type=\'dataset\',\n primary=primary) )\n \n- if isArchive is False:\n+ if (isArchive is False) or ((isArchive is True) and (organize is False)):\n download_from_query( query_item, target_output_filename )\n else:\n+ # if the current entry is an archive download it inside appdata folder\n target_output_path = os.path.join(appdata_path, filename)\n download_from_query( query_item, target_output_path )\n if extra_data:\n+ # just download extra data\n extra_files_path = \'\'.join( [ target_output_filename, \'files\' ] )\n download_extra_data( extra_data, extra_files_path )\n \n- """ the following code handles archives and decompress them in a collection """\n- if ( isArchive ):\n+ # if the current file is an archive and want to organize the content \n+ # -> decompress the archive and populate the collection (has to be defined in the tool xml schema)\n+ if isArchive and organize:\n+ # set the same db_key for each file inside the archive\n+ # use the db_key associated to the archive (if it exists)\n db_key = "?"\n archive_metadata = query_item.get( \'metadata\', None )\n if archive_metadata is not None:\n@@ -182,7 +229,11 @@\n db_key = archive_metadata.get( \'db_key\' )\n except:\n pass\n- walk_on_archive(target_output_path, check_ext, filename, appdata_path, db_key)\n+ archive_name = query_item.get( \'name\', None )\n+ if archive_name is None:\n+ archive_name = filename\n+ # iterate over the archive content\n+ walk_on_archive(target_output_path, check_ext, archive_library, archive_name, appdata_path, db_key)\n \n return True\n \n@@ -214,7 +265,7 @@\n \n # read tool job configuration file and parse parameters we need\n json_params = json.loads( open( options.json_param_file, \'r\' ).read() )\n- print("json_params: "+str(json_params))\n+ #print("json_params: "+str(json_params))\n \n dataset_url, output_filename, \\\n extra_files_path, file_name, \\\n@@ -250,9 +301,10 @@\n ------\n \n [ {"url":"http://url_of_file",\n- "name":"encode WigData",\n- "extension":"wig",\n- "metadata":{"db_key":"hg19"},\n+ "name":"My Archive",\n+ "extension":"tar.gz",\n+ "organize":"true",\n+ "metadata":{"db_key":"hg38"},\n "extra_data":[ {"url":"http://url_of_ext_file",\n "path":"rel/path/to/ext_file"}\n ]\n@@ -261,12 +313,13 @@\n \n """\n # Parse the command line options\n- usage = "Usage: json_data_source_mod.py max_size --json_param_file filename [options]"\n+ usage = "Usage: json_collect_data_source.py max_size --json_param_file filename [options]"\n parser = optparse.OptionParser(usage = usage)\n parser.add_option("-j", "--json_param_file", type="string",\n action="store", dest="json_param_file", help="json schema return data")\n parser.add_option("-p", "--path", type="string",\n action="store", dest="path", help="new file path")\n+ # set appdata: temporary directory in which the archives will be decompressed\n parser.add_option("-a", "--appdata", type="string",\n action="store", dest="appdata", help="appdata folder name")\n parser.add_option("-v", "--version", action="store_true", dest="version",\n' |