Previous changeset 31:9cb5e4f12ce5 (2017-06-06) Next changeset 33:228038cd0683 (2017-06-07) |
Commit message:
Uploaded 20170607 |
modified:
gdcwebapp.xml |
added:
._gdcwebapp.xml ._json_collect_data_source.py json_collect_data_source.py |
removed:
json_data_source_mod.py |
b |
diff -r 9cb5e4f12ce5 -r 1edc869cd008 ._gdcwebapp.xml |
b |
Binary file ._gdcwebapp.xml has changed |
b |
diff -r 9cb5e4f12ce5 -r 1edc869cd008 ._json_collect_data_source.py |
b |
Binary file ._json_collect_data_source.py has changed |
b |
diff -r 9cb5e4f12ce5 -r 1edc869cd008 gdcwebapp.xml --- a/gdcwebapp.xml Tue Jun 06 23:04:41 2017 -0400 +++ b/gdcwebapp.xml Wed Jun 07 18:01:57 2017 -0400 |
[ |
@@ -11,7 +11,7 @@ <command> <![CDATA[ mkdir -p tmp && - python ${__tool_directory__}/json_data_source_mod.py "${__app__.config.output_size_limit}" --json_param_file "${output1}" --path "." --appdata "tmp" + python ${__tool_directory__}/json_collect_data_source.py "${__app__.config.output_size_limit}" --json_param_file "${output1}" --path "." --appdata "tmp" ]]> </command> <inputs check_values="False" action="http://bioinf.iasi.cnr.it/gdcwebapp/app.php"> |
b |
diff -r 9cb5e4f12ce5 -r 1edc869cd008 json_collect_data_source.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/json_collect_data_source.py Wed Jun 07 18:01:57 2017 -0400 |
[ |
b'@@ -0,0 +1,282 @@\n+#!/usr/bin/env python\n+import json\n+import optparse\n+import urllib\n+import os.path\n+import os\n+from operator import itemgetter\n+import tarfile\n+\n+__version__ = "1.0.0"\n+CHUNK_SIZE = 2**20 #1mb\n+VALID_CHARS = \'.-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ \'\n+\n+\n+def splitext(path):\n+ for ext in [\'.tar.gz\', \'.tar.bz2\']:\n+ if path.endswith(ext):\n+ path, ext = path[:-len(ext)], path[-len(ext):]\n+ break\n+ else:\n+ path, ext = os.path.splitext(path)\n+ return path, ext[1:]\n+\n+\n+def chunk_write( source_stream, target_stream, source_method = "read", target_method="write" ):\n+ source_method = getattr( source_stream, source_method )\n+ target_method = getattr( target_stream, target_method )\n+ while True:\n+ chunk = source_method( CHUNK_SIZE )\n+ if chunk:\n+ target_method( chunk )\n+ else:\n+ break\n+\n+\n+def deconstruct_multi_filename( multi_filename ):\n+ keys = [ \'primary\', \'id\', \'name\', \'visible\', \'file_type\' ]\n+ return ( dict( zip( keys, multi_filename.split(\'_\') ) ) )\n+\n+\n+def construct_multi_filename( id, name, file_type ):\n+ """ Implementation of *Number of Output datasets cannot be determined until tool run* from documentation_.\n+ .. _documentation: http://wiki.galaxyproject.org/Admin/Tools/Multiple%20Output%20Files\n+ """\n+ filename = "%s_%s_%s_%s_%s" % ( \'primary\', id, name, \'visible\', file_type )\n+ return filename\n+\n+\n+def download_from_query( query_data, target_output_filename ):\n+ """ Download file from the json data and write it to target_output_filename.\n+ """\n+ query_url = query_data.get( \'url\' )\n+ query_file_type = query_data.get( \'extension\' )\n+ query_stream = urllib.urlopen( query_url )\n+ output_stream = open( target_output_filename, \'wb\' )\n+ chunk_write( query_stream, output_stream )\n+ query_stream.close()\n+ output_stream.close()\n+\n+def store_file_from_archive( file_object, target_output_filename, isString=False ):\n+ """ Store file after extracting from archive and organize them as a collection using the structure \n+ (collection-name)_(file-name).ext as file name\n+ """\n+ output_stream = open( target_output_filename, \'wb\' )\n+ #chunk_write( file_object.read(), output_stream )\n+ if not isString:\n+ output_stream.write(file_object.read())\n+ else:\n+ output_stream.write(file_object)\n+ output_stream.close()\n+\n+\n+def download_extra_data( query_ext_data, base_path ):\n+ """ Download any extra data defined in the JSON.\n+ NOTE: the "path" value is a relative path to the file on our\n+ file system. This is slightly dangerous and we should make every effort\n+ to avoid a malicious absolute path to write the file elsewhere on the\n+ filesystem.\n+ """\n+ for ext_data in query_ext_data:\n+ if not os.path.exists( base_path ):\n+ os.mkdir( base_path )\n+ query_stream = urllib.urlopen( ext_data.get( \'url\' ) )\n+ ext_path = ext_data.get( \'path\' )\n+ os.makedirs( os.path.normpath( \'/\'.join( [ base_path, os.path.dirname( ext_path ) ] ) ) )\n+ output_stream = open( os.path.normpath( \'/\'.join( [ base_path, ext_path ] ) ), \'wb\' )\n+ chunk_write( query_stream, output_stream )\n+ query_stream.close()\n+ output_stream.close()\n+\n+\n+def metadata_to_json( dataset_id, metadata, filename, ds_type=\'dataset\', primary=False):\n+ """ Return line separated JSON """\n+ meta_dict = dict( type = ds_type,\n+ ext = metadata.get( \'extension\' ),\n+ filename = filename,\n+ name = metadata.get( \'name\' ),\n+ metadata = metadata.get( \'metadata\', {} ) )\n+ if metadata.get( \'extra_data\', None ):\n+ meta_dict[ \'extra_files\' ] = \'_\'.join( [ filename, \'files\' ] )\n+ if primary:\n+ meta_dict[ \'base_dataset_id\' ] = dataset_id\n+ else:\n+ meta_dict[ \'dataset_id\' ] = dataset_id\n+ r'..b'arse json_params file and return a tuple of necessary configuration\n+ values.\n+ """\n+ datasource_params = json_params.get( \'param_dict\' )\n+ dataset_url = datasource_params.get( \'URL\' )\n+ output_filename = datasource_params.get( \'output1\', None )\n+ output_data = json_params.get( \'output_data\' )\n+ extra_files_path, file_name, ext, out_data_name, hda_id, dataset_id = \\\n+ itemgetter(\'extra_files_path\', \'file_name\', \'ext\', \'out_data_name\', \'hda_id\', \'dataset_id\')(output_data[0])\n+ return (dataset_url, output_filename,\n+ extra_files_path, file_name,\n+ ext, out_data_name,\n+ hda_id, dataset_id)\n+\n+\n+def download_from_json_data( options, args ):\n+ """ Parse the returned JSON data and download files. Write metadata\n+ to flat JSON file.\n+ """\n+ output_base_path = options.path\n+ appdata_path = options.appdata\n+ if not os.path.exists(appdata_path):\n+ os.makedirs(appdata_path)\n+\n+ # read tool job configuration file and parse parameters we need\n+ json_params = json.loads( open( options.json_param_file, \'r\' ).read() )\n+ print("json_params: "+str(json_params))\n+ \n+ dataset_url, output_filename, \\\n+ extra_files_path, file_name, \\\n+ ext, out_data_name, \\\n+ hda_id, dataset_id = set_up_config_values(json_params)\n+ # line separated JSON file to contain all dataset metadata\n+ metadata_parameter_file = open( json_params[\'job_config\'][\'TOOL_PROVIDED_JOB_METADATA_FILE\'], \'wb\' )\n+\n+ # get JSON response from data source\n+ # TODO: make sure response is not enormous\n+ query_params = json.loads(urllib.urlopen( dataset_url ).read())\n+ # download and write files\n+ primary = False\n+ #primary = True\n+ # query_item, hda_id, output_base_path, dataset_id\n+ for query_item in query_params:\n+ if isinstance( query_item, list ):\n+ # TODO: do something with the nested list as a collection\n+ for query_subitem in query_item:\n+ primary = download_files_and_write_metadata(query_subitem, json_params, output_base_path,\n+ metadata_parameter_file, primary, appdata_path, options, args)\n+\n+ elif isinstance( query_item, dict ):\n+ primary = download_files_and_write_metadata(query_item, json_params, output_base_path,\n+ metadata_parameter_file, primary, appdata_path, options, args)\n+ metadata_parameter_file.close()\n+\n+def __main__():\n+ """ Read the JSON return from a data source. Parse each line and request\n+ the data, download to "newfilepath", and write metadata.\n+\n+ Schema\n+ ------\n+\n+ [ {"url":"http://url_of_file",\n+ "name":"encode WigData",\n+ "extension":"wig",\n+ "metadata":{"db_key":"hg19"},\n+ "extra_data":[ {"url":"http://url_of_ext_file",\n+ "path":"rel/path/to/ext_file"}\n+ ]\n+ }\n+ ]\n+\n+ """\n+ # Parse the command line options\n+ usage = "Usage: json_data_source_mod.py max_size --json_param_file filename [options]"\n+ parser = optparse.OptionParser(usage = usage)\n+ parser.add_option("-j", "--json_param_file", type="string",\n+ action="store", dest="json_param_file", help="json schema return data")\n+ parser.add_option("-p", "--path", type="string",\n+ action="store", dest="path", help="new file path")\n+ parser.add_option("-a", "--appdata", type="string",\n+ action="store", dest="appdata", help="appdata folder name")\n+ parser.add_option("-v", "--version", action="store_true", dest="version",\n+ default=False, help="display version and exit")\n+\n+ (options, args) = parser.parse_args()\n+ if options.version:\n+ print __version__\n+ else:\n+ download_from_json_data( options, args )\n+\n+\n+if __name__ == "__main__": __main__()\n' |
b |
diff -r 9cb5e4f12ce5 -r 1edc869cd008 json_data_source_mod.py --- a/json_data_source_mod.py Tue Jun 06 23:04:41 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,281 +0,0 @@\n-#!/usr/bin/env python\n-import json\n-import optparse\n-import urllib\n-import os.path\n-import os\n-from operator import itemgetter\n-import tarfile\n-\n-__version__ = "1.0.0"\n-CHUNK_SIZE = 2**20 #1mb\n-VALID_CHARS = \'.-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ \'\n-\n-\n-def splitext(path):\n- for ext in [\'.tar.gz\', \'.tar.bz2\']:\n- if path.endswith(ext):\n- path, ext = path[:-len(ext)], path[-len(ext):]\n- break\n- else:\n- path, ext = os.path.splitext(path)\n- return path, ext[1:]\n-\n-\n-def chunk_write( source_stream, target_stream, source_method = "read", target_method="write" ):\n- source_method = getattr( source_stream, source_method )\n- target_method = getattr( target_stream, target_method )\n- while True:\n- chunk = source_method( CHUNK_SIZE )\n- if chunk:\n- target_method( chunk )\n- else:\n- break\n-\n-\n-def deconstruct_multi_filename( multi_filename ):\n- keys = [ \'primary\', \'id\', \'name\', \'visible\', \'file_type\' ]\n- return ( dict( zip( keys, multi_filename.split(\'_\') ) ) )\n-\n-\n-def construct_multi_filename( id, name, file_type ):\n- """ Implementation of *Number of Output datasets cannot be determined until tool run* from documentation_.\n- .. _documentation: http://wiki.galaxyproject.org/Admin/Tools/Multiple%20Output%20Files\n- """\n- filename = "%s_%s_%s_%s_%s" % ( \'primary\', id, name, \'visible\', file_type )\n- return filename\n-\n-\n-def download_from_query( query_data, target_output_filename ):\n- """ Download file from the json data and write it to target_output_filename.\n- """\n- query_url = query_data.get( \'url\' )\n- query_file_type = query_data.get( \'extension\' )\n- query_stream = urllib.urlopen( query_url )\n- output_stream = open( target_output_filename, \'wb\' )\n- chunk_write( query_stream, output_stream )\n- query_stream.close()\n- output_stream.close()\n-\n-def store_file_from_archive( file_object, target_output_filename, isString=False ):\n- """ Store file after extracting from archive and organize them as a collection using the structure \n- (collection-name)_(file-name).ext as file name\n- """\n- output_stream = open( target_output_filename, \'wb\' )\n- #chunk_write( file_object.read(), output_stream )\n- if not isString:\n- output_stream.write(file_object.read())\n- else:\n- output_stream.write(file_object)\n- output_stream.close()\n-\n-\n-def download_extra_data( query_ext_data, base_path ):\n- """ Download any extra data defined in the JSON.\n- NOTE: the "path" value is a relative path to the file on our\n- file system. This is slightly dangerous and we should make every effort\n- to avoid a malicious absolute path to write the file elsewhere on the\n- filesystem.\n- """\n- for ext_data in query_ext_data:\n- if not os.path.exists( base_path ):\n- os.mkdir( base_path )\n- query_stream = urllib.urlopen( ext_data.get( \'url\' ) )\n- ext_path = ext_data.get( \'path\' )\n- os.makedirs( os.path.normpath( \'/\'.join( [ base_path, os.path.dirname( ext_path ) ] ) ) )\n- output_stream = open( os.path.normpath( \'/\'.join( [ base_path, ext_path ] ) ), \'wb\' )\n- chunk_write( query_stream, output_stream )\n- query_stream.close()\n- output_stream.close()\n-\n-\n-def metadata_to_json( dataset_id, metadata, filename, ds_type=\'dataset\', primary=False):\n- """ Return line separated JSON """\n- meta_dict = dict( type = ds_type,\n- ext = metadata.get( \'extension\' ),\n- filename = filename,\n- name = metadata.get( \'name\' ),\n- metadata = metadata.get( \'metadata\', {} ) )\n- if metadata.get( \'extra_data\', None ):\n- meta_dict[ \'extra_files\' ] = \'_\'.join( [ filename, \'files\' ] )\n- if primary:\n- meta_dict[ \'base_dataset_id\' ] = dataset_id\n- else:\n- meta_dict[ \'dataset_id\' ] = dataset_id\n- r'..b'set_up_config_values(json_params):\n- """ Parse json_params file and return a tuple of necessary configuration\n- values.\n- """\n- datasource_params = json_params.get( \'param_dict\' )\n- dataset_url = datasource_params.get( \'URL\' )\n- output_filename = datasource_params.get( \'output1\', None )\n- output_data = json_params.get( \'output_data\' )\n- extra_files_path, file_name, ext, out_data_name, hda_id, dataset_id = \\\n- itemgetter(\'extra_files_path\', \'file_name\', \'ext\', \'out_data_name\', \'hda_id\', \'dataset_id\')(output_data[0])\n- return (dataset_url, output_filename,\n- extra_files_path, file_name,\n- ext, out_data_name,\n- hda_id, dataset_id)\n-\n-\n-def download_from_json_data( options, args ):\n- """ Parse the returned JSON data and download files. Write metadata\n- to flat JSON file.\n- """\n- output_base_path = options.path\n- appdata_path = options.appdata\n- if not os.path.exists(appdata_path):\n- os.makedirs(appdata_path)\n-\n- # read tool job configuration file and parse parameters we need\n- json_params = json.loads( open( options.json_param_file, \'r\' ).read() )\n- \n- dataset_url, output_filename, \\\n- extra_files_path, file_name, \\\n- ext, out_data_name, \\\n- hda_id, dataset_id = set_up_config_values(json_params)\n- # line separated JSON file to contain all dataset metadata\n- metadata_parameter_file = open( json_params[\'job_config\'][\'TOOL_PROVIDED_JOB_METADATA_FILE\'], \'wb\' )\n-\n- # get JSON response from data source\n- # TODO: make sure response is not enormous\n- query_params = json.loads(urllib.urlopen( dataset_url ).read())\n- # download and write files\n- primary = False\n- #primary = True\n- # query_item, hda_id, output_base_path, dataset_id\n- for query_item in query_params:\n- if isinstance( query_item, list ):\n- # TODO: do something with the nested list as a collection\n- for query_subitem in query_item:\n- primary = download_files_and_write_metadata(query_subitem, json_params, output_base_path,\n- metadata_parameter_file, primary, appdata_path, options, args)\n-\n- elif isinstance( query_item, dict ):\n- primary = download_files_and_write_metadata(query_item, json_params, output_base_path,\n- metadata_parameter_file, primary, appdata_path, options, args)\n- metadata_parameter_file.close()\n-\n-def __main__():\n- """ Read the JSON return from a data source. Parse each line and request\n- the data, download to "newfilepath", and write metadata.\n-\n- Schema\n- ------\n-\n- [ {"url":"http://url_of_file",\n- "name":"encode WigData",\n- "extension":"wig",\n- "metadata":{"db_key":"hg19"},\n- "extra_data":[ {"url":"http://url_of_ext_file",\n- "path":"rel/path/to/ext_file"}\n- ]\n- }\n- ]\n-\n- """\n- # Parse the command line options\n- usage = "Usage: json_data_source_mod.py max_size --json_param_file filename [options]"\n- parser = optparse.OptionParser(usage = usage)\n- parser.add_option("-j", "--json_param_file", type="string",\n- action="store", dest="json_param_file", help="json schema return data")\n- parser.add_option("-p", "--path", type="string",\n- action="store", dest="path", help="new file path")\n- parser.add_option("-a", "--appdata", type="string",\n- action="store", dest="appdata", help="appdata folder name")\n- parser.add_option("-v", "--version", action="store_true", dest="version",\n- default=False, help="display version and exit")\n-\n- (options, args) = parser.parse_args()\n- if options.version:\n- print __version__\n- else:\n- download_from_json_data( options, args )\n-\n-\n-if __name__ == "__main__": __main__()\n' |