Previous changeset 11:9d24947d4335 (2017-05-25) Next changeset 13:39c4f4528c6e (2017-05-30) |
Commit message:
Uploaded |
modified:
gdcwebapp.xml json_data_source_mod.py |
added:
._gdcwebapp.xml ._json_data_source_mod.py |
b |
diff -r 9d24947d4335 -r 80593f75d74a ._gdcwebapp.xml |
b |
Binary file ._gdcwebapp.xml has changed |
b |
diff -r 9d24947d4335 -r 80593f75d74a ._json_data_source_mod.py |
b |
Binary file ._json_data_source_mod.py has changed |
b |
diff -r 9d24947d4335 -r 80593f75d74a gdcwebapp.xml --- a/gdcwebapp.xml Thu May 25 17:58:23 2017 -0400 +++ b/gdcwebapp.xml Tue May 30 12:26:32 2017 -0400 |
[ |
@@ -1,14 +1,17 @@ <?xml version="1.0"?> <tool name="GDCWebApp" id="data_source_gdcwebapp" tool_type="data_source" hidden="False" display_interface="False" version="1.0.0" force_history_refresh="True"> <description>an intuitive interface to filter, extract, and convert Genomic Data Commons experiments</description> + <requirements> + <requirement type="package" version="2.7.10">python</requirement> + </requirements> <stdio> <exit_code range="1:" /> <exit_code range=":-1" /> </stdio> <command> <![CDATA[ - mkdir -p ${tool.name}_tmp && - python ${__tool_directory__}/json_data_source_mod.py "${__app__.config.output_size_limit}" --json_param_file "${output1}" --path "." --appdata "${tool.name}_tmp" + mkdir -p tmp && + python ${__tool_directory__}/json_data_source_mod.py "${__app__.config.output_size_limit}" --json_param_file "${output1}" --path "." --appdata "tmp" ]]> </command> <inputs check_values="False" action="http://bioinf.iasi.cnr.it/gdcwebapp/app.php" > @@ -16,9 +19,9 @@ <param name="URL" type="hidden" value="" /> </inputs> <outputs> - <data name="output1" format="auto" visible="False" /> + <data name="output1" format="auto" label="${tool.name} Output Data" /> <collection name="list_output" type="list:list" label="${tool.name} Output Collection"> - <discover_datasets pattern="(?P<archive_name>.*)_(?P<file_name>.*)\..*" ext="auto" visible="True" directory="${tool.name}_tmp" /> + <discover_datasets pattern="(?P<identifier_0>[^_]+)_(?P<identifier_1>[^_]+)" ext="auto" visible="False" directory="tmp" /> </collection> </outputs> <options sanitize="False" refresh="True" /> |
b |
diff -r 9d24947d4335 -r 80593f75d74a json_data_source_mod.py --- a/json_data_source_mod.py Thu May 25 17:58:23 2017 -0400 +++ b/json_data_source_mod.py Tue May 30 12:26:32 2017 -0400 |
[ |
b'@@ -57,12 +57,16 @@\n query_stream.close()\n output_stream.close()\n \n-def store_file_from_archive( file_object, target_output_filename ):\n+def store_file_from_archive( file_object, target_output_filename, isString=False ):\n """ Store file after extracting from archive and organize them as a collection using the structure \n (collection-name)_(file-name).ext as file name\n """\n output_stream = open( target_output_filename, \'wb\' )\n- chunk_write( file_object.read(), output_stream )\n+ #chunk_write( file_object.read(), output_stream )\n+ if not isString:\n+ output_stream.write(file_object.read())\n+ else:\n+ output_stream.write(file_object)\n output_stream.close()\n \n \n@@ -85,20 +89,6 @@\n output_stream.close()\n \n \n-def metadata_to_json_for_archive_entry( dataset_id, extension, metaname, filename, ds_type=\'dataset\', primary=False ):\n- """ Return line separated JSON """\n- meta_dict = dict( type = ds_type,\n- ext = extension,\n- filename = filename,\n- name = metaname,\n- metadata = {} )\n- if primary:\n- meta_dict[ \'base_dataset_id\' ] = dataset_id\n- else:\n- meta_dict[ \'dataset_id\' ] = dataset_id\n- return "%s\\n" % json.dumps( meta_dict )\n-\n-\n def metadata_to_json( dataset_id, metadata, filename, ds_type=\'dataset\', primary=False):\n """ Return line separated JSON """\n meta_dict = dict( type = ds_type,\n@@ -115,7 +105,27 @@\n return "%s\\n" % json.dumps( meta_dict )\n \n \n-def download_files_and_write_metadata(query_item, json_params, output_base_path, metadata_parameter_file, primary, appdata_path):\n+def walk_on_archive(target_output_filename, check_ext, archive_name, appdata_path):\n+ archive_name = archive_name.replace("_", "-").replace(".", "-")\n+ with tarfile.open( target_output_filename, check_ext ) as tf:\n+ for entry in tf:\n+ if entry.isfile():\n+ fileobj = tf.extractfile( entry )\n+ # reserve the underscore for the collection searator\n+ filename = os.path.basename( entry.name ).replace("_", "-")\n+ extension = splitext( filename )[1]\n+ # pattern: (?P<identifier_0>[^_]+)_(?P<identifier_1>[^_]+)\n+ if (len(extension) > 0):\n+ filename = (filename[0:len(filename)-(len(extension)+1)]).replace(".", "-") + "." + extension\n+ else:\n+ extension = "auto"\n+ filename_with_collection_prefix = archive_name + "_" + filename\n+ target_entry_output_filename = os.path.join(appdata_path, filename_with_collection_prefix)\n+ store_file_from_archive( fileobj, target_entry_output_filename )\n+ return True\n+\n+\n+def download_files_and_write_metadata(query_item, json_params, output_base_path, metadata_parameter_file, primary, appdata_path, options, args):\n """ Main work function that operates on the JSON representation of\n one dataset and its metadata. Returns True.\n """\n@@ -124,68 +134,48 @@\n ext, out_data_name, \\\n hda_id, dataset_id = set_up_config_values(json_params)\n extension = query_item.get( \'extension\' )\n- filename = query_item.get( \'url\' )\n+ #filename = query_item.get( \'url\' )\n+ filename = query_item.get( \'name\' )\n+\n+ check_ext = ""\n+ if ( filename.endswith( "gz" ) ):\n+ check_ext = "r:gz"\n+ elif ( filename.endswith( "bz2" ) ):\n+ check_ext = "r:bz2"\n+ elif ( filename.endswith( "tar" ) ):\n+ check_ext = "r:"\n+ isArchive = bool( check_ext and check_ext.strip() )\n+\n extra_data = query_item.get( \'extra_data\', None )\n if primary:\n filename = \'\'.join( c in VALID_CHARS and c or \'-\' for c in filename )\n name = construct_multi_filename( hda_id, filename, extension )\n target_output_filename = os.path.normpath( \'/\'.join( [ output_base_path, name ] ) )\n- metadata_param'..b'for entry in tf:\n- fileobj = tf.extractfile( entry )\n- if entry.isfile():\n- \n- #dataset_url, output_filename, \\\n- # extra_files_path, file_name, \\\n- # ext, out_data_name, \\\n- # hda_id, dataset_id = set_up_config_values(json_params)\n- \n- filename = os.path.basename( entry.name )\n- extension = splitext( filename )\n- extra_data = None\n- #target_output_filename = output_filename\n- # (?P<archive_name>.*)_(?P<file_name>.*)\\..*\n- filename_with_collection_prefix = query_item.get( \'name\' ) + "_" + filename\n- target_output_filename = os.path.join(appdata_path, filename_with_collection_prefix)\n- \n- #metadata_parameter_file.write( metadata_to_json_for_archive_entry( dataset_id, extension,\n- # filename, target_output_filename,\n- # ds_type=\'dataset\',\n- # primary=primary) )\n- \n- store_file_from_archive( fileobj, target_output_filename )\n- \n+ if ( isArchive ):\n+ walk_on_archive(target_output_filename, check_ext, query_item.get( \'name\' ), appdata_path)\n+\n return True\n \n \n-def set_up_config_values():\n- extra_files_path, file_name, ext, out_data_name, hda_id, dataset_id = \\\n- itemgetter(\'extra_files_path\', \'file_name\', \'ext\', \'out_data_name\', \'hda_id\', \'dataset_id\')(output_data[0])\n-\n def set_up_config_values(json_params):\n """ Parse json_params file and return a tuple of necessary configuration\n values.\n@@ -202,7 +192,7 @@\n hda_id, dataset_id)\n \n \n-def download_from_json_data( options, args ):\n+def download_from_json_data( options, args, json_params=None, json_dataset_url=None ):\n """ Parse the returned JSON data and download files. Write metadata\n to flat JSON file.\n """\n@@ -212,7 +202,9 @@\n os.makedirs(appdata_path)\n \n # read tool job configuration file and parse parameters we need\n- json_params = json.loads( open( options.json_param_file, \'r\' ).read() )\n+ if json_params is None:\n+ json_params = json.loads( open( options.json_param_file, \'r\' ).read() )\n+ \n dataset_url, output_filename, \\\n extra_files_path, file_name, \\\n ext, out_data_name, \\\n@@ -222,7 +214,10 @@\n \n # get JSON response from data source\n # TODO: make sure response is not enormous\n- query_params = json.loads(urllib.urlopen( dataset_url ).read())\n+ if json_dataset_url is None:\n+ query_params = json.loads(urllib.urlopen( dataset_url ).read())\n+ else:\n+ query_params = json.loads(urllib.urlopen( json_dataset_url ).read())\n # download and write files\n primary = False\n # query_item, hda_id, output_base_path, dataset_id\n@@ -231,11 +226,11 @@\n # TODO: do something with the nested list as a collection\n for query_subitem in query_item:\n primary = download_files_and_write_metadata(query_subitem, json_params, output_base_path,\n- metadata_parameter_file, primary, appdata_path)\n+ metadata_parameter_file, primary, appdata_path, options, args)\n \n elif isinstance( query_item, dict ):\n primary = download_files_and_write_metadata(query_item, json_params, output_base_path,\n- metadata_parameter_file, primary, appdata_path)\n+ metadata_parameter_file, primary, appdata_path, options, args)\n metadata_parameter_file.close()\n \n def __main__():\n' |