Repository 'gdcwebapp'
hg clone https://toolshed.g2.bx.psu.edu/repos/fabio/gdcwebapp

Changeset 12:80593f75d74a (2017-05-30)
Previous changeset 11:9d24947d4335 (2017-05-25) Next changeset 13:39c4f4528c6e (2017-05-30)
Commit message:
Uploaded
modified:
gdcwebapp.xml
json_data_source_mod.py
added:
._gdcwebapp.xml
._json_data_source_mod.py
b
diff -r 9d24947d4335 -r 80593f75d74a ._gdcwebapp.xml
b
Binary file ._gdcwebapp.xml has changed
b
diff -r 9d24947d4335 -r 80593f75d74a ._json_data_source_mod.py
b
Binary file ._json_data_source_mod.py has changed
b
diff -r 9d24947d4335 -r 80593f75d74a gdcwebapp.xml
--- a/gdcwebapp.xml Thu May 25 17:58:23 2017 -0400
+++ b/gdcwebapp.xml Tue May 30 12:26:32 2017 -0400
[
@@ -1,14 +1,17 @@
 <?xml version="1.0"?>
 <tool name="GDCWebApp" id="data_source_gdcwebapp" tool_type="data_source" hidden="False" display_interface="False" version="1.0.0" force_history_refresh="True">
     <description>an intuitive interface to filter, extract, and convert Genomic Data Commons experiments</description>
+    <requirements>
+        <requirement type="package" version="2.7.10">python</requirement>
+    </requirements>
     <stdio>
         <exit_code range="1:" />
         <exit_code range=":-1" />
     </stdio>
     <command>
         <![CDATA[
-            mkdir -p ${tool.name}_tmp && 
-            python ${__tool_directory__}/json_data_source_mod.py "${__app__.config.output_size_limit}" --json_param_file "${output1}" --path "." --appdata "${tool.name}_tmp"
+            mkdir -p tmp && 
+            python ${__tool_directory__}/json_data_source_mod.py "${__app__.config.output_size_limit}" --json_param_file "${output1}" --path "." --appdata "tmp"
         ]]>
     </command>
     <inputs check_values="False" action="http://bioinf.iasi.cnr.it/gdcwebapp/app.php" >
@@ -16,9 +19,9 @@
         <param name="URL" type="hidden" value="" />
     </inputs>
     <outputs>
-        <data name="output1" format="auto" visible="False" />
+        <data name="output1" format="auto" label="${tool.name} Output Data" />
         <collection name="list_output" type="list:list" label="${tool.name} Output Collection">
-            <discover_datasets pattern="(?P&lt;archive_name&gt;.*)_(?P&lt;file_name&gt;.*)\..*" ext="auto" visible="True" directory="${tool.name}_tmp" />
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_(?P&lt;identifier_1&gt;[^_]+)" ext="auto" visible="False" directory="tmp" />
         </collection>
     </outputs>
     <options sanitize="False" refresh="True" />
b
diff -r 9d24947d4335 -r 80593f75d74a json_data_source_mod.py
--- a/json_data_source_mod.py Thu May 25 17:58:23 2017 -0400
+++ b/json_data_source_mod.py Tue May 30 12:26:32 2017 -0400
[
b'@@ -57,12 +57,16 @@\n     query_stream.close()\n     output_stream.close()\n \n-def store_file_from_archive( file_object, target_output_filename ):\n+def store_file_from_archive( file_object, target_output_filename, isString=False ):\n     """ Store file after extracting from archive and organize them as a collection using the structure \n     (collection-name)_(file-name).ext as file name\n     """\n     output_stream = open( target_output_filename, \'wb\' )\n-    chunk_write( file_object.read(), output_stream )\n+    #chunk_write( file_object.read(), output_stream )\n+    if not isString:\n+        output_stream.write(file_object.read())\n+    else:\n+        output_stream.write(file_object)\n     output_stream.close()\n \n \n@@ -85,20 +89,6 @@\n         output_stream.close()\n \n \n-def metadata_to_json_for_archive_entry( dataset_id, extension, metaname, filename, ds_type=\'dataset\', primary=False ):\n-    """ Return line separated JSON """\n-    meta_dict = dict( type = ds_type,\n-                      ext = extension,\n-                      filename = filename,\n-                      name = metaname,\n-                      metadata = {} )\n-    if primary:\n-        meta_dict[ \'base_dataset_id\' ] = dataset_id\n-    else:\n-        meta_dict[ \'dataset_id\' ] = dataset_id\n-    return "%s\\n" % json.dumps( meta_dict )\n-\n-\n def metadata_to_json( dataset_id, metadata, filename, ds_type=\'dataset\', primary=False):\n     """ Return line separated JSON """\n     meta_dict = dict( type = ds_type,\n@@ -115,7 +105,27 @@\n     return "%s\\n" % json.dumps( meta_dict )\n \n \n-def download_files_and_write_metadata(query_item, json_params, output_base_path, metadata_parameter_file, primary, appdata_path):\n+def walk_on_archive(target_output_filename, check_ext, archive_name, appdata_path):\n+    archive_name = archive_name.replace("_", "-").replace(".", "-")\n+    with tarfile.open( target_output_filename, check_ext ) as tf:\n+        for entry in tf:\n+            if entry.isfile():\n+                fileobj = tf.extractfile( entry )\n+                # reserve the underscore for the collection searator\n+                filename = os.path.basename( entry.name ).replace("_", "-")\n+                extension = splitext( filename )[1]\n+                # pattern: (?P&lt;identifier_0&gt;[^_]+)_(?P&lt;identifier_1&gt;[^_]+)\n+                if (len(extension) > 0):\n+                    filename = (filename[0:len(filename)-(len(extension)+1)]).replace(".", "-") + "." + extension\n+                else:\n+                    extension = "auto"\n+                filename_with_collection_prefix = archive_name + "_" + filename\n+                target_entry_output_filename = os.path.join(appdata_path, filename_with_collection_prefix)\n+                store_file_from_archive( fileobj, target_entry_output_filename )\n+    return True\n+\n+\n+def download_files_and_write_metadata(query_item, json_params, output_base_path, metadata_parameter_file, primary, appdata_path, options, args):\n     """ Main work function that operates on the JSON representation of\n     one dataset and its metadata. Returns True.\n     """\n@@ -124,68 +134,48 @@\n         ext, out_data_name, \\\n         hda_id, dataset_id = set_up_config_values(json_params)\n     extension = query_item.get( \'extension\' )\n-    filename = query_item.get( \'url\' )\n+    #filename = query_item.get( \'url\' )\n+    filename = query_item.get( \'name\' )\n+\n+    check_ext = ""\n+    if ( filename.endswith( "gz" ) ):\n+        check_ext = "r:gz"\n+    elif ( filename.endswith( "bz2" ) ):\n+        check_ext = "r:bz2"\n+    elif ( filename.endswith( "tar" ) ):\n+        check_ext = "r:"\n+    isArchive = bool( check_ext and check_ext.strip() )\n+\n     extra_data = query_item.get( \'extra_data\', None )\n     if primary:\n         filename = \'\'.join( c in VALID_CHARS and c or \'-\' for c in filename )\n         name = construct_multi_filename( hda_id, filename, extension )\n         target_output_filename = os.path.normpath( \'/\'.join( [ output_base_path, name ] ) )\n-        metadata_param'..b'for entry in tf:\n-                fileobj = tf.extractfile( entry )\n-                if entry.isfile():\n-                    \n-                    #dataset_url, output_filename, \\\n-                    #    extra_files_path, file_name, \\\n-                    #    ext, out_data_name, \\\n-                    #    hda_id, dataset_id = set_up_config_values(json_params)\n-                    \n-                    filename = os.path.basename( entry.name )\n-                    extension = splitext( filename )\n-                    extra_data = None\n-                    #target_output_filename = output_filename\n-                    # (?P&lt;archive_name&gt;.*)_(?P&lt;file_name&gt;.*)\\..*\n-                    filename_with_collection_prefix = query_item.get( \'name\' ) + "_" + filename\n-                    target_output_filename = os.path.join(appdata_path, filename_with_collection_prefix)\n-                    \n-                    #metadata_parameter_file.write( metadata_to_json_for_archive_entry( dataset_id, extension,\n-                    #                                     filename, target_output_filename,\n-                    #                                     ds_type=\'dataset\',\n-                    #                                     primary=primary) )\n-                    \n-                    store_file_from_archive( fileobj, target_output_filename )\n-    \n+    if ( isArchive ):\n+        walk_on_archive(target_output_filename, check_ext, query_item.get( \'name\' ), appdata_path)\n+\n     return True\n \n \n-def set_up_config_values():\n-    extra_files_path, file_name, ext, out_data_name, hda_id, dataset_id = \\\n-      itemgetter(\'extra_files_path\', \'file_name\', \'ext\', \'out_data_name\', \'hda_id\', \'dataset_id\')(output_data[0])\n-\n def set_up_config_values(json_params):\n     """ Parse json_params file and return a tuple of necessary configuration\n     values.\n@@ -202,7 +192,7 @@\n             hda_id, dataset_id)\n \n \n-def download_from_json_data( options, args ):\n+def download_from_json_data( options, args, json_params=None, json_dataset_url=None ):\n     """ Parse the returned JSON data and download files. Write metadata\n     to flat JSON file.\n     """\n@@ -212,7 +202,9 @@\n         os.makedirs(appdata_path)\n \n     # read tool job configuration file and parse parameters we need\n-    json_params = json.loads( open( options.json_param_file, \'r\' ).read() )\n+    if json_params is None:\n+        json_params = json.loads( open( options.json_param_file, \'r\' ).read() )\n+    \n     dataset_url, output_filename, \\\n         extra_files_path, file_name, \\\n         ext, out_data_name, \\\n@@ -222,7 +214,10 @@\n \n     # get JSON response from data source\n     # TODO: make sure response is not enormous\n-    query_params = json.loads(urllib.urlopen( dataset_url ).read())\n+    if json_dataset_url is None:\n+        query_params = json.loads(urllib.urlopen( dataset_url ).read())\n+    else:\n+        query_params = json.loads(urllib.urlopen( json_dataset_url ).read())\n     # download and write files\n     primary = False\n     # query_item, hda_id, output_base_path, dataset_id\n@@ -231,11 +226,11 @@\n             # TODO: do something with the nested list as a collection\n             for query_subitem in query_item:\n                 primary = download_files_and_write_metadata(query_subitem, json_params, output_base_path,\n-                                                            metadata_parameter_file, primary, appdata_path)\n+                                                            metadata_parameter_file, primary, appdata_path, options, args)\n \n         elif isinstance( query_item, dict ):\n             primary = download_files_and_write_metadata(query_item, json_params, output_base_path,\n-                                                        metadata_parameter_file, primary, appdata_path)\n+                                                        metadata_parameter_file, primary, appdata_path, options, args)\n     metadata_parameter_file.close()\n \n def __main__():\n'