Repository 'gdcwebapp'
hg clone https://toolshed.g2.bx.psu.edu/repos/fabio/gdcwebapp

Changeset 32:1edc869cd008 (2017-06-07)
Previous changeset 31:9cb5e4f12ce5 (2017-06-06) Next changeset 33:228038cd0683 (2017-06-07)
Commit message:
Uploaded 20170607
modified:
gdcwebapp.xml
added:
._gdcwebapp.xml
._json_collect_data_source.py
json_collect_data_source.py
removed:
json_data_source_mod.py
b
diff -r 9cb5e4f12ce5 -r 1edc869cd008 ._gdcwebapp.xml
b
Binary file ._gdcwebapp.xml has changed
b
diff -r 9cb5e4f12ce5 -r 1edc869cd008 ._json_collect_data_source.py
b
Binary file ._json_collect_data_source.py has changed
b
diff -r 9cb5e4f12ce5 -r 1edc869cd008 gdcwebapp.xml
--- a/gdcwebapp.xml Tue Jun 06 23:04:41 2017 -0400
+++ b/gdcwebapp.xml Wed Jun 07 18:01:57 2017 -0400
[
@@ -11,7 +11,7 @@
     <command>
         <![CDATA[
             mkdir -p tmp && 
-            python ${__tool_directory__}/json_data_source_mod.py "${__app__.config.output_size_limit}" --json_param_file "${output1}" --path "." --appdata "tmp"
+            python ${__tool_directory__}/json_collect_data_source.py "${__app__.config.output_size_limit}" --json_param_file "${output1}" --path "." --appdata "tmp"
         ]]>
     </command>
     <inputs check_values="False" action="http://bioinf.iasi.cnr.it/gdcwebapp/app.php">
b
diff -r 9cb5e4f12ce5 -r 1edc869cd008 json_collect_data_source.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/json_collect_data_source.py Wed Jun 07 18:01:57 2017 -0400
[
b'@@ -0,0 +1,282 @@\n+#!/usr/bin/env python\n+import json\n+import optparse\n+import urllib\n+import os.path\n+import os\n+from operator import itemgetter\n+import tarfile\n+\n+__version__ = "1.0.0"\n+CHUNK_SIZE = 2**20 #1mb\n+VALID_CHARS = \'.-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ \'\n+\n+\n+def splitext(path):\n+    for ext in [\'.tar.gz\', \'.tar.bz2\']:\n+        if path.endswith(ext):\n+            path, ext = path[:-len(ext)], path[-len(ext):]\n+            break\n+    else:\n+        path, ext = os.path.splitext(path)\n+    return path, ext[1:]\n+\n+\n+def chunk_write( source_stream, target_stream, source_method = "read", target_method="write" ):\n+    source_method = getattr( source_stream, source_method )\n+    target_method = getattr( target_stream, target_method )\n+    while True:\n+        chunk = source_method( CHUNK_SIZE )\n+        if chunk:\n+            target_method( chunk )\n+        else:\n+            break\n+\n+\n+def deconstruct_multi_filename( multi_filename ):\n+    keys = [ \'primary\', \'id\', \'name\', \'visible\', \'file_type\' ]\n+    return ( dict( zip( keys, multi_filename.split(\'_\') ) ) )\n+\n+\n+def construct_multi_filename( id, name, file_type ):\n+    """ Implementation of *Number of Output datasets cannot be determined until tool run* from documentation_.\n+    .. _documentation: http://wiki.galaxyproject.org/Admin/Tools/Multiple%20Output%20Files\n+    """\n+    filename = "%s_%s_%s_%s_%s" % ( \'primary\', id, name, \'visible\', file_type )\n+    return filename\n+\n+\n+def download_from_query( query_data, target_output_filename  ):\n+    """ Download file from the json data and write it to target_output_filename.\n+    """\n+    query_url = query_data.get( \'url\' )\n+    query_file_type = query_data.get( \'extension\' )\n+    query_stream = urllib.urlopen( query_url )\n+    output_stream = open( target_output_filename, \'wb\' )\n+    chunk_write( query_stream, output_stream )\n+    query_stream.close()\n+    output_stream.close()\n+\n+def store_file_from_archive( file_object, target_output_filename, isString=False ):\n+    """ Store file after extracting from archive and organize them as a collection using the structure \n+    (collection-name)_(file-name).ext as file name\n+    """\n+    output_stream = open( target_output_filename, \'wb\' )\n+    #chunk_write( file_object.read(), output_stream )\n+    if not isString:\n+        output_stream.write(file_object.read())\n+    else:\n+        output_stream.write(file_object)\n+    output_stream.close()\n+\n+\n+def download_extra_data( query_ext_data, base_path ):\n+    """ Download any extra data defined in the JSON.\n+    NOTE: the "path" value is a relative path to the file on our\n+    file system. This is slightly dangerous and we should make every effort\n+    to avoid a malicious absolute path to write the file elsewhere on the\n+    filesystem.\n+    """\n+    for ext_data in query_ext_data:\n+        if not os.path.exists( base_path ):\n+            os.mkdir( base_path )\n+        query_stream = urllib.urlopen( ext_data.get( \'url\' ) )\n+        ext_path = ext_data.get( \'path\' )\n+        os.makedirs( os.path.normpath( \'/\'.join( [ base_path, os.path.dirname( ext_path ) ] ) ) )\n+        output_stream = open( os.path.normpath( \'/\'.join( [ base_path, ext_path ] ) ), \'wb\' )\n+        chunk_write( query_stream, output_stream )\n+        query_stream.close()\n+        output_stream.close()\n+\n+\n+def metadata_to_json( dataset_id, metadata, filename, ds_type=\'dataset\', primary=False):\n+    """ Return line separated JSON """\n+    meta_dict = dict( type = ds_type,\n+                      ext = metadata.get( \'extension\' ),\n+                      filename = filename,\n+                      name = metadata.get( \'name\' ),\n+                      metadata = metadata.get( \'metadata\', {} ) )\n+    if metadata.get( \'extra_data\', None ):\n+        meta_dict[ \'extra_files\' ] = \'_\'.join( [ filename, \'files\' ] )\n+    if primary:\n+        meta_dict[ \'base_dataset_id\' ] = dataset_id\n+    else:\n+        meta_dict[ \'dataset_id\' ] = dataset_id\n+    r'..b'arse json_params file and return a tuple of necessary configuration\n+    values.\n+    """\n+    datasource_params = json_params.get( \'param_dict\' )\n+    dataset_url = datasource_params.get( \'URL\' )\n+    output_filename = datasource_params.get( \'output1\', None )\n+    output_data = json_params.get( \'output_data\' )\n+    extra_files_path, file_name, ext, out_data_name, hda_id, dataset_id = \\\n+      itemgetter(\'extra_files_path\', \'file_name\', \'ext\', \'out_data_name\', \'hda_id\', \'dataset_id\')(output_data[0])\n+    return (dataset_url, output_filename,\n+            extra_files_path, file_name,\n+            ext, out_data_name,\n+            hda_id, dataset_id)\n+\n+\n+def download_from_json_data( options, args ):\n+    """ Parse the returned JSON data and download files. Write metadata\n+    to flat JSON file.\n+    """\n+    output_base_path = options.path\n+    appdata_path = options.appdata\n+    if not os.path.exists(appdata_path):\n+        os.makedirs(appdata_path)\n+\n+    # read tool job configuration file and parse parameters we need\n+    json_params = json.loads( open( options.json_param_file, \'r\' ).read() )\n+    print("json_params: "+str(json_params))\n+    \n+    dataset_url, output_filename, \\\n+        extra_files_path, file_name, \\\n+        ext, out_data_name, \\\n+        hda_id, dataset_id = set_up_config_values(json_params)\n+    # line separated JSON file to contain all dataset metadata\n+    metadata_parameter_file = open( json_params[\'job_config\'][\'TOOL_PROVIDED_JOB_METADATA_FILE\'], \'wb\' )\n+\n+    # get JSON response from data source\n+    # TODO: make sure response is not enormous\n+    query_params = json.loads(urllib.urlopen( dataset_url ).read())\n+    # download and write files\n+    primary = False\n+    #primary = True\n+    # query_item, hda_id, output_base_path, dataset_id\n+    for query_item in query_params:\n+        if isinstance( query_item, list ):\n+            # TODO: do something with the nested list as a collection\n+            for query_subitem in query_item:\n+                primary = download_files_and_write_metadata(query_subitem, json_params, output_base_path,\n+                                                            metadata_parameter_file, primary, appdata_path, options, args)\n+\n+        elif isinstance( query_item, dict ):\n+            primary = download_files_and_write_metadata(query_item, json_params, output_base_path,\n+                                                        metadata_parameter_file, primary, appdata_path, options, args)\n+    metadata_parameter_file.close()\n+\n+def __main__():\n+    """ Read the JSON return from a data source. Parse each line and request\n+    the data, download to "newfilepath", and write metadata.\n+\n+    Schema\n+    ------\n+\n+        [ {"url":"http://url_of_file",\n+           "name":"encode WigData",\n+           "extension":"wig",\n+           "metadata":{"db_key":"hg19"},\n+           "extra_data":[ {"url":"http://url_of_ext_file",\n+                           "path":"rel/path/to/ext_file"}\n+                        ]\n+          }\n+        ]\n+\n+    """\n+    # Parse the command line options\n+    usage = "Usage: json_data_source_mod.py max_size --json_param_file filename [options]"\n+    parser = optparse.OptionParser(usage = usage)\n+    parser.add_option("-j", "--json_param_file", type="string",\n+                    action="store", dest="json_param_file", help="json schema return data")\n+    parser.add_option("-p", "--path", type="string",\n+                    action="store", dest="path", help="new file path")\n+    parser.add_option("-a", "--appdata", type="string",\n+                    action="store", dest="appdata", help="appdata folder name")\n+    parser.add_option("-v", "--version", action="store_true", dest="version",\n+                    default=False, help="display version and exit")\n+\n+    (options, args) = parser.parse_args()\n+    if options.version:\n+        print __version__\n+    else:\n+        download_from_json_data( options, args )\n+\n+\n+if __name__ == "__main__": __main__()\n'
b
diff -r 9cb5e4f12ce5 -r 1edc869cd008 json_data_source_mod.py
--- a/json_data_source_mod.py Tue Jun 06 23:04:41 2017 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,281 +0,0 @@\n-#!/usr/bin/env python\n-import json\n-import optparse\n-import urllib\n-import os.path\n-import os\n-from operator import itemgetter\n-import tarfile\n-\n-__version__ = "1.0.0"\n-CHUNK_SIZE = 2**20 #1mb\n-VALID_CHARS = \'.-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ \'\n-\n-\n-def splitext(path):\n-    for ext in [\'.tar.gz\', \'.tar.bz2\']:\n-        if path.endswith(ext):\n-            path, ext = path[:-len(ext)], path[-len(ext):]\n-            break\n-    else:\n-        path, ext = os.path.splitext(path)\n-    return path, ext[1:]\n-\n-\n-def chunk_write( source_stream, target_stream, source_method = "read", target_method="write" ):\n-    source_method = getattr( source_stream, source_method )\n-    target_method = getattr( target_stream, target_method )\n-    while True:\n-        chunk = source_method( CHUNK_SIZE )\n-        if chunk:\n-            target_method( chunk )\n-        else:\n-            break\n-\n-\n-def deconstruct_multi_filename( multi_filename ):\n-    keys = [ \'primary\', \'id\', \'name\', \'visible\', \'file_type\' ]\n-    return ( dict( zip( keys, multi_filename.split(\'_\') ) ) )\n-\n-\n-def construct_multi_filename( id, name, file_type ):\n-    """ Implementation of *Number of Output datasets cannot be determined until tool run* from documentation_.\n-    .. _documentation: http://wiki.galaxyproject.org/Admin/Tools/Multiple%20Output%20Files\n-    """\n-    filename = "%s_%s_%s_%s_%s" % ( \'primary\', id, name, \'visible\', file_type )\n-    return filename\n-\n-\n-def download_from_query( query_data, target_output_filename  ):\n-    """ Download file from the json data and write it to target_output_filename.\n-    """\n-    query_url = query_data.get( \'url\' )\n-    query_file_type = query_data.get( \'extension\' )\n-    query_stream = urllib.urlopen( query_url )\n-    output_stream = open( target_output_filename, \'wb\' )\n-    chunk_write( query_stream, output_stream )\n-    query_stream.close()\n-    output_stream.close()\n-\n-def store_file_from_archive( file_object, target_output_filename, isString=False ):\n-    """ Store file after extracting from archive and organize them as a collection using the structure \n-    (collection-name)_(file-name).ext as file name\n-    """\n-    output_stream = open( target_output_filename, \'wb\' )\n-    #chunk_write( file_object.read(), output_stream )\n-    if not isString:\n-        output_stream.write(file_object.read())\n-    else:\n-        output_stream.write(file_object)\n-    output_stream.close()\n-\n-\n-def download_extra_data( query_ext_data, base_path ):\n-    """ Download any extra data defined in the JSON.\n-    NOTE: the "path" value is a relative path to the file on our\n-    file system. This is slightly dangerous and we should make every effort\n-    to avoid a malicious absolute path to write the file elsewhere on the\n-    filesystem.\n-    """\n-    for ext_data in query_ext_data:\n-        if not os.path.exists( base_path ):\n-            os.mkdir( base_path )\n-        query_stream = urllib.urlopen( ext_data.get( \'url\' ) )\n-        ext_path = ext_data.get( \'path\' )\n-        os.makedirs( os.path.normpath( \'/\'.join( [ base_path, os.path.dirname( ext_path ) ] ) ) )\n-        output_stream = open( os.path.normpath( \'/\'.join( [ base_path, ext_path ] ) ), \'wb\' )\n-        chunk_write( query_stream, output_stream )\n-        query_stream.close()\n-        output_stream.close()\n-\n-\n-def metadata_to_json( dataset_id, metadata, filename, ds_type=\'dataset\', primary=False):\n-    """ Return line separated JSON """\n-    meta_dict = dict( type = ds_type,\n-                      ext = metadata.get( \'extension\' ),\n-                      filename = filename,\n-                      name = metadata.get( \'name\' ),\n-                      metadata = metadata.get( \'metadata\', {} ) )\n-    if metadata.get( \'extra_data\', None ):\n-        meta_dict[ \'extra_files\' ] = \'_\'.join( [ filename, \'files\' ] )\n-    if primary:\n-        meta_dict[ \'base_dataset_id\' ] = dataset_id\n-    else:\n-        meta_dict[ \'dataset_id\' ] = dataset_id\n-    r'..b'set_up_config_values(json_params):\n-    """ Parse json_params file and return a tuple of necessary configuration\n-    values.\n-    """\n-    datasource_params = json_params.get( \'param_dict\' )\n-    dataset_url = datasource_params.get( \'URL\' )\n-    output_filename = datasource_params.get( \'output1\', None )\n-    output_data = json_params.get( \'output_data\' )\n-    extra_files_path, file_name, ext, out_data_name, hda_id, dataset_id = \\\n-      itemgetter(\'extra_files_path\', \'file_name\', \'ext\', \'out_data_name\', \'hda_id\', \'dataset_id\')(output_data[0])\n-    return (dataset_url, output_filename,\n-            extra_files_path, file_name,\n-            ext, out_data_name,\n-            hda_id, dataset_id)\n-\n-\n-def download_from_json_data( options, args ):\n-    """ Parse the returned JSON data and download files. Write metadata\n-    to flat JSON file.\n-    """\n-    output_base_path = options.path\n-    appdata_path = options.appdata\n-    if not os.path.exists(appdata_path):\n-        os.makedirs(appdata_path)\n-\n-    # read tool job configuration file and parse parameters we need\n-    json_params = json.loads( open( options.json_param_file, \'r\' ).read() )\n-    \n-    dataset_url, output_filename, \\\n-        extra_files_path, file_name, \\\n-        ext, out_data_name, \\\n-        hda_id, dataset_id = set_up_config_values(json_params)\n-    # line separated JSON file to contain all dataset metadata\n-    metadata_parameter_file = open( json_params[\'job_config\'][\'TOOL_PROVIDED_JOB_METADATA_FILE\'], \'wb\' )\n-\n-    # get JSON response from data source\n-    # TODO: make sure response is not enormous\n-    query_params = json.loads(urllib.urlopen( dataset_url ).read())\n-    # download and write files\n-    primary = False\n-    #primary = True\n-    # query_item, hda_id, output_base_path, dataset_id\n-    for query_item in query_params:\n-        if isinstance( query_item, list ):\n-            # TODO: do something with the nested list as a collection\n-            for query_subitem in query_item:\n-                primary = download_files_and_write_metadata(query_subitem, json_params, output_base_path,\n-                                                            metadata_parameter_file, primary, appdata_path, options, args)\n-\n-        elif isinstance( query_item, dict ):\n-            primary = download_files_and_write_metadata(query_item, json_params, output_base_path,\n-                                                        metadata_parameter_file, primary, appdata_path, options, args)\n-    metadata_parameter_file.close()\n-\n-def __main__():\n-    """ Read the JSON return from a data source. Parse each line and request\n-    the data, download to "newfilepath", and write metadata.\n-\n-    Schema\n-    ------\n-\n-        [ {"url":"http://url_of_file",\n-           "name":"encode WigData",\n-           "extension":"wig",\n-           "metadata":{"db_key":"hg19"},\n-           "extra_data":[ {"url":"http://url_of_ext_file",\n-                           "path":"rel/path/to/ext_file"}\n-                        ]\n-          }\n-        ]\n-\n-    """\n-    # Parse the command line options\n-    usage = "Usage: json_data_source_mod.py max_size --json_param_file filename [options]"\n-    parser = optparse.OptionParser(usage = usage)\n-    parser.add_option("-j", "--json_param_file", type="string",\n-                    action="store", dest="json_param_file", help="json schema return data")\n-    parser.add_option("-p", "--path", type="string",\n-                    action="store", dest="path", help="new file path")\n-    parser.add_option("-a", "--appdata", type="string",\n-                    action="store", dest="appdata", help="appdata folder name")\n-    parser.add_option("-v", "--version", action="store_true", dest="version",\n-                    default=False, help="display version and exit")\n-\n-    (options, args) = parser.parse_args()\n-    if options.version:\n-        print __version__\n-    else:\n-        download_from_json_data( options, args )\n-\n-\n-if __name__ == "__main__": __main__()\n'