Repository 'ctat_genome_resource_libs_data_manager'
hg clone https://toolshed.g2.bx.psu.edu/repos/trinity_ctat/ctat_genome_resource_libs_data_manager

Changeset 0:ea7bc21cbb7a (2018-07-16)
Next changeset 1:da7f50809820 (2018-11-11)
Commit message:
Uploaded
added:
data_manager/add_ctat_resource_lib.py
data_manager/add_ctat_resource_lib.xml
data_manager_conf.xml
tool-data/ctat_genome_resource_libs.loc.sample
tool_data_table_conf.xml.sample
b
diff -r 000000000000 -r ea7bc21cbb7a data_manager/add_ctat_resource_lib.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/add_ctat_resource_lib.py Mon Jul 16 19:56:38 2018 -0400
[
b'@@ -0,0 +1,879 @@\n+#!/usr/bin/env python\n+# ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/\n+\n+# Rewritten by H.E. Cicada Brokaw Dennis from a source downloaded from the toolshed and\n+# other example code on the web.\n+# This now allows downloading of a user selected library\n+# but only from the CTAT Genome Resource Library website.\n+# Ultimately we might want to allow the user to specify any location \n+# from which to download.\n+# Users can create or download other libraries and use this tool to add them if they don\'t want\n+# to add them by hand.\n+\n+import argparse\n+import os\n+#import tarfile\n+#import urllib\n+import subprocess\n+\n+# Comment out the following line when testing without galaxy package.\n+from galaxy.util.json import to_json_string\n+# The following is not being used, but leaving as info\n+# in case we ever want to get input values using json.\n+# from galaxy.util.json import from_json_string\n+\n+# datetime.now() is used to create the unique_id\n+from datetime import datetime\n+\n+# The FileListParser is used by get_ctat_genome_filenames(),\n+# which is called by the Data Manager interface (.xml file) to get\n+# the filenames that are available online at broadinstitute.org\n+# Not sure best way to do it. \n+# This object uses HTMLParser to look through the html \n+# searching for the filenames within anchor tags.\n+import urllib2\n+from HTMLParser import HTMLParser\n+\n+_CTAT_ResourceLib_URL = \'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/\'\n+_CTAT_MutationIndex_URL = \'https://data.broadinstitute.org/Trinity/CTAT/mutation/\'\n+_CTAT_Build_dirname = \'ctat_genome_lib_build_dir\'\n+_CTAT_ResourceLib_DisplayNamePrefix = \'CTAT_GenomeResourceLib_\'\n+_CTAT_ResourceLib_DefaultGenome = \'Unspecified_Genome\'\n+_CTAT_HumanFusionLib_FilenamePrefix = \'CTAT_HumanFusionLib\'\n+_CTAT_RefGenome_Filename = \'ref_genome.fa\'\n+_CTAT_MouseGenome_Prefix = \'Mouse\'\n+_CTAT_HumanGenome_Prefix = \'GRCh\'\n+_NumBytesNeededForBuild = 66571993088 # 62 Gigabytes. FIX - This might not be correct.\n+_NumBytesNeededForIndexes = 21474836480 # 20 Gigabytes. FIX - This might not be correct.\n+_Download_TestFile = "write_testfile.txt"\n+_DownloadSuccessFile = \'download_succeeded.txt\'\n+_LibBuiltSuccessFile = \'build_succeeded.txt\'\n+_MutationDownloadSuccessFile = \'mutation_index_download_succeeded.txt\'\n+\n+class FileListParser(HTMLParser):\n+    def __init__(self):\n+        # Have to use direct call to super class rather than using super():\n+        # super(FileListParser, self).__init__()\n+        # because HTMLParser is an "old style" class and its inheritance chain does not include object.\n+        HTMLParser.__init__(self)\n+        self.urls = set()\n+    def handle_starttag(self, tag, attrs):\n+        # Look for filename references in anchor tags and add them to urls.\n+        if tag == "a":\n+            # The tag is an anchor tag.\n+            for attribute in attrs:\n+                # print "Checking: {:s}".format(str(attribute))\n+                if attribute[0] == "href":\n+                    # Does the href have a tar.gz in it?\n+                    if ("tar.gz" in attribute[1]) and ("md5" not in attribute[1]):\n+                        # Add the value to urls.\n+                        self.urls.add(attribute[1])            \n+# End of class FileListParser\n+\n+def get_ctat_genome_urls():\n+    # open the url and retrieve the urls of the files in the directory.\n+    resource = urllib2.urlopen(_CTAT_ResourceLib_URL)\n+    theHTML = resource.read()\n+    filelist_parser = FileListParser()\n+    filelist_parser.feed(theHTML)\n+    # For dynamic options need to return an interable with contents that are tuples with 3 items.\n+    # Item one is a string that is the display name put into the option list.\n+    # Item two is the value that is put into the parameter associated with the option list.\n+    # Item three is a True or False value, indicating whether the item is selected.\n+    options = []\n+    for i, url in enumerate(filelist_parser.urls):\n+        # The '..b'e\n+    # genome_build_directory.\n+    if (source_data_directory is not None):\n+        build_the_library(source_data_directory, \\\n+                          genome_build_directory, \\\n+                          args.rebuild, \\\n+                          args.gmap_build)\n+        lib_was_built = True\n+    elif genome_build_directory is None:\n+        raise ValueError("No CTAT Genome Resource Library was downloaded, " + \\\n+            "there is no source data specified, " + \\\n+            "and no build location has been set. " + \\\n+            "This line of code should never execute.")\n+    # The following looks to see if the library actually exists after the build,\n+    # and raises an error if it cannot find the library files.\n+    # The reassignment of genome_build_directory should be superfluous, \n+    # since genome_build_directory should already point to the correct directory,\n+    # unless I made a mistake somewhere above.\n+\n+    genome_build_directory = search_for_genome_build_dir(genome_build_directory)\n+\n+    if (args.gmap_build and not lib_was_built):\n+        # If we did not build the genome resource library\n+        # the user might still be asking for a gmap_build.\n+        gmap_the_library(genome_build_directory)\n+\n+    if (args.download_mutation_indexes_url != ""):\n+        download_mutation_indexes(source_url=args.download_mutation_indexes_url, \\\n+                                  genome_build_directory=genome_build_directory, \\\n+                                  force_download=args.new_mutation_indexes_download)\n+\n+    # Need to get the genome name.\n+    genome_name = find_genome_name_in_path(args.download_url)\n+    if genome_name is None:\n+        genome_name = find_genome_name_in_path(genome_build_directory)\n+    if genome_name is None:\n+        genome_name = find_genome_name_in_path(downloaded_directory)\n+    if genome_name is None:\n+        genome_name = find_genome_name_in_path(args.source_location)\n+    if genome_name is None:\n+        genome_name = find_genome_name_in_path(args.download_location)\n+    if genome_name is None:\n+        genome_name = find_genome_name_in_path(args.display_name)\n+    if genome_name is None:\n+        genome_name = _CTAT_ResourceLib_DefaultGenome\n+        print "WARNING: We could not find a genome name in any of the directory paths."\n+\n+    # Determine the display_name for the library.\n+    if (args.display_name is None) or (args.display_name == ""):\n+        # Create the display_name from the genome_name.\n+        display_name = _CTAT_ResourceLib_DisplayNamePrefix + genome_name\n+    else:\n+        display_name = _CTAT_ResourceLib_DisplayNamePrefix + args.display_name\n+    display_name = display_name.replace(" ","_")\n+\n+    # Create a unique_id for the library.\n+    datetime_stamp = datetime.now().strftime("_%Y_%m_%d_%H_%M_%S_%f")\n+    unique_id = genome_name + "." + datetime_stamp\n+\n+    print "The Genome Resource Library\'s display_name will be set to: {:s}\\n".format(display_name)\n+    print "Its unique_id will be set to: {:s}\\n".format(unique_id)\n+    print "Its dir_path will be set to: {:s}\\n".format(genome_build_directory)\n+\n+    data_manager_dict = {}\n+    data_manager_dict[\'data_tables\'] = {}\n+    data_manager_dict[\'data_tables\'][\'ctat_genome_resource_libs\'] = []\n+    data_table_entry = dict(value=unique_id, name=display_name, path=genome_build_directory)\n+    data_manager_dict[\'data_tables\'][\'ctat_genome_resource_libs\'].append(data_table_entry)\n+\n+    # Temporarily the output file\'s dictionary is written for debugging:\n+    print "The dictionary for the output file is:\\n\\t{:s}".format(str(data_manager_dict))\n+    # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager,\n+    # which then puts it into the correct .loc file (I think).\n+    # Comment out the following line when testing without galaxy package.\n+    open(args.output_filename, \'wb\').write(to_json_string(data_manager_dict))\n+\n+if __name__ == "__main__":\n+    main()\n'
b
diff -r 000000000000 -r ea7bc21cbb7a data_manager/add_ctat_resource_lib.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/add_ctat_resource_lib.xml Mon Jul 16 19:56:38 2018 -0400
[
b'@@ -0,0 +1,147 @@\n+<tool id="ctat_genome_resource_libs_data_manager" \n+    name="CTAT Genome Resource Libraries Data Manager" \n+    version="1.0.0" tool_type="manage_data">\n+    <description>Retrieve, and/or specify the location of, a CTAT Genome Resource Library. \n+    </description>\n+    <requirements>\n+        <requirement type="package" version="2.7">python</requirement>\n+        <requirement type="package" version="0.5.0">fusion-filter</requirement>\n+    </requirements>\n+    <command detect_errors="default">\n+        <![CDATA[\n+          python $__tool_directory__/add_ctat_resource_lib.py \n+            --output_filename="${out_file}" \n+            --display_name="${display_name}" \n+            #if str($genome_resource_library.build_type) == "download_and_build":\n+              --download_url="${genome_resource_library.download_url}" \n+              --download_location="${genome_resource_library.download_destination}"\n+              #if str($genome_resource_library.force_new_download) == "true":\n+                --new_archive_download\n+              #end if\n+            #end if\n+            #if str($genome_resource_library.build_type) == "build_from_source":\n+              --source_location "${genome_resource_library.source_location}"\n+              --build_location "${genome_resource_library.built_library_location}" \n+              #if str($genome_resource_library.rebuild) == "true":\n+                --rebuild\n+              #end if\n+            #end if\n+            #if str($genome_resource_library.build_type) == "specify_built_location":\n+              --build_location="${genome_resource_library.built_library_location}"\n+            #end if\n+            #if str($gmap_build) == "true":\n+              --gmap_build \n+            #end if\n+        ]]>\n+    </command>\n+    <inputs>\n+        <!-- The following are left in here, just as examples of various ways of doing options.\n+            <param name="force_download" type="boolean" checked="false"\n+                truevalue="- -force_download" falsevalue="" label="Force New Download? (yes/no)" />\n+            <param name="download" type="select" label="Need to Download?">\n+                <option value="single" selected="true">Single Dataset</option>\n+                <option value="paired_collection">Paired Collection</option>\n+            <when value="paired_collection">\n+                 <param name="fastq_input" format="fastqsanger" type="data_collection" collection_type="paired" label="Select dataset pair" help="Specify paired dataset collection containing paired reads"/>\n+            </when>\n+        -->\n+        <conditional name="genome_resource_library">\n+            <param name="build_type" type="select" label="Download CTAT Genome Resource Library?">\n+                <option value="download_and_build" selected="true">Download from CTAT and build if needed</option>\n+                <option value="build_from_source">Build library from local source data</option>\n+                <option value="specify_built_location">Specify location of built library</option>\n+            </param>\n+            <when value="download_and_build">\n+                <!-- The use of a code block to get dynamic options is now deprecated and discouraged.\n+                     I am still using it here. The only other way I can think of to do this is to\n+                     create another data_manager that gets the list of files and puts them into a\n+                     data_table, that is then used to get the filenames. That would require the admin\n+                     to first run the data_manager that builds the filename data_table before running\n+                     this data_manager.\n+                This is the dynamic way to get the options filled.\n+                <param name="filename" type="select" label="Select File" display="radio" \n+                    dynamic_options="get_ctat_genome_filenames()" \n+                    help="Select a CTAT Genome Resource Library to Download." />'..b'            <when value="build_from_source">\n+                <param name="source_location" type="text" label="Location of Source Files (full path)" />\n+                <param name="built_library_location" type="text" label="Location of the Built Library (full path)" />\n+                <param name="rebuild" type="boolean" checked="false" label="Force new build of Library?" />\n+            </when>\n+            <when value="specify_built_location">\n+                <param name="built_library_location" type="text" label="Location of the Built Library (full path)" />\n+            </when>\n+        </conditional>\n+        <param name="display_name" type="text" label="Reference Genome Display Name" />\n+        <param name="gmap_build" type="boolean" checked="true" label="Do a gmap_build on the Library?" />\n+        <!-- Below is the most recent interface for download of mutation indexes, but it is not being used yet...\n+        <conditional name="mutation_indexes">\n+            <param name="download" type="boolean" checked="true" label="Download mutation indexes into the Library?">\n+            </param>\n+            <when value="true">\n+                <param name="source_url" type="select" label="Select a File"\n+                    dynamic_options="get_mutation_index_urls()" \n+                    help="Select CTAT Mutation Indexes File to Download.\\nMake sure it is the right one for your CTAT Genome Resource Library!">\n+                </param>\n+                <param name="force_download" type="boolean" checked="false" label="Force New Download?" />\n+            </when>\n+        </conditional>\n+        -->\n+    </inputs>\n+    <outputs>\n+        <data name="out_file" format="data_manager_json" />\n+    </outputs>\n+    <help>\n+        Retrieve, and/or specify the location of, a CTAT Genome Resource Library.\n+        When download is true, the files at https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/\n+        are used as selectors for the user to choose among.\n+        Specify the Full Path of the location where the CTAT Resource Library should be placed.\n+        You will need approximately 62GB of space for this library.\n+        The installation of this tool takes some time, due to building a conda environment for the dependencies.\n+        The download extracts the files during the download. The "source_data" files download faster, but then must be built.\n+        Building the library from the "source_data" files can take many hours, depending on the resources of your machine.\n+        The "plug-n-play" can take considerable time to download, depending on your internet connection. Even with high speed,\n+        it is about 25GB that is transfered, so plan accordingly.\n+        Neither the "source_data" nor the "plug-n-play" versions have had their gmap index built. If you are not going to be\n+        using gmap_fusion, then you can uncheck the gmap-build check box and save the space and time building the index consumes.\n+        Neither the "source_data" nor the "plug-n-play" versions have mutation indexes included. Those must be downloaded\n+        separately. If you are not going to be using the mutation tool, uncheck the Download mutation indexes check box and\n+        save the space and time it takes to include the mutation index files. \n+        - FIX - \n+        This version of the tool does not yet implement the download of mutation indexes.\n+        - FIX -\n+        If you already have a CTAT Genome Resource library installed on your system, \n+        specify the full path of the location where it exists and leave the download box unchecked.\n+        The Reference Genome name may be left empty if downloading. The filename will then be used as the selector text of the entry in the data table.\n+        For more information on CTAT Genome Resource Libraries, \n+        see <a http="https://github.com/FusionFilter/FusionFilter/wiki">FusionFilter</a>\n+    </help>\n+    <code file="add_ctat_resource_lib.py" />\n+</tool>\n'
b
diff -r 000000000000 -r ea7bc21cbb7a data_manager_conf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml Mon Jul 16 19:56:38 2018 -0400
b
@@ -0,0 +1,38 @@
+<?xml version="1.0"?>
+<data_managers>
+    <data_manager tool_file="data_manager/add_ctat_resource_lib.xml" id="ctat_genome_resource_libs_data_manager"> 
+        <data_table name="ctat_genome_resource_libs">
+            <output>
+                <column name="value" />
+                    <!-- value is used to uniquely identify this entry in the table.
+                    -->
+                <column name="name" />
+                    <!-- name is used as the selector in the pull down lists for items in this table.
+                    -->
+                <column name="path" />
+                    <!-- path is the absolute path of the top level directory of the CTAT Genome Resource Library.
+                    -->
+                <!-- <column name="path" output_ref="out_file"> -->
+                    <!-- It is typical to move the data file, but because our tool gets the destination
+                    location from the user, we do not want to move the data from that location.
+                    The full path of the CTAT Resource library is returned in location. 
+                    So no need to change the value either.
+                    The files are so big we do not want to be making copies of them.
+                    They are created where we want them.
+                    -->
+                    <!-- <move type="file" relativize_symlinks="False"> -->
+                        <!--<source>${path}</source> -->
+                        <!--<target base="${GALAXY_DATA_MANAGER_DATA_PATH}">ctat_genome_lib_build_dir</target> -->
+                    <!--</move> -->
+                    <!--
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/ctat_genome_lib_build_dir
+                    </value_translation>
+                    -->
+                    <!-- The location returned by the tool should already be an absolute path.
+                    <value_translation type="function">abspath</value_translation>
+                    -->
+                <!--</column> -->
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
b
diff -r 000000000000 -r ea7bc21cbb7a tool-data/ctat_genome_resource_libs.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/ctat_genome_resource_libs.loc.sample Mon Jul 16 19:56:38 2018 -0400
b
@@ -0,0 +1,15 @@
+# This file lists the locations of CTAT Genome Resource Libraries
+# Usually there will only be one library, but it is concievable 
+# that there could be multiple libraries.
+# This file format is as follows
+# (white space characters are TAB characters):
+#
+#<value>    <name>  <path>
+# value is a unique id
+# name is the display name
+# path is the directory where the genome resource lib files are stored
+#
+#ctat_genome_resource_libs.loc could look like:
+#
+#GRCh38_v27_CTAT_lib_Feb092018 CTAT_GenomeResourceLib_GRCh38_v27_CTAT_lib_Feb092018 /path/to/ctat/genome/resource/lib/directory
+#
b
diff -r 000000000000 -r ea7bc21cbb7a tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample Mon Jul 16 19:56:38 2018 -0400
b
@@ -0,0 +1,6 @@
+<tables>
+    <table name="ctat_genome_resource_libs" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, name, path</columns>
+        <file path="tool-data/ctat_genome_resource_libs.loc" />
+    </table>
+</tables>