Previous changeset 0:ea7bc21cbb7a (2018-07-16) |
Commit message:
Uploaded |
modified:
data_manager/add_ctat_resource_lib.py data_manager/add_ctat_resource_lib.xml data_manager_conf.xml tool_data_table_conf.xml.sample |
added:
tool-data/ctat_cravat_tissues.loc.sample |
b |
diff -r ea7bc21cbb7a -r da7f50809820 data_manager/add_ctat_resource_lib.py --- a/data_manager/add_ctat_resource_lib.py Mon Jul 16 19:56:38 2018 -0400 +++ b/data_manager/add_ctat_resource_lib.py Sun Nov 11 21:02:34 2018 -0500 |
[ |
b"@@ -1,56 +1,113 @@\n #!/usr/bin/env python\n # ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/\n \n-# Rewritten by H.E. Cicada Brokaw Dennis from a source downloaded from the toolshed and\n+# Written by H.E. Cicada Brokaw Dennis of Indiana University for the Broad Institute.\n+# Initial starting point was some code downloaded from the toolshed and\n # other example code on the web.\n-# This now allows downloading of a user selected library\n-# but only from the CTAT Genome Resource Library website.\n-# Ultimately we might want to allow the user to specify any location \n-# from which to download.\n-# Users can create or download other libraries and use this tool to add them if they don't want\n-# to add them by hand.\n+# That code has however been extensively modified and augmented.\n+\n+# This is part of Data Manager code to be used within a Galaxy.\n+# This Data Manager allows users to add entries to the ctat_genome_resource_libs table.\n \n+# This code allows downloading of a user selected Genome Reference Library\n+# from the CTAT Genome Resource Library website.\n+# It also provides for building libraries from source, doing a gmap_build over,\n+# and/or integrating mutation resources with, a Genome Reference Library.\n+# For more information on CTAT Genome Resource Libraries, \n+# see https://github.com/FusionFilter/FusionFilter/wiki\n+# Users can create or download their own libraries and use this Data Manger to add them \n+# if they don't want to add them by hand.\n+\n+import sys\n+# The many calls to sys.stdout.flush() are done in order to get the output to be synchronized.\n+# Otherwise output from subprocesses can get streamed to stdout in a disjunct manner from \n+# the output of the process running this code.\n+# This is particularly evident in the stdout stream when running within a Galaxy instance.\n import argparse\n import os\n-#import tarfile\n-#import urllib\n+import shutil\n+import tarfile\n+import hashlib\n+import urllib\n+import urlparse\n+import contextlib\n import subprocess\n \n-# Comment out the following line when testing without galaxy package.\n+# One can comment out the following line when testing without galaxy package.\n+# In that case, also comment out the last line in main(). That is, the line that uses to_json_string.\n from galaxy.util.json import to_json_string\n-# The following is not being used, but leaving as info\n-# in case we ever want to get input values using json.\n+\n+# The following is not being used, but leaving here as info\n+# in case one ever wants to get input values using json.\n # from galaxy.util.json import from_json_string\n+# However in this datamanager, the command line arguments are used instead.\n \n # datetime.now() is used to create the unique_id\n from datetime import datetime\n \n-# The FileListParser is used by get_ctat_genome_filenames(),\n-# which is called by the Data Manager interface (.xml file) to get\n-# the filenames that are available online at broadinstitute.org\n-# Not sure best way to do it. \n-# This object uses HTMLParser to look through the html \n+# The Data Manager uses a subclass of HTMLParser to look through a web page's html \n # searching for the filenames within anchor tags.\n import urllib2\n from HTMLParser import HTMLParser\n \n _CTAT_ResourceLib_URL = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/'\n-_CTAT_MutationIndex_URL = 'https://data.broadinstitute.org/Trinity/CTAT/mutation/'\n+_CTAT_Mutation_URL = 'https://data.broadinstitute.org/Trinity/CTAT/mutation/'\n _CTAT_Build_dirname = 'ctat_genome_lib_build_dir'\n+_CTAT_MutationLibDirname = 'ctat_mutation_lib'\n _CTAT_ResourceLib_DisplayNamePrefix = 'CTAT_GenomeResourceLib_'\n _CTAT_ResourceLib_DefaultGenome = 'Unspecified_Genome'\n _CTAT_HumanFusionLib_FilenamePrefix = 'CTAT_HumanFusionLib'\n _CTAT_RefGenome_Filename = 'ref_genome.fa'\n _CTAT_MouseGenome_Prefix = 'Mouse'\n _CTAT_HumanGenome_Prefix = 'GRCh'\n+_COSMIC_Mutant_Filename = 'CosmicMutantExport.tsv.gz'\n+_COSMIC_Coding_Filename = 'CosmicCodingMuts.vcf.gz'\n+\n+# FIX - The "..b'ild_dir(genome_build_directory)\n \n if (args.gmap_build and not lib_was_built):\n # If we did not build the genome resource library\n # the user might still be asking for a gmap_build.\n- gmap_the_library(genome_build_directory)\n+ gmap_the_library(genome_build_directory, args.force_gmap_build)\n+ sys.stdout.flush()\n \n- if (args.download_mutation_indexes_url != ""):\n- download_mutation_indexes(source_url=args.download_mutation_indexes_url, \\\n+ if mutation_url_is_set:\n+ download_and_integrate_mutation_resources(source_url=args.download_mutation_resources_url, \\\n genome_build_directory=genome_build_directory, \\\n- force_download=args.new_mutation_indexes_download)\n+ cosmic_resources_location=args.cosmic_resources_location, \\\n+ force_new_download=args.new_mutation_download, \\\n+ force_new_integration=args.new_mutation_integration)\n \n # Need to get the genome name.\n genome_name = find_genome_name_in_path(args.download_url)\n if genome_name is None:\n genome_name = find_genome_name_in_path(genome_build_directory)\n if genome_name is None:\n- genome_name = find_genome_name_in_path(downloaded_directory)\n+ genome_name = find_genome_name_in_path(extracted_directory)\n if genome_name is None:\n genome_name = find_genome_name_in_path(args.source_location)\n if genome_name is None:\n@@ -845,6 +1528,7 @@\n if genome_name is None:\n genome_name = _CTAT_ResourceLib_DefaultGenome\n print "WARNING: We could not find a genome name in any of the directory paths."\n+ sys.stdout.flush()\n \n # Determine the display_name for the library.\n if (args.display_name is None) or (args.display_name == ""):\n@@ -861,18 +1545,34 @@\n print "The Genome Resource Library\'s display_name will be set to: {:s}\\n".format(display_name)\n print "Its unique_id will be set to: {:s}\\n".format(unique_id)\n print "Its dir_path will be set to: {:s}\\n".format(genome_build_directory)\n+ sys.stdout.flush()\n \n data_manager_dict = {}\n data_manager_dict[\'data_tables\'] = {}\n data_manager_dict[\'data_tables\'][\'ctat_genome_resource_libs\'] = []\n data_table_entry = dict(value=unique_id, name=display_name, path=genome_build_directory)\n data_manager_dict[\'data_tables\'][\'ctat_genome_resource_libs\'].append(data_table_entry)\n+ \n+ # Create the data table for the cravat_tissues, if the file is given:\n+ print "The cravat tissues file is: {:s}".format(str(args.cravat_tissues_filepath))\n+ if (args.cravat_tissues_filepath is not None) and (args.cravat_tissues_filepath != ""):\n+ data_manager_dict[\'data_tables\'][\'ctat_cravat_tissues\'] = []\n+ cravat_file = open(args.cravat_tissues_filepath, \'r\')\n+ for line in cravat_file:\n+ # print line\n+ if line[0] != \'#\':\n+ # The line is not a comment, so parse it.\n+ items = [item.strip() for item in line.split("\\t")]\n+ print items\n+ data_table_entry = dict(value=items[0], name=items[1], code=items[2], date=items[3])\n+ data_manager_dict[\'data_tables\'][\'ctat_cravat_tissues\'].append(data_table_entry)\n \n # Temporarily the output file\'s dictionary is written for debugging:\n print "The dictionary for the output file is:\\n\\t{:s}".format(str(data_manager_dict))\n+ sys.stdout.flush()\n # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager,\n # which then puts it into the correct .loc file (I think).\n- # Comment out the following line when testing without galaxy package.\n+ # One can comment out the following line when testing without galaxy package.\n open(args.output_filename, \'wb\').write(to_json_string(data_manager_dict))\n \n if __name__ == "__main__":\n' |
b |
diff -r ea7bc21cbb7a -r da7f50809820 data_manager/add_ctat_resource_lib.xml --- a/data_manager/add_ctat_resource_lib.xml Mon Jul 16 19:56:38 2018 -0400 +++ b/data_manager/add_ctat_resource_lib.xml Sun Nov 11 21:02:34 2018 -0500 |
[ |
b'@@ -1,50 +1,68 @@\n <tool id="ctat_genome_resource_libs_data_manager" \n name="CTAT Genome Resource Libraries Data Manager" \n- version="1.0.0" tool_type="manage_data">\n+ version="2.0.0" tool_type="manage_data">\n+ <!-- This Data Manager tool was written by Cicada Dennis of Indiana University for the Broad Institute.\n+ -->\n <description>Retrieve, and/or specify the location of, a CTAT Genome Resource Library. \n </description>\n <requirements>\n <requirement type="package" version="2.7">python</requirement>\n <requirement type="package" version="0.5.0">fusion-filter</requirement>\n+ <requirement type="package" version="2.0.1">ctat-mutations</requirement>\n </requirements>\n- <command detect_errors="default">\n+ <command detect_errors="exit_code">\n <![CDATA[\n python $__tool_directory__/add_ctat_resource_lib.py \n --output_filename="${out_file}" \n- --display_name="${display_name}" \n- #if str($genome_resource_library.build_type) == "download_and_build":\n+ --display_name="${display_name}"\n+ --cravat_tissues_filepath="${__tool_directory__}/../tool-data/ctat_cravat_tissues.loc.sample" \n+ #if str( $genome_resource_library.build_type ) == "download_and_build":\n --download_url="${genome_resource_library.download_url}" \n --download_location="${genome_resource_library.download_destination}"\n- #if str($genome_resource_library.force_new_download) == "true":\n+ #if str( $genome_resource_library.force_new_download ) == "true":\n --new_archive_download\n #end if\n+ #if str( $genome_resource_library.keep_archive ) == "true":\n+ --keep_archive\n+ #end if\n+ #if str( $genome_resource_library.rebuild ) == "true":\n+ --new_library_build\n+ #end if\n+ #if str( $genome_resource_library.specify_build_location.build_location ) == "true":\n+ --build_location="${genome_resource_library.specify_build_location.different_build_location}"\n+ #end if\n+ #elif str( $genome_resource_library.build_type ) == "build_from_source":\n+ --source_location "${genome_resource_library.source_location}"\n+ #if str( $genome_resource_library.rebuild ) == "true":\n+ --new_library_build\n+ #end if\n+ #if str( $genome_resource_library.specify_build_location.build_location ) == "true":\n+ --build_location="${genome_resource_library.specify_build_location.different_build_location}"\n+ #end if\n+ #elif str( $genome_resource_library.build_type ) == "specify_built_location":\n+ --build_location="${genome_resource_library.built_library_location}"\n+ #end if\n+ \n+ #if str( $gmap_options.gmap_build ) == "true":\n+ --gmap_build \n+ #if str( $gmap_options.force_gmap_build ) == "true":\n+ --force_gmap_build\n+ #end if\n #end if\n- #if str($genome_resource_library.build_type) == "build_from_source":\n- --source_location "${genome_resource_library.source_location}"\n- --build_location "${genome_resource_library.built_library_location}" \n- #if str($genome_resource_library.rebuild) == "true":\n- --rebuild\n+ \n+ #if str( $mutation_lib_options.mutation_build ) == "true":\n+ --download_mutation_resources_url="${mutation_lib_options.source_url}" \n+ --cosmic_resources_location="${mutation_lib_options.cosmic_files_location}"\n+ #if str( $mutation_lib_options.force_download ) == "true":\n+ --new_mutation_download\n #end if\n- #end if\n- #if str($genome_resource_library.build_type) == "sp'..b' do a gmap_build on the library \n+ and also to integrate ctat-mutation resources into the library.\n+\n+ You will need approximately 62GB of space for a human genome resource library, once it is built, \n+ but if downloading and building, to be safe provide at least 75GB.\n+\n The installation of this tool takes some time, due to building a conda environment for the dependencies.\n- The download extracts the files during the download. The "source_data" files download faster, but then must be built.\n+ The "source_data" files download faster, but then must be built.\n Building the library from the "source_data" files can take many hours, depending on the resources of your machine.\n The "plug-n-play" can take considerable time to download, depending on your internet connection. Even with high speed,\n it is about 25GB that is transfered, so plan accordingly.\n+ If you have a good speed internet connection, downloading the plug-n-play will usually be faster than building.\n+\n+ **If a download or a build is interrupted, re-running the job should pick up where it left off.**\n+\n Neither the "source_data" nor the "plug-n-play" versions have had their gmap index built. If you are not going to be\n- using gmap_fusion, then you can uncheck the gmap-build check box and save the space and time building the index consumes.\n- Neither the "source_data" nor the "plug-n-play" versions have mutation indexes included. Those must be downloaded\n- separately. If you are not going to be using the mutation tool, uncheck the Download mutation indexes check box and\n- save the space and time it takes to include the mutation index files. \n- - FIX - \n- This version of the tool does not yet implement the download of mutation indexes.\n- - FIX -\n- If you already have a CTAT Genome Resource library installed on your system, \n- specify the full path of the location where it exists and leave the download box unchecked.\n- The Reference Genome name may be left empty if downloading. The filename will then be used as the selector text of the entry in the data table.\n+ using gmap_fusion, then you can uncheck the gmap_build check box and save the space and time building the index consumes.\n+\n+ Neither the "source_data" nor the "plug-n-play" versions have Mutation Resources included. \n+ Those must be downloaded separately and integrated into the Library. If you are going to be using the \n+ ctat_mutations tool, check the Download Mutation Library check box. Whether or not you check this box, the\n+ ctat_cravat_tissues table needed by the ctat_mutations tool will be created.\n+\n+ In order to integrate the Mutation Resources into a CTAT Genome Resource Library, you must have previously downloaded\n+ COSMIC resources (See Step 2 from https://github.com/NCIP/ctat-mutations/tree/master/mutation_lib_prep )\n+ You can place them directly into the Genome Resource Library location, or if the Library is \n+ not built yet, or you do not know the full path to it, specify the directory where the COSMIC files are, so they can be \n+ integrated into the Library. \n+\n+ The Mouse genome is not currently supported by ctat_mutations.\n+\n+ If the Reference Genome Display Name is left empty a name will be created, \n+ but any text that will best guide the user can be entered here. \n+ It will be the text that is used for selecting the library in pull down lists \n+ requiring a Genome Reference Library resource (These are stored in the ctat_genome_resource_libs table).\n+\n For more information on CTAT Genome Resource Libraries, \n- see <a http="https://github.com/FusionFilter/FusionFilter/wiki">FusionFilter</a>\n+ see https://github.com/FusionFilter/FusionFilter/wiki\n </help>\n <code file="add_ctat_resource_lib.py" />\n </tool>\n' |
b |
diff -r ea7bc21cbb7a -r da7f50809820 data_manager_conf.xml --- a/data_manager_conf.xml Mon Jul 16 19:56:38 2018 -0400 +++ b/data_manager_conf.xml Sun Nov 11 21:02:34 2018 -0500 |
b |
@@ -34,5 +34,21 @@ <!--</column> --> </output> </data_table> + <data_table name="ctat_cravat_tissues"> + <output> + <column name="value" /> + <!-- value is used to uniquely identify this entry in the table. + --> + <column name="name" /> + <!-- name is used as the selector in the pull down lists for items in this table. + --> + <column name="code" /> + <!-- . + --> + <column name="date" /> + <!-- . + --> + </output> + </data_table> </data_manager> </data_managers> |
b |
diff -r ea7bc21cbb7a -r da7f50809820 tool-data/ctat_cravat_tissues.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/ctat_cravat_tissues.loc.sample Sun Nov 11 21:02:34 2018 -0500 |
b |
@@ -0,0 +1,27 @@ +Bladder Bladder Urothelial Carcinoma BLCA (TCGA) Jun 2013 +Blood-Lymphocyte Chronic Lymphocytic Leukemia CLL (ICGC) Mar 2013 +Blood-Myeloid Acute Myeloid Leukemia LAML (TCGA) Jun 2013 +Brain-Cerebellum Medulloblastoma MB (mixed source) Dec 2010 +Brain-Glioblastoma-Multiforme Glioblastoma Multiforme GBM (TCGA) Jun 2013 +Brain-Lower-Grade-Glioma Brain Lower Grade Glioma LGG (TCGA) Jun 2013 +Breast Breast Invasive Carcinoma BRCA (TCGA) Jun 12012 +Cervix Cervical Squamous Cell Carcinoma and Endocervical Adenocarcinoma CESC (TCGA) Jun 2013 +Colon Colon Adenocarcinoma COAD (TCGA) Jun 2013 +Head and Neck Head and Neck Squamous Cell Carcinoma HNSC (TCGA) Jun 2013 +Kidney-Chromophobe Kidney Chromophobe KICH (TCGA) Jun 2013 +Kidney-Clear-Cell Kidney Renal Clear Cell Carcinoma KIRC (TCGA) Jun 2013 +Kidney-Papillary-Cell Kidney Renal Papillary Cell Carcinoma KIRP (TCGA) Jun 2013 +Liver-Nonviral Hepatocellular Carcinoma (Secondary to Alcohol and Adiposity) HCCA (ICGC) Mar 2013 +Liver-Viral Hepatocellular Carcinoma (Viral) HCCV (ICGC) Mar 2013 +Lung-Adenocarcinoma Lung Adenocarcinoma LUAD (TCGA) Jun 2013 +Lung-Squamous Cell Lung Squamous Cell Carcinoma LUSC (TCGA) Jun 2013 +Melanoma Melanoma ML (Yardena Samuels lab) Dec 2011 +Other General purpose OV (TCGA) Jun 2013 +Ovary Ovarian Serous Cystadenocarcinoma OV (TCGA) Jun 2013 +Pancreas Pancreatic Cancer PNCC (ICGC)) Mar 2013 +Prostate-Adenocarcinoma Prostate Adenocarcinoma PRAD (TCGA) Jun 2013 +Rectum Rectum Adenocarcinoma READ (TCGA) Jun 2013 +Skin Skin Cutaneous Melanoma SKCM (TCGA) Jun 2013 +Stomach Stomach Adenocarcinoma STAD (TCGA) Jun 2013 +Thyroid Thyroid Carcinoma THCA (TCGA) Jun 2013 +Uterus Uterine Corpus Endometriod Carcinoma UCEC (TCGA) Jun 2013 |
b |
diff -r ea7bc21cbb7a -r da7f50809820 tool_data_table_conf.xml.sample --- a/tool_data_table_conf.xml.sample Mon Jul 16 19:56:38 2018 -0400 +++ b/tool_data_table_conf.xml.sample Sun Nov 11 21:02:34 2018 -0500 |
b |
@@ -3,4 +3,8 @@ <columns>value, name, path</columns> <file path="tool-data/ctat_genome_resource_libs.loc" /> </table> + <table name="ctat_cravat_tissues" comment_char="#" allow_duplicate_entries="False"> + <columns>value, name, code, date</columns> + <file path="tool-data/ctat_cravat_tissues.loc" /> + </table> </tables> |