Galaxy |

Changeset 0:f68a60c3d768 (2018-05-16)

Commit message:
Uploaded

diff -r 000000000000 -r f68a60c3d768 data_manager_build_alfa_indexes/.shed.yml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_build_alfa_indexes/.shed.yml Wed May 16 09:56:43 2018 -0400

@@ -0,0 +1,13 @@
+categories:
+- Data Managers
+description: A tool to build ALFA indexes from automatically downloaded gtf annotation file
+long_description: |
+ 1. The tool asks the admin to enter a 'species_name' and automatically download the last release of the corresponding gtf annotation file on Ensembl.
+ 2. The tool calls ALFA to generate the alfa indexes from this gtf file.
+ 3. Resulting indexes are stored in the child directory 'alfa_indexes' of the dir <galaxy_data_manager_data_path> defined in config/galaxy.ini
+ 4. Finally, the tool adds the new entry to the table 'alfa_indexes.loc'. This .loc file is where the data table 'alfa_indexes' points, as defined in config/shed_tool_data_table.conf.xml
+ 5. At the end of the process, when a user will use alfa (https://toolshed.g2.bx.psu.edu/view/charles-bernard/alfa), the built-in indexes corresponding to the 'species_name' will be available
+name: data_manager_build_alfa_indexes
+owner: charles_bernard
+remote_repository_url: https://github.com/charles-bernard/Galaxy_tools/tree/master/data_manager_build_alfa_indexes
+type: unrestricted
\ No newline at end of file

diff -r 000000000000 -r f68a60c3d768 data_manager_build_alfa_indexes/data_manager/ALFA.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_build_alfa_indexes/data_manager/ALFA.py Wed May 16 09:56:43 2018 -0400

[

b'@@ -0,0 +1,1160 @@\n+#!/usr/bin/python\n+#-*- coding: utf-8 -*-\n+\n+__author__ = \'noel & bahin\'\n+\'\'\' <decription> \'\'\'\n+\n+import argparse\n+import os\n+import numpy\n+import sys\n+import subprocess\n+import matplotlib.pyplot as plt\n+import matplotlib.cm as cmx\n+import matplotlib.colors as colors\n+import matplotlib.patheffects as PathEffects\n+import re\n+from matplotlib.backends.backend_pdf import PdfPages\n+# To correctly embbed the texts when saving plots in svg format\n+import matplotlib\n+matplotlib.rcParams[\'svg.fonttype\'] = \'none\'\n+\n+##########################################################################\n+# FUNCTIONS #\n+##########################################################################\n+\n+def init_dict(d, key, init):\n+\tif key not in d:\n+\t\td[key] = init\n+\n+def get_chromosome_lengths(args):\n+\t"""\n+\tParse the file containing the chromosomes lengths.\n+\tIf no length file is provided, browse the annotation file (GTF) to estimate the chromosome sizes (\n+\t"""\n+\tlengths={}\n+\tgtf_chrom_names=set()\n+\tforce_get_lengths = False\n+\t# If the user provided the chromosome length file\n+\tif args.chr_len:\n+\t\twith open(args.chr_len, \'r\') as chr_len_file:\n+\t\t\tfor line in chr_len_file:\n+\t\t\t\tlengths[line.split(\'\\t\')[0]] = int(line.rstrip().split(\'\\t\')[1])\n+\t\twith open(args.annotation,\'r\') as gtf_file:\n+\t\t\tfor line in gtf_file:\n+\t\t\t\tif not line.startswith(\'#\'):\n+\t\t\t\t\tchrom = line.split(\'\\t\')[0]\n+\t\t\t\t\tif chrom not in gtf_chrom_names:\n+\t\t\t\t\t\tgtf_chrom_names.add(chrom)\n+\t\tfor chrom in lengths.keys():\n+\t\t\tif chrom not in gtf_chrom_names:\n+\t\t\t\tprint "Warning: at least one chromosome name (\'"+chrom+"\') of the file \'"+args.chr_len+"\'does not match any chromosome name if GTF and was ignored."\n+\t\t\t\t#del lengths[chrom]\n+\t\t\t\tbreak\n+\t\tfor chrom in gtf_chrom_names:\n+\t\t\tif force_get_lengths: break\n+\t\t\tif chrom not in lengths.keys():\n+\t\t\t\tprint "WARNING: chromosome name \'"+chrom+"\' was found in gtf and does not match any chromosome name provided in",args.chr_len+". "\n+\t\t\t\tprint "\\t=> The chromosome lenghts will be approximated using annotations in the GTF file."\n+\t\t\t\tcontinue_value =""\n+\t\t\t\twhile continue_value not in {"yes","y","no","n"}:\n+\t\t\t\t\tcontinue_value = raw_input("\\tDo you want to continue (\'yes\' or \'y\')?\\n\\tElse write \'no\' or \'n\' to exit the script and check your file of lengths.\\n")\n+\t\t\t\t\tif continue_value == "no" or continue_value == "n":\n+\t\t\t\t\t\tsys.exit("Exiting")\n+\t\t\t\t\telif continue_value == "yes" or continue_value == "y":\n+\t\t\t\t\t\tforce_get_lengths = True\n+\t\t\t\t\t\tbreak\n+\t\t\t\t\tprint "Error: use \'yes/y/no/n\' only"\n+\t\tif not force_get_lengths:\n+\t\t\treturn lengths\n+\t# Otherwise, (or if at least one chromosome was missing in chromosome lengths file) we consider the end of the last annotation of the chromosome in the GTF file as the chromosome length\n+\twith open(args.annotation, \'r\') as gtf_file:\n+\t\tfor line in gtf_file:\n+\t\t\tif not line.startswith(\'#\'):\n+\t\t\t\tchrom = line.split(\'\\t\')[0]\n+\t\t\t\tend = int(line.split(\'\\t\')[4])\n+\t\t\t\tinit_dict(lengths, chrom, 0)\n+\t\t\t\tlengths[chrom] = max(lengths[chrom], end)\n+\t\tif force_get_lengths:\n+\t\t\tprint "The chromosome lenghts have been approximated using the last annotations in the GTF file."\n+\t\treturn lengths\n+\n+def write_feature_on_index(feat,chrom, start, stop, sign, stranded_genome_index, unstranded_genome_index=None):\n+\tgrouped_by_biotype_features = []\n+\tfor biotype,categs in feat.iteritems():\n+\t\tcateg_list=[]\n+\t\tfor cat in set(categs):\n+\t\t\tcateg_list.append(cat)\n+\t\tgrouped_by_biotype_features.append(":".join((str(biotype),",".join(categ_list))))\n+\tstranded_genome_index.write(\'\\t\'.join((chrom, start, stop, sign,\'\'))+\'\\t\'.join(grouped_by_biotype_features)+\'\\n\')\n+\tif unstranded_genome_index :\n+\t\tunstranded_genome_index.write(\'\\t\'.join((chrom, start, stop, \'.\',\'\'))+\'\\t\'.join(grouped_by_biotype_features)+\'\\n\')\n+\n+\n+def add_info(cpt, feat_values, start, stop, chrom=None, unstranded_genome_index=None, stranded_genome_index = None , biotype_prios=None, coverage=1, cate'..b'mples_files,samples_names,prios,genome_index, options.strandness[0], biotype_prios = None)\n+\n+\t#### Write the counts on disk\n+\twrite_counts_in_files(cpt,cpt_genome)\n+\n+if not (intersect_reads or process_counts) or (options.quiet and options.pdf == False):\n+\tquit("\\n### End of program")\n+print "\\n### Generating plots"\n+# Updating the biotypes lists (biotypes and \'biotype_group1\'): adding the \'unknow biotypes\' found in gtf/index\n+if unknown_feature == []: # \'unknown_feature\' is define only during the index generation\n+\t# Browse the feature to determine whether some biotypes are \'unknown\'\n+\tfor sample,counts in cpt.items():\n+\t\tfor (cat,biot) in counts:\n+\t\t\tif biot not in biotypes and cat not in unknown_feature:\n+\t\t\t\tunknown_feature.append(biot)\n+for new_biot in unknown_feature:\n+\tbiotypes.add(new_biot)\n+\tbiotypes_group1["others"].append(new_biot)\n+biotypes = sorted(biotypes)\n+# move antisense categ to the end of the list\n+biotypes.remove(\'antisense\')\n+biotypes.append(\'antisense\')\n+biotypes_group1 = sorted(biotypes_group1)\n+\n+\n+#print \'\\nCounts for every category/biotype pair: \',cpt\n+\n+# Generating plots\n+if options.pdf != False:\n+\tif options.pdf == None:\n+\t\toptions.pdf = "categories_plots.pdf"\n+\tpdf = PdfPages(options.pdf)\n+else:\n+\tpdf = False\n+\n+selected_biotype = None\n+if options.biotype_filter:\n+\toptions.biotype_filter = options.biotype_filter[0]\n+\tfor sample in cpt:\n+\t\tfor feature in cpt[sample]:\n+\t\t\tbiotype = feature[1]\n+\t\t\tif options.biotype_filter.lower() == biotype.lower():\n+\t\t\t\tselected_biotype=biotype\n+\t\t\t\tbreak\n+\tif selected_biotype == None :\n+\t\tprint "\\nError: biotype \'"+options.biotype_filter+"\' not found. Please check the biotype name and that this biotype exists in your sample(s)."\n+\t\tsys.exit()\n+\n+#Print a warning message if the UTRs are not specified as 5\' or 3\' (they will be ploted as 5\'UTR)\n+if \'UTR\' in [categ[0] for counts in cpt.values() for categ in counts.keys()]:\n+\tprint \'\'\'\\nWARNING: (some) 5\'UTR/3\'UTR are not precisely defined. Consequently, positions annotated as "UTR" will be counted as "5\'UTR"\\n\'\'\'\n+\n+#### Make the plot by categories\n+\t#### Recategorizing with the final categories\n+final_cats=categs_groups[options.categories_depth-1]\n+final_cat_cpt,final_genome_cpt, filtered_cat_cpt = group_counts_by_categ(cpt,cpt_genome,final_cats,selected_biotype)\n+\t#### Display the distribution of specified categories (or biotypes) in samples on a barplot\n+# Remove the \'antisense\' category if the library type is \'unstranded\'\n+for dic in cpt.values():\n+\tif (\'antisense\',\'antisense\') in dic.keys(): break\n+else:\n+\tcat_list.remove(\'antisense\')\n+make_plot(cat_list,samples_names,final_cat_cpt,final_genome_cpt,pdf, "categories",options.threshold, svg = options.svg, png = options.png)\n+if selected_biotype :\n+\tmake_plot(cat_list,samples_names,filtered_cat_cpt,final_genome_cpt,pdf, "categories",options.threshold,title="Categories distribution for \'"+selected_biotype+"\' biotype", svg = options.svg, png = options.png)\n+\n+#### Make the plot by biotypes\n+\t#### Recategorizing with the final categories\n+final_cat_cpt,final_genome_cpt = group_counts_by_biotype(cpt,cpt_genome,biotypes)\n+\t#### Display the distribution of specified categories (or biotypes) in samples on a barplot\n+make_plot(biotypes,samples_names,final_cat_cpt,final_genome_cpt,pdf, "biotypes",options.threshold, svg = options.svg, png = options.png)\n+\n+\n+\n+\t##### Recategorizing with the final categories\n+#final_cat_cpt,final_genome_cpt = group_counts_by_biotype(cpt,cpt_genome,biotypes_group1)\n+\t##### Display the distribution of specified categories (or biotypes) in samples on a barplot\n+#make_plot(biotypes_group1,samples_names,final_cat_cpt,final_genome_cpt,pdf,"Biotype groups", options.threshold, title="Biotypes distribution in mapped reads \\n(biotypes are grouped by \'family\')", svg = options.svg, png = options.png)\n+\n+\n+if options.pdf:\n+\tpdf.close()\n+\tprint "\\n### Plots saved in pdf file: %s" %options.pdf\n+\t\n+print "\\n### End of program"\n\\ No newline at end of file\n'

diff -r 000000000000 -r f68a60c3d768 data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.py Wed May 16 09:56:43 2018 -0400

[

b'@@ -0,0 +1,239 @@\n+#!/usr/bin/python\n+\n+import sys\n+import shutil\n+import re\n+import urllib2\n+import subprocess\n+import gzip\n+import os\n+import tempfile\n+from optparse import OptionParser\n+from galaxy.util.json import from_json_string, to_json_string\n+\n+def get_arg():\n+ parser = OptionParser()\n+ parser.add_option("-e", "--ensembl", dest = \'ensembl_info\', action = "store", nargs = 2, metavar = ("kingdom", "species_name"), type = "str")\n+ parser.add_option("-o", "--output", dest=\'output_filename\', action="store", nargs = 1, metavar = \'JSON_FILE\')\n+ parser.add_option("--log", dest=\'log_filename\', action="store", nargs=1, metavar=\'log_report\')\n+ (options, args) = parser.parse_args()\n+ return options, args\n+\n+def cleanup_before_exit(tmp_dir):\n+ if tmp_dir and os.path.exists(tmp_dir):\n+ shutil.rmtree(tmp_dir)\n+\n+def get_page_content(url):\n+ req = urllib2.Request(url)\n+ page = urllib2.urlopen(req)\n+ return page.read()\n+\n+def download_file(link, local_file_name):\n+ req = urllib2.Request(link)\n+ src_file = urllib2.urlopen(req)\n+ local_file = open(local_file_name, \'wb\')\n+ local_file.write(src_file.read())\n+ local_file.close()\n+\n+def uncompress_gz(gz_file_name, uncompressed_file_name):\n+ print("____________________________________________________________")\n+ print("*** Uncompressing %s" % gz_file_name)\n+ uncompressed_file = open(uncompressed_file_name, \'wb\')\n+ with gzip.open(gz_file_name, \'rb\') as src_file:\n+ uncompressed_file.write(src_file.read())\n+ uncompressed_file.close()\n+ print("-> Uncompressed !\\n")\n+\n+def add_data_table_entry( data_manager_dict, data_table_entry ):\n+ data_manager_dict[\'data_tables\'] = data_manager_dict.get( \'data_tables\', {} )\n+ data_manager_dict[\'data_tables\'][\'alfa_indexes\'] = data_manager_dict[\'data_tables\'].get( \'alfa_indexes\', data_table_entry )\n+ return data_manager_dict\n+\n+def standardize_species_name(species_name):\n+ # substitute all capital letters, replace every succession of chars that are not letters to one underscore\n+ standard_species_name = re.sub(r\'[)]$\', \'\', species_name)\n+ standard_species_name = re.sub(r\'[ _),-.(=]+ *\', \'_\', standard_species_name)\n+ return standard_species_name.lower()\n+\n+def get_ensembl_url_root(kingdom):\n+ print("____________________________________________________________")\n+ print("*** Determining Ensembl ftp root url")\n+ if kingdom == \'vertebrates\':\n+ root = \'ftp://ftp.ensembl.org/pub/current_gtf/\'\n+ else:\n+ root = \'ftp://ftp.ensemblgenomes.org/pub/%s/current/\' % kingdom\n+ print("-> Determined !\\n")\n+ return root\n+\n+def test_ensembl_species_exists(kingdom, url, species_name):\n+ """\n+ Test if a species exist on the ftp & return the species name with the species_line if so.\n+ if the species_name matches a single string, then this string will be returned as the species name\n+ if the species_name matches several strings, then an error is printed with all the possible species to enter for a new run\n+ """\n+ print("____________________________________________________________")\n+ print ("*** Testing whether %s is referenced in Ensembl %s" % (species_name, kingdom))\n+ list_species_file_name = \'species_Ensembl%s%s.txt\' % (kingdom[0].upper(), kingdom[1:])\n+ if kingdom==\'vertebrates\':\n+ download_file(url, list_species_file_name)\n+ else:\n+ download_file(url + list_species_file_name, list_species_file_name)\n+\n+ grep_result = subprocess.Popen([\'grep\', species_name, list_species_file_name], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)\n+ species_lines_matched, grep_error = grep_result.communicate()\n+ if grep_error != None or species_lines_matched == "":\n+ msg = \'The species \\\'%s\\\' is not referenced on Ensembl (%s)\' % (species_name, kingdom)\n+ sys.exit(msg)\n+\n+ species_lines = species_lines_matched.split(\'\\n\')\n+ del species_lines[-1]\n+ nb_lines = len(species_lines)\n+\n+ if nb_lin'..b'inal_url = url + species_name + \'/\'\n+ gtf_archive_name = get_ensembl_gtf_archive_name(final_url, species_name)\n+ print("____________________________________________________________")\n+ print("*** Download the gtf archive of %s" % species_name)\n+ download_file(final_url + gtf_archive_name, gtf_archive_name)\n+ print("-> Downloaded !\\n")\n+ return gtf_archive_name\n+\n+def generate_alfa_indexes(path_to_alfa, gtf_file_name):\n+ print("____________________________________________________________")\n+ print("*** Generating alfa indexes from %s" % gtf_file_name)\n+ alfa_result = subprocess.Popen([\'python\', path_to_alfa, \'-a\', gtf_file_name], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)\n+ alfa_out, alfa_err = alfa_result.communicate()\n+ if alfa_err != None and not re.search(\'### End of program\', alfa_err):\n+ msg = \'Generation Failed due an alfa error: %s\' % (alfa_err)\n+ sys.exit(msg)\n+ print("Alfa prompt:\\n%s" % alfa_out)\n+ print("-> Generated !\\n")\n+\n+def get_data_table_new_entry(gtf_archive_name):\n+ info_list = gtf_archive_name.split(\'.\')\n+ species = info_list[0]\n+ version = info_list[1]\n+ release = info_list[2]\n+ value = \'%s_%s_%s\' % (species, version, release)\n+ dbkey = value\n+ name = \'%s: %s (release %s)\' % (species, version, release)\n+ prefix = \'%s.%s.%s\' % (species, version, release)\n+ entry_dict = { \'species\': species, \'version\': version, \'release\': release, \'value\': value, \'dbkey\': dbkey, \'name\': name, \'prefix\': prefix }\n+ return entry_dict\n+\n+def main():\n+ options, args = get_arg()\n+ tool_dir = args[0]\n+\n+ path_to_alfa = os.path.join(tool_dir, \'ALFA.py\')\n+\n+ if options.output_filename == None:\n+ msg = \'No json output file specified\'\n+ sys.exit(msg)\n+ output_filename = options.output_filename\n+\n+ # Interestingly the output file to return is not empty initially.\n+ # it contains a dictionary, with notably the path to the dir where the alfa_indexes\n+ # are expected to be found\n+ params = from_json_string(open(output_filename).read())\n+ target_directory = params[\'output_data\'][0][\'extra_files_path\']\n+ os.mkdir(target_directory)\n+\n+ tmp_dir = tempfile.mkdtemp(prefix=\'tmp\', suffix=\'\')\n+ os.chdir(tmp_dir)\n+\n+ data_manager_dict = {}\n+\n+ if options.ensembl_info:\n+ kingdom, species_name = options.ensembl_info\n+ species_name = standardize_species_name(species_name)\n+ url = get_ensembl_url_root(kingdom)\n+ species_name, species_line = test_ensembl_species_exists(kingdom, url, species_name)\n+ gtf_archive_name = get_ensembl_gtf_archive(kingdom, url, species_name, species_line)\n+ data_table_entry = get_data_table_new_entry(gtf_archive_name)\n+ gtf_file_name = \'%s.gtf\' % data_table_entry[\'prefix\']\n+ uncompress_gz(gtf_archive_name, gtf_file_name)\n+ generate_alfa_indexes(path_to_alfa, gtf_file_name)\n+ stranded_index_name = \'%s.stranded.index\' % data_table_entry[\'prefix\']\n+ unstranded_index_name = \'%s.unstranded.index\' % data_table_entry[\'prefix\']\n+ add_data_table_entry(data_manager_dict, data_table_entry)\n+\n+ print("____________________________________________________________")\n+ print("*** General Info")\n+ print("URL ROOT:\\t%s" % url)\n+ print("SPECIES:\\t%s" % data_table_entry[\'species\'])\n+ print("VERSION:\\t%s" % data_table_entry[\'version\'])\n+ print("RELEASE:\\t%s" % data_table_entry[\'release\'])\n+ print("VALUE:\\t%s" % data_table_entry[\'value\'])\n+ print("DBKEY:\\t%s" % data_table_entry[\'dbkey\'])\n+ print("NAME:\\t%s" % data_table_entry[\'name\'])\n+ print("PREFIX:\\t%s" % data_table_entry[\'prefix\'])\n+\n+ shutil.move(stranded_index_name, os.path.join(target_directory, stranded_index_name))\n+ shutil.move(unstranded_index_name, os.path.join(target_directory, unstranded_index_name))\n+\n+ cleanup_before_exit(tmp_dir)\n+\n+ open(output_filename, \'wb\').write(to_json_string(data_manager_dict))\n+main()\n'

diff -r 000000000000 -r f68a60c3d768 data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.xml Wed May 16 09:56:43 2018 -0400

[

@@ -0,0 +1,57 @@
+<tool id="build_alfa_indexes" name="ALFA indexes" version="0.0.1" tool_type="manage_data">
+  <description>build ALFA indexes from automatically downloaded gtf annotation file</description>
+    
+  <requirements>
+      <requirement type="package" version="2.24">bedtools</requirement>
+      <requirement type="package" version="1.2">samtools</requirement>
+      <requirement type="package" version="1.4">matplotlib</requirement>
+  </requirements>
+
+  <command interpreter="python">data_manager_build_alfa_indexes.py -e "${reference_source['kingdom']}" "${reference_source['species_name']}" -o "${out_file}" "$__tool_directory__"</command>
+
+  <inputs>
+    <conditional name="reference_source">
+      <param name="reference_source_selector" type="select" label="Choose the source for the reference genome">
+        <option value="ensembl" selected="True">Ensembl Genomes Websites</option>
+      </param>
+      <when value="ensembl">
+        <param name="kingdom" type="select" label="Select the ensembl website where to fetch the genome">
+          <option value="vertebrates" selected="True">Ensembl (Vertebrates)</option>
+          <option value="bacteria" selected="True">Ensembl Bacteria</option>
+          <option value="fungi" selected="True">Ensembl Fungi</option>
+          <option value="metazoa" selected="True">Ensembl Metazoa</option>
+          <option value="plants" selected="True">Ensembl Plants</option>
+          <option value="protists" selected="True">Ensembl Protists</option>
+        </param>
+          <param name="species_name" type="text" value="Homo sapiens" label="Complete Species_Name" optional="False">
+          <validator type="empty_field" message="Please, enter a species name."/>
+        </param>
+      </when>
+    </conditional>
+  </inputs>
+
+   <outputs>
+      <data name="out_file" format="data_manager_json"/>
+  </outputs>
+
+    <help>
+<![CDATA[
+**What it does**
+
+
+  | 1. The tool asks the admin to enter a *species_name* and automatically download the last release of the corresponding gtf annotation file on Ensembl.
+  |
+  | 2. The tool calls ALFA.py to generate the alfa indexes from this gtf file.
+  |
+  | 3. Resulting indexes are stored in the child directory *alfa_indexes/* of the dir *<galaxy_data_manager_data_path>* defined in *config/galaxy.ini*
+  |
+  | 4. Finally, the tool adds the new entry to the table *alfa_indexes.loc*. This .loc file is where the data table *alfa_indexes* points, as defined in *config/shed_tool_data_table.conf.xml*
+  |
+  | 5. At the end of the process, when a user will use `alfa`_ , the built-in indexes corresponding to the *species_name* will be available
+
+  .. _alfa: https://toolshed.g2.bx.psu.edu/view/charles-bernard/alfa
+
+]]>
+     </help>
+
+</tool>

diff -r 000000000000 -r f68a60c3d768 data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes_testchr.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes_testchr.py Wed May 16 09:56:43 2018 -0400

[

b'@@ -0,0 +1,272 @@\n+#!/usr/bin/python\n+\n+import sys\n+import shutil\n+import re\n+import urllib2\n+import subprocess\n+import gzip\n+import os\n+import tempfile\n+from optparse import OptionParser\n+from galaxy.util.json import from_json_string, to_json_string\n+\n+def get_arg():\n+ parser = OptionParser()\n+ parser.add_option("-e", "--ensembl", dest = \'ensembl_info\', action = "store", nargs = 2, metavar = ("kingdom", "species_name"), type = "str")\n+ parser.add_option("-o", "--output", dest=\'output_filename\', action="store", nargs = 1, metavar = \'JSON_FILE\')\n+ parser.add_option("--log", dest=\'log_filename\', action="store", nargs=1, metavar=\'log_report\')\n+ (options, args) = parser.parse_args()\n+ return options, args\n+\n+def cleanup_before_exit(tmp_dir):\n+ if tmp_dir and os.path.exists(tmp_dir):\n+ shutil.rmtree(tmp_dir)\n+\n+def get_page_content(url):\n+ req = urllib2.Request(url)\n+ page = urllib2.urlopen(req)\n+ return page.read()\n+\n+def download_file(link, local_file_name):\n+ req = urllib2.Request(link)\n+ src_file = urllib2.urlopen(req)\n+ local_file = open(local_file_name, \'wb\')\n+ local_file.write(src_file.read())\n+ local_file.close()\n+\n+def uncompress_gz(gz_file_name, uncompressed_file_name):\n+ print("____________________________________________________________")\n+ print("*** Uncompressing %s" % gz_file_name)\n+ uncompressed_file = open(uncompressed_file_name, \'wb\')\n+ with gzip.open(gz_file_name, \'rb\') as src_file:\n+ uncompressed_file.write(src_file.read())\n+ uncompressed_file.close()\n+ print("-> Uncompressed !\\n")\n+\n+def standardize_species_name(species_name):\n+ #substitute all capital letters, replace every succession of chars that are not letters to one underscore\n+ standard_species_name = re.sub(r\'[)]$\', \'\', species_name)\n+ standard_species_name = re.sub(r\'[ _),-.(=]+ *\', \'_\', standard_species_name)\n+ return standard_species_name.lower()\n+\n+def get_ensembl_url_root(kingdom):\n+ print("____________________________________________________________")\n+ print("*** Determining Ensembl ftp root url")\n+ if kingdom == \'vertebrates\':\n+ root = \'ftp://ftp.ensembl.org/pub/current_gtf/\'\n+ else:\n+ root = \'ftp://ftp.ensemblgenomes.org/pub/%s/current/\' % kingdom\n+ print("-> Determined !\\n")\n+ return root\n+\n+def test_ensembl_species_exists(kingdom, url, species_name):\n+ """\n+ Test if a species exist on the ftp & return the species name with the species_line if so.\n+ if the species_name matches a single string, then this string will be returned as the species name\n+ if the species_name matches several strings, then an error is printed with all the possible species to enter for a new run\n+ """\n+ print("____________________________________________________________")\n+ print ("*** Testing whether %s is referenced in Ensembl %s" % (species_name, kingdom))\n+ list_species_file_name = \'species_Ensembl%s%s.txt\' % (kingdom[0].upper(), kingdom[1:])\n+ if kingdom==\'vertebrates\':\n+ download_file(url, list_species_file_name)\n+ else:\n+ download_file(url + list_species_file_name, list_species_file_name)\n+\n+ grep_result = subprocess.Popen([\'grep\', species_name, list_species_file_name], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)\n+ species_lines_matched, grep_error = grep_result.communicate()\n+ if grep_error != None or species_lines_matched == "":\n+ msg = \'The species \\\'%s\\\' is not referenced on Ensembl (%s)\' % (species_name, kingdom)\n+ sys.exit(msg)\n+\n+ species_lines = species_lines_matched.split(\'\\n\')\n+ del species_lines[-1]\n+ nb_lines = len(species_lines)\n+\n+ if nb_lines == 1:\n+ if kingdom == \'vertebrates\':\n+ fields = species_lines[0].split(\' \')\n+ columns = fields[-1].split(\'\\r\')\n+ found_species_name = columns[0]\n+ else:\n+ columns = species_lines[0].split(\'\\t\')\n+ found_species_name = columns[1]\n+ if'..b'me.split(\'.\')\n+ species = info_list[0]\n+ version = info_list[1]\n+ release = info_list[2]\n+ value = \'%s_%s_%s.chr\' % (species, version, release)\n+ dbkey = value\n+ name = \'%s: %s (release %s) - Chr\' % (species, version, release)\n+ prefix = \'%s.%s.%s.chr\' % (species, version, release)\n+ entry_dict = { \'species\': species, \'version\': version, \'release\': release, \'value\': value, \'dbkey\': dbkey, \'name\': name, \'prefix\': prefix }\n+ return entry_dict\n+\n+def main():\n+ options, args = get_arg()\n+ tool_dir = args[0]\n+\n+ path_to_alfa = os.path.join(tool_dir, \'ALFA.py\')\n+\n+ if options.output_filename == None:\n+ msg = \'No json output file specified\'\n+ sys.exit(msg)\n+ output_filename = options.output_filename\n+\n+ # Interestingly the output file to return is not empty initially.\n+ # it contains a dictionary, with notably the path to the dir where the alfa_indexes\n+ # are expected to be found\n+ params = from_json_string(open(output_filename).read())\n+ target_directory = params[\'output_data\'][0][\'extra_files_path\']\n+ os.mkdir(target_directory)\n+\n+ tmp_dir = tempfile.mkdtemp(prefix=\'tmp\', suffix=\'\')\n+ os.chdir(tmp_dir)\n+\n+ data_manager_dict = {}\n+ data_manager_dict[\'data_tables\'] = data_manager_dict.get(\'data_tables\', {})\n+ data_manager_dict[\'data_tables\'][\'alfa_indexes\'] = data_manager_dict[\'data_tables\'].get(\'alfa_indexes\', [])\n+\n+ if options.ensembl_info:\n+ kingdom, species_name = options.ensembl_info\n+ species_name = standardize_species_name(species_name)\n+ url = get_ensembl_url_root(kingdom)\n+ species_name, species_line = test_ensembl_species_exists(kingdom, url, species_name)\n+ gtf_archive_name, chr_gtf_archive_name = get_ensembl_gtf_archive(kingdom, url, species_name, species_line)\n+ data_table_entry = get_data_table_new_entry(gtf_archive_name)\n+ gtf_file_name = \'%s.gtf\' % data_table_entry[\'prefix\']\n+ uncompress_gz(gtf_archive_name, gtf_file_name)\n+ generate_alfa_indexes(path_to_alfa, gtf_file_name)\n+ stranded_index_name = \'%s.stranded.index\' % data_table_entry[\'prefix\']\n+ unstranded_index_name = \'%s.unstranded.index\' % data_table_entry[\'prefix\']\n+ data_manager_dict[\'data_tables\'][\'alfa_indexes\'].append(data_table_entry)\n+ if chr_gtf_archive_name:\n+ data_table_entry = chr_get_data_table_new_entry(chr_gtf_archive_name)\n+ chr_gtf_file_name = \'%s.gtf\' % data_table_entry[\'prefix\']\n+ uncompress_gz(chr_gtf_archive_name, chr_gtf_file_name)\n+ generate_alfa_indexes(path_to_alfa, chr_gtf_file_name)\n+ chr_stranded_index_name = \'%s.stranded.index\' % data_table_entry[\'prefix\']\n+ chr_unstranded_index_name = \'%s.unstranded.index\' % data_table_entry[\'prefix\']\n+ data_manager_dict[\'data_tables\'][\'alfa_indexes\'].append(data_table_entry)\n+\n+\n+ print("____________________________________________________________")\n+ print("*** General Info")\n+ print("URL ROOT:\\t%s" % url)\n+ print("SPECIES:\\t%s" % data_table_entry[\'species\'])\n+ print("VERSION:\\t%s" % data_table_entry[\'version\'])\n+ print("RELEASE:\\t%s" % data_table_entry[\'release\'])\n+ print("VALUE:\\t%s" % data_table_entry[\'value\'])\n+ print("DBKEY:\\t%s" % data_table_entry[\'dbkey\'])\n+ print("NAME:\\t%s" % data_table_entry[\'name\'])\n+ print("PREFIX:\\t%s" % data_table_entry[\'prefix\'])\n+\n+ shutil.copyfile(stranded_index_name, os.path.join(target_directory, stranded_index_name))\n+ shutil.copyfile(unstranded_index_name, os.path.join(target_directory, unstranded_index_name))\n+\n+ if chr_gtf_archive_name:\n+ shutil.copyfile(chr_stranded_index_name, os.path.join(target_directory, stranded_index_name))\n+ shutil.copyfile(chr_unstranded_index_name, os.path.join(target_directory, unstranded_index_name))\n+\n+\n+ cleanup_before_exit(tmp_dir)\n+\n+ open(output_filename, \'wb\').write(to_json_string(data_manager_dict))\n+main()\n'

diff -r 000000000000 -r f68a60c3d768 data_manager_build_alfa_indexes/data_manager/tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_build_alfa_indexes/data_manager/tool_dependencies.xml Wed May 16 09:56:43 2018 -0400

@@ -0,0 +1,12 @@
+<?xml version="1.0"?>
+<tool_dependency>
+ <package name="bedtools" version="2.24">
+ <repository changeset_revision="3416a1d4a582" name="package_bedtools_2_24" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" />
+ </package>
+    <package name="samtools" version="1.2">
+     <repository changeset_revision="f6ae3ba3f3c1" name="package_samtools_1_2" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="matplotlib" version="1.4">
+     <repository changeset_revision="f7424e1cf115" name="package_python_2_7_matplotlib_1_4" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>

diff -r 000000000000 -r f68a60c3d768 data_manager_build_alfa_indexes/data_manager_conf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_build_alfa_indexes/data_manager_conf.xml Wed May 16 09:56:43 2018 -0400

@@ -0,0 +1,23 @@
+<?xml version="1.0"?>
+<data_managers>
+ <data_manager tool_file="data_manager/data_manager_build_alfa_indexes.xml" id="build_alfa_indexes">
+ <data_table name="alfa_indexes">
+ <output>
+ <column name="species" />
+ <column name="version" />
+ <column name="release" />
+ <column name="value" />
+ <column name="dbkey" />
+ <column name="name" />
+ <column name="prefix" output_ref="out_file">
+ <move type="directory">
+ 
+ <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">alfa_indexes/${dbkey}</target>
+ </move>
+ <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/alfa_indexes/${dbkey}/${species}.${version}.${release}</value_translation>
+ <value_translation type="function">abspath</value_translation>
+ </column>
+ </output>
+ </data_table>
+ </data_manager>
+</data_managers>

diff -r 000000000000 -r f68a60c3d768 data_manager_build_alfa_indexes/tool-data/alfa_indexes.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_build_alfa_indexes/tool-data/alfa_indexes.loc.sample Wed May 16 09:56:43 2018 -0400

@@ -0,0 +1,2 @@
+#<species> <version> <release> <value> <dbkey> <name> <prefix>
+#Dictyostelium_discoideum dicty_2 7 Dictyostelium_discoideum_dicty_2_7 Dictyostelium_discoideum_dicty_2_7 Dictyostelium_discoideum: dicty_2 (release 7) <path_to_dicty_indexes_dir>

diff -r 000000000000 -r f68a60c3d768 data_manager_build_alfa_indexes/tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_build_alfa_indexes/tool_data_table_conf.xml.sample Wed May 16 09:56:43 2018 -0400

@@ -0,0 +1,7 @@
+<tables>
+    
+    <table name="alfa_indexes" comment_char="#" allow_duplicate_entries="False">
+        <columns>species, version, release, value, dbkey, name, prefix</columns>
+        <file path="tool-data/alfa_indexes.loc" />
+    </table>
+</tables>