Galaxy |

Changeset 0:c9699375fcf6 (2016-07-13)

Next changeset 1:4da91bb244dc (2016-07-14)

Commit message:
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/glimmer_hmm commit 0dc67759bcbdf5a8a285ded9ba751340d741fe63

added:
glimmerHMM/BCBio/GFF/GFFOutput.py
glimmerHMM/BCBio/GFF/GFFParser.py
glimmerHMM/BCBio/GFF/__init__.py
glimmerHMM/BCBio/GFF/_utils.py
glimmerHMM/BCBio/__init__.py
glimmerHMM/__init__.py
glimmerHMM/glimmerhmm_gff_to_sequence.py
glimmerHMM/glimmerhmm_predict.xml
glimmerHMM/glimmerhmm_tabular_to_sequence.py
glimmerHMM/glimmerhmm_to_sequence.py
glimmerHMM/glimmerhmm_to_sequence.xml
glimmer_hmm.loc.sample
readme.md
readme.txt
tool_conf.xml

diff -r 000000000000 -r c9699375fcf6 glimmerHMM/BCBio/GFF/GFFOutput.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerHMM/BCBio/GFF/GFFOutput.py Wed Jul 13 10:55:48 2016 -0400

[

@@ -0,0 +1,169 @@
+"""Output Biopython SeqRecords and SeqFeatures to GFF3 format.
+
+The target format is GFF3, the current GFF standard:
+    http://www.sequenceontology.org/gff3.shtml
+"""
+import urllib
+
+class _IdHandler:
+    """Generate IDs for GFF3 Parent/Child relationships where they don't exist.
+    """
+    def __init__(self):
+        self._prefix = "biopygen"
+        self._counter = 1
+        self._seen_ids = []
+
+    def _generate_id(self, quals):
+        """Generate a unique ID not present in our existing IDs.
+        """
+        gen_id = self._get_standard_id(quals)
+        if gen_id is None:
+            while 1:
+                gen_id = "%s%s" % (self._prefix, self._counter)
+                if gen_id not in self._seen_ids:
+                    break
+                self._counter += 1
+        return gen_id
+
+    def _get_standard_id(self, quals):
+        """Retrieve standardized IDs from other sources like NCBI GenBank.
+
+        This tries to find IDs from known key/values when stored differently
+        than GFF3 specifications.
+        """
+        possible_keys = ["transcript_id", "protein_id"]
+        for test_key in possible_keys:
+            if quals.has_key(test_key):
+                cur_id = quals[test_key]
+                if isinstance(cur_id, tuple) or isinstance(cur_id, list):
+                    return cur_id[0]
+                else:
+                    return cur_id
+        return None
+
+    def update_quals(self, quals, has_children):
+        """Update a set of qualifiers, adding an ID if necessary.
+        """
+        cur_id = quals.get("ID", None)
+        # if we have an ID, record it
+        if cur_id:
+            if not isinstance(cur_id, list) and not isinstance(cur_id, tuple):
+                cur_id = [cur_id]
+            for add_id in cur_id:
+                self._seen_ids.append(add_id)
+        # if we need one and don't have it, create a new one
+        elif has_children:
+            new_id = self._generate_id(quals)
+            self._seen_ids.append(new_id)
+            quals["ID"] = [new_id]
+        return quals
+
+class GFF3Writer:
+    """Write GFF3 files starting with standard Biopython objects.
+    """
+    def __init__(self):
+        pass
+
+    def write(self, recs, out_handle):
+        """Write the provided records to the given handle in GFF3 format.
+        """
+        id_handler = _IdHandler()
+        self._write_header(out_handle)
+        for rec in recs:
+            self._write_rec(rec, out_handle)
+            self._write_annotations(rec.annotations, rec.id, out_handle)
+            for sf in rec.features:
+                sf = self._clean_feature(sf)
+                id_handler = self._write_feature(sf, rec.id, out_handle,
+                        id_handler)
+
+    def _clean_feature(self, feature):
+        quals = {}
+        for key, val in feature.qualifiers.items():
+            if not isinstance(val, (list, tuple)):
+                val = [val]
+            val = [str(x) for x in val]
+            quals[key] = val
+        feature.qualifiers = quals
+        clean_sub = [self._clean_feature(f) for f in feature.sub_features]
+        feature.sub_features = clean_sub
+        return feature
+
+    def _write_rec(self, rec, out_handle):
+        # if we have a SeqRecord, write out optional directive
+        if len(rec.seq) > 0:
+            out_handle.write("##sequence-region %s 1 %s\n" % (rec.id, len(rec.seq)))
+
+    def _write_feature(self, feature, rec_id, out_handle, id_handler,
+            parent_id=None):
+        """Write a feature with location information.
+        """
+        if feature.strand == 1:
+            strand = '+'
+        elif feature.strand == -1:
+            strand = '-'
+        else:
+            strand = '.'
+        # remove any standard features from the qualifiers
+        quals = feature.qualifiers.copy()
+        for std_qual in ["source", "score", "phase"]:
+            if quals.has_key(std_qual) and len(quals[std_qual]) == 1:
+                del quals[std_qual]
+        # add a link to a parent identifier if it exists
+        if parent_id:
+            if not quals.has_key("Parent"):
+                quals["Parent"] = []
+            quals["Parent"].append(parent_id)
+        quals = id_handler.update_quals(quals, len(feature.sub_features) > 0)
+        if feature.type:
+            ftype = feature.type
+        else:
+            ftype = "sequence_feature"
+        parts = [str(rec_id),
+                 feature.qualifiers.get("source", ["feature"])[0],
+                 ftype,
+                 str(feature.location.nofuzzy_start + 1), # 1-based indexing
+                 str(feature.location.nofuzzy_end),
+                 feature.qualifiers.get("score", ["."])[0],
+                 strand,
+                 str(feature.qualifiers.get("phase", ["."])[0]),
+                 self._format_keyvals(quals)]
+        out_handle.write("\t".join(parts) + "\n")
+        for sub_feature in feature.sub_features:
+            id_handler = self._write_feature(sub_feature, rec_id, out_handle,
+                    id_handler, quals["ID"][0])
+        return id_handler
+
+    def _format_keyvals(self, keyvals):
+        format_kvs = []
+        for key, values in keyvals.items():
+            key = key.strip()
+            format_vals = []
+            if not isinstance(values, list) or isinstance(values, tuple):
+                values = [values]
+            for val in values:
+                val = urllib.quote(str(val).strip())
+                if ((key and val) and val not in format_vals):
+                    format_vals.append(val)
+            format_kvs.append("%s=%s" % (key, ",".join(format_vals)))
+        return ";".join(format_kvs)
+
+    def _write_annotations(self, anns, rec_id, out_handle):
+        """Add annotations which refer to an entire sequence.
+        """
+        format_anns = self._format_keyvals(anns)
+        if format_anns:
+            parts = [rec_id, "annotation", "remark", ".", ".", ".", ".", ".",
+                     format_anns]
+            out_handle.write("\t".join(parts) + "\n")
+
+    def _write_header(self, out_handle):
+        """Write out standard header directives.
+        """
+        out_handle.write("##gff-version 3\n")
+
+def write(recs, out_handle):
+    """High level interface to write GFF3 files from SeqRecords and SeqFeatures.
+    """
+    writer = GFF3Writer()
+    return writer.write(recs, out_handle)

diff -r 000000000000 -r c9699375fcf6 glimmerHMM/BCBio/GFF/GFFParser.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerHMM/BCBio/GFF/GFFParser.py Wed Jul 13 10:55:48 2016 -0400

[

b'@@ -0,0 +1,835 @@\n+"""Parse GFF files into features attached to Biopython SeqRecord objects.\n+\n+This deals with GFF3 formatted files, a tab delimited format for storing\n+sequence features and annotations:\n+\n+http://www.sequenceontology.org/gff3.shtml\n+\n+It will also deal with older GFF versions (GTF/GFF2):\n+\n+http://www.sanger.ac.uk/Software/formats/GFF/GFF_Spec.shtml\n+http://mblab.wustl.edu/GTF22.html\n+\n+The implementation utilizes map/reduce parsing of GFF using Disco. Disco\n+(http://discoproject.org) is a Map-Reduce framework for Python utilizing\n+Erlang for parallelization. The code works on a single processor without\n+Disco using the same architecture.\n+"""\n+import os\n+import copy\n+import re\n+import collections\n+import urllib\n+import itertools\n+\n+# Make defaultdict compatible with versions of python older than 2.4\n+try:\n+ collections.defaultdict\n+except AttributeError:\n+ import _utils\n+ collections.defaultdict = _utils.defaultdict\n+\n+from Bio.Seq import Seq, UnknownSeq\n+from Bio.SeqRecord import SeqRecord\n+from Bio.SeqFeature import SeqFeature, FeatureLocation\n+from Bio import SeqIO\n+\n+def _gff_line_map(line, params):\n+ """Map part of Map-Reduce; parses a line of GFF into a dictionary.\n+\n+ Given an input line from a GFF file, this:\n+ - decides if the file passes our filtering limits\n+ - if so:\n+ - breaks it into component elements\n+ - determines the type of attribute (flat, parent, child or annotation)\n+ - generates a dictionary of GFF info which can be serialized as JSON\n+ """\n+ gff3_kw_pat = re.compile("\\w+=")\n+ def _split_keyvals(keyval_str):\n+ """Split key-value pairs in a GFF2, GTF and GFF3 compatible way.\n+\n+ GFF3 has key value pairs like:\n+ count=9;gene=amx-2;sequence=SAGE:aacggagccg\n+ GFF2 and GTF have: \n+ Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206"\n+ name "fgenesh1_pg.C_chr_1000003"; transcriptId 869\n+ """\n+ quals = collections.defaultdict(list)\n+ if keyval_str is None:\n+ return quals\n+ # ensembl GTF has a stray semi-colon at the end\n+ if keyval_str[-1] == \';\':\n+ keyval_str = keyval_str[:-1]\n+ # GFF2/GTF has a semi-colon with at least one space after it.\n+ # It can have spaces on both sides; wormbase does this.\n+ # GFF3 works with no spaces.\n+ # Split at the first one we can recognize as working\n+ parts = keyval_str.split(" ; ")\n+ if len(parts) == 1:\n+ parts = keyval_str.split("; ")\n+ if len(parts) == 1:\n+ parts = keyval_str.split(";")\n+ # check if we have GFF3 style key-vals (with =)\n+ is_gff2 = True\n+ if gff3_kw_pat.match(parts[0]):\n+ is_gff2 = False\n+ key_vals = [p.split(\'=\') for p in parts]\n+ # otherwise, we are separated by a space with a key as the first item\n+ else:\n+ pieces = []\n+ for p in parts:\n+ # fix misplaced semi-colons in keys in some GFF2 files\n+ if p and p[0] == \';\':\n+ p = p[1:]\n+ pieces.append(p.strip().split(" "))\n+ key_vals = [(p[0], " ".join(p[1:])) for p in pieces]\n+ for item in key_vals:\n+ # standard in-spec items are key=value\n+ if len(item) == 2:\n+ key, val = item\n+ # out-of-spec files can have just key values. We set an empty value\n+ # which will be changed to true later to standardize.\n+ else:\n+ assert len(item) == 1, item\n+ key = item[0]\n+ val = \'\'\n+ # remove quotes in GFF2 files\n+ if (len(val) > 0 and val[0] == \'"\' and val[-1] == \'"\'):\n+ val = val[1:-1] \n+ if val:\n+ quals[key].extend([v for v in val.split(\',\') if v])\n+ # if we don\'t have a value, make this a key=True/False style\n+ '..b' the\n+ information you need. This class provides high level summary details to\n+ help in learning.\n+ """\n+ def __init__(self):\n+ self._filter_info = dict(gff_id = [0], gff_source_type = [1, 2],\n+ gff_source = [1], gff_type = [2])\n+ \n+ def _get_local_params(self, limit_info=None):\n+ class _LocalParams:\n+ def __init__(self):\n+ self.jsonify = False\n+ params = _LocalParams()\n+ params.limit_info = limit_info\n+ params.filter_info = self._filter_info\n+ return params\n+ \n+ @_file_or_handle\n+ def available_limits(self, gff_handle):\n+ """Return dictionary information on possible limits for this file.\n+\n+ This returns a nested dictionary with the following structure:\n+ \n+ keys -- names of items to filter by\n+ values -- dictionary with:\n+ keys -- filter choice\n+ value -- counts of that filter in this file\n+\n+ Not a parallelized map-reduce implementation.\n+ """\n+ cur_limits = dict()\n+ for filter_key in self._filter_info.keys():\n+ cur_limits[filter_key] = collections.defaultdict(int)\n+ for line in gff_handle:\n+ # when we hit FASTA sequences, we are done with annotations\n+ if line.startswith("##FASTA"):\n+ break\n+ # ignore empty and comment lines\n+ if line.strip() and line.strip()[0] != "#":\n+ parts = [p.strip() for p in line.split(\'\\t\')]\n+ assert len(parts) == 9, line\n+ for filter_key, cur_indexes in self._filter_info.items():\n+ cur_id = tuple([parts[i] for i in cur_indexes])\n+ cur_limits[filter_key][cur_id] += 1\n+ # get rid of the default dicts\n+ final_dict = dict()\n+ for key, value_dict in cur_limits.items():\n+ if len(key) == 1:\n+ key = key[0]\n+ final_dict[key] = dict(value_dict)\n+ gff_handle.close()\n+ return final_dict\n+\n+ @_file_or_handle\n+ def parent_child_map(self, gff_handle):\n+ """Provide a mapping of parent to child relationships in the file.\n+\n+ Returns a dictionary of parent child relationships:\n+\n+ keys -- tuple of (source, type) for each parent\n+ values -- tuple of (source, type) as children of that parent\n+ \n+ Not a parallelized map-reduce implementation.\n+ """\n+ # collect all of the parent and child types mapped to IDs\n+ parent_sts = dict()\n+ child_sts = collections.defaultdict(list)\n+ for line in gff_handle:\n+ # when we hit FASTA sequences, we are done with annotations\n+ if line.startswith("##FASTA"):\n+ break\n+ if line.strip():\n+ line_type, line_info = _gff_line_map(line,\n+ self._get_local_params())[0]\n+ if (line_type == \'parent\' or (line_type == \'child\' and\n+ line_info[\'id\'])):\n+ parent_sts[line_info[\'id\']] = (\n+ line_info[\'quals\'][\'source\'][0], line_info[\'type\'])\n+ if line_type == \'child\':\n+ for parent_id in line_info[\'quals\'][\'Parent\']:\n+ child_sts[parent_id].append((\n+ line_info[\'quals\'][\'source\'][0], line_info[\'type\']))\n+ #print parent_sts, child_sts\n+ # generate a dictionary of the unique final type relationships\n+ pc_map = collections.defaultdict(list)\n+ for parent_id, parent_type in parent_sts.items():\n+ for child_type in child_sts[parent_id]:\n+ pc_map[parent_type].append(child_type)\n+ pc_final_map = dict()\n+ for ptype, ctypes in pc_map.items():\n+ unique_ctypes = list(set(ctypes))\n+ unique_ctypes.sort()\n+ pc_final_map[ptype] = unique_ctypes\n+ return pc_final_map\n'

diff -r 000000000000 -r c9699375fcf6 glimmerHMM/BCBio/GFF/__init__.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerHMM/BCBio/GFF/__init__.py Wed Jul 13 10:55:48 2016 -0400

@@ -0,0 +1,4 @@
+"""Top level of GFF parsing providing shortcuts for useful classes.
+"""
+from GFFParser import GFFParser, DiscoGFFParser, GFFExaminer, parse, parse_simple
+from GFFOutput import GFF3Writer, write

diff -r 000000000000 -r c9699375fcf6 glimmerHMM/BCBio/GFF/_utils.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerHMM/BCBio/GFF/_utils.py Wed Jul 13 10:55:48 2016 -0400

[

@@ -0,0 +1,37 @@
+class defaultdict(dict):
+    """Back compatible defaultdict: http://code.activestate.com/recipes/523034/
+    """
+    def __init__(self, default_factory=None, *a, **kw):
+        if (default_factory is not None and
+            not hasattr(default_factory, '__call__')):
+            raise TypeError('first argument must be callable')
+        dict.__init__(self, *a, **kw)
+        self.default_factory = default_factory
+    def __getitem__(self, key):
+        try:
+            return dict.__getitem__(self, key)
+        except KeyError:
+            return self.__missing__(key)
+    def __missing__(self, key):
+        if self.default_factory is None:
+            raise KeyError(key)
+        self[key] = value = self.default_factory()
+        return value
+    def __reduce__(self):
+        if self.default_factory is None:
+            args = tuple()
+        else:
+            args = self.default_factory,
+        return type(self), args, None, None, self.items()
+    def copy(self):
+        return self.__copy__()
+    def __copy__(self):
+        return type(self)(self.default_factory, self)
+    def __deepcopy__(self, memo):
+        import copy
+        return type(self)(self.default_factory,
+                          copy.deepcopy(self.items()))
+    def __repr__(self):
+        return 'defaultdict(%s, %s)' % (self.default_factory,
+                                        dict.__repr__(self))
+

diff -r 000000000000 -r c9699375fcf6 glimmerHMM/BCBio/__init__.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerHMM/BCBio/__init__.py Wed Jul 13 10:55:48 2016 -0400

@@ -0,0 +1,1 @@
+__version__="0.1"

diff -r 000000000000 -r c9699375fcf6 glimmerHMM/__init__.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerHMM/__init__.py Wed Jul 13 10:55:48 2016 -0400

@@ -0,0 +1,3 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+

diff -r 000000000000 -r c9699375fcf6 glimmerHMM/glimmerhmm_gff_to_sequence.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerHMM/glimmerhmm_gff_to_sequence.py Wed Jul 13 10:55:48 2016 -0400

[

@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+"""Convert GlimmerHMM GFF3 gene predictions into protein sequences.
+
+This works with the GlimmerHMM GFF3 output format:
+
+##gff-version 3
+##sequence-region Contig5.15 1 47390
+Contig5.15      GlimmerHMM      mRNA    323     325     .       +       .       ID=Contig5.15.path1.gene1;Name=Contig5.15.path1.gene1
+Contig5.15      GlimmerHMM      CDS     323     325     .       +       0       ID=Contig5.15.cds1.1;Parent=Contig5.15.path1.gene1;Name=Contig5.15.path1.gene1;Note=final-exon
+
+http://www.cbcb.umd.edu/software/GlimmerHMM/
+
+Modified version of the converter from Brad Chapman: https://github.com/chapmanb/bcbb/blob/master/biopython/glimmergff_to_proteins.py
+
+Usage:
+    glimmer_to_proteins.py <glimmer output> <ref fasta> <output file> <convert to protein ... False|True>
+"""
+import sys
+import os
+import operator
+
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+
+from BCBio import GFF
+
+def main(glimmer_file, ref_file, out_file, to_protein):
+    with open(ref_file) as in_handle:
+        ref_recs = SeqIO.to_dict(SeqIO.parse(in_handle, "fasta"))
+
+    base, ext = os.path.splitext(glimmer_file)
+
+    with open(out_file, "w") as out_handle:
+        SeqIO.write(protein_recs(glimmer_file, ref_recs, to_protein), out_handle, "fasta")
+
+def protein_recs(glimmer_file, ref_recs, to_protein):
+    """Generate protein records from GlimmerHMM gene predictions.
+    """
+    with open(glimmer_file) as in_handle:
+        for rec in glimmer_predictions(in_handle, ref_recs):
+            for feature in rec.features:
+                seq_exons = []
+                for cds in feature.sub_features:
+                    seq_exons.append(rec.seq[
+                        cds.location.nofuzzy_start:
+                        cds.location.nofuzzy_end])
+                gene_seq = reduce(operator.add, seq_exons)
+                if feature.strand == -1:
+                    gene_seq = gene_seq.reverse_complement()
+
+                if to_protein:
+                    yield SeqRecord(gene_seq.translate(), feature.qualifiers["ID"][0], "", "")
+                else:
+                    yield SeqRecord(gene_seq, feature.qualifiers["ID"][0], "", "")
+
+def glimmer_predictions(in_handle, ref_recs):
+    """Parse Glimmer output, generating SeqRecord and SeqFeatures for predictions
+    """
+    for rec in GFF.parse(in_handle, target_lines=1000, base_dict=ref_recs):
+        yield rec
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print __doc__
+        sys.exit()
+    main(*sys.argv[1:])

diff -r 000000000000 -r c9699375fcf6 glimmerHMM/glimmerhmm_predict.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerHMM/glimmerhmm_predict.xml Wed Jul 13 10:55:48 2016 -0400

[

@@ -0,0 +1,66 @@
+<tool id="glimmerhmm_predict" name="GlimmerHMM" version="2.0">
+    <description>Predict ORFs in eukaryotic genomes</description>
+    <command detect_errors="exit_code"><![CDATA[
+     glimmerhmm
+     $input $trained_specie.fields.path
+     -o $output
+     -g
+     $svm_splice_prediction
+     $partial_gene
+     #if str($top_n_predictions) != "-1":
+         -n $top_n_predictions
+        #end if
+ ]]></command>
+    <inputs>
+        <param name="input" type="data" format="fasta" label="Genome Sequence"/>
+        <param name="trained_specie" type="select" label="Select a specie">
+            <options from_data_table="glimmer_hmm_trained_dir">
+                <filter type="sort_by" column="2"/>
+                <validator type="no_options" message="No indexes are available"/>
+            </options>
+        </param>
+        <param name="partial_gene" type="boolean" label="Don't make partial gene predictions" truevalue="-f" falsevalue="" checked="false" />
+        <param name="svm_splice_prediction" type="boolean" label="Don't use svm splice site predictions" truevalue="-v" falsevalue="" checked="false" />
+        <param name="top_n_predictions" type="integer" label="top n best predictions, -1 means infinite" value="-1"/>
+    </inputs>
+    <outputs>
+        <data format="gff3" name="output" />
+    </outputs>
+    <help>
+
+        **What it does**
+
+        GlimmerHMM is a new gene finder based on a Generalized Hidden Markov Model (GHMM).
+        Although the gene finder conforms to the overall mathematical framework of a GHMM,
+        additionally it incorporates splice site models adapted from the GeneSplicer program and a
+        decision tree adapted from GlimmerM. It also utilizes Interpolated Markov Models for the
+        coding and noncoding models . Currently, GlimmerHMM's GHMM structure includes introns of each phase,
+        intergenic regions, and four types of exons (initial, internal, final, and single).
+        A basic user manual can be consulted here.
+
+        -----
+
+        **Example**
+
+        Suppose you have the following DNA formatted sequences::
+
+        >SQ   Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other;
+        cccgcggagcgggtaccacatcgctgcgcgatgtgcgagcgaacacccgggctgcgcccg
+        ggtgttgcgctcccgctccgcgggagcgctggcgggacgctgcgcgtcccgctcaccaag
+        cccgcttcgcgggcttggtgacgctccgtccgctgcgcttccggagttgcggggcttcgc
+        cccgctaaccctgggcctcgcttcgctccgccttgggcctgcggcgggtccgctgcgctc
+        ccccgcctcaagggcccttccggctgcgcctccaggacccaaccgcttgcgcgggcctgg
+
+        Running this tool will produce this::
+
+        ##gff-version 3
+        ##sequence-region ConsensusfromCH236920mapping 1 4148552
+        ConsensusfromCH236920mapping  GlimmerHMM  mRNA  1       122     .   +   .   ID=ConsensusfromCH236920mapping.path1.gene1;Name=ConsensusfromCH236920mapping.path1.gene1
+        ConsensusfromCH236920mapping  GlimmerHMM  CDS   1       122     .   +   0   ID=ConsensusfromCH236920mapping.cds1.1;
+        ConsensusfromCH236920mapping  GlimmerHMM  mRNA  14066   15205   .   -   .   ID=ConsensusfromCH236920mapping.path1.gene2;Name=ConsensusfromCH236920mapping.path1.gene2
+        ConsensusfromCH236920mapping  GlimmerHMM  CDS   14066   15034   .   -   0   ID=ConsensusfromCH236920mapping.cds2.1;
+        ConsensusfromCH236920mapping  GlimmerHMM  CDS   15137   15205   .   -   0   ID=ConsensusfromCH236920mapping.cds2.2;
+        ConsensusfromCH236920mapping  GlimmerHMM  mRNA  19910   24210   .   -   .   ID=ConsensusfromCH236920mapping.path1.gene3;Name=ConsensusfromCH236920mapping.path1.gene3
+
+    </help>
+</tool>

diff -r 000000000000 -r c9699375fcf6 glimmerHMM/glimmerhmm_tabular_to_sequence.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerHMM/glimmerhmm_tabular_to_sequence.py Wed Jul 13 10:55:48 2016 -0400

[

@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+"""Convert GlimmerHMM gene predictions into protein sequences.
+
+This works with the GlimmerHMM specific output format:
+
+  12    1  +  Initial       10748      10762       15
+  12    2  +  Internal      10940      10971       32
+  12    3  +  Internal      11035      11039        5
+  12    4  +  Internal      11072      11110       39
+  12    5  +  Internal      11146      11221       76
+  12    6  +  Terminal      11265      11388      124
+
+http://www.cbcb.umd.edu/software/GlimmerHMM/
+
+Modified version of the converter from Brad Chapman: https://github.com/chapmanb/bcbb/blob/master/biopython/glimmer_to_proteins.py
+
+Usage:
+    glimmer_to_proteins.py <glimmer output> <ref fasta> <output file> <convert to protein ... False|True>
+"""
+
+import sys
+import os
+import operator
+
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+
+def main(glimmer_file, ref_file, out_file, to_protein):
+    with open(ref_file) as in_handle:
+        ref_rec = SeqIO.read(in_handle, "fasta")
+
+    base, ext = os.path.splitext(glimmer_file)
+
+    with open(out_file, "w") as out_handle:
+        SeqIO.write(protein_recs(glimmer_file, ref_rec, to_protein), out_handle, "fasta")
+
+def protein_recs(glimmer_file, ref_rec, to_protein):
+    """Generate protein records
+    """
+    with open(glimmer_file) as in_handle:
+        for gene_num, exons, strand in glimmer_predictions(in_handle):
+            seq_exons = []
+            for start, end in exons:
+                seq_exons.append(ref_rec.seq[start:end])
+            gene_seq = reduce(operator.add, seq_exons)
+            if strand == '-':
+                gene_seq = gene_seq.reverse_complement()
+            if to_protein:
+                yield SeqRecord(gene_seq.translate(), gene_num, "", "")
+            else:
+                yield SeqRecord(gene_seq, gene_num, "", "")
+
+def glimmer_predictions(in_handle):
+    """Parse Glimmer output, generating a exons and strand for each prediction.
+    """
+    # read the header
+    while 1:
+        line = in_handle.readline()
+        if line.startswith("   #    #"):
+            break
+    in_handle.readline()
+    # read gene predictions one at a time
+    cur_exons, cur_gene_num, cur_strand = ([], None, None)
+    while 1:
+        line = in_handle.readline()
+        if not line:
+            break
+        parts = line.strip().split()
+        # new exon
+        if len(parts) == 0:
+            yield cur_gene_num, cur_exons, cur_strand
+            cur_exons, cur_gene_num, cur_strand = ([], None, None)
+        else:
+            this_gene_num = parts[0]
+            this_strand = parts[2]
+            this_start = int(parts[4]) - 1 # 1 based
+            this_end = int(parts[5])
+            if cur_gene_num is None:
+                cur_gene_num = this_gene_num
+                cur_strand = this_strand
+            else:
+                assert cur_gene_num == this_gene_num
+                assert cur_strand == this_strand
+            cur_exons.append((this_start, this_end))
+    if len(cur_exons) > 0:
+        yield cur_gene_num, cur_exons, cur_strand
+
+if __name__ == "__main__":
+    if len(sys.argv) != 5:
+        print __doc__
+        sys.exit()
+    main(*sys.argv[1:])

diff -r 000000000000 -r c9699375fcf6 glimmerHMM/glimmerhmm_to_sequence.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerHMM/glimmerhmm_to_sequence.py Wed Jul 13 10:55:48 2016 -0400

[

@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+"""Convert GlimmerHMM gene predictions into protein sequences.
+
+This works with both the GFF and the costumn Tabular Output.
+And is only a wrapper to call the appropiate scripts.
+
+Usage:
+    glimmerhmm_to_sequence.py <glimmer output> <ref fasta> <output file> <format> <protein>
+
+"""
+import sys
+import os
+import glimmerhmm_tabular_to_sequence
+import glimmerhmm_gff_to_sequence
+
+def main(glimmer_file, ref_file, out_file, to_protein = False):
+    if to_protein == 'True':
+        to_protein = True
+    else:
+        to_protein = False
+
+    glimmerhmm_gff_to_sequence.main(glimmer_file, ref_file, out_file, to_protein)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 5:
+        print __doc__
+        sys.exit()
+    main(*sys.argv[1:])

diff -r 000000000000 -r c9699375fcf6 glimmerHMM/glimmerhmm_to_sequence.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerHMM/glimmerhmm_to_sequence.xml Wed Jul 13 10:55:48 2016 -0400

[

@@ -0,0 +1,29 @@
+<tool id="glimmerhmm_to_sequence" name="GlimmerHMM to Sequence" version="0.1">
+    <description>converter</description>
+    <command interpreter='python'>
+<![CDATA[
+    glimmerhmm_to_sequence.py
+        $glimmerhmm_input
+        $genome_input
+        $output
+        $to_protein
+]]>
+    </command>
+    <inputs>
+        <param name="genome_input" type="data" format="fasta" label="Genome Sequence" />
+        <param name="glimmerhmm_input" type="data" format="gff" label="GlimmerHmm result File" />
+        <param name="to_protein" type="boolean" label="Convert to protein sequence" truevalue="True" falsevalue="False" checked="false" />
+    </inputs>
+    <outputs>
+        <data name="output" format="fasta" />
+    </outputs>
+    <help>
+<![CDATA[
+
+**What it does**
+
+It converts the GlimmerHMM GFF output to a Fasta File.
+
+]]>
+    </help>
+</tool>

diff -r 000000000000 -r c9699375fcf6 glimmer_hmm.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmer_hmm.loc.sample Wed Jul 13 10:55:48 2016 -0400

@@ -0,0 +1,16 @@
+#This file lists the locations of all the trained_dir files
+#under the "trained_dir" directory (a directory that contains a directory
+#for each organism used by glimmer_hmm).
+#This file has the format (white space characters are
+#TAB characters):
+#
+#<unique_id> <display_name> <file_path>
+#
+#glimmer_hmm.loc could look something like this:
+#
+#human_gc_0-43 Human with GC Content 0 to 43% /path/to/trained_dir/human
+#celegans Celegan /path/to/trained_dir/Celegans
+#arabidopsis Arabidopsis /path/to/trained_dir/arabidopsis
+#rice Rice /path/to/trained_dir/rice
+#zebrafish Zebrafish /path/to/trained_dir/zebrafish
+#

diff -r 000000000000 -r c9699375fcf6 readme.md
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/readme.md Wed Jul 13 10:55:48 2016 -0400

[

@@ -0,0 +1,82 @@
+Galaxy wrapper for GlimmerHMM
+=====================================
+
+This wrapper has been written in 2012 by Björn Grüning, and updated by Rémi Marenco in 2016.
+
+This is a wrapper for the command line tool of GlimmerHMM.
+https://ccb.jhu.edu/software/glimmerhmm/
+
+GlimmerHMM is a gene finder based on a Generalized Hidden Markov Model (GHMM). Although the gene finder conforms to the overall mathematical framework of a GHMM,
+additionally it incorporates splice site models adapted from the GeneSplicer program and a decision tree adapted from GlimmerM. It also utilizes
+Interpolated Markov Models for the coding and noncoding models.
+Currently, GlimmerHMM's GHMM structure includes introns of each phase, intergenic regions, and four types of exons (initial, internal, final, and single).
+
+Majoros, W.H., Pertea, M., and Salzberg, S.L. TigrScan and GlimmerHMM: two open-source ab initio eukaryotic gene-finders Bioinformatics 20 2878-2879.
+Pertea, M. and S. L. Salzberg (2002). "Computational gene finding in plants." Plant Molecular Biology 48(1-2): 39-48.
+The Arabidopsis Genome Initiative, (2000) "Analysis of the genome sequence of the flowering plant Arabidopsis thaliana", Nature. Dec 14; 408(6814):796-815.
+Pertea, M., S. L. Salzberg, et al. (2000). "Finding genes in Plasmodium falciparum." Nature 404(6773): 34; discussion 34-5.
+Salzberg, S. L., M. Pertea, et al. (1999). "Interpolated Markov models for eukaryotic gene finding." Genomics 59(1): 24-31.
+
+
+Installation
+============
+
+To install Glimmer3, please download GlimmerHMM from
+
+ftp://ccb.jhu.edu/pub/software/glimmerhmm
+
+and follow the installation instructions.
+To extract the glimmerHMM predicted genes, the GFF Parser from Brad Chapman (ttp://github.com/chapmanb/bcbb/tree/master/gff) was used and is included.
+
+To install the wrapper copy the glimmerHMM folder in the galaxy tools
+folder and modify the $GALAXY_ROOT/config/tool_conf.xml file to make the tool available to Galaxy.
+For example:
+
+```xml
+<tool file="gene_prediction/tools/glimmerHMM/glimmerhmm_predict.xml" />
+<tool file="gene_prediction/tools/glimmerHMM/glimmerhmm_to_sequence.xml" />
+```
+
+You also need to use a trained organism by adding them as reference data in Galaxy:
+
+1. Add the *glimmer_hmm_trained_dir* data table to `tool_data_table_conf.xml` in `$GALAXY_ROOT/config/`:
+
+    ```xml
+    
+    <table name="glimmer_hmm_trained_dir" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="tool-data/glimmer_hmm.loc" />
+    </table>
+    ```
+
+2. Add the `glimmer_hmm.loc` file referencing your trained organism, in `tool-data`.
+    You have a sample [`glimmer_hmm.loc.sample`] available in the repository to help you configuring it properly
+3. Add your data in the chosen folder at step 2. You can get them from the GlimmerHMM tar, `$GLIMMERHMM/trained_dir`
+
+History
+=======
+
+- v2.0 - Update by Rémi Marenco to make it work without having to modify the wrapper + add ability to select the species
+- v0.1 - Initial public release
+
+
+Wrapper Licence (MIT/BSD style)
+===============================
+
+Permission to use, copy, modify, and distribute this software and its
+documentation with or without modifications and for any purpose and
+without fee is hereby granted, provided that any copyright notices
+appear in all copies and that both those copyright notices and this
+permission notice appear in supporting documentation, and that the
+names of the contributors or copyright holders not be used in
+advertising or publicity pertaining to distribution of the software
+without specific prior permission.
+
+THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
+WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
+OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+OR PERFORMANCE OF THIS SOFTWARE.

diff -r 000000000000 -r c9699375fcf6 readme.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/readme.txt Wed Jul 13 10:55:48 2016 -0400

@@ -0,0 +1,65 @@
+Galaxy wrapper for RepeatMasker
+=====================================
+
+This wrapper is copyright 2012 by Björn Grüning.
+
+This is a wrapper for the command line tool of GlimmerHMM.
+http://www.cbcb.umd.edu/software/GlimmerHMM/
+
+GlimmerHMM is a new gene finder based on a Generalized Hidden Markov Model (GHMM). Although the gene finder conforms to the overall mathematical framework of a GHMM,
+additionally it incorporates splice site models adapted from the GeneSplicer program and a decision tree adapted from GlimmerM. It also utilizes
+Interpolated Markov Models for the coding and noncoding models.
+Currently, GlimmerHMM's GHMM structure includes introns of each phase, intergenic regions, and four types of exons (initial, internal, final, and single).
+
+Majoros, W.H., Pertea, M., and Salzberg, S.L. TigrScan and GlimmerHMM: two open-source ab initio eukaryotic gene-finders Bioinformatics 20 2878-2879.
+Pertea, M. and S. L. Salzberg (2002). "Computational gene finding in plants." Plant Molecular Biology 48(1-2): 39-48.
+The Arabidopsis Genome Initiative, (2000) "Analysis of the genome sequence of the flowering plant Arabidopsis thaliana", Nature. Dec 14; 408(6814):796-815.
+Pertea, M., S. L. Salzberg, et al. (2000). "Finding genes in Plasmodium falciparum." Nature 404(6773): 34; discussion 34-5.
+Salzberg, S. L., M. Pertea, et al. (1999). "Interpolated Markov models for eukaryotic gene finding." Genomics 59(1): 24-31.
+
+
+Installation
+============
+
+To install Glimmer3, please download GlimmerHMM from
+
+ftp://ftp.cbcb.umd.edu/pub/software/glimmerhmm/
+
+and follow the installation instructions.
+To extract the glimmerHMM predicted genes, the GFF Parser from Brad Chapman (ttp://github.com/chapmanb/bcbb/tree/master/gff) was used and is included.
+
+To install the wrapper copy the glimmerHMM folder in the galaxy tools
+folder and modify the tools_conf.xml file to make the tool available to Galaxy.
+For example:
+
+<tool file="gene_prediction/tools/glimmerHMM/glimmerhmm_predict.xml" />
+<tool file="gene_prediction/tools/glimmerHMM/glimmerhmm_to_sequence.xml" />
+
+
+History
+=======
+
+v0.1 - Initial public release
+
+
+Wrapper Licence (MIT/BSD style)
+===============================
+
+Permission to use, copy, modify, and distribute this software and its
+documentation with or without modifications and for any purpose and
+without fee is hereby granted, provided that any copyright notices
+appear in all copies and that both those copyright notices and this
+permission notice appear in supporting documentation, and that the
+names of the contributors or copyright holders not be used in
+advertising or publicity pertaining to distribution of the software
+without specific prior permission.
+
+THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
+WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
+OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+OR PERFORMANCE OF THIS SOFTWARE.
+

diff -r 000000000000 -r c9699375fcf6 tool_conf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_conf.xml Wed Jul 13 10:55:48 2016 -0400

@@ -0,0 +1,10 @@
+<?xml version="1.0"?>
+<toolbox>
+  <section name="Gene Prediction" id="gene_prediction">
+
+    <label text="GlimmerHMM (eukaryotes)" id="glimmerHMM_eukaryotes" />
+        <tool file="gene_prediction/tools/glimmerHMM/glimmerhmm_predict.xml" />
+        <tool file="gene_prediction/tools/glimmerHMM/glimmerhmm_to_sequence.xml" />
+
+  </section>
+</toolbox>