changeset 0:0a15677c6668 default tip

Uploaded
author bjoern-gruening
date Wed, 11 Jan 2012 09:58:35 -0500
parents
children
files glimmerHMM/BCBio/GFF/GFFOutput.py glimmerHMM/BCBio/GFF/GFFParser.py glimmerHMM/BCBio/GFF/__init__.py glimmerHMM/BCBio/GFF/_utils.py glimmerHMM/BCBio/__init__.py glimmerHMM/__init__.py glimmerHMM/glimmerhmm_gff_to_sequence.py glimmerHMM/glimmerhmm_predict.xml glimmerHMM/glimmerhmm_tabular_to_sequence.py glimmerHMM/glimmerhmm_to_sequence.py glimmerHMM/glimmerhmm_to_sequence.xml readme.txt tool_conf.xml
diffstat 13 files changed, 1400 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerHMM/BCBio/GFF/GFFOutput.py	Wed Jan 11 09:58:35 2012 -0500
@@ -0,0 +1,169 @@
+"""Output Biopython SeqRecords and SeqFeatures to GFF3 format.
+
+The target format is GFF3, the current GFF standard:
+    http://www.sequenceontology.org/gff3.shtml
+"""
+import urllib
+
+class _IdHandler:
+    """Generate IDs for GFF3 Parent/Child relationships where they don't exist.
+    """
+    def __init__(self):
+        self._prefix = "biopygen"
+        self._counter = 1
+        self._seen_ids = []
+
+    def _generate_id(self, quals):
+        """Generate a unique ID not present in our existing IDs.
+        """
+        gen_id = self._get_standard_id(quals)
+        if gen_id is None:
+            while 1:
+                gen_id = "%s%s" % (self._prefix, self._counter)
+                if gen_id not in self._seen_ids:
+                    break
+                self._counter += 1
+        return gen_id
+
+    def _get_standard_id(self, quals):
+        """Retrieve standardized IDs from other sources like NCBI GenBank.
+
+        This tries to find IDs from known key/values when stored differently
+        than GFF3 specifications.
+        """
+        possible_keys = ["transcript_id", "protein_id"]
+        for test_key in possible_keys:
+            if quals.has_key(test_key):
+                cur_id = quals[test_key]
+                if isinstance(cur_id, tuple) or isinstance(cur_id, list):
+                    return cur_id[0]
+                else:
+                    return cur_id
+        return None
+
+    def update_quals(self, quals, has_children):
+        """Update a set of qualifiers, adding an ID if necessary.
+        """
+        cur_id = quals.get("ID", None)
+        # if we have an ID, record it
+        if cur_id:
+            if not isinstance(cur_id, list) and not isinstance(cur_id, tuple):
+                cur_id = [cur_id]
+            for add_id in cur_id:
+                self._seen_ids.append(add_id)
+        # if we need one and don't have it, create a new one
+        elif has_children:
+            new_id = self._generate_id(quals)
+            self._seen_ids.append(new_id)
+            quals["ID"] = [new_id]
+        return quals
+
+class GFF3Writer:
+    """Write GFF3 files starting with standard Biopython objects.
+    """
+    def __init__(self):
+        pass
+
+    def write(self, recs, out_handle):
+        """Write the provided records to the given handle in GFF3 format.
+        """
+        id_handler = _IdHandler()
+        self._write_header(out_handle)
+        for rec in recs:
+            self._write_rec(rec, out_handle)
+            self._write_annotations(rec.annotations, rec.id, out_handle)
+            for sf in rec.features:
+                sf = self._clean_feature(sf)
+                id_handler = self._write_feature(sf, rec.id, out_handle,
+                        id_handler)
+
+    def _clean_feature(self, feature):
+        quals = {}
+        for key, val in feature.qualifiers.items():
+            if not isinstance(val, (list, tuple)):
+                val = [val]
+            val = [str(x) for x in val]
+            quals[key] = val
+        feature.qualifiers = quals
+        clean_sub = [self._clean_feature(f) for f in feature.sub_features]
+        feature.sub_features = clean_sub
+        return feature
+
+    def _write_rec(self, rec, out_handle):
+        # if we have a SeqRecord, write out optional directive
+        if len(rec.seq) > 0:
+            out_handle.write("##sequence-region %s 1 %s\n" % (rec.id, len(rec.seq)))
+
+    def _write_feature(self, feature, rec_id, out_handle, id_handler,
+            parent_id=None):
+        """Write a feature with location information.
+        """
+        if feature.strand == 1:
+            strand = '+'
+        elif feature.strand == -1:
+            strand = '-'
+        else:
+            strand = '.'
+        # remove any standard features from the qualifiers
+        quals = feature.qualifiers.copy()
+        for std_qual in ["source", "score", "phase"]:
+            if quals.has_key(std_qual) and len(quals[std_qual]) == 1:
+                del quals[std_qual]
+        # add a link to a parent identifier if it exists
+        if parent_id:
+            if not quals.has_key("Parent"):
+                quals["Parent"] = []
+            quals["Parent"].append(parent_id)
+        quals = id_handler.update_quals(quals, len(feature.sub_features) > 0)
+        if feature.type:
+            ftype = feature.type
+        else:
+            ftype = "sequence_feature"
+        parts = [str(rec_id),
+                 feature.qualifiers.get("source", ["feature"])[0],
+                 ftype,
+                 str(feature.location.nofuzzy_start + 1), # 1-based indexing
+                 str(feature.location.nofuzzy_end),
+                 feature.qualifiers.get("score", ["."])[0],
+                 strand,
+                 str(feature.qualifiers.get("phase", ["."])[0]),
+                 self._format_keyvals(quals)]
+        out_handle.write("\t".join(parts) + "\n")
+        for sub_feature in feature.sub_features:
+            id_handler = self._write_feature(sub_feature, rec_id, out_handle,
+                    id_handler, quals["ID"][0])
+        return id_handler
+
+    def _format_keyvals(self, keyvals):
+        format_kvs = []
+        for key, values in keyvals.items():
+            key = key.strip()
+            format_vals = []
+            if not isinstance(values, list) or isinstance(values, tuple):
+                values = [values]
+            for val in values:
+                val = urllib.quote(str(val).strip())
+                if ((key and val) and val not in format_vals):
+                    format_vals.append(val)
+            format_kvs.append("%s=%s" % (key, ",".join(format_vals)))
+        return ";".join(format_kvs)
+
+    def _write_annotations(self, anns, rec_id, out_handle):
+        """Add annotations which refer to an entire sequence.
+        """
+        format_anns = self._format_keyvals(anns)
+        if format_anns:
+            parts = [rec_id, "annotation", "remark", ".", ".", ".", ".", ".",
+                     format_anns]
+            out_handle.write("\t".join(parts) + "\n")
+
+    def _write_header(self, out_handle):
+        """Write out standard header directives.
+        """
+        out_handle.write("##gff-version 3\n")
+
+def write(recs, out_handle):
+    """High level interface to write GFF3 files from SeqRecords and SeqFeatures.
+    """
+    writer = GFF3Writer()
+    return writer.write(recs, out_handle)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerHMM/BCBio/GFF/GFFParser.py	Wed Jan 11 09:58:35 2012 -0500
@@ -0,0 +1,835 @@
+"""Parse GFF files into features attached to Biopython SeqRecord objects.
+
+This deals with GFF3 formatted files, a tab delimited format for storing
+sequence features and annotations:
+
+http://www.sequenceontology.org/gff3.shtml
+
+It will also deal with older GFF versions (GTF/GFF2):
+
+http://www.sanger.ac.uk/Software/formats/GFF/GFF_Spec.shtml
+http://mblab.wustl.edu/GTF22.html
+
+The implementation utilizes map/reduce parsing of GFF using Disco. Disco
+(http://discoproject.org) is a Map-Reduce framework for Python utilizing
+Erlang for parallelization. The code works on a single processor without
+Disco using the same architecture.
+"""
+import os
+import copy
+import re
+import collections
+import urllib
+import itertools
+
+# Make defaultdict compatible with versions of python older than 2.4
+try:
+    collections.defaultdict
+except AttributeError:
+    import _utils
+    collections.defaultdict = _utils.defaultdict
+
+from Bio.Seq import Seq, UnknownSeq
+from Bio.SeqRecord import SeqRecord
+from Bio.SeqFeature import SeqFeature, FeatureLocation
+from Bio import SeqIO
+
+def _gff_line_map(line, params):
+    """Map part of Map-Reduce; parses a line of GFF into a dictionary.
+
+    Given an input line from a GFF file, this:
+    - decides if the file passes our filtering limits
+    - if so:
+        - breaks it into component elements
+        - determines the type of attribute (flat, parent, child or annotation)
+        - generates a dictionary of GFF info which can be serialized as JSON
+    """
+    gff3_kw_pat = re.compile("\w+=")
+    def _split_keyvals(keyval_str):
+        """Split key-value pairs in a GFF2, GTF and GFF3 compatible way.
+
+        GFF3 has key value pairs like:
+          count=9;gene=amx-2;sequence=SAGE:aacggagccg
+        GFF2 and GTF have:           
+          Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206"
+          name "fgenesh1_pg.C_chr_1000003"; transcriptId 869
+        """
+        quals = collections.defaultdict(list)
+        if keyval_str is None:
+            return quals
+        # ensembl GTF has a stray semi-colon at the end
+        if keyval_str[-1] == ';':
+            keyval_str = keyval_str[:-1]
+        # GFF2/GTF has a semi-colon with at least one space after it.
+        # It can have spaces on both sides; wormbase does this.
+        # GFF3 works with no spaces.
+        # Split at the first one we can recognize as working
+        parts = keyval_str.split(" ; ")
+        if len(parts) == 1:
+            parts = keyval_str.split("; ")
+            if len(parts) == 1:
+                parts = keyval_str.split(";")
+        # check if we have GFF3 style key-vals (with =)
+        is_gff2 = True
+        if gff3_kw_pat.match(parts[0]):
+            is_gff2 = False
+            key_vals = [p.split('=') for p in parts]
+        # otherwise, we are separated by a space with a key as the first item
+        else:
+            pieces = []
+            for p in parts:
+                # fix misplaced semi-colons in keys in some GFF2 files
+                if p and p[0] == ';':
+                    p = p[1:]
+                pieces.append(p.strip().split(" "))
+            key_vals = [(p[0], " ".join(p[1:])) for p in pieces]
+        for item in key_vals:
+            # standard in-spec items are key=value
+            if len(item) == 2:
+                key, val = item
+            # out-of-spec files can have just key values. We set an empty value
+            # which will be changed to true later to standardize.
+            else:
+                assert len(item) == 1, item
+                key = item[0]
+                val = ''
+            # remove quotes in GFF2 files
+            if (len(val) > 0 and val[0] == '"' and val[-1] == '"'):
+                val = val[1:-1] 
+            if val:
+                quals[key].extend([v for v in val.split(',') if v])
+            # if we don't have a value, make this a key=True/False style
+            # attribute
+            else:
+                quals[key].append('true')
+        for key, vals in quals.items():
+            quals[key] = [urllib.unquote(v) for v in vals]
+        return quals, is_gff2
+
+    def _nest_gff2_features(gff_parts):
+        """Provide nesting of GFF2 transcript parts with transcript IDs.
+
+        exons and coding sequences are mapped to a parent with a transcript_id
+        in GFF2. This is implemented differently at different genome centers
+        and this function attempts to resolve that and map things to the GFF3
+        way of doing them.
+        """
+        # map protein or transcript ids to a parent
+        for transcript_id in ["transcript_id", "transcriptId", "proteinId"]:
+            try:
+                gff_parts["quals"]["Parent"] = \
+                        gff_parts["quals"][transcript_id]
+                break
+            except KeyError:
+                pass
+        # case for WormBase GFF -- everything labelled as Transcript or CDS
+        for flat_name in ["Transcript", "CDS"]:
+            if gff_parts["quals"].has_key(flat_name):
+                # parent types
+                if gff_parts["type"] in [flat_name]:
+                    if not gff_parts["id"]:
+                        gff_parts["id"] = gff_parts["quals"][flat_name][0]
+                        gff_parts["quals"]["ID"] = [gff_parts["id"]]
+                # children types
+                elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR",
+                        "coding_exon", "five_prime_UTR", "CDS", "stop_codon",
+                        "start_codon"]:
+                    gff_parts["quals"]["Parent"] = gff_parts["quals"][flat_name]
+                break
+
+        return gff_parts
+
+    strand_map = {'+' : 1, '-' : -1, '?' : None, None: None}
+    line = line.strip()
+    if line[:2] == "##":
+        return [('directive', line[2:])]
+    elif line and line[0] != "#":
+        parts = line.split('\t')
+        should_do = True
+        if params.limit_info:
+            for limit_name, limit_values in params.limit_info.items():
+                cur_id = tuple([parts[i] for i in 
+                    params.filter_info[limit_name]])
+                if cur_id not in limit_values:
+                    should_do = False
+                    break
+        if should_do:
+            assert len(parts) >= 8, line
+            # not python2.4 compatible but easier to understand
+            #gff_parts = [(None if p == '.' else p) for p in parts]
+            gff_parts = []
+            for p in parts:
+                if p == ".":
+                    gff_parts.append(None)
+                else:
+                    gff_parts.append(p)
+            gff_info = dict()
+            # collect all of the base qualifiers for this item
+            if len(parts) > 8:
+                quals, is_gff2 = _split_keyvals(gff_parts[8])
+            else:
+                quals, is_gff2 = dict(), False
+            gff_info["is_gff2"] = is_gff2
+            if gff_parts[1]:
+                quals["source"].append(gff_parts[1])
+            if gff_parts[5]:
+                quals["score"].append(gff_parts[5])
+            if gff_parts[7]:
+                quals["phase"].append(gff_parts[7])
+            gff_info['quals'] = dict(quals)
+            gff_info['rec_id'] = gff_parts[0]
+            # if we are describing a location, then we are a feature
+            if gff_parts[3] and gff_parts[4]:
+                gff_info['location'] = [int(gff_parts[3]) - 1,
+                        int(gff_parts[4])]
+                gff_info['type'] = gff_parts[2]
+                gff_info['id'] = quals.get('ID', [''])[0]
+                gff_info['strand'] = strand_map.get(gff_parts[6], None)
+                if is_gff2:
+                    gff_info = _nest_gff2_features(gff_info)
+                # features that have parents need to link so we can pick up
+                # the relationship
+                if gff_info['quals'].has_key('Parent'):
+                    # check for self referential parent/child relationships
+                    # remove the ID, which is not useful
+                    for p in gff_info['quals']['Parent']:
+                        if p == gff_info['id']:
+                            gff_info['id'] = ''
+                            del gff_info['quals']['ID']
+                            break
+                    final_key = 'child'
+                elif gff_info['id']:
+                    final_key = 'parent'
+                # Handle flat features
+                else:
+                    final_key = 'feature'
+            # otherwise, associate these annotations with the full record
+            else:
+                final_key = 'annotation'
+            if params.jsonify:
+                return [(final_key, simplejson.dumps(gff_info))]
+            else:
+                return [(final_key, gff_info)]
+    return []
+
+def _gff_line_reduce(map_results, out, params):
+    """Reduce part of Map-Reduce; combines results of parsed features.
+    """
+    final_items = dict()
+    for gff_type, final_val in map_results:
+        if params.jsonify and gff_type not in ['directive']:
+            final_val = simplejson.loads(final_val)
+        try:
+            final_items[gff_type].append(final_val)
+        except KeyError:
+            final_items[gff_type] = [final_val]
+    for key, vals in final_items.items():
+        if params.jsonify:
+            vals = simplejson.dumps(vals)
+        out.add(key, vals)
+
+class _MultiIDRemapper:
+    """Provide an ID remapping for cases where a parent has a non-unique ID.
+
+    Real life GFF3 cases have non-unique ID attributes, which we fix here
+    by using the unique sequence region to assign children to the right
+    parent.
+    """
+    def __init__(self, base_id, all_parents):
+        self._base_id = base_id
+        self._parents = all_parents
+
+    def remap_id(self, feature_dict):
+        rstart, rend = feature_dict['location']
+        for index, parent in enumerate(self._parents):
+            pstart, pend = parent['location']
+            if rstart >= pstart and rend <= pend:
+                if index > 0:
+                    return ("%s_%s" % (self._base_id, index + 1))
+                else:
+                    return self._base_id
+        raise ValueError("Did not find remapped ID location: %s, %s, %s" % (
+                self._base_id, [p['location'] for p in self._parents],
+                feature_dict['location']))
+
+class _AbstractMapReduceGFF:
+    """Base class providing general GFF parsing for local and remote classes.
+
+    This class should be subclassed to provide a concrete class to parse
+    GFF under specific conditions. These classes need to implement
+    the _gff_process function, which returns a dictionary of SeqRecord
+    information.
+    """
+    def __init__(self, create_missing=True):
+        """Initialize GFF parser 
+
+        create_missing - If True, create blank records for GFF ids not in
+        the base_dict. If False, an error will be raised.
+        """
+        self._create_missing = create_missing
+        self._map_fn = _gff_line_map
+        self._reduce_fn = _gff_line_reduce
+        self._examiner = GFFExaminer()
+
+    def _gff_process(self, gff_files, limit_info, target_lines=None):
+        raise NotImplementedError("Derived class must define")
+
+    def parse(self, gff_files, base_dict=None, limit_info=None):
+        """Parse a GFF file, returning an iterator of SeqRecords.
+
+        limit_info - A dictionary specifying the regions of the GFF file
+        which should be extracted. This allows only relevant portions of a file
+        to be parsed.
+        
+        base_dict - A base dictionary of SeqRecord objects which may be
+        pre-populated with sequences and other features. The new features from
+        the GFF file will be added to this dictionary.
+        """
+        for rec in self.parse_in_parts(gff_files, base_dict, limit_info):
+            yield rec
+
+    def parse_in_parts(self, gff_files, base_dict=None, limit_info=None,
+            target_lines=None):
+        """Parse a region of a GFF file specified, returning info as generated.
+
+        target_lines -- The number of lines in the file which should be used
+        for each partial parse. This should be determined based on available
+        memory.
+        """
+        for results in self.parse_simple(gff_files, limit_info, target_lines):
+            if base_dict is None:
+                cur_dict = dict()
+            else:
+                cur_dict = copy.deepcopy(base_dict)
+            cur_dict = self._results_to_features(cur_dict, results)
+            all_ids = cur_dict.keys()
+            all_ids.sort()
+            for cur_id in all_ids:
+                yield cur_dict[cur_id]
+
+    def parse_simple(self, gff_files, limit_info=None, target_lines=1):
+        """Simple parse which does not build or nest features.
+
+        This returns a simple dictionary representation of each line in the
+        GFF file.
+        """
+        # gracefully handle a single file passed
+        if not isinstance(gff_files, (list, tuple)):
+            gff_files = [gff_files]
+        limit_info = self._normalize_limit_info(limit_info)
+        for results in self._gff_process(gff_files, limit_info, target_lines):
+            yield results
+       
+    def _normalize_limit_info(self, limit_info):
+        """Turn all limit information into tuples for identical comparisons.
+        """
+        final_limit_info = {}
+        if limit_info:
+            for key, values in limit_info.items():
+                final_limit_info[key] = []
+                for v in values:
+                    if isinstance(v, str):
+                        final_limit_info[key].append((v,))
+                    else:
+                        final_limit_info[key].append(tuple(v))
+        return final_limit_info
+    
+    def _results_to_features(self, base, results):
+        """Add parsed dictionaries of results to Biopython SeqFeatures.
+        """
+        base = self._add_annotations(base, results.get('annotation', []))
+        for feature in results.get('feature', []):
+            (_, base) = self._add_toplevel_feature(base, feature)
+        base = self._add_parent_child_features(base, results.get('parent', []),
+                results.get('child', []))
+        base = self._add_seqs(base, results.get('fasta', []))
+        base = self._add_directives(base, results.get('directive', []))
+        return base
+
+    def _add_directives(self, base, directives):
+        """Handle any directives or meta-data in the GFF file.
+
+        Relevant items are added as annotation meta-data to each record.
+        """
+        dir_keyvals = collections.defaultdict(list)
+        for directive in directives:
+            parts = directive.split()
+            if len(parts) > 1:
+                key = parts[0]
+                if len(parts) == 2:
+                    val = parts[1]
+                else:
+                    val = tuple(parts[1:])
+                dir_keyvals[key].append(val)
+        for key, vals in dir_keyvals.items():
+            for rec in base.values():
+                self._add_ann_to_rec(rec, key, vals)
+        return base
+
+    def _add_seqs(self, base, recs):
+        """Add sequence information contained in the GFF3 to records.
+        """
+        for rec in recs:
+            if base.has_key(rec.id):
+                base[rec.id].seq = rec.seq
+            else:
+                base[rec.id] = rec
+        return base
+    
+    def _add_parent_child_features(self, base, parents, children):
+        """Add nested features with parent child relationships.
+        """
+        multi_remap = self._identify_dup_ids(parents)
+        # add children features
+        children_prep = collections.defaultdict(list)
+        for child_dict in children:
+            child_feature = self._get_feature(child_dict)
+            for pindex, pid in enumerate(child_feature.qualifiers['Parent']):
+                if multi_remap.has_key(pid):
+                    pid = multi_remap[pid].remap_id(child_dict)
+                    child_feature.qualifiers['Parent'][pindex] = pid
+                children_prep[pid].append((child_dict['rec_id'],
+                    child_feature))
+        children = dict(children_prep)
+        # add children to parents that exist
+        for cur_parent_dict in parents:
+            cur_id = cur_parent_dict['id']
+            if multi_remap.has_key(cur_id):
+                cur_parent_dict['id'] = multi_remap[cur_id].remap_id(
+                        cur_parent_dict)
+            cur_parent, base = self._add_toplevel_feature(base, cur_parent_dict)
+            cur_parent, children = self._add_children_to_parent(cur_parent,
+                    children)
+        # create parents for children without them (GFF2 or split/bad files)
+        while len(children) > 0:
+            parent_id, cur_children = itertools.islice(children.items(),
+                    1).next()
+            # one child, do not nest it
+            if len(cur_children) == 1:
+                rec_id, child = cur_children[0]
+                loc = (child.location.nofuzzy_start, child.location.nofuzzy_end)
+                rec, base = self._get_rec(base,
+                        dict(rec_id=rec_id, location=loc))
+                rec.features.append(child)
+                del children[parent_id]
+            else:
+                cur_parent, base = self._add_missing_parent(base, parent_id,
+                        cur_children)
+                cur_parent, children = self._add_children_to_parent(cur_parent,
+                        children)
+        return base
+
+    def _identify_dup_ids(self, parents):
+        """Identify duplicated ID attributes in potential nested parents.
+
+        According to the GFF3 spec ID attributes are supposed to be unique
+        for a file, but this is not always true in practice. This looks
+        for duplicates, and provides unique IDs sorted by locations.
+        """
+        multi_ids = collections.defaultdict(list)
+        for parent in parents:
+            multi_ids[parent['id']].append(parent)
+        multi_ids = [(mid, parents) for (mid, parents) in multi_ids.items()
+                if len(parents) > 1]
+        multi_remap = dict()
+        for mid, parents in multi_ids:
+            multi_remap[mid] = _MultiIDRemapper(mid, parents)
+        return multi_remap
+
+    def _add_children_to_parent(self, cur_parent, children):
+        """Recursively add children to parent features.
+        """
+        if children.has_key(cur_parent.id):
+            cur_children = children[cur_parent.id]
+            for rec_id, cur_child in cur_children:
+                cur_child, children = self._add_children_to_parent(cur_child,
+                        children)
+                cur_parent.location_operator = "join"
+                cur_parent.sub_features.append(cur_child)
+            del children[cur_parent.id]
+        return cur_parent, children
+
+    def _add_annotations(self, base, anns):
+        """Add annotation data from the GFF file to records.
+        """
+        # add these as a list of annotations, checking not to overwrite
+        # current values
+        for ann in anns:
+            rec, base = self._get_rec(base, ann)
+            for key, vals in ann['quals'].items():
+                self._add_ann_to_rec(rec, key, vals)
+        return base
+
+    def _add_ann_to_rec(self, rec, key, vals):
+        """Add a key/value annotation to the given SeqRecord.
+        """
+        if rec.annotations.has_key(key):
+            try:
+                rec.annotations[key].extend(vals)
+            except AttributeError:
+                rec.annotations[key] = [rec.annotations[key]] + vals
+        else:
+            rec.annotations[key] = vals
+
+    def _get_rec(self, base, info_dict):
+        """Retrieve a record to add features to.
+        """
+        max_loc = info_dict.get('location', (0, 1))[1]
+        try:
+            cur_rec = base[info_dict['rec_id']]
+            # update generated unknown sequences with the expected maximum length
+            if isinstance(cur_rec.seq, UnknownSeq):
+                cur_rec.seq._length = max([max_loc, cur_rec.seq._length])
+            return cur_rec, base
+        except KeyError:
+            if self._create_missing:
+                new_rec = SeqRecord(UnknownSeq(max_loc), info_dict['rec_id'])
+                base[info_dict['rec_id']] = new_rec
+                return new_rec, base
+            else:
+                raise
+
+    def _add_missing_parent(self, base, parent_id, cur_children):
+        """Add a new feature that is missing from the GFF file.
+        """
+        base_rec_id = list(set(c[0] for c in cur_children))
+        assert len(base_rec_id) == 1
+        feature_dict = dict(id=parent_id, strand=None,
+                type="inferred_parent", quals=dict(ID=[parent_id]),
+                rec_id=base_rec_id[0])
+        coords = [(c.location.nofuzzy_start, c.location.nofuzzy_end) 
+                for r, c in cur_children]
+        feature_dict["location"] = (min([c[0] for c in coords]),
+                max([c[1] for c in coords]))
+        return self._add_toplevel_feature(base, feature_dict)
+
+    def _add_toplevel_feature(self, base, feature_dict):
+        """Add a toplevel non-nested feature to the appropriate record.
+        """
+        new_feature = self._get_feature(feature_dict)
+        rec, base = self._get_rec(base, feature_dict)
+        rec.features.append(new_feature)
+        return new_feature, base
+
+    def _get_feature(self, feature_dict):
+        """Retrieve a Biopython feature from our dictionary representation.
+        """
+        location = FeatureLocation(*feature_dict['location'])
+        new_feature = SeqFeature(location, feature_dict['type'],
+                id=feature_dict['id'], strand=feature_dict['strand'])
+        new_feature.qualifiers = feature_dict['quals']
+        return new_feature
+
+    def _parse_fasta(self, in_handle):
+        """Parse FASTA sequence information contained in the GFF3 file.
+        """
+        return list(SeqIO.parse(in_handle, "fasta"))
+
+class _GFFParserLocalOut:
+    """Provide a collector for local GFF MapReduce file parsing.
+    """
+    def __init__(self, smart_breaks=False):
+        self._items = dict()
+        self._smart_breaks = smart_breaks
+        self._missing_keys = collections.defaultdict(int)
+        self._last_parent = None
+        self.can_break = True
+        self.num_lines = 0
+
+    def add(self, key, vals):
+        if self._smart_breaks:
+            # if we are not GFF2 we expect parents and break
+            # based on not having missing ones
+            if key == 'directive':
+                if vals[0] == '#':
+                    self.can_break = True
+                self._last_parent = None
+            elif not vals[0].get("is_gff2", False):
+                self._update_missing_parents(key, vals)
+                self.can_break = (len(self._missing_keys) == 0)
+            # break when we are done with stretches of child features
+            elif key != 'child':
+                self.can_break = True
+                self._last_parent = None
+            # break when we have lots of child features in a row
+            # and change between parents
+            else:
+                cur_parent = vals[0]["quals"]["Parent"][0]
+                if (self._last_parent):
+                    self.can_break = (cur_parent != self._last_parent)
+                self._last_parent = cur_parent
+        self.num_lines += 1
+        try:
+            self._items[key].extend(vals)
+        except KeyError:
+            self._items[key] = vals
+
+    def _update_missing_parents(self, key, vals):
+        # smart way of deciding if we can break this.
+        # if this is too much, can go back to not breaking in the
+        # middle of children
+        if key in ["child"]:
+            for val in vals:
+                for p_id in val["quals"]["Parent"]:
+                    self._missing_keys[p_id] += 1
+        for val in vals:
+            try:
+                del self._missing_keys[val["quals"]["ID"][0]]
+            except KeyError:
+                pass
+
+    def has_items(self):
+        return len(self._items) > 0
+
+    def get_results(self):
+        self._last_parent = None
+        return self._items
+
+class GFFParser(_AbstractMapReduceGFF):
+    """Local GFF parser providing standardized parsing of GFF3 and GFF2 files.
+    """
+    def __init__(self, line_adjust_fn=None, create_missing=True):
+        _AbstractMapReduceGFF.__init__(self, create_missing=create_missing)
+        self._line_adjust_fn = line_adjust_fn
+    
+    def _gff_process(self, gff_files, limit_info, target_lines):
+        """Process GFF addition without any parallelization.
+
+        In addition to limit filtering, this accepts a target_lines attribute
+        which provides a number of lines to parse before returning results.
+        This allows partial parsing of a file to prevent memory issues.
+        """
+        line_gen = self._file_line_generator(gff_files)
+        for out in self._lines_to_out_info(line_gen, limit_info, target_lines):
+            yield out
+
+    def _file_line_generator(self, gff_files):
+        """Generate single lines from a set of GFF files.
+        """
+        for gff_file in gff_files:
+            if hasattr(gff_file, "read"):
+                need_close = False
+                in_handle = gff_file
+            else:
+                need_close = True
+                in_handle = open(gff_file)
+            found_seqs = False
+            while 1:
+                line = in_handle.readline()
+                if not line:
+                    break
+                yield line
+            if need_close:
+                in_handle.close()
+
+    def _lines_to_out_info(self, line_iter, limit_info=None,
+            target_lines=None):
+        """Generate SeqRecord and SeqFeatures from GFF file lines.
+        """
+        params = self._examiner._get_local_params(limit_info)
+        out_info = _GFFParserLocalOut((target_lines is not None and
+                target_lines > 1))
+        found_seqs = False
+        for line in line_iter:
+            results = self._map_fn(line, params)
+            if self._line_adjust_fn and results:
+                if results[0][0] not in ['directive']:
+                    results = [(results[0][0],
+                        self._line_adjust_fn(results[0][1]))]
+            self._reduce_fn(results, out_info, params)
+            if (target_lines and out_info.num_lines >= target_lines and
+                    out_info.can_break):
+                yield out_info.get_results()
+                out_info = _GFFParserLocalOut((target_lines is not None and
+                        target_lines > 1))
+            if (results and results[0][0] == 'directive' and 
+                    results[0][1] == 'FASTA'):
+                found_seqs = True
+                break
+
+        class FakeHandle:
+            def __init__(self, line_iter):
+                self._iter = line_iter
+            def read(self):
+                return "".join(l for l in self._iter)
+            def readline(self):
+                try:
+                    return self._iter.next()
+                except StopIteration:
+                    return ""
+
+        if found_seqs:
+            fasta_recs = self._parse_fasta(FakeHandle(line_iter))
+            out_info.add('fasta', fasta_recs)
+        if out_info.has_items():
+            yield out_info.get_results()
+
+class DiscoGFFParser(_AbstractMapReduceGFF):
+    """GFF Parser with parallelization through Disco (http://discoproject.org.
+    """
+    def __init__(self, disco_host, create_missing=True):
+        """Initialize parser.
+        
+        disco_host - Web reference to a Disco host which will be used for
+        parallelizing the GFF reading job.
+        """
+        _AbstractMapReduceGFF.__init__(self, create_missing=create_missing)
+        self._disco_host = disco_host
+
+    def _gff_process(self, gff_files, limit_info, target_lines=None):
+        """Process GFF addition, using Disco to parallelize the process.
+        """
+        assert target_lines is None, "Cannot split parallelized jobs"
+        # make these imports local; only need them when using disco
+        import simplejson
+        import disco
+        # absolute path names unless they are special disco files 
+        full_files = []
+        for f in gff_files:
+            if f.split(":")[0] != "disco":
+                full_files.append(os.path.abspath(f))
+            else:
+                full_files.append(f)
+        results = disco.job(self._disco_host, name="gff_reader",
+                input=full_files,
+                params=disco.Params(limit_info=limit_info, jsonify=True,
+                    filter_info=self._examiner._filter_info),
+                required_modules=["simplejson", "collections", "re"],
+                map=self._map_fn, reduce=self._reduce_fn)
+        processed = dict()
+        for out_key, out_val in disco.result_iterator(results):
+            processed[out_key] = simplejson.loads(out_val)
+        yield processed
+
+def parse(gff_files, base_dict=None, limit_info=None, target_lines=None):
+    """High level interface to parse GFF files into SeqRecords and SeqFeatures.
+    """
+    parser = GFFParser()
+    for rec in parser.parse_in_parts(gff_files, base_dict, limit_info,
+            target_lines):
+        yield rec
+
+def parse_simple(gff_files, limit_info=None):
+    """Parse GFF files as line by line dictionary of parts.
+    """
+    parser = GFFParser()
+    for rec in parser.parse_simple(gff_files, limit_info=limit_info):
+        yield rec["child"][0]
+
+def _file_or_handle(fn):
+    """Decorator to handle either an input handle or a file.
+    """
+    def _file_or_handle_inside(*args, **kwargs):
+        in_file = args[1]
+        if hasattr(in_file, "read"):
+            need_close = False
+            in_handle = in_file
+        else:
+            need_close = True
+            in_handle = open(in_file)
+        args = (args[0], in_handle) + args[2:]
+        out = fn(*args, **kwargs)
+        if need_close:
+            in_handle.close()
+        return out
+    return _file_or_handle_inside
+
+class GFFExaminer:
+    """Provide high level details about a GFF file to refine parsing.
+
+    GFF is a spec and is provided by many different centers. Real life files
+    will present the same information in slightly different ways. Becoming
+    familiar with the file you are dealing with is the best way to extract the
+    information you need. This class provides high level summary details to
+    help in learning.
+    """
+    def __init__(self):
+        self._filter_info = dict(gff_id = [0], gff_source_type = [1, 2],
+                gff_source = [1], gff_type = [2])
+    
+    def _get_local_params(self, limit_info=None):
+        class _LocalParams:
+            def __init__(self):
+                self.jsonify = False
+        params = _LocalParams()
+        params.limit_info = limit_info
+        params.filter_info = self._filter_info
+        return params
+    
+    @_file_or_handle
+    def available_limits(self, gff_handle):
+        """Return dictionary information on possible limits for this file.
+
+        This returns a nested dictionary with the following structure:
+        
+        keys -- names of items to filter by
+        values -- dictionary with:
+            keys -- filter choice
+            value -- counts of that filter in this file
+
+        Not a parallelized map-reduce implementation.
+        """
+        cur_limits = dict()
+        for filter_key in self._filter_info.keys():
+            cur_limits[filter_key] = collections.defaultdict(int)
+        for line in gff_handle:
+            # when we hit FASTA sequences, we are done with annotations
+            if line.startswith("##FASTA"):
+                break
+            # ignore empty and comment lines
+            if line.strip() and line.strip()[0] != "#":
+                parts = [p.strip() for p in line.split('\t')]
+                assert len(parts) == 9, line
+                for filter_key, cur_indexes in self._filter_info.items():
+                    cur_id = tuple([parts[i] for i in cur_indexes])
+                    cur_limits[filter_key][cur_id] += 1
+        # get rid of the default dicts
+        final_dict = dict()
+        for key, value_dict in cur_limits.items():
+            if len(key) == 1:
+                key = key[0]
+            final_dict[key] = dict(value_dict)
+        gff_handle.close()
+        return final_dict
+
+    @_file_or_handle
+    def parent_child_map(self, gff_handle):
+        """Provide a mapping of parent to child relationships in the file.
+
+        Returns a dictionary of parent child relationships:
+
+        keys -- tuple of (source, type) for each parent
+        values -- tuple of (source, type) as children of that parent
+        
+        Not a parallelized map-reduce implementation.
+        """
+        # collect all of the parent and child types mapped to IDs
+        parent_sts = dict()
+        child_sts = collections.defaultdict(list)
+        for line in gff_handle:
+            # when we hit FASTA sequences, we are done with annotations
+            if line.startswith("##FASTA"):
+                break
+            if line.strip():
+                line_type, line_info = _gff_line_map(line,
+                        self._get_local_params())[0]
+                if (line_type == 'parent' or (line_type == 'child' and
+                        line_info['id'])):
+                    parent_sts[line_info['id']] = (
+                            line_info['quals']['source'][0], line_info['type'])
+                if line_type == 'child':
+                    for parent_id in line_info['quals']['Parent']:
+                        child_sts[parent_id].append((
+                            line_info['quals']['source'][0], line_info['type']))
+        #print parent_sts, child_sts
+        # generate a dictionary of the unique final type relationships
+        pc_map = collections.defaultdict(list)
+        for parent_id, parent_type in parent_sts.items():
+            for child_type in child_sts[parent_id]:
+                pc_map[parent_type].append(child_type)
+        pc_final_map = dict()
+        for ptype, ctypes in pc_map.items():
+            unique_ctypes = list(set(ctypes))
+            unique_ctypes.sort()
+            pc_final_map[ptype] = unique_ctypes
+        return pc_final_map
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerHMM/BCBio/GFF/__init__.py	Wed Jan 11 09:58:35 2012 -0500
@@ -0,0 +1,4 @@
+"""Top level of GFF parsing providing shortcuts for useful classes.
+"""
+from GFFParser import GFFParser, DiscoGFFParser, GFFExaminer, parse, parse_simple
+from GFFOutput import GFF3Writer, write
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerHMM/BCBio/GFF/_utils.py	Wed Jan 11 09:58:35 2012 -0500
@@ -0,0 +1,37 @@
+class defaultdict(dict):
+    """Back compatible defaultdict: http://code.activestate.com/recipes/523034/
+    """
+    def __init__(self, default_factory=None, *a, **kw):
+        if (default_factory is not None and
+            not hasattr(default_factory, '__call__')):
+            raise TypeError('first argument must be callable')
+        dict.__init__(self, *a, **kw)
+        self.default_factory = default_factory
+    def __getitem__(self, key):
+        try:
+            return dict.__getitem__(self, key)
+        except KeyError:
+            return self.__missing__(key)
+    def __missing__(self, key):
+        if self.default_factory is None:
+            raise KeyError(key)
+        self[key] = value = self.default_factory()
+        return value
+    def __reduce__(self):
+        if self.default_factory is None:
+            args = tuple()
+        else:
+            args = self.default_factory,
+        return type(self), args, None, None, self.items()
+    def copy(self):
+        return self.__copy__()
+    def __copy__(self):
+        return type(self)(self.default_factory, self)
+    def __deepcopy__(self, memo):
+        import copy
+        return type(self)(self.default_factory,
+                          copy.deepcopy(self.items()))
+    def __repr__(self):
+        return 'defaultdict(%s, %s)' % (self.default_factory,
+                                        dict.__repr__(self))
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerHMM/BCBio/__init__.py	Wed Jan 11 09:58:35 2012 -0500
@@ -0,0 +1,1 @@
+__version__="0.1"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerHMM/__init__.py	Wed Jan 11 09:58:35 2012 -0500
@@ -0,0 +1,3 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerHMM/glimmerhmm_gff_to_sequence.py	Wed Jan 11 09:58:35 2012 -0500
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+"""Convert GlimmerHMM GFF3 gene predictions into protein sequences.
+
+This works with the GlimmerHMM GFF3 output format:
+
+##gff-version 3
+##sequence-region Contig5.15 1 47390
+Contig5.15      GlimmerHMM      mRNA    323     325     .       +       .       ID=Contig5.15.path1.gene1;Name=Contig5.15.path1.gene1
+Contig5.15      GlimmerHMM      CDS     323     325     .       +       0       ID=Contig5.15.cds1.1;Parent=Contig5.15.path1.gene1;Name=Contig5.15.path1.gene1;Note=final-exon
+
+http://www.cbcb.umd.edu/software/GlimmerHMM/
+
+Modified version of the converter from Brad Chapman: https://github.com/chapmanb/bcbb/blob/master/biopython/glimmergff_to_proteins.py
+
+Usage:
+    glimmer_to_proteins.py <glimmer output> <ref fasta> <output file> <convert to protein ... False|True>
+"""
+import sys
+import os
+import operator
+
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+
+from BCBio import GFF
+
+def main(glimmer_file, ref_file, out_file, to_protein):
+    with open(ref_file) as in_handle:
+        ref_recs = SeqIO.to_dict(SeqIO.parse(in_handle, "fasta"))
+
+    base, ext = os.path.splitext(glimmer_file)
+
+    with open(out_file, "w") as out_handle:
+        SeqIO.write(protein_recs(glimmer_file, ref_recs, to_protein), out_handle, "fasta")
+
+def protein_recs(glimmer_file, ref_recs, to_protein):
+    """Generate protein records from GlimmerHMM gene predictions.
+    """
+    with open(glimmer_file) as in_handle:
+        for rec in glimmer_predictions(in_handle, ref_recs):
+            for feature in rec.features:
+                seq_exons = []
+                for cds in feature.sub_features:
+                    seq_exons.append(rec.seq[
+                        cds.location.nofuzzy_start:
+                        cds.location.nofuzzy_end])
+                gene_seq = reduce(operator.add, seq_exons)
+                if feature.strand == -1:
+                    gene_seq = gene_seq.reverse_complement()
+
+                if to_protein:
+                    yield SeqRecord(gene_seq.translate(), feature.qualifiers["ID"][0], "", "")
+                else:
+                    yield SeqRecord(gene_seq, feature.qualifiers["ID"][0], "", "")
+
+def glimmer_predictions(in_handle, ref_recs):
+    """Parse Glimmer output, generating SeqRecord and SeqFeatures for predictions
+    """
+    for rec in GFF.parse(in_handle, target_lines=1000, base_dict=ref_recs):
+        yield rec
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print __doc__
+        sys.exit()
+    main(*sys.argv[1:])
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerHMM/glimmerhmm_predict.xml	Wed Jan 11 09:58:35 2012 -0500
@@ -0,0 +1,65 @@
+<tool id="glimmerhmm_predict" name="GlimmerHMM" version="0.1">
+	<description>Predict ORFs in eukaryotic genomes</description>
+	<command>glimmerhmm 
+	    $input /home/galaxy/lib/glimmer_trainings/train_crypto 
+	    -o $output 
+	    -g 
+	    $svm_splice_prediction 
+	    $partial_gene
+	    #if str($top_n_predictions) != "-1":
+	        -n $top_n_predictions
+        #end if
+	    2> /dev/null</command>
+	<inputs>
+		<param name="input" type="data" format="fasta" label="Genome Sequence"/>
+		<param name="partial_gene" type="boolean" label="Don't make partial gene predictions" truevalue="-f" falsevalue="" checked="false" />
+		<param name="svm_splice_prediction" type="boolean" label="Don't use svm splice site predictions" truevalue="-v" falsevalue="" checked="false" />
+		<param name="top_n_predictions" type="integer" label="top n best predictions, -1 means infinite" value="-1"/>
+	</inputs>
+	<outputs>
+        <data format="tabular" name="output">
+            <change_format>
+                <when input="gff" value="-g" format="gff" />
+            </change_format>
+        </data>
+    </outputs>
+	<help>
+
+**What it does**
+
+GlimmerHMM is a new gene finder based on a Generalized Hidden Markov Model (GHMM).
+Although the gene finder conforms to the overall mathematical framework of a GHMM,
+additionally it incorporates splice site models adapted from the GeneSplicer program and a
+decision tree adapted from GlimmerM. It also utilizes Interpolated Markov Models for the
+coding and noncoding models . Currently, GlimmerHMM's GHMM structure includes introns of each phase,
+intergenic regions, and four types of exons (initial, internal, final, and single).
+A basic user manual can be consulted here.
+
+-----	
+
+**Example**
+
+Suppose you have the following DNA formatted sequences::
+
+    >SQ   Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other;
+    cccgcggagcgggtaccacatcgctgcgcgatgtgcgagcgaacacccgggctgcgcccg
+    ggtgttgcgctcccgctccgcgggagcgctggcgggacgctgcgcgtcccgctcaccaag
+    cccgcttcgcgggcttggtgacgctccgtccgctgcgcttccggagttgcggggcttcgc
+    cccgctaaccctgggcctcgcttcgctccgccttgggcctgcggcgggtccgctgcgctc
+    ccccgcctcaagggcccttccggctgcgcctccaggacccaaccgcttgcgcgggcctgg
+
+Running this tool will produce this::
+
+    ##gff-version 3
+    ##sequence-region ConsensusfromCH236920mapping 1 4148552
+    ConsensusfromCH236920mapping  GlimmerHMM  mRNA  1       122     .   +   .   ID=ConsensusfromCH236920mapping.path1.gene1;Name=ConsensusfromCH236920mapping.path1.gene1
+    ConsensusfromCH236920mapping  GlimmerHMM  CDS   1       122     .   +   0   ID=ConsensusfromCH236920mapping.cds1.1;
+    ConsensusfromCH236920mapping  GlimmerHMM  mRNA  14066   15205   .   -   .   ID=ConsensusfromCH236920mapping.path1.gene2;Name=ConsensusfromCH236920mapping.path1.gene2
+    ConsensusfromCH236920mapping  GlimmerHMM  CDS   14066   15034   .   -   0   ID=ConsensusfromCH236920mapping.cds2.1;
+    ConsensusfromCH236920mapping  GlimmerHMM  CDS   15137   15205   .   -   0   ID=ConsensusfromCH236920mapping.cds2.2;
+    ConsensusfromCH236920mapping  GlimmerHMM  mRNA  19910   24210   .   -   .   ID=ConsensusfromCH236920mapping.path1.gene3;Name=ConsensusfromCH236920mapping.path1.gene3
+
+
+
+	</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerHMM/glimmerhmm_tabular_to_sequence.py	Wed Jan 11 09:58:35 2012 -0500
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+"""Convert GlimmerHMM gene predictions into protein sequences.
+
+This works with the GlimmerHMM specific output format:
+
+  12    1  +  Initial       10748      10762       15
+  12    2  +  Internal      10940      10971       32
+  12    3  +  Internal      11035      11039        5
+  12    4  +  Internal      11072      11110       39
+  12    5  +  Internal      11146      11221       76
+  12    6  +  Terminal      11265      11388      124
+
+http://www.cbcb.umd.edu/software/GlimmerHMM/
+
+Modified version of the converter from Brad Chapman: https://github.com/chapmanb/bcbb/blob/master/biopython/glimmer_to_proteins.py
+
+Usage:
+    glimmer_to_proteins.py <glimmer output> <ref fasta> <output file> <convert to protein ... False|True>
+"""
+
+import sys
+import os
+import operator
+
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+
+def main(glimmer_file, ref_file, out_file, to_protein):
+    with open(ref_file) as in_handle:
+        ref_rec = SeqIO.read(in_handle, "fasta")
+
+    base, ext = os.path.splitext(glimmer_file)
+
+    with open(out_file, "w") as out_handle:
+        SeqIO.write(protein_recs(glimmer_file, ref_rec, to_protein), out_handle, "fasta")
+
+def protein_recs(glimmer_file, ref_rec, to_protein):
+    """Generate protein records
+    """
+    with open(glimmer_file) as in_handle:
+        for gene_num, exons, strand in glimmer_predictions(in_handle):
+            seq_exons = []
+            for start, end in exons:
+                seq_exons.append(ref_rec.seq[start:end])
+            gene_seq = reduce(operator.add, seq_exons)
+            if strand == '-':
+                gene_seq = gene_seq.reverse_complement()
+            if to_protein:
+                yield SeqRecord(gene_seq.translate(), gene_num, "", "")
+            else:
+                yield SeqRecord(gene_seq, gene_num, "", "")
+
+def glimmer_predictions(in_handle):
+    """Parse Glimmer output, generating a exons and strand for each prediction.
+    """
+    # read the header
+    while 1:
+        line = in_handle.readline()
+        if line.startswith("   #    #"):
+            break
+    in_handle.readline()
+    # read gene predictions one at a time
+    cur_exons, cur_gene_num, cur_strand = ([], None, None)
+    while 1:
+        line = in_handle.readline()
+        if not line:
+            break
+        parts = line.strip().split()
+        # new exon
+        if len(parts) == 0:
+            yield cur_gene_num, cur_exons, cur_strand
+            cur_exons, cur_gene_num, cur_strand = ([], None, None)
+        else:
+            this_gene_num = parts[0]
+            this_strand = parts[2]
+            this_start = int(parts[4]) - 1 # 1 based
+            this_end = int(parts[5])
+            if cur_gene_num is None:
+                cur_gene_num = this_gene_num
+                cur_strand = this_strand
+            else:
+                assert cur_gene_num == this_gene_num
+                assert cur_strand == this_strand
+            cur_exons.append((this_start, this_end))
+    if len(cur_exons) > 0:
+        yield cur_gene_num, cur_exons, cur_strand
+
+if __name__ == "__main__":
+    if len(sys.argv) != 5:
+        print __doc__
+        sys.exit()
+    main(*sys.argv[1:])
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerHMM/glimmerhmm_to_sequence.py	Wed Jan 11 09:58:35 2012 -0500
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+"""Convert GlimmerHMM gene predictions into protein sequences.
+
+This works with both the GFF and the costumn Tabular Output.
+And is only a wrapper to call the appropiate scripts.
+
+Usage:
+    glimmerhmm_to_sequence.py <glimmer output> <ref fasta> <output file> <format> <protein>
+
+"""
+import sys
+import os
+import glimmerhmm_tabular_to_sequence
+import glimmerhmm_gff_to_sequence
+
+def main(glimmer_file, ref_file, out_file, to_protein = False):
+    if to_protein == 'True':
+        to_protein = True
+    else:
+        to_protein = False
+
+    glimmerhmm_gff_to_sequence.main(glimmer_file, ref_file, out_file, to_protein)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 5:
+        print __doc__
+        sys.exit()
+    main(*sys.argv[1:])
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerHMM/glimmerhmm_to_sequence.xml	Wed Jan 11 09:58:35 2012 -0500
@@ -0,0 +1,24 @@
+<tool id="glimmerhmm_to_sequence" name="GlimmerHMM to Sequence" version="0.1">
+    <description>converter</description>
+    <command interpreter='python'>glimmerhmm_to_sequence.py 
+    $glimmerhmm_input
+    $genome_input
+    $output
+    $to_protein
+    </command>
+    <inputs>
+        <param name="genome_input" type="data" format="fasta" label="Genome Sequence" />
+        <param name="glimmerhmm_input" type="data" format="gff" label="GlimmerHmm result File" />
+        <param name="to_protein" type="boolean" label="Convert to protein sequence" truevalue="True" falsevalue="False" checked="false" />
+    </inputs>
+    <outputs>
+        <data name="output" format="fasta" />
+    </outputs>
+    <help>
+
+**What it does**
+
+It converts the GlimmerHMM GFF output to a Fasta File.
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/readme.txt	Wed Jan 11 09:58:35 2012 -0500
@@ -0,0 +1,65 @@
+Galaxy wrapper for RepeatMasker
+=====================================
+
+This wrapper is copyright 2012 by Björn Grüning.
+
+This is a wrapper for the command line tool of GlimmerHMM.
+http://www.cbcb.umd.edu/software/GlimmerHMM/
+
+GlimmerHMM is a new gene finder based on a Generalized Hidden Markov Model (GHMM). Although the gene finder conforms to the overall mathematical framework of a GHMM,
+additionally it incorporates splice site models adapted from the GeneSplicer program and a decision tree adapted from GlimmerM. It also utilizes
+Interpolated Markov Models for the coding and noncoding models.
+Currently, GlimmerHMM's GHMM structure includes introns of each phase, intergenic regions, and four types of exons (initial, internal, final, and single).
+
+Majoros, W.H., Pertea, M., and Salzberg, S.L. TigrScan and GlimmerHMM: two open-source ab initio eukaryotic gene-finders Bioinformatics 20 2878-2879.
+Pertea, M. and S. L. Salzberg (2002). "Computational gene finding in plants." Plant Molecular Biology 48(1-2): 39-48.
+The Arabidopsis Genome Initiative, (2000) "Analysis of the genome sequence of the flowering plant Arabidopsis thaliana", Nature. Dec 14; 408(6814):796-815.
+Pertea, M., S. L. Salzberg, et al. (2000). "Finding genes in Plasmodium falciparum." Nature 404(6773): 34; discussion 34-5.
+Salzberg, S. L., M. Pertea, et al. (1999). "Interpolated Markov models for eukaryotic gene finding." Genomics 59(1): 24-31. 
+
+
+Installation
+============
+
+To install Glimmer3, please download GlimmerHMM from 
+
+ftp://ftp.cbcb.umd.edu/pub/software/glimmerhmm/
+
+and follow the installation instructions.
+To extract the glimmerHMM predicted genes, the GFF Parser from Brad Chapman (ttp://github.com/chapmanb/bcbb/tree/master/gff) was used and is included.
+
+To install the wrapper copy the glimmerHMM folder in the galaxy tools
+folder and modify the tools_conf.xml file to make the tool available to Galaxy.
+For example:
+
+<tool file="gene_prediction/tools/glimmerHMM/glimmerhmm_predict.xml" />
+<tool file="gene_prediction/tools/glimmerHMM/glimmerhmm_to_sequence.xml" />
+
+
+History
+=======
+
+v0.1 - Initial public release
+
+
+Wrapper Licence (MIT/BSD style)
+===============================
+
+Permission to use, copy, modify, and distribute this software and its
+documentation with or without modifications and for any purpose and
+without fee is hereby granted, provided that any copyright notices
+appear in all copies and that both those copyright notices and this
+permission notice appear in supporting documentation, and that the
+names of the contributors or copyright holders not be used in
+advertising or publicity pertaining to distribution of the software
+without specific prior permission.
+
+THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
+WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
+OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+OR PERFORMANCE OF THIS SOFTWARE.
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_conf.xml	Wed Jan 11 09:58:35 2012 -0500
@@ -0,0 +1,10 @@
+<?xml version="1.0"?>
+<toolbox>
+  <section name="Gene Prediction" id="gene_prediction">
+
+    <label text="GlimmerHMM (eukaryotes)" id="glimmerHMM_eukaryotes" />
+        <tool file="gene_prediction/tools/glimmerHMM/glimmerhmm_predict.xml" />
+        <tool file="gene_prediction/tools/glimmerHMM/glimmerhmm_to_sequence.xml" />
+
+  </section>
+</toolbox>