Mercurial > repos > bgruening > glimmer_hmm
changeset 0:c9699375fcf6 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/glimmer_hmm commit 0dc67759bcbdf5a8a285ded9ba751340d741fe63
author | bgruening |
---|---|
date | Wed, 13 Jul 2016 10:55:48 -0400 |
parents | |
children | 4da91bb244dc |
files | glimmerHMM/BCBio/GFF/GFFOutput.py glimmerHMM/BCBio/GFF/GFFParser.py glimmerHMM/BCBio/GFF/__init__.py glimmerHMM/BCBio/GFF/_utils.py glimmerHMM/BCBio/__init__.py glimmerHMM/__init__.py glimmerHMM/glimmerhmm_gff_to_sequence.py glimmerHMM/glimmerhmm_predict.xml glimmerHMM/glimmerhmm_tabular_to_sequence.py glimmerHMM/glimmerhmm_to_sequence.py glimmerHMM/glimmerhmm_to_sequence.xml glimmer_hmm.loc.sample readme.md readme.txt tool_conf.xml |
diffstat | 15 files changed, 1504 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmerHMM/BCBio/GFF/GFFOutput.py Wed Jul 13 10:55:48 2016 -0400 @@ -0,0 +1,169 @@ +"""Output Biopython SeqRecords and SeqFeatures to GFF3 format. + +The target format is GFF3, the current GFF standard: + http://www.sequenceontology.org/gff3.shtml +""" +import urllib + +class _IdHandler: + """Generate IDs for GFF3 Parent/Child relationships where they don't exist. + """ + def __init__(self): + self._prefix = "biopygen" + self._counter = 1 + self._seen_ids = [] + + def _generate_id(self, quals): + """Generate a unique ID not present in our existing IDs. + """ + gen_id = self._get_standard_id(quals) + if gen_id is None: + while 1: + gen_id = "%s%s" % (self._prefix, self._counter) + if gen_id not in self._seen_ids: + break + self._counter += 1 + return gen_id + + def _get_standard_id(self, quals): + """Retrieve standardized IDs from other sources like NCBI GenBank. + + This tries to find IDs from known key/values when stored differently + than GFF3 specifications. + """ + possible_keys = ["transcript_id", "protein_id"] + for test_key in possible_keys: + if quals.has_key(test_key): + cur_id = quals[test_key] + if isinstance(cur_id, tuple) or isinstance(cur_id, list): + return cur_id[0] + else: + return cur_id + return None + + def update_quals(self, quals, has_children): + """Update a set of qualifiers, adding an ID if necessary. + """ + cur_id = quals.get("ID", None) + # if we have an ID, record it + if cur_id: + if not isinstance(cur_id, list) and not isinstance(cur_id, tuple): + cur_id = [cur_id] + for add_id in cur_id: + self._seen_ids.append(add_id) + # if we need one and don't have it, create a new one + elif has_children: + new_id = self._generate_id(quals) + self._seen_ids.append(new_id) + quals["ID"] = [new_id] + return quals + +class GFF3Writer: + """Write GFF3 files starting with standard Biopython objects. + """ + def __init__(self): + pass + + def write(self, recs, out_handle): + """Write the provided records to the given handle in GFF3 format. + """ + id_handler = _IdHandler() + self._write_header(out_handle) + for rec in recs: + self._write_rec(rec, out_handle) + self._write_annotations(rec.annotations, rec.id, out_handle) + for sf in rec.features: + sf = self._clean_feature(sf) + id_handler = self._write_feature(sf, rec.id, out_handle, + id_handler) + + def _clean_feature(self, feature): + quals = {} + for key, val in feature.qualifiers.items(): + if not isinstance(val, (list, tuple)): + val = [val] + val = [str(x) for x in val] + quals[key] = val + feature.qualifiers = quals + clean_sub = [self._clean_feature(f) for f in feature.sub_features] + feature.sub_features = clean_sub + return feature + + def _write_rec(self, rec, out_handle): + # if we have a SeqRecord, write out optional directive + if len(rec.seq) > 0: + out_handle.write("##sequence-region %s 1 %s\n" % (rec.id, len(rec.seq))) + + def _write_feature(self, feature, rec_id, out_handle, id_handler, + parent_id=None): + """Write a feature with location information. + """ + if feature.strand == 1: + strand = '+' + elif feature.strand == -1: + strand = '-' + else: + strand = '.' + # remove any standard features from the qualifiers + quals = feature.qualifiers.copy() + for std_qual in ["source", "score", "phase"]: + if quals.has_key(std_qual) and len(quals[std_qual]) == 1: + del quals[std_qual] + # add a link to a parent identifier if it exists + if parent_id: + if not quals.has_key("Parent"): + quals["Parent"] = [] + quals["Parent"].append(parent_id) + quals = id_handler.update_quals(quals, len(feature.sub_features) > 0) + if feature.type: + ftype = feature.type + else: + ftype = "sequence_feature" + parts = [str(rec_id), + feature.qualifiers.get("source", ["feature"])[0], + ftype, + str(feature.location.nofuzzy_start + 1), # 1-based indexing + str(feature.location.nofuzzy_end), + feature.qualifiers.get("score", ["."])[0], + strand, + str(feature.qualifiers.get("phase", ["."])[0]), + self._format_keyvals(quals)] + out_handle.write("\t".join(parts) + "\n") + for sub_feature in feature.sub_features: + id_handler = self._write_feature(sub_feature, rec_id, out_handle, + id_handler, quals["ID"][0]) + return id_handler + + def _format_keyvals(self, keyvals): + format_kvs = [] + for key, values in keyvals.items(): + key = key.strip() + format_vals = [] + if not isinstance(values, list) or isinstance(values, tuple): + values = [values] + for val in values: + val = urllib.quote(str(val).strip()) + if ((key and val) and val not in format_vals): + format_vals.append(val) + format_kvs.append("%s=%s" % (key, ",".join(format_vals))) + return ";".join(format_kvs) + + def _write_annotations(self, anns, rec_id, out_handle): + """Add annotations which refer to an entire sequence. + """ + format_anns = self._format_keyvals(anns) + if format_anns: + parts = [rec_id, "annotation", "remark", ".", ".", ".", ".", ".", + format_anns] + out_handle.write("\t".join(parts) + "\n") + + def _write_header(self, out_handle): + """Write out standard header directives. + """ + out_handle.write("##gff-version 3\n") + +def write(recs, out_handle): + """High level interface to write GFF3 files from SeqRecords and SeqFeatures. + """ + writer = GFF3Writer() + return writer.write(recs, out_handle)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmerHMM/BCBio/GFF/GFFParser.py Wed Jul 13 10:55:48 2016 -0400 @@ -0,0 +1,835 @@ +"""Parse GFF files into features attached to Biopython SeqRecord objects. + +This deals with GFF3 formatted files, a tab delimited format for storing +sequence features and annotations: + +http://www.sequenceontology.org/gff3.shtml + +It will also deal with older GFF versions (GTF/GFF2): + +http://www.sanger.ac.uk/Software/formats/GFF/GFF_Spec.shtml +http://mblab.wustl.edu/GTF22.html + +The implementation utilizes map/reduce parsing of GFF using Disco. Disco +(http://discoproject.org) is a Map-Reduce framework for Python utilizing +Erlang for parallelization. The code works on a single processor without +Disco using the same architecture. +""" +import os +import copy +import re +import collections +import urllib +import itertools + +# Make defaultdict compatible with versions of python older than 2.4 +try: + collections.defaultdict +except AttributeError: + import _utils + collections.defaultdict = _utils.defaultdict + +from Bio.Seq import Seq, UnknownSeq +from Bio.SeqRecord import SeqRecord +from Bio.SeqFeature import SeqFeature, FeatureLocation +from Bio import SeqIO + +def _gff_line_map(line, params): + """Map part of Map-Reduce; parses a line of GFF into a dictionary. + + Given an input line from a GFF file, this: + - decides if the file passes our filtering limits + - if so: + - breaks it into component elements + - determines the type of attribute (flat, parent, child or annotation) + - generates a dictionary of GFF info which can be serialized as JSON + """ + gff3_kw_pat = re.compile("\w+=") + def _split_keyvals(keyval_str): + """Split key-value pairs in a GFF2, GTF and GFF3 compatible way. + + GFF3 has key value pairs like: + count=9;gene=amx-2;sequence=SAGE:aacggagccg + GFF2 and GTF have: + Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206" + name "fgenesh1_pg.C_chr_1000003"; transcriptId 869 + """ + quals = collections.defaultdict(list) + if keyval_str is None: + return quals + # ensembl GTF has a stray semi-colon at the end + if keyval_str[-1] == ';': + keyval_str = keyval_str[:-1] + # GFF2/GTF has a semi-colon with at least one space after it. + # It can have spaces on both sides; wormbase does this. + # GFF3 works with no spaces. + # Split at the first one we can recognize as working + parts = keyval_str.split(" ; ") + if len(parts) == 1: + parts = keyval_str.split("; ") + if len(parts) == 1: + parts = keyval_str.split(";") + # check if we have GFF3 style key-vals (with =) + is_gff2 = True + if gff3_kw_pat.match(parts[0]): + is_gff2 = False + key_vals = [p.split('=') for p in parts] + # otherwise, we are separated by a space with a key as the first item + else: + pieces = [] + for p in parts: + # fix misplaced semi-colons in keys in some GFF2 files + if p and p[0] == ';': + p = p[1:] + pieces.append(p.strip().split(" ")) + key_vals = [(p[0], " ".join(p[1:])) for p in pieces] + for item in key_vals: + # standard in-spec items are key=value + if len(item) == 2: + key, val = item + # out-of-spec files can have just key values. We set an empty value + # which will be changed to true later to standardize. + else: + assert len(item) == 1, item + key = item[0] + val = '' + # remove quotes in GFF2 files + if (len(val) > 0 and val[0] == '"' and val[-1] == '"'): + val = val[1:-1] + if val: + quals[key].extend([v for v in val.split(',') if v]) + # if we don't have a value, make this a key=True/False style + # attribute + else: + quals[key].append('true') + for key, vals in quals.items(): + quals[key] = [urllib.unquote(v) for v in vals] + return quals, is_gff2 + + def _nest_gff2_features(gff_parts): + """Provide nesting of GFF2 transcript parts with transcript IDs. + + exons and coding sequences are mapped to a parent with a transcript_id + in GFF2. This is implemented differently at different genome centers + and this function attempts to resolve that and map things to the GFF3 + way of doing them. + """ + # map protein or transcript ids to a parent + for transcript_id in ["transcript_id", "transcriptId", "proteinId"]: + try: + gff_parts["quals"]["Parent"] = \ + gff_parts["quals"][transcript_id] + break + except KeyError: + pass + # case for WormBase GFF -- everything labelled as Transcript or CDS + for flat_name in ["Transcript", "CDS"]: + if gff_parts["quals"].has_key(flat_name): + # parent types + if gff_parts["type"] in [flat_name]: + if not gff_parts["id"]: + gff_parts["id"] = gff_parts["quals"][flat_name][0] + gff_parts["quals"]["ID"] = [gff_parts["id"]] + # children types + elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR", + "coding_exon", "five_prime_UTR", "CDS", "stop_codon", + "start_codon"]: + gff_parts["quals"]["Parent"] = gff_parts["quals"][flat_name] + break + + return gff_parts + + strand_map = {'+' : 1, '-' : -1, '?' : None, None: None} + line = line.strip() + if line[:2] == "##": + return [('directive', line[2:])] + elif line and line[0] != "#": + parts = line.split('\t') + should_do = True + if params.limit_info: + for limit_name, limit_values in params.limit_info.items(): + cur_id = tuple([parts[i] for i in + params.filter_info[limit_name]]) + if cur_id not in limit_values: + should_do = False + break + if should_do: + assert len(parts) >= 8, line + # not python2.4 compatible but easier to understand + #gff_parts = [(None if p == '.' else p) for p in parts] + gff_parts = [] + for p in parts: + if p == ".": + gff_parts.append(None) + else: + gff_parts.append(p) + gff_info = dict() + # collect all of the base qualifiers for this item + if len(parts) > 8: + quals, is_gff2 = _split_keyvals(gff_parts[8]) + else: + quals, is_gff2 = dict(), False + gff_info["is_gff2"] = is_gff2 + if gff_parts[1]: + quals["source"].append(gff_parts[1]) + if gff_parts[5]: + quals["score"].append(gff_parts[5]) + if gff_parts[7]: + quals["phase"].append(gff_parts[7]) + gff_info['quals'] = dict(quals) + gff_info['rec_id'] = gff_parts[0] + # if we are describing a location, then we are a feature + if gff_parts[3] and gff_parts[4]: + gff_info['location'] = [int(gff_parts[3]) - 1, + int(gff_parts[4])] + gff_info['type'] = gff_parts[2] + gff_info['id'] = quals.get('ID', [''])[0] + gff_info['strand'] = strand_map.get(gff_parts[6], None) + if is_gff2: + gff_info = _nest_gff2_features(gff_info) + # features that have parents need to link so we can pick up + # the relationship + if gff_info['quals'].has_key('Parent'): + # check for self referential parent/child relationships + # remove the ID, which is not useful + for p in gff_info['quals']['Parent']: + if p == gff_info['id']: + gff_info['id'] = '' + del gff_info['quals']['ID'] + break + final_key = 'child' + elif gff_info['id']: + final_key = 'parent' + # Handle flat features + else: + final_key = 'feature' + # otherwise, associate these annotations with the full record + else: + final_key = 'annotation' + if params.jsonify: + return [(final_key, simplejson.dumps(gff_info))] + else: + return [(final_key, gff_info)] + return [] + +def _gff_line_reduce(map_results, out, params): + """Reduce part of Map-Reduce; combines results of parsed features. + """ + final_items = dict() + for gff_type, final_val in map_results: + if params.jsonify and gff_type not in ['directive']: + final_val = simplejson.loads(final_val) + try: + final_items[gff_type].append(final_val) + except KeyError: + final_items[gff_type] = [final_val] + for key, vals in final_items.items(): + if params.jsonify: + vals = simplejson.dumps(vals) + out.add(key, vals) + +class _MultiIDRemapper: + """Provide an ID remapping for cases where a parent has a non-unique ID. + + Real life GFF3 cases have non-unique ID attributes, which we fix here + by using the unique sequence region to assign children to the right + parent. + """ + def __init__(self, base_id, all_parents): + self._base_id = base_id + self._parents = all_parents + + def remap_id(self, feature_dict): + rstart, rend = feature_dict['location'] + for index, parent in enumerate(self._parents): + pstart, pend = parent['location'] + if rstart >= pstart and rend <= pend: + if index > 0: + return ("%s_%s" % (self._base_id, index + 1)) + else: + return self._base_id + raise ValueError("Did not find remapped ID location: %s, %s, %s" % ( + self._base_id, [p['location'] for p in self._parents], + feature_dict['location'])) + +class _AbstractMapReduceGFF: + """Base class providing general GFF parsing for local and remote classes. + + This class should be subclassed to provide a concrete class to parse + GFF under specific conditions. These classes need to implement + the _gff_process function, which returns a dictionary of SeqRecord + information. + """ + def __init__(self, create_missing=True): + """Initialize GFF parser + + create_missing - If True, create blank records for GFF ids not in + the base_dict. If False, an error will be raised. + """ + self._create_missing = create_missing + self._map_fn = _gff_line_map + self._reduce_fn = _gff_line_reduce + self._examiner = GFFExaminer() + + def _gff_process(self, gff_files, limit_info, target_lines=None): + raise NotImplementedError("Derived class must define") + + def parse(self, gff_files, base_dict=None, limit_info=None): + """Parse a GFF file, returning an iterator of SeqRecords. + + limit_info - A dictionary specifying the regions of the GFF file + which should be extracted. This allows only relevant portions of a file + to be parsed. + + base_dict - A base dictionary of SeqRecord objects which may be + pre-populated with sequences and other features. The new features from + the GFF file will be added to this dictionary. + """ + for rec in self.parse_in_parts(gff_files, base_dict, limit_info): + yield rec + + def parse_in_parts(self, gff_files, base_dict=None, limit_info=None, + target_lines=None): + """Parse a region of a GFF file specified, returning info as generated. + + target_lines -- The number of lines in the file which should be used + for each partial parse. This should be determined based on available + memory. + """ + for results in self.parse_simple(gff_files, limit_info, target_lines): + if base_dict is None: + cur_dict = dict() + else: + cur_dict = copy.deepcopy(base_dict) + cur_dict = self._results_to_features(cur_dict, results) + all_ids = cur_dict.keys() + all_ids.sort() + for cur_id in all_ids: + yield cur_dict[cur_id] + + def parse_simple(self, gff_files, limit_info=None, target_lines=1): + """Simple parse which does not build or nest features. + + This returns a simple dictionary representation of each line in the + GFF file. + """ + # gracefully handle a single file passed + if not isinstance(gff_files, (list, tuple)): + gff_files = [gff_files] + limit_info = self._normalize_limit_info(limit_info) + for results in self._gff_process(gff_files, limit_info, target_lines): + yield results + + def _normalize_limit_info(self, limit_info): + """Turn all limit information into tuples for identical comparisons. + """ + final_limit_info = {} + if limit_info: + for key, values in limit_info.items(): + final_limit_info[key] = [] + for v in values: + if isinstance(v, str): + final_limit_info[key].append((v,)) + else: + final_limit_info[key].append(tuple(v)) + return final_limit_info + + def _results_to_features(self, base, results): + """Add parsed dictionaries of results to Biopython SeqFeatures. + """ + base = self._add_annotations(base, results.get('annotation', [])) + for feature in results.get('feature', []): + (_, base) = self._add_toplevel_feature(base, feature) + base = self._add_parent_child_features(base, results.get('parent', []), + results.get('child', [])) + base = self._add_seqs(base, results.get('fasta', [])) + base = self._add_directives(base, results.get('directive', [])) + return base + + def _add_directives(self, base, directives): + """Handle any directives or meta-data in the GFF file. + + Relevant items are added as annotation meta-data to each record. + """ + dir_keyvals = collections.defaultdict(list) + for directive in directives: + parts = directive.split() + if len(parts) > 1: + key = parts[0] + if len(parts) == 2: + val = parts[1] + else: + val = tuple(parts[1:]) + dir_keyvals[key].append(val) + for key, vals in dir_keyvals.items(): + for rec in base.values(): + self._add_ann_to_rec(rec, key, vals) + return base + + def _add_seqs(self, base, recs): + """Add sequence information contained in the GFF3 to records. + """ + for rec in recs: + if base.has_key(rec.id): + base[rec.id].seq = rec.seq + else: + base[rec.id] = rec + return base + + def _add_parent_child_features(self, base, parents, children): + """Add nested features with parent child relationships. + """ + multi_remap = self._identify_dup_ids(parents) + # add children features + children_prep = collections.defaultdict(list) + for child_dict in children: + child_feature = self._get_feature(child_dict) + for pindex, pid in enumerate(child_feature.qualifiers['Parent']): + if multi_remap.has_key(pid): + pid = multi_remap[pid].remap_id(child_dict) + child_feature.qualifiers['Parent'][pindex] = pid + children_prep[pid].append((child_dict['rec_id'], + child_feature)) + children = dict(children_prep) + # add children to parents that exist + for cur_parent_dict in parents: + cur_id = cur_parent_dict['id'] + if multi_remap.has_key(cur_id): + cur_parent_dict['id'] = multi_remap[cur_id].remap_id( + cur_parent_dict) + cur_parent, base = self._add_toplevel_feature(base, cur_parent_dict) + cur_parent, children = self._add_children_to_parent(cur_parent, + children) + # create parents for children without them (GFF2 or split/bad files) + while len(children) > 0: + parent_id, cur_children = itertools.islice(children.items(), + 1).next() + # one child, do not nest it + if len(cur_children) == 1: + rec_id, child = cur_children[0] + loc = (child.location.nofuzzy_start, child.location.nofuzzy_end) + rec, base = self._get_rec(base, + dict(rec_id=rec_id, location=loc)) + rec.features.append(child) + del children[parent_id] + else: + cur_parent, base = self._add_missing_parent(base, parent_id, + cur_children) + cur_parent, children = self._add_children_to_parent(cur_parent, + children) + return base + + def _identify_dup_ids(self, parents): + """Identify duplicated ID attributes in potential nested parents. + + According to the GFF3 spec ID attributes are supposed to be unique + for a file, but this is not always true in practice. This looks + for duplicates, and provides unique IDs sorted by locations. + """ + multi_ids = collections.defaultdict(list) + for parent in parents: + multi_ids[parent['id']].append(parent) + multi_ids = [(mid, parents) for (mid, parents) in multi_ids.items() + if len(parents) > 1] + multi_remap = dict() + for mid, parents in multi_ids: + multi_remap[mid] = _MultiIDRemapper(mid, parents) + return multi_remap + + def _add_children_to_parent(self, cur_parent, children): + """Recursively add children to parent features. + """ + if children.has_key(cur_parent.id): + cur_children = children[cur_parent.id] + for rec_id, cur_child in cur_children: + cur_child, children = self._add_children_to_parent(cur_child, + children) + cur_parent.location_operator = "join" + cur_parent.sub_features.append(cur_child) + del children[cur_parent.id] + return cur_parent, children + + def _add_annotations(self, base, anns): + """Add annotation data from the GFF file to records. + """ + # add these as a list of annotations, checking not to overwrite + # current values + for ann in anns: + rec, base = self._get_rec(base, ann) + for key, vals in ann['quals'].items(): + self._add_ann_to_rec(rec, key, vals) + return base + + def _add_ann_to_rec(self, rec, key, vals): + """Add a key/value annotation to the given SeqRecord. + """ + if rec.annotations.has_key(key): + try: + rec.annotations[key].extend(vals) + except AttributeError: + rec.annotations[key] = [rec.annotations[key]] + vals + else: + rec.annotations[key] = vals + + def _get_rec(self, base, info_dict): + """Retrieve a record to add features to. + """ + max_loc = info_dict.get('location', (0, 1))[1] + try: + cur_rec = base[info_dict['rec_id']] + # update generated unknown sequences with the expected maximum length + if isinstance(cur_rec.seq, UnknownSeq): + cur_rec.seq._length = max([max_loc, cur_rec.seq._length]) + return cur_rec, base + except KeyError: + if self._create_missing: + new_rec = SeqRecord(UnknownSeq(max_loc), info_dict['rec_id']) + base[info_dict['rec_id']] = new_rec + return new_rec, base + else: + raise + + def _add_missing_parent(self, base, parent_id, cur_children): + """Add a new feature that is missing from the GFF file. + """ + base_rec_id = list(set(c[0] for c in cur_children)) + assert len(base_rec_id) == 1 + feature_dict = dict(id=parent_id, strand=None, + type="inferred_parent", quals=dict(ID=[parent_id]), + rec_id=base_rec_id[0]) + coords = [(c.location.nofuzzy_start, c.location.nofuzzy_end) + for r, c in cur_children] + feature_dict["location"] = (min([c[0] for c in coords]), + max([c[1] for c in coords])) + return self._add_toplevel_feature(base, feature_dict) + + def _add_toplevel_feature(self, base, feature_dict): + """Add a toplevel non-nested feature to the appropriate record. + """ + new_feature = self._get_feature(feature_dict) + rec, base = self._get_rec(base, feature_dict) + rec.features.append(new_feature) + return new_feature, base + + def _get_feature(self, feature_dict): + """Retrieve a Biopython feature from our dictionary representation. + """ + location = FeatureLocation(*feature_dict['location']) + new_feature = SeqFeature(location, feature_dict['type'], + id=feature_dict['id'], strand=feature_dict['strand']) + new_feature.qualifiers = feature_dict['quals'] + return new_feature + + def _parse_fasta(self, in_handle): + """Parse FASTA sequence information contained in the GFF3 file. + """ + return list(SeqIO.parse(in_handle, "fasta")) + +class _GFFParserLocalOut: + """Provide a collector for local GFF MapReduce file parsing. + """ + def __init__(self, smart_breaks=False): + self._items = dict() + self._smart_breaks = smart_breaks + self._missing_keys = collections.defaultdict(int) + self._last_parent = None + self.can_break = True + self.num_lines = 0 + + def add(self, key, vals): + if self._smart_breaks: + # if we are not GFF2 we expect parents and break + # based on not having missing ones + if key == 'directive': + if vals[0] == '#': + self.can_break = True + self._last_parent = None + elif not vals[0].get("is_gff2", False): + self._update_missing_parents(key, vals) + self.can_break = (len(self._missing_keys) == 0) + # break when we are done with stretches of child features + elif key != 'child': + self.can_break = True + self._last_parent = None + # break when we have lots of child features in a row + # and change between parents + else: + cur_parent = vals[0]["quals"]["Parent"][0] + if (self._last_parent): + self.can_break = (cur_parent != self._last_parent) + self._last_parent = cur_parent + self.num_lines += 1 + try: + self._items[key].extend(vals) + except KeyError: + self._items[key] = vals + + def _update_missing_parents(self, key, vals): + # smart way of deciding if we can break this. + # if this is too much, can go back to not breaking in the + # middle of children + if key in ["child"]: + for val in vals: + for p_id in val["quals"]["Parent"]: + self._missing_keys[p_id] += 1 + for val in vals: + try: + del self._missing_keys[val["quals"]["ID"][0]] + except KeyError: + pass + + def has_items(self): + return len(self._items) > 0 + + def get_results(self): + self._last_parent = None + return self._items + +class GFFParser(_AbstractMapReduceGFF): + """Local GFF parser providing standardized parsing of GFF3 and GFF2 files. + """ + def __init__(self, line_adjust_fn=None, create_missing=True): + _AbstractMapReduceGFF.__init__(self, create_missing=create_missing) + self._line_adjust_fn = line_adjust_fn + + def _gff_process(self, gff_files, limit_info, target_lines): + """Process GFF addition without any parallelization. + + In addition to limit filtering, this accepts a target_lines attribute + which provides a number of lines to parse before returning results. + This allows partial parsing of a file to prevent memory issues. + """ + line_gen = self._file_line_generator(gff_files) + for out in self._lines_to_out_info(line_gen, limit_info, target_lines): + yield out + + def _file_line_generator(self, gff_files): + """Generate single lines from a set of GFF files. + """ + for gff_file in gff_files: + if hasattr(gff_file, "read"): + need_close = False + in_handle = gff_file + else: + need_close = True + in_handle = open(gff_file) + found_seqs = False + while 1: + line = in_handle.readline() + if not line: + break + yield line + if need_close: + in_handle.close() + + def _lines_to_out_info(self, line_iter, limit_info=None, + target_lines=None): + """Generate SeqRecord and SeqFeatures from GFF file lines. + """ + params = self._examiner._get_local_params(limit_info) + out_info = _GFFParserLocalOut((target_lines is not None and + target_lines > 1)) + found_seqs = False + for line in line_iter: + results = self._map_fn(line, params) + if self._line_adjust_fn and results: + if results[0][0] not in ['directive']: + results = [(results[0][0], + self._line_adjust_fn(results[0][1]))] + self._reduce_fn(results, out_info, params) + if (target_lines and out_info.num_lines >= target_lines and + out_info.can_break): + yield out_info.get_results() + out_info = _GFFParserLocalOut((target_lines is not None and + target_lines > 1)) + if (results and results[0][0] == 'directive' and + results[0][1] == 'FASTA'): + found_seqs = True + break + + class FakeHandle: + def __init__(self, line_iter): + self._iter = line_iter + def read(self): + return "".join(l for l in self._iter) + def readline(self): + try: + return self._iter.next() + except StopIteration: + return "" + + if found_seqs: + fasta_recs = self._parse_fasta(FakeHandle(line_iter)) + out_info.add('fasta', fasta_recs) + if out_info.has_items(): + yield out_info.get_results() + +class DiscoGFFParser(_AbstractMapReduceGFF): + """GFF Parser with parallelization through Disco (http://discoproject.org. + """ + def __init__(self, disco_host, create_missing=True): + """Initialize parser. + + disco_host - Web reference to a Disco host which will be used for + parallelizing the GFF reading job. + """ + _AbstractMapReduceGFF.__init__(self, create_missing=create_missing) + self._disco_host = disco_host + + def _gff_process(self, gff_files, limit_info, target_lines=None): + """Process GFF addition, using Disco to parallelize the process. + """ + assert target_lines is None, "Cannot split parallelized jobs" + # make these imports local; only need them when using disco + import simplejson + import disco + # absolute path names unless they are special disco files + full_files = [] + for f in gff_files: + if f.split(":")[0] != "disco": + full_files.append(os.path.abspath(f)) + else: + full_files.append(f) + results = disco.job(self._disco_host, name="gff_reader", + input=full_files, + params=disco.Params(limit_info=limit_info, jsonify=True, + filter_info=self._examiner._filter_info), + required_modules=["simplejson", "collections", "re"], + map=self._map_fn, reduce=self._reduce_fn) + processed = dict() + for out_key, out_val in disco.result_iterator(results): + processed[out_key] = simplejson.loads(out_val) + yield processed + +def parse(gff_files, base_dict=None, limit_info=None, target_lines=None): + """High level interface to parse GFF files into SeqRecords and SeqFeatures. + """ + parser = GFFParser() + for rec in parser.parse_in_parts(gff_files, base_dict, limit_info, + target_lines): + yield rec + +def parse_simple(gff_files, limit_info=None): + """Parse GFF files as line by line dictionary of parts. + """ + parser = GFFParser() + for rec in parser.parse_simple(gff_files, limit_info=limit_info): + yield rec["child"][0] + +def _file_or_handle(fn): + """Decorator to handle either an input handle or a file. + """ + def _file_or_handle_inside(*args, **kwargs): + in_file = args[1] + if hasattr(in_file, "read"): + need_close = False + in_handle = in_file + else: + need_close = True + in_handle = open(in_file) + args = (args[0], in_handle) + args[2:] + out = fn(*args, **kwargs) + if need_close: + in_handle.close() + return out + return _file_or_handle_inside + +class GFFExaminer: + """Provide high level details about a GFF file to refine parsing. + + GFF is a spec and is provided by many different centers. Real life files + will present the same information in slightly different ways. Becoming + familiar with the file you are dealing with is the best way to extract the + information you need. This class provides high level summary details to + help in learning. + """ + def __init__(self): + self._filter_info = dict(gff_id = [0], gff_source_type = [1, 2], + gff_source = [1], gff_type = [2]) + + def _get_local_params(self, limit_info=None): + class _LocalParams: + def __init__(self): + self.jsonify = False + params = _LocalParams() + params.limit_info = limit_info + params.filter_info = self._filter_info + return params + + @_file_or_handle + def available_limits(self, gff_handle): + """Return dictionary information on possible limits for this file. + + This returns a nested dictionary with the following structure: + + keys -- names of items to filter by + values -- dictionary with: + keys -- filter choice + value -- counts of that filter in this file + + Not a parallelized map-reduce implementation. + """ + cur_limits = dict() + for filter_key in self._filter_info.keys(): + cur_limits[filter_key] = collections.defaultdict(int) + for line in gff_handle: + # when we hit FASTA sequences, we are done with annotations + if line.startswith("##FASTA"): + break + # ignore empty and comment lines + if line.strip() and line.strip()[0] != "#": + parts = [p.strip() for p in line.split('\t')] + assert len(parts) == 9, line + for filter_key, cur_indexes in self._filter_info.items(): + cur_id = tuple([parts[i] for i in cur_indexes]) + cur_limits[filter_key][cur_id] += 1 + # get rid of the default dicts + final_dict = dict() + for key, value_dict in cur_limits.items(): + if len(key) == 1: + key = key[0] + final_dict[key] = dict(value_dict) + gff_handle.close() + return final_dict + + @_file_or_handle + def parent_child_map(self, gff_handle): + """Provide a mapping of parent to child relationships in the file. + + Returns a dictionary of parent child relationships: + + keys -- tuple of (source, type) for each parent + values -- tuple of (source, type) as children of that parent + + Not a parallelized map-reduce implementation. + """ + # collect all of the parent and child types mapped to IDs + parent_sts = dict() + child_sts = collections.defaultdict(list) + for line in gff_handle: + # when we hit FASTA sequences, we are done with annotations + if line.startswith("##FASTA"): + break + if line.strip(): + line_type, line_info = _gff_line_map(line, + self._get_local_params())[0] + if (line_type == 'parent' or (line_type == 'child' and + line_info['id'])): + parent_sts[line_info['id']] = ( + line_info['quals']['source'][0], line_info['type']) + if line_type == 'child': + for parent_id in line_info['quals']['Parent']: + child_sts[parent_id].append(( + line_info['quals']['source'][0], line_info['type'])) + #print parent_sts, child_sts + # generate a dictionary of the unique final type relationships + pc_map = collections.defaultdict(list) + for parent_id, parent_type in parent_sts.items(): + for child_type in child_sts[parent_id]: + pc_map[parent_type].append(child_type) + pc_final_map = dict() + for ptype, ctypes in pc_map.items(): + unique_ctypes = list(set(ctypes)) + unique_ctypes.sort() + pc_final_map[ptype] = unique_ctypes + return pc_final_map
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmerHMM/BCBio/GFF/__init__.py Wed Jul 13 10:55:48 2016 -0400 @@ -0,0 +1,4 @@ +"""Top level of GFF parsing providing shortcuts for useful classes. +""" +from GFFParser import GFFParser, DiscoGFFParser, GFFExaminer, parse, parse_simple +from GFFOutput import GFF3Writer, write
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmerHMM/BCBio/GFF/_utils.py Wed Jul 13 10:55:48 2016 -0400 @@ -0,0 +1,37 @@ +class defaultdict(dict): + """Back compatible defaultdict: http://code.activestate.com/recipes/523034/ + """ + def __init__(self, default_factory=None, *a, **kw): + if (default_factory is not None and + not hasattr(default_factory, '__call__')): + raise TypeError('first argument must be callable') + dict.__init__(self, *a, **kw) + self.default_factory = default_factory + def __getitem__(self, key): + try: + return dict.__getitem__(self, key) + except KeyError: + return self.__missing__(key) + def __missing__(self, key): + if self.default_factory is None: + raise KeyError(key) + self[key] = value = self.default_factory() + return value + def __reduce__(self): + if self.default_factory is None: + args = tuple() + else: + args = self.default_factory, + return type(self), args, None, None, self.items() + def copy(self): + return self.__copy__() + def __copy__(self): + return type(self)(self.default_factory, self) + def __deepcopy__(self, memo): + import copy + return type(self)(self.default_factory, + copy.deepcopy(self.items())) + def __repr__(self): + return 'defaultdict(%s, %s)' % (self.default_factory, + dict.__repr__(self)) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmerHMM/BCBio/__init__.py Wed Jul 13 10:55:48 2016 -0400 @@ -0,0 +1,1 @@ +__version__="0.1"
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmerHMM/__init__.py Wed Jul 13 10:55:48 2016 -0400 @@ -0,0 +1,3 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmerHMM/glimmerhmm_gff_to_sequence.py Wed Jul 13 10:55:48 2016 -0400 @@ -0,0 +1,66 @@ +#!/usr/bin/env python +"""Convert GlimmerHMM GFF3 gene predictions into protein sequences. + +This works with the GlimmerHMM GFF3 output format: + +##gff-version 3 +##sequence-region Contig5.15 1 47390 +Contig5.15 GlimmerHMM mRNA 323 325 . + . ID=Contig5.15.path1.gene1;Name=Contig5.15.path1.gene1 +Contig5.15 GlimmerHMM CDS 323 325 . + 0 ID=Contig5.15.cds1.1;Parent=Contig5.15.path1.gene1;Name=Contig5.15.path1.gene1;Note=final-exon + +http://www.cbcb.umd.edu/software/GlimmerHMM/ + +Modified version of the converter from Brad Chapman: https://github.com/chapmanb/bcbb/blob/master/biopython/glimmergff_to_proteins.py + +Usage: + glimmer_to_proteins.py <glimmer output> <ref fasta> <output file> <convert to protein ... False|True> +""" +import sys +import os +import operator + +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord + +from BCBio import GFF + +def main(glimmer_file, ref_file, out_file, to_protein): + with open(ref_file) as in_handle: + ref_recs = SeqIO.to_dict(SeqIO.parse(in_handle, "fasta")) + + base, ext = os.path.splitext(glimmer_file) + + with open(out_file, "w") as out_handle: + SeqIO.write(protein_recs(glimmer_file, ref_recs, to_protein), out_handle, "fasta") + +def protein_recs(glimmer_file, ref_recs, to_protein): + """Generate protein records from GlimmerHMM gene predictions. + """ + with open(glimmer_file) as in_handle: + for rec in glimmer_predictions(in_handle, ref_recs): + for feature in rec.features: + seq_exons = [] + for cds in feature.sub_features: + seq_exons.append(rec.seq[ + cds.location.nofuzzy_start: + cds.location.nofuzzy_end]) + gene_seq = reduce(operator.add, seq_exons) + if feature.strand == -1: + gene_seq = gene_seq.reverse_complement() + + if to_protein: + yield SeqRecord(gene_seq.translate(), feature.qualifiers["ID"][0], "", "") + else: + yield SeqRecord(gene_seq, feature.qualifiers["ID"][0], "", "") + +def glimmer_predictions(in_handle, ref_recs): + """Parse Glimmer output, generating SeqRecord and SeqFeatures for predictions + """ + for rec in GFF.parse(in_handle, target_lines=1000, base_dict=ref_recs): + yield rec + +if __name__ == "__main__": + if len(sys.argv) != 3: + print __doc__ + sys.exit() + main(*sys.argv[1:])
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmerHMM/glimmerhmm_predict.xml Wed Jul 13 10:55:48 2016 -0400 @@ -0,0 +1,66 @@ +<tool id="glimmerhmm_predict" name="GlimmerHMM" version="2.0"> + <description>Predict ORFs in eukaryotic genomes</description> + <command detect_errors="exit_code"><![CDATA[ + glimmerhmm + $input $trained_specie.fields.path + -o $output + -g + $svm_splice_prediction + $partial_gene + #if str($top_n_predictions) != "-1": + -n $top_n_predictions + #end if + ]]></command> + <inputs> + <param name="input" type="data" format="fasta" label="Genome Sequence"/> + <param name="trained_specie" type="select" label="Select a specie"> + <options from_data_table="glimmer_hmm_trained_dir"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available"/> + </options> + </param> + <param name="partial_gene" type="boolean" label="Don't make partial gene predictions" truevalue="-f" falsevalue="" checked="false" /> + <param name="svm_splice_prediction" type="boolean" label="Don't use svm splice site predictions" truevalue="-v" falsevalue="" checked="false" /> + <param name="top_n_predictions" type="integer" label="top n best predictions, -1 means infinite" value="-1"/> + </inputs> + <outputs> + <data format="gff3" name="output" /> + </outputs> + <help> + + **What it does** + + GlimmerHMM is a new gene finder based on a Generalized Hidden Markov Model (GHMM). + Although the gene finder conforms to the overall mathematical framework of a GHMM, + additionally it incorporates splice site models adapted from the GeneSplicer program and a + decision tree adapted from GlimmerM. It also utilizes Interpolated Markov Models for the + coding and noncoding models . Currently, GlimmerHMM's GHMM structure includes introns of each phase, + intergenic regions, and four types of exons (initial, internal, final, and single). + A basic user manual can be consulted here. + + ----- + + **Example** + + Suppose you have the following DNA formatted sequences:: + + >SQ Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other; + cccgcggagcgggtaccacatcgctgcgcgatgtgcgagcgaacacccgggctgcgcccg + ggtgttgcgctcccgctccgcgggagcgctggcgggacgctgcgcgtcccgctcaccaag + cccgcttcgcgggcttggtgacgctccgtccgctgcgcttccggagttgcggggcttcgc + cccgctaaccctgggcctcgcttcgctccgccttgggcctgcggcgggtccgctgcgctc + ccccgcctcaagggcccttccggctgcgcctccaggacccaaccgcttgcgcgggcctgg + + Running this tool will produce this:: + + ##gff-version 3 + ##sequence-region ConsensusfromCH236920mapping 1 4148552 + ConsensusfromCH236920mapping GlimmerHMM mRNA 1 122 . + . ID=ConsensusfromCH236920mapping.path1.gene1;Name=ConsensusfromCH236920mapping.path1.gene1 + ConsensusfromCH236920mapping GlimmerHMM CDS 1 122 . + 0 ID=ConsensusfromCH236920mapping.cds1.1; + ConsensusfromCH236920mapping GlimmerHMM mRNA 14066 15205 . - . ID=ConsensusfromCH236920mapping.path1.gene2;Name=ConsensusfromCH236920mapping.path1.gene2 + ConsensusfromCH236920mapping GlimmerHMM CDS 14066 15034 . - 0 ID=ConsensusfromCH236920mapping.cds2.1; + ConsensusfromCH236920mapping GlimmerHMM CDS 15137 15205 . - 0 ID=ConsensusfromCH236920mapping.cds2.2; + ConsensusfromCH236920mapping GlimmerHMM mRNA 19910 24210 . - . ID=ConsensusfromCH236920mapping.path1.gene3;Name=ConsensusfromCH236920mapping.path1.gene3 + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmerHMM/glimmerhmm_tabular_to_sequence.py Wed Jul 13 10:55:48 2016 -0400 @@ -0,0 +1,92 @@ +#!/usr/bin/env python +"""Convert GlimmerHMM gene predictions into protein sequences. + +This works with the GlimmerHMM specific output format: + + 12 1 + Initial 10748 10762 15 + 12 2 + Internal 10940 10971 32 + 12 3 + Internal 11035 11039 5 + 12 4 + Internal 11072 11110 39 + 12 5 + Internal 11146 11221 76 + 12 6 + Terminal 11265 11388 124 + +http://www.cbcb.umd.edu/software/GlimmerHMM/ + +Modified version of the converter from Brad Chapman: https://github.com/chapmanb/bcbb/blob/master/biopython/glimmer_to_proteins.py + +Usage: + glimmer_to_proteins.py <glimmer output> <ref fasta> <output file> <convert to protein ... False|True> +""" + +import sys +import os +import operator + +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord + +def main(glimmer_file, ref_file, out_file, to_protein): + with open(ref_file) as in_handle: + ref_rec = SeqIO.read(in_handle, "fasta") + + base, ext = os.path.splitext(glimmer_file) + + with open(out_file, "w") as out_handle: + SeqIO.write(protein_recs(glimmer_file, ref_rec, to_protein), out_handle, "fasta") + +def protein_recs(glimmer_file, ref_rec, to_protein): + """Generate protein records + """ + with open(glimmer_file) as in_handle: + for gene_num, exons, strand in glimmer_predictions(in_handle): + seq_exons = [] + for start, end in exons: + seq_exons.append(ref_rec.seq[start:end]) + gene_seq = reduce(operator.add, seq_exons) + if strand == '-': + gene_seq = gene_seq.reverse_complement() + if to_protein: + yield SeqRecord(gene_seq.translate(), gene_num, "", "") + else: + yield SeqRecord(gene_seq, gene_num, "", "") + +def glimmer_predictions(in_handle): + """Parse Glimmer output, generating a exons and strand for each prediction. + """ + # read the header + while 1: + line = in_handle.readline() + if line.startswith(" # #"): + break + in_handle.readline() + # read gene predictions one at a time + cur_exons, cur_gene_num, cur_strand = ([], None, None) + while 1: + line = in_handle.readline() + if not line: + break + parts = line.strip().split() + # new exon + if len(parts) == 0: + yield cur_gene_num, cur_exons, cur_strand + cur_exons, cur_gene_num, cur_strand = ([], None, None) + else: + this_gene_num = parts[0] + this_strand = parts[2] + this_start = int(parts[4]) - 1 # 1 based + this_end = int(parts[5]) + if cur_gene_num is None: + cur_gene_num = this_gene_num + cur_strand = this_strand + else: + assert cur_gene_num == this_gene_num + assert cur_strand == this_strand + cur_exons.append((this_start, this_end)) + if len(cur_exons) > 0: + yield cur_gene_num, cur_exons, cur_strand + +if __name__ == "__main__": + if len(sys.argv) != 5: + print __doc__ + sys.exit() + main(*sys.argv[1:])
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmerHMM/glimmerhmm_to_sequence.py Wed Jul 13 10:55:48 2016 -0400 @@ -0,0 +1,29 @@ +#!/usr/bin/env python +"""Convert GlimmerHMM gene predictions into protein sequences. + +This works with both the GFF and the costumn Tabular Output. +And is only a wrapper to call the appropiate scripts. + +Usage: + glimmerhmm_to_sequence.py <glimmer output> <ref fasta> <output file> <format> <protein> + +""" +import sys +import os +import glimmerhmm_tabular_to_sequence +import glimmerhmm_gff_to_sequence + +def main(glimmer_file, ref_file, out_file, to_protein = False): + if to_protein == 'True': + to_protein = True + else: + to_protein = False + + glimmerhmm_gff_to_sequence.main(glimmer_file, ref_file, out_file, to_protein) + + +if __name__ == "__main__": + if len(sys.argv) != 5: + print __doc__ + sys.exit() + main(*sys.argv[1:])
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmerHMM/glimmerhmm_to_sequence.xml Wed Jul 13 10:55:48 2016 -0400 @@ -0,0 +1,29 @@ +<tool id="glimmerhmm_to_sequence" name="GlimmerHMM to Sequence" version="0.1"> + <description>converter</description> + <command interpreter='python'> +<![CDATA[ + glimmerhmm_to_sequence.py + $glimmerhmm_input + $genome_input + $output + $to_protein +]]> + </command> + <inputs> + <param name="genome_input" type="data" format="fasta" label="Genome Sequence" /> + <param name="glimmerhmm_input" type="data" format="gff" label="GlimmerHmm result File" /> + <param name="to_protein" type="boolean" label="Convert to protein sequence" truevalue="True" falsevalue="False" checked="false" /> + </inputs> + <outputs> + <data name="output" format="fasta" /> + </outputs> + <help> +<![CDATA[ + +**What it does** + +It converts the GlimmerHMM GFF output to a Fasta File. + +]]> + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmer_hmm.loc.sample Wed Jul 13 10:55:48 2016 -0400 @@ -0,0 +1,16 @@ +#This file lists the locations of all the trained_dir files +#under the "trained_dir" directory (a directory that contains a directory +#for each organism used by glimmer_hmm). +#This file has the format (white space characters are +#TAB characters): +# +#<unique_id> <display_name> <file_path> +# +#glimmer_hmm.loc could look something like this: +# +#human_gc_0-43 Human with GC Content 0 to 43% /path/to/trained_dir/human +#celegans Celegan /path/to/trained_dir/Celegans +#arabidopsis Arabidopsis /path/to/trained_dir/arabidopsis +#rice Rice /path/to/trained_dir/rice +#zebrafish Zebrafish /path/to/trained_dir/zebrafish +#
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/readme.md Wed Jul 13 10:55:48 2016 -0400 @@ -0,0 +1,82 @@ +Galaxy wrapper for GlimmerHMM +===================================== + +This wrapper has been written in 2012 by Björn Grüning, and updated by Rémi Marenco in 2016. + +This is a wrapper for the command line tool of GlimmerHMM. +https://ccb.jhu.edu/software/glimmerhmm/ + +GlimmerHMM is a gene finder based on a Generalized Hidden Markov Model (GHMM). Although the gene finder conforms to the overall mathematical framework of a GHMM, +additionally it incorporates splice site models adapted from the GeneSplicer program and a decision tree adapted from GlimmerM. It also utilizes +Interpolated Markov Models for the coding and noncoding models. +Currently, GlimmerHMM's GHMM structure includes introns of each phase, intergenic regions, and four types of exons (initial, internal, final, and single). + +Majoros, W.H., Pertea, M., and Salzberg, S.L. TigrScan and GlimmerHMM: two open-source ab initio eukaryotic gene-finders Bioinformatics 20 2878-2879. +Pertea, M. and S. L. Salzberg (2002). "Computational gene finding in plants." Plant Molecular Biology 48(1-2): 39-48. +The Arabidopsis Genome Initiative, (2000) "Analysis of the genome sequence of the flowering plant Arabidopsis thaliana", Nature. Dec 14; 408(6814):796-815. +Pertea, M., S. L. Salzberg, et al. (2000). "Finding genes in Plasmodium falciparum." Nature 404(6773): 34; discussion 34-5. +Salzberg, S. L., M. Pertea, et al. (1999). "Interpolated Markov models for eukaryotic gene finding." Genomics 59(1): 24-31. + + +Installation +============ + +To install Glimmer3, please download GlimmerHMM from + +ftp://ccb.jhu.edu/pub/software/glimmerhmm + +and follow the installation instructions. +To extract the glimmerHMM predicted genes, the GFF Parser from Brad Chapman (ttp://github.com/chapmanb/bcbb/tree/master/gff) was used and is included. + +To install the wrapper copy the glimmerHMM folder in the galaxy tools +folder and modify the $GALAXY_ROOT/config/tool_conf.xml file to make the tool available to Galaxy. +For example: + +```xml +<tool file="gene_prediction/tools/glimmerHMM/glimmerhmm_predict.xml" /> +<tool file="gene_prediction/tools/glimmerHMM/glimmerhmm_to_sequence.xml" /> +``` + +You also need to use a trained organism by adding them as reference data in Galaxy: + +1. Add the *glimmer_hmm_trained_dir* data table to `tool_data_table_conf.xml` in `$GALAXY_ROOT/config/`: + + ```xml + <!-- glimmer_hmm trained_dir --> + <table name="glimmer_hmm_trained_dir" comment_char="#"> + <columns>value, name, path</columns> + <file path="tool-data/glimmer_hmm.loc" /> + </table> + ``` + +2. Add the `glimmer_hmm.loc` file referencing your trained organism, in `tool-data`. + You have a sample [`glimmer_hmm.loc.sample`] available in the repository to help you configuring it properly +3. Add your data in the chosen folder at step 2. You can get them from the GlimmerHMM tar, `$GLIMMERHMM/trained_dir` + +History +======= + +- v2.0 - Update by Rémi Marenco to make it work without having to modify the wrapper + add ability to select the species +- v0.1 - Initial public release + + +Wrapper Licence (MIT/BSD style) +=============================== + +Permission to use, copy, modify, and distribute this software and its +documentation with or without modifications and for any purpose and +without fee is hereby granted, provided that any copyright notices +appear in all copies and that both those copyright notices and this +permission notice appear in supporting documentation, and that the +names of the contributors or copyright holders not be used in +advertising or publicity pertaining to distribution of the software +without specific prior permission. + +THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL +WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT +OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +OR PERFORMANCE OF THIS SOFTWARE.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/readme.txt Wed Jul 13 10:55:48 2016 -0400 @@ -0,0 +1,65 @@ +Galaxy wrapper for RepeatMasker +===================================== + +This wrapper is copyright 2012 by Björn Grüning. + +This is a wrapper for the command line tool of GlimmerHMM. +http://www.cbcb.umd.edu/software/GlimmerHMM/ + +GlimmerHMM is a new gene finder based on a Generalized Hidden Markov Model (GHMM). Although the gene finder conforms to the overall mathematical framework of a GHMM, +additionally it incorporates splice site models adapted from the GeneSplicer program and a decision tree adapted from GlimmerM. It also utilizes +Interpolated Markov Models for the coding and noncoding models. +Currently, GlimmerHMM's GHMM structure includes introns of each phase, intergenic regions, and four types of exons (initial, internal, final, and single). + +Majoros, W.H., Pertea, M., and Salzberg, S.L. TigrScan and GlimmerHMM: two open-source ab initio eukaryotic gene-finders Bioinformatics 20 2878-2879. +Pertea, M. and S. L. Salzberg (2002). "Computational gene finding in plants." Plant Molecular Biology 48(1-2): 39-48. +The Arabidopsis Genome Initiative, (2000) "Analysis of the genome sequence of the flowering plant Arabidopsis thaliana", Nature. Dec 14; 408(6814):796-815. +Pertea, M., S. L. Salzberg, et al. (2000). "Finding genes in Plasmodium falciparum." Nature 404(6773): 34; discussion 34-5. +Salzberg, S. L., M. Pertea, et al. (1999). "Interpolated Markov models for eukaryotic gene finding." Genomics 59(1): 24-31. + + +Installation +============ + +To install Glimmer3, please download GlimmerHMM from + +ftp://ftp.cbcb.umd.edu/pub/software/glimmerhmm/ + +and follow the installation instructions. +To extract the glimmerHMM predicted genes, the GFF Parser from Brad Chapman (ttp://github.com/chapmanb/bcbb/tree/master/gff) was used and is included. + +To install the wrapper copy the glimmerHMM folder in the galaxy tools +folder and modify the tools_conf.xml file to make the tool available to Galaxy. +For example: + +<tool file="gene_prediction/tools/glimmerHMM/glimmerhmm_predict.xml" /> +<tool file="gene_prediction/tools/glimmerHMM/glimmerhmm_to_sequence.xml" /> + + +History +======= + +v0.1 - Initial public release + + +Wrapper Licence (MIT/BSD style) +=============================== + +Permission to use, copy, modify, and distribute this software and its +documentation with or without modifications and for any purpose and +without fee is hereby granted, provided that any copyright notices +appear in all copies and that both those copyright notices and this +permission notice appear in supporting documentation, and that the +names of the contributors or copyright holders not be used in +advertising or publicity pertaining to distribution of the software +without specific prior permission. + +THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL +WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT +OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +OR PERFORMANCE OF THIS SOFTWARE. +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_conf.xml Wed Jul 13 10:55:48 2016 -0400 @@ -0,0 +1,10 @@ +<?xml version="1.0"?> +<toolbox> + <section name="Gene Prediction" id="gene_prediction"> + + <label text="GlimmerHMM (eukaryotes)" id="glimmerHMM_eukaryotes" /> + <tool file="gene_prediction/tools/glimmerHMM/glimmerhmm_predict.xml" /> + <tool file="gene_prediction/tools/glimmerHMM/glimmerhmm_to_sequence.xml" /> + + </section> +</toolbox>