Mercurial > repos > fubar > jbrowse2

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Dockerfile	Sun Jan 28 06:48:52 2024 +0000
@@ -0,0 +1,3 @@
+FROM quay.io/bioconda/base-glibc-busybox-bash:3.0
+
+RUN adduser -u 1000 user1000 -D && adduser -u 1001 user1001 -D
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/GFFOutput.py	Sun Jan 28 06:48:52 2024 +0000
@@ -0,0 +1,213 @@
+"""Output Biopython SeqRecords and SeqFeatures to GFF3 format.
+
+The target format is GFF3, the current GFF standard:
+    http://www.sequenceontology.org/gff3.shtml
+"""
+from six.moves import urllib
+
+from Bio import SeqIO
+
+
+class _IdHandler:
+    """Generate IDs for GFF3 Parent/Child
+    relationships where they don't exist."""
+
+    def __init__(self):
+        self._prefix = "biopygen"
+        self._counter = 1
+        self._seen_ids = []
+
+    def _generate_id(self, quals):
+        """Generate a unique ID not present in our existing IDs."""
+        gen_id = self._get_standard_id(quals)
+        if gen_id is None:
+            while 1:
+                gen_id = "%s%s" % (self._prefix, self._counter)
+                if gen_id not in self._seen_ids:
+                    break
+                self._counter += 1
+        return gen_id
+
+    def _get_standard_id(self, quals):
+        """Retrieve standardized IDs from other sources like NCBI GenBank.
+
+        This tries to find IDs from known key/values when stored differently
+        than GFF3 specifications.
+        """
+        possible_keys = ["transcript_id", "protein_id"]
+        for test_key in possible_keys:
+            if test_key in quals:
+                cur_id = quals[test_key]
+                if isinstance(cur_id, tuple) or isinstance(cur_id, list):
+                    return cur_id[0]
+                else:
+                    return cur_id
+        return None
+
+    def update_quals(self, quals, has_children):
+        """Update a set of qualifiers, adding an ID if necessary."""
+        cur_id = quals.get("ID", None)
+        # if we have an ID, record it
+        if cur_id:
+            if not isinstance(cur_id, list) and not isinstance(cur_id, tuple):
+                cur_id = [cur_id]
+            for add_id in cur_id:
+                self._seen_ids.append(add_id)
+        # if we need one and don't have it, create a new one
+        elif has_children:
+            new_id = self._generate_id(quals)
+            self._seen_ids.append(new_id)
+            quals["ID"] = [new_id]
+        return quals
+
+
+class GFF3Writer:
+    """Write GFF3 files starting with standard Biopython objects."""
+
+    def __init__(self):
+        pass
+
+    def write(self, recs, out_handle, include_fasta=False):
+        """Write the provided records to the given handle in GFF3 format."""
+        id_handler = _IdHandler()
+        self._write_header(out_handle)
+        fasta_recs = []
+        try:
+            recs = iter(recs)
+        except TypeError:
+            recs = [recs]
+        for rec in recs:
+            self._write_rec(rec, out_handle)
+            self._write_annotations(rec.annotations, rec.id, len(rec.seq), out_handle)
+            for sf in rec.features:
+                sf = self._clean_feature(sf)
+                id_handler = self._write_feature(sf, rec.id, out_handle, id_handler)
+            if include_fasta and len(rec.seq) > 0:
+                fasta_recs.append(rec)
+        if len(fasta_recs) > 0:
+            self._write_fasta(fasta_recs, out_handle)
+
+    def _clean_feature(self, feature):
+        quals = {}
+        for key, val in feature.qualifiers.items():
+            if not isinstance(val, (list, tuple)):
+                val = [val]
+            val = [str(x) for x in val]
+            quals[key] = val
+        feature.qualifiers = quals
+        # Support for Biopython 1.68 and above, which removed sub_features
+        if not hasattr(feature, "sub_features"):
+            feature.sub_features = []
+        clean_sub = [self._clean_feature(f) for f in feature.sub_features]
+        feature.sub_features = clean_sub
+        return feature
+
+    def _write_rec(self, rec, out_handle):
+        # if we have a SeqRecord, write out optional directive
+        if len(rec.seq) > 0:
+            out_handle.write("##sequence-region %s 1 %s\n" % (rec.id, len(rec.seq)))
+
+    def _get_phase(self, feature):
+        if "phase" in feature.qualifiers:
+            phase = feature.qualifiers["phase"][0]
+        elif feature.type == "CDS":
+            phase = int(feature.qualifiers.get("codon_start", [1])[0]) - 1
+        else:
+            phase = "."
+        return str(phase)
+
+    def _write_feature(self, feature, rec_id, out_handle, id_handler, parent_id=None):
+        """Write a feature with location information."""
+        if feature.location.strand == 1:
+            strand = "+"
+        elif feature.location.strand == -1:
+            strand = "-"
+        else:
+            strand = "."
+        # remove any standard features from the qualifiers
+        quals = feature.qualifiers.copy()
+        for std_qual in ["source", "score", "phase"]:
+            if std_qual in quals and len(quals[std_qual]) == 1:
+                del quals[std_qual]
+        # add a link to a parent identifier if it exists
+        if parent_id:
+            if "Parent" not in quals:
+                quals["Parent"] = []
+            quals["Parent"].append(parent_id)
+        quals = id_handler.update_quals(quals, len(feature.sub_features) > 0)
+        if feature.type:
+            ftype = feature.type
+        else:
+            ftype = "sequence_feature"
+        parts = [
+            str(rec_id),
+            feature.qualifiers.get("source", ["feature"])[0],
+            ftype,
+            str(feature.location.start + 1),  # 1-based indexing
+            str(feature.location.end),
+            feature.qualifiers.get("score", ["."])[0],
+            strand,
+            self._get_phase(feature),
+            self._format_keyvals(quals),
+        ]
+        out_handle.write("\t".join(parts) + "\n")
+        for sub_feature in feature.sub_features:
+            id_handler = self._write_feature(
+                sub_feature,
+                rec_id,
+                out_handle,
+                id_handler,
+                quals["ID"][0],
+            )
+        return id_handler
+
+    def _format_keyvals(self, keyvals):
+        format_kvs = []
+        for key in sorted(keyvals.keys()):
+            values = keyvals[key]
+            key = key.strip()
+            format_vals = []
+            if not isinstance(values, list) or isinstance(values, tuple):
+                values = [values]
+            for val in values:
+                val = urllib.parse.quote(str(val).strip(), safe=":/ ")
+                if (key and val) and val not in format_vals:
+                    format_vals.append(val)
+            format_kvs.append("%s=%s" % (key, ",".join(format_vals)))
+        return ";".join(format_kvs)
+
+    def _write_annotations(self, anns, rec_id, size, out_handle):
+        """Add annotations which refer to an entire sequence."""
+        format_anns = self._format_keyvals(anns)
+        if format_anns:
+            parts = [
+                rec_id,
+                "annotation",
+                "remark",
+                "1",
+                str(size if size > 1 else 1),
+                ".",
+                ".",
+                ".",
+                format_anns,
+            ]
+            out_handle.write("\t".join(parts) + "\n")
+
+    def _write_header(self, out_handle):
+        """Write out standard header directives."""
+        out_handle.write("##gff-version 3\n")
+
+    def _write_fasta(self, recs, out_handle):
+        """Write sequence records using the ##FASTA directive."""
+        out_handle.write("##FASTA\n")
+        SeqIO.write(recs, out_handle, "fasta")
+
+
+def write(recs, out_handle, include_fasta=False):
+    """High level interface to write GFF3 files from SeqRecords and SeqFeatures.
+
+    If include_fasta is True, the GFF3 file will include sequence information
+    using the ##FASTA directive.
+    """
+    writer = GFF3Writer()
+    return writer.write(recs, out_handle, include_fasta)
--- a/blastxml_to_gapped_gff3.py	Thu Jan 25 07:58:28 2024 +0000
+++ b/blastxml_to_gapped_gff3.py	Sun Jan 28 06:48:52 2024 +0000
@@ -32,7 +32,7 @@

         recid = record.query
         if " " in recid:
-            recid = recid[0: recid.index(" ")]
+            recid = recid[0 : recid.index(" ")]

         rec = SeqRecord(Seq("ACTG"), id=recid)
         for idx_hit, hit in enumerate(record.alignments):
@@ -72,7 +72,7 @@
                     qualifiers["blast_" + prop] = getattr(hsp, prop, None)

                 desc = hit.title.split(" >")[0]
-                qualifiers["description"] = desc[desc.index(" "):]
+                qualifiers["description"] = desc[desc.index(" ") :]

                 # This required a fair bit of sketching out/match to figure out
                 # the first time.
@@ -161,9 +161,9 @@
     fm = ""
     fs = ""
     for position in re.finditer("-", query):
-        fq += query[prev: position.start()]
-        fm += match[prev: position.start()]
-        fs += subject[prev: position.start()]
+        fq += query[prev : position.start()]
+        fm += match[prev : position.start()]
+        fs += subject[prev : position.start()]
         prev = position.start() + 1
     fq += query[prev:]
     fm += match[prev:]
@@ -290,7 +290,9 @@
         help="Trim blast hits to be only as long as the parent feature",
     )
     parser.add_argument(
-        "--trim_end", action="store_true", help="Cut blast results off at end of gene"
+        "--trim_end",
+        action="store_true",
+        help="Cut blast results off at end of gene",
     )
     parser.add_argument("--include_seq", action="store_true", help="Include sequence")
     args = parser.parse_args()
--- a/gff3_rebase.py	Thu Jan 25 07:58:28 2024 +0000
+++ b/gff3_rebase.py	Sun Jan 28 06:48:52 2024 +0000
@@ -65,7 +65,10 @@

         if hasattr(feature, "sub_features"):
             for x in feature_lambda(
-                feature.sub_features, test, test_kwargs, subfeatures=subfeatures
+                feature.sub_features,
+                test,
+                test_kwargs,
+                subfeatures=subfeatures,
             ):
                 yield x

@@ -197,7 +200,9 @@
         help="Child GFF3 annotations to rebase against parent",
     )
     parser.add_argument(
-        "--interpro", action="store_true", help="Interpro specific modifications"
+        "--interpro",
+        action="store_true",
+        help="Interpro specific modifications",
     )
     parser.add_argument(
         "--protein2dna",
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/jb2_GFF/GFFOutput.py	Sun Jan 28 06:48:52 2024 +0000
@@ -0,0 +1,233 @@
+"""Output Biopython SeqRecords and SeqFeatures to GFF3 format.
+
+The target format is GFF3, the current GFF standard:
+    http://www.sequenceontology.org/gff3.shtml
+"""
+from six.moves import urllib
+
+from Bio import SeqIO
+
+
+class _IdHandler:
+    """Generate IDs for GFF3 Parent/Child
+    relationships where they don't exist."""
+
+    def __init__(self):
+        self._prefix = "biopygen"
+        self._counter = 1
+        self._seen_ids = []
+
+    def _generate_id(self, quals):
+        """Generate a unique ID not present in our existing IDs."""
+        gen_id = self._get_standard_id(quals)
+        if gen_id is None:
+            while 1:
+                gen_id = "%s%s" % (self._prefix, self._counter)
+                if gen_id not in self._seen_ids:
+                    break
+                self._counter += 1
+        return gen_id
+
+    def _get_standard_id(self, quals):
+        """Retrieve standardized IDs from other sources like NCBI GenBank.
+
+        This tries to find IDs from known key/values when stored differently
+        than GFF3 specifications.
+        """
+        possible_keys = ["transcript_id", "protein_id"]
+        for test_key in possible_keys:
+            if test_key in quals:
+                cur_id = quals[test_key]
+                if isinstance(cur_id, tuple) or isinstance(
+                    cur_id, list
+                ):
+                    return cur_id[0]
+                else:
+                    return cur_id
+        return None
+
+    def update_quals(self, quals, has_children):
+        """Update a set of qualifiers, adding an ID if necessary."""
+        cur_id = quals.get("ID", None)
+        # if we have an ID, record it
+        if cur_id:
+            if not isinstance(cur_id, list) and not isinstance(
+                cur_id, tuple
+            ):
+                cur_id = [cur_id]
+            for add_id in cur_id:
+                self._seen_ids.append(add_id)
+        # if we need one and don't have it, create a new one
+        elif has_children:
+            new_id = self._generate_id(quals)
+            self._seen_ids.append(new_id)
+            quals["ID"] = [new_id]
+        return quals
+
+
+class GFF3Writer:
+    """Write GFF3 files starting with standard Biopython objects."""
+
+    def __init__(self):
+        pass
+
+    def write(self, recs, out_handle, include_fasta=False):
+        """Write the provided records to the given handle in GFF3 format."""
+        id_handler = _IdHandler()
+        self._write_header(out_handle)
+        fasta_recs = []
+        try:
+            recs = iter(recs)
+        except TypeError:
+            recs = [recs]
+        for rec in recs:
+            self._write_rec(rec, out_handle)
+            self._write_annotations(
+                rec.annotations, rec.id, len(rec.seq), out_handle
+            )
+            for sf in rec.features:
+                sf = self._clean_feature(sf)
+                id_handler = self._write_feature(
+                    sf, rec.id, out_handle, id_handler
+                )
+            if include_fasta and len(rec.seq) > 0:
+                fasta_recs.append(rec)
+        if len(fasta_recs) > 0:
+            self._write_fasta(fasta_recs, out_handle)
+
+    def _clean_feature(self, feature):
+        quals = {}
+        for key, val in feature.qualifiers.items():
+            if not isinstance(val, (list, tuple)):
+                val = [val]
+            val = [str(x) for x in val]
+            quals[key] = val
+        feature.qualifiers = quals
+        # Support for Biopython 1.68 and above, which removed sub_features
+        if not hasattr(feature, "sub_features"):
+            feature.sub_features = []
+        clean_sub = [
+            self._clean_feature(f) for f in feature.sub_features
+        ]
+        feature.sub_features = clean_sub
+        return feature
+
+    def _write_rec(self, rec, out_handle):
+        # if we have a SeqRecord, write out optional directive
+        if len(rec.seq) > 0:
+            out_handle.write(
+                "##sequence-region %s 1 %s\n" % (rec.id, len(rec.seq))
+            )
+
+    def _get_phase(self, feature):
+        if "phase" in feature.qualifiers:
+            phase = feature.qualifiers["phase"][0]
+        elif feature.type == "CDS":
+            phase = (
+                int(feature.qualifiers.get("codon_start", [1])[0]) - 1
+            )
+        else:
+            phase = "."
+        return str(phase)
+
+    def _write_feature(
+        self, feature, rec_id, out_handle, id_handler, parent_id=None
+    ):
+        """Write a feature with location information."""
+        if feature.location.strand == 1:
+            strand = "+"
+        elif feature.location.strand == -1:
+            strand = "-"
+        else:
+            strand = "."
+        # remove any standard features from the qualifiers
+        quals = feature.qualifiers.copy()
+        for std_qual in ["source", "score", "phase"]:
+            if std_qual in quals and len(quals[std_qual]) == 1:
+                del quals[std_qual]
+        # add a link to a parent identifier if it exists
+        if parent_id:
+            if "Parent" not in quals:
+                quals["Parent"] = []
+            quals["Parent"].append(parent_id)
+        quals = id_handler.update_quals(
+            quals, len(feature.sub_features) > 0
+        )
+        if feature.type:
+            ftype = feature.type
+        else:
+            ftype = "sequence_feature"
+        parts = [
+            str(rec_id),
+            feature.qualifiers.get("source", ["feature"])[0],
+            ftype,
+            str(feature.location.start + 1),  # 1-based indexing
+            str(feature.location.end),
+            feature.qualifiers.get("score", ["."])[0],
+            strand,
+            self._get_phase(feature),
+            self._format_keyvals(quals),
+        ]
+        out_handle.write("\t".join(parts) + "\n")
+        for sub_feature in feature.sub_features:
+            id_handler = self._write_feature(
+                sub_feature,
+                rec_id,
+                out_handle,
+                id_handler,
+                quals["ID"][0],
+            )
+        return id_handler
+
+    def _format_keyvals(self, keyvals):
+        format_kvs = []
+        for key in sorted(keyvals.keys()):
+            values = keyvals[key]
+            key = key.strip()
+            format_vals = []
+            if not isinstance(values, list) or isinstance(
+                values, tuple
+            ):
+                values = [values]
+            for val in values:
+                val = urllib.parse.quote(str(val).strip(), safe=":/ ")
+                if (key and val) and val not in format_vals:
+                    format_vals.append(val)
+            format_kvs.append("%s=%s" % (key, ",".join(format_vals)))
+        return ";".join(format_kvs)
+
+    def _write_annotations(self, anns, rec_id, size, out_handle):
+        """Add annotations which refer to an entire sequence."""
+        format_anns = self._format_keyvals(anns)
+        if format_anns:
+            parts = [
+                rec_id,
+                "annotation",
+                "remark",
+                "1",
+                str(size if size > 1 else 1),
+                ".",
+                ".",
+                ".",
+                format_anns,
+            ]
+            out_handle.write("\t".join(parts) + "\n")
+
+    def _write_header(self, out_handle):
+        """Write out standard header directives."""
+        out_handle.write("##gff-version 3\n")
+
+    def _write_fasta(self, recs, out_handle):
+        """Write sequence records using the ##FASTA directive."""
+        out_handle.write("##FASTA\n")
+        SeqIO.write(recs, out_handle, "fasta")
+
+
+def write(recs, out_handle, include_fasta=False):
+    """High level interface to write GFF3 files from SeqRecords and SeqFeatures.
+
+    If include_fasta is True, the GFF3 file will include sequence information
+    using the ##FASTA directive.
+    """
+    writer = GFF3Writer()
+    return writer.write(recs, out_handle, include_fasta)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/jb2_GFF/GFFParser.py	Sun Jan 28 06:48:52 2024 +0000
@@ -0,0 +1,1099 @@
+"""Parse GFF files into features attached to Biopython SeqRecord objects.
+
+This deals with GFF3 formatted files, a tab delimited format for storing
+sequence features and annotations:
+
+http://www.sequenceontology.org/gff3.shtml
+
+It will also deal with older GFF versions (GTF/GFF2):
+
+http://www.sanger.ac.uk/Software/formats/GFF/GFF_Spec.shtml
+http://mblab.wustl.edu/GTF22.html
+
+The implementation utilizes map/reduce parsing of GFF using Disco. Disco
+(http://discoproject.org) is a Map-Reduce framework for Python utilizing
+Erlang for parallelization. The code works on a single processor without
+Disco using the same architecture.
+"""
+import os
+import copy
+import json
+import re
+import collections
+import io
+import itertools
+import warnings
+import six
+from six.moves import urllib
+
+from Bio.SeqRecord import SeqRecord
+from Bio import SeqFeature
+from Bio import SeqIO
+from Bio import BiopythonDeprecationWarning
+
+import disco
+
+# Make defaultdict compatible with versions of python older than 2.4
+try:
+    collections.defaultdict
+except AttributeError:
+    import _utils
+
+    collections.defaultdict = _utils.defaultdict
+
+unknown_seq_avail = False
+try:
+    from Bio.Seq import UnknownSeq
+
+    unknown_seq_avail = True
+except ImportError:
+    # Starting with biopython 1.81, has been removed
+    from Bio.Seq import _UndefinedSequenceData
+    from Bio.Seq import Seq
+
+
+warnings.simplefilter("ignore", BiopythonDeprecationWarning)
+
+
+def _gff_line_map(line, params):
+    """Map part of Map-Reduce; parses a line of GFF into a dictionary.
+
+    Given an input line from a GFF file, this:
+    - decides if the file passes our filtering limits
+    - if so:
+        - breaks it into component elements
+        - determines the type of attribute (flat, parent, child or annotation)
+        - generates a dictionary of GFF info which can be serialized as JSON
+    """
+
+    def _merge_keyvals(parts):
+        """Merge key-values escaped by quotes
+        that are improperly split at semicolons."""
+        out = []
+        for i, p in enumerate(parts):
+            if (
+                i > 0
+                and len(p) == 1
+                and p[0].endswith('"')
+                and not p[0].startswith('"')
+            ):
+                if out[-1][-1].startswith('"'):
+                    prev_p = out.pop(-1)
+                    to_merge = prev_p[-1]
+                    prev_p[-1] = "%s; %s" % (to_merge, p[0])
+                    out.append(prev_p)
+            else:
+                out.append(p)
+        return out
+
+    gff3_kw_pat = re.compile(r"\w+=")
+
+    def _split_keyvals(keyval_str):
+        """Split key-value pairs in a GFF2, GTF and GFF3 compatible way.
+
+        GFF3 has key value pairs like:
+          count=9;gene=amx-2;sequence=SAGE:aacggagccg
+        GFF2 and GTF have:
+          Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206"
+          name "fgenesh1_pg.C_chr_1000003"; transcriptId 869
+        """
+        quals = collections.defaultdict(list)
+        if keyval_str is None:
+            return quals
+        # ensembl GTF has a stray semi-colon at the end
+        if keyval_str[-1] == ";":
+            keyval_str = keyval_str[:-1]
+        # GFF2/GTF has a semi-colon with at least one space after it.
+        # It can have spaces on both sides; wormbase does this.
+        # GFF3 works with no spaces.
+        # Split at the first one we can recognize as working
+        parts = keyval_str.split(" ; ")
+        if len(parts) == 1:
+            parts = [x.strip() for x in keyval_str.split(";")]
+        # check if we have GFF3 style key-vals (with =)
+        is_gff2 = True
+        if gff3_kw_pat.match(parts[0]):
+            is_gff2 = False
+            key_vals = _merge_keyvals([p.split("=") for p in parts])
+        # otherwise, we are separated by a space with a key as the first item
+        else:
+            pieces = []
+            for p in parts:
+                # fix misplaced semi-colons in keys in some GFF2 files
+                if p and p[0] == ";":
+                    p = p[1:]
+                pieces.append(p.strip().split(" "))
+            key_vals = [(p[0], " ".join(p[1:])) for p in pieces]
+        for item in key_vals:
+            # standard in-spec items are key=value
+            if len(item) == 2:
+                key, val = item
+            # out-of-spec files can have just key values. We set an empty value
+            # which will be changed to true later to standardize.
+            else:
+                assert len(item) == 1, item
+                key = item[0]
+                val = ""
+            # remove quotes in GFF2 files
+            quoted = False
+            if len(val) > 0 and val[0] == '"' and val[-1] == '"':
+                quoted = True
+                val = val[1:-1]
+            if val:
+                if quoted:
+                    quals[key].append(val)
+                else:
+                    quals[key].extend(
+                        [v for v in val.split(",") if v]
+                    )
+            # if we don't have a value, make this a key=True/False style
+            # attribute
+            else:
+                quals[key].append("true")
+        for key, vals in quals.items():
+            quals[key] = [urllib.parse.unquote(v) for v in vals]
+        return quals, is_gff2
+
+    def _nest_gff2_features(gff_parts):
+        """Provide nesting of GFF2 transcript parts with transcript IDs.
+
+        exons and coding sequences are mapped to a parent with a transcript_id
+        in GFF2. This is implemented differently at different genome centers
+        and this function attempts to resolve that and map things to the GFF3
+        way of doing them.
+        """
+        # map protein or transcript ids to a parent
+        for transcript_id in [
+            "transcript_id",
+            "transcriptId",
+            "proteinId",
+        ]:
+            try:
+                gff_parts["quals"]["Parent"] = gff_parts["quals"][
+                    transcript_id
+                ]
+                break
+            except KeyError:
+                pass
+        # case for WormBase GFF -- everything labelled as Transcript or CDS
+        for flat_name in ["Transcript", "CDS"]:
+            if flat_name in gff_parts["quals"]:
+                # parent types
+                if gff_parts["type"] in [flat_name]:
+                    if not gff_parts["id"]:
+                        gff_parts["id"] = gff_parts["quals"][
+                            flat_name
+                        ][0]
+                        gff_parts["quals"]["ID"] = [gff_parts["id"]]
+                # children types
+                elif gff_parts["type"] in [
+                    "intron",
+                    "exon",
+                    "three_prime_UTR",
+                    "coding_exon",
+                    "five_prime_UTR",
+                    "CDS",
+                    "stop_codon",
+                    "start_codon",
+                ]:
+                    gff_parts["quals"]["Parent"] = gff_parts["quals"][
+                        flat_name
+                    ]
+                break
+
+        return gff_parts
+
+    strand_map = {"+": 1, "-": -1, "?": None, None: None}
+    line = line.strip()
+    if line[:2] == "##":
+        return [("directive", line[2:])]
+    elif line and line[0] != "#":
+        parts = line.split("\t")
+        should_do = True
+        if params.limit_info:
+            for limit_name, limit_values in params.limit_info.items():
+                cur_id = tuple(
+                    [parts[i] for i in params.filter_info[limit_name]]
+                )
+                if cur_id not in limit_values:
+                    should_do = False
+                    break
+        if should_do:
+            assert len(parts) >= 8, line
+            # not python2.4 compatible but easier to understand
+            # gff_parts = [(None if p == '.' else p) for p in parts]
+            gff_parts = []
+            for p in parts:
+                if p == ".":
+                    gff_parts.append(None)
+                else:
+                    gff_parts.append(p)
+            gff_info = dict()
+            # collect all of the base qualifiers for this item
+            if len(parts) > 8:
+                quals, is_gff2 = _split_keyvals(gff_parts[8])
+            else:
+                quals, is_gff2 = collections.defaultdict(list), False
+            gff_info["is_gff2"] = is_gff2
+            if gff_parts[1]:
+                quals["source"].append(gff_parts[1])
+            if gff_parts[5]:
+                quals["score"].append(gff_parts[5])
+            if gff_parts[7]:
+                quals["phase"].append(gff_parts[7])
+            gff_info["quals"] = dict(quals)
+            gff_info["rec_id"] = gff_parts[0]
+            # if we are describing a location, then we are a feature
+            if gff_parts[3] and gff_parts[4]:
+                gff_info["location"] = [
+                    int(gff_parts[3]) - 1,
+                    int(gff_parts[4]),
+                ]
+                gff_info["type"] = gff_parts[2]
+                gff_info["id"] = quals.get("ID", [""])[0]
+                gff_info["strand"] = strand_map.get(
+                    gff_parts[6], None
+                )
+                if is_gff2:
+                    gff_info = _nest_gff2_features(gff_info)
+                # features that have parents need to link so we can pick up
+                # the relationship
+                if "Parent" in gff_info["quals"]:
+                    # check for self referential parent/child relationships
+                    # remove the ID, which is not useful
+                    for p in gff_info["quals"]["Parent"]:
+                        if p == gff_info["id"]:
+                            gff_info["id"] = ""
+                            del gff_info["quals"]["ID"]
+                            break
+                    final_key = "child"
+                elif gff_info["id"]:
+                    final_key = "parent"
+                # Handle flat features
+                else:
+                    final_key = "feature"
+            # otherwise, associate these annotations with the full record
+            else:
+                final_key = "annotation"
+            if params.jsonify:
+                return [(final_key, json.dumps(gff_info))]
+            else:
+                return [(final_key, gff_info)]
+    return []
+
+
+def _gff_line_reduce(map_results, out, params):
+    """Reduce part of Map-Reduce; combines results of parsed features."""
+    final_items = dict()
+    for gff_type, final_val in map_results:
+        if params.jsonify and gff_type not in ["directive"]:
+            final_val = json.loads(final_val)
+        try:
+            final_items[gff_type].append(final_val)
+        except KeyError:
+            final_items[gff_type] = [final_val]
+    for key, vals in final_items.items():
+        if params.jsonify:
+            vals = json.dumps(vals)
+        out.add(key, vals)
+
+
+class _MultiIDRemapper:
+    """Provide an ID remapping for cases where a parent has a non-unique ID.
+
+    Real life GFF3 cases have non-unique ID attributes, which we fix here
+    by using the unique sequence region to assign children to the right
+    parent.
+    """
+
+    def __init__(self, base_id, all_parents):
+        self._base_id = base_id
+        self._parents = all_parents
+
+    def remap_id(self, feature_dict):
+        rstart, rend = feature_dict["location"]
+        for index, parent in enumerate(self._parents):
+            pstart, pend = parent["location"]
+            if rstart >= pstart and rend <= pend:
+                if index > 0:
+                    return "%s_%s" % (self._base_id, index + 1)
+                else:
+                    return self._base_id
+        # if we haven't found a location match but parents are umabiguous,
+        # return that
+        if len(self._parents) == 1:
+            return self._base_id
+        raise ValueError(
+            "Did not find remapped ID location: %s, %s, %s"
+            % (
+                self._base_id,
+                [p["location"] for p in self._parents],
+                feature_dict["location"],
+            )
+        )
+
+
+class _AbstractMapReduceGFF:
+    """Base class providing general GFF parsing for local and remote classes.
+
+    This class should be subclassed to provide a concrete class to parse
+    GFF under specific conditions. These classes need to implement
+    the _gff_process function, which returns a dictionary of SeqRecord
+    information.
+    """
+
+    def __init__(self, create_missing=True):
+        """Initialize GFF parser
+
+        create_missing - If True, create blank records for GFF ids not in
+        the base_dict. If False, an error will be raised.
+        """
+        self._create_missing = create_missing
+        self._map_fn = _gff_line_map
+        self._reduce_fn = _gff_line_reduce
+        self._examiner = GFFExaminer()
+
+    def _gff_process(self, gff_files, limit_info, target_lines=None):
+        raise NotImplementedError("Derived class must define")
+
+    def parse(self, gff_files, base_dict=None, limit_info=None):
+        """Parse a GFF file, returning an iterator of SeqRecords.
+
+        limit_info - A dictionary specifying the regions of the GFF file
+        which should be extracted. This allows only relevant portions of a file
+        to be parsed.
+
+        base_dict - A base dictionary of SeqRecord objects which may be
+        pre-populated with sequences and other features. The new features from
+        the GFF file will be added to this dictionary.
+        """
+        for rec in self.parse_in_parts(
+            gff_files, base_dict, limit_info
+        ):
+            yield rec
+
+    def parse_in_parts(
+        self,
+        gff_files,
+        base_dict=None,
+        limit_info=None,
+        target_lines=None,
+    ):
+        """Parse a region of a GFF file specified, returning info as generated.
+
+        target_lines -- The number of lines in the file which should be used
+        for each partial parse. This should be determined based on available
+        memory.
+        """
+        for results in self.parse_simple(
+            gff_files, limit_info, target_lines
+        ):
+            if base_dict is None:
+                cur_dict = dict()
+            else:
+                cur_dict = copy.deepcopy(base_dict)
+            cur_dict = self._results_to_features(cur_dict, results)
+            all_ids = list(cur_dict.keys())
+            all_ids.sort()
+            for cur_id in all_ids:
+                yield cur_dict[cur_id]
+
+    def parse_simple(
+        self, gff_files, limit_info=None, target_lines=1
+    ):
+        """Simple parse which does not build or nest features.
+
+        This returns a simple dictionary representation of each line in the
+        GFF file.
+        """
+        # gracefully handle a single file passed
+        if not isinstance(gff_files, (list, tuple)):
+            gff_files = [gff_files]
+        limit_info = self._normalize_limit_info(limit_info)
+        for results in self._gff_process(
+            gff_files, limit_info, target_lines
+        ):
+            yield results
+
+    def _normalize_limit_info(self, limit_info):
+        """Turn all limit information into tuples for identical comparisons."""
+        final_limit_info = {}
+        if limit_info:
+            for key, values in limit_info.items():
+                final_limit_info[key] = []
+                for v in values:
+                    if isinstance(v, str):
+                        final_limit_info[key].append((v,))
+                    else:
+                        final_limit_info[key].append(tuple(v))
+        return final_limit_info
+
+    def _results_to_features(self, base, results):
+        """Add parsed dictionaries of results to Biopython SeqFeatures."""
+        base = self._add_annotations(
+            base, results.get("annotation", [])
+        )
+        for feature in results.get("feature", []):
+            (_, base) = self._add_toplevel_feature(base, feature)
+        base = self._add_parent_child_features(
+            base, results.get("parent", []), results.get("child", [])
+        )
+        base = self._add_seqs(base, results.get("fasta", []))
+        base = self._add_directives(
+            base, results.get("directive", [])
+        )
+        return base
+
+    def _add_directives(self, base, directives):
+        """Handle any directives or meta-data in the GFF file.
+
+        Relevant items are added as annotation meta-data to each record.
+        """
+        dir_keyvals = collections.defaultdict(list)
+        for directive in directives:
+            parts = directive.split()
+            if len(parts) > 1:
+                key = parts[0]
+                if len(parts) == 2:
+                    val = parts[1]
+                else:
+                    val = tuple(parts[1:])
+                # specific directives that need special handling
+                if (
+                    key == "sequence-region"
+                ):  # convert to Python 0-based coordinates
+                    if len(val) == 2:  # handle regions missing contig
+                        val = (int(val[0]) - 1, int(val[1]))
+                    elif len(val) == 3:
+                        val = (val[0], int(val[1]) - 1, int(val[2]))
+                dir_keyvals[key].append(val)
+        for key, vals in dir_keyvals.items():
+            for rec in base.values():
+                self._add_ann_to_rec(rec, key, vals)
+        return base
+
+    def _get_matching_record_id(self, base, find_id):
+        """Find a matching base record with the test identifier, handling tricky cases.
+
+        NCBI IDs https://en.wikipedia.org/wiki/FASTA_format#NCBI_identifiers
+        """
+        # Straight matches for identifiers
+        if find_id in base:
+            return find_id
+        # NCBI style IDs in find_id
+        elif find_id and find_id.find("|") > 0:
+            for test_id in [
+                x.strip() for x in find_id.split("|")[1:]
+            ]:
+                if test_id and test_id in base:
+                    return test_id
+        # NCBI style IDs in base IDs
+        else:
+            for base_id in base.keys():
+                if base_id.find("|") > 0:
+                    for test_id in [
+                        x.strip() for x in base_id.split("|")[1:]
+                    ]:
+                        if test_id and test_id == find_id:
+                            return base_id
+        return None
+
+    def _add_seqs(self, base, recs):
+        """Add sequence information contained in the GFF3 to records."""
+        for rec in recs:
+            match_id = self._get_matching_record_id(base, rec.id)
+            if match_id:
+                base[match_id].seq = rec.seq
+            else:
+                base[rec.id] = rec
+        return base
+
+    def _add_parent_child_features(self, base, parents, children):
+        """Add nested features with parent child relationships."""
+        multi_remap = self._identify_dup_ids(parents)
+        # add children features
+        children_prep = collections.defaultdict(list)
+        for child_dict in children:
+            child_feature = self._get_feature(child_dict)
+            for pindex, pid in enumerate(
+                child_feature.qualifiers["Parent"]
+            ):
+                if pid in multi_remap:
+                    pid = multi_remap[pid].remap_id(child_dict)
+                    child_feature.qualifiers["Parent"][pindex] = pid
+                children_prep[pid].append(
+                    (child_dict["rec_id"], child_feature)
+                )
+        children = dict(children_prep)
+        # add children to parents that exist
+        for cur_parent_dict in parents:
+            cur_id = cur_parent_dict["id"]
+            if cur_id in multi_remap:
+                cur_parent_dict["id"] = multi_remap[cur_id].remap_id(
+                    cur_parent_dict
+                )
+            cur_parent, base = self._add_toplevel_feature(
+                base, cur_parent_dict
+            )
+            cur_parent, children = self._add_children_to_parent(
+                cur_parent, children
+            )
+        # create parents for children without them (GFF2 or split/bad files)
+        while len(children) > 0:
+            parent_id, cur_children = next(
+                itertools.islice(children.items(), 1)
+            )
+            # one child, do not nest it
+            if len(cur_children) == 1:
+                rec_id, child = cur_children[0]
+                loc = (child.location.start, child.location.end)
+                rec, base = self._get_rec(
+                    base, dict(rec_id=rec_id, location=loc)
+                )
+                rec.features.append(child)
+                del children[parent_id]
+            else:
+                cur_parent, base = self._add_missing_parent(
+                    base, parent_id, cur_children
+                )
+                cur_parent, children = self._add_children_to_parent(
+                    cur_parent, children
+                )
+        return base
+
+    def _identify_dup_ids(self, parents):
+        """Identify duplicated ID attributes in potential nested parents.
+
+        According to the GFF3 spec ID attributes are supposed to be unique
+        for a file, but this is not always true in practice. This looks
+        for duplicates, and provides unique IDs sorted by locations.
+        """
+        multi_ids = collections.defaultdict(list)
+        for parent in parents:
+            multi_ids[parent["id"]].append(parent)
+        multi_ids = [
+            (mid, ps)
+            for (mid, ps) in multi_ids.items()
+            if len(parents) > 1
+        ]
+        multi_remap = dict()
+        for mid, parents in multi_ids:
+            multi_remap[mid] = _MultiIDRemapper(mid, parents)
+        return multi_remap
+
+    def _add_children_to_parent(self, cur_parent, children):
+        """Recursively add children to parent features."""
+        if cur_parent.id in children:
+            cur_children = children[cur_parent.id]
+            ready_children = []
+            for _, cur_child in cur_children:
+                cur_child, _ = self._add_children_to_parent(
+                    cur_child, children
+                )
+                ready_children.append(cur_child)
+            # Support Biopython features for 1.62+
+            # CompoundLocations and pre-1.62
+            if not hasattr(SeqFeature, "CompoundLocation"):
+                cur_parent.location_operator = "join"
+            for cur_child in ready_children:
+                cur_parent.sub_features.append(cur_child)
+            del children[cur_parent.id]
+        return cur_parent, children
+
+    def _add_annotations(self, base, anns):
+        """Add annotation data from the GFF file to records."""
+        # add these as a list of annotations, checking not to overwrite
+        # current values
+        for ann in anns:
+            rec, base = self._get_rec(base, ann)
+            for key, vals in ann["quals"].items():
+                self._add_ann_to_rec(rec, key, vals)
+        return base
+
+    def _add_ann_to_rec(self, rec, key, vals):
+        """Add a key/value annotation to the given SeqRecord."""
+        if key in rec.annotations:
+            try:
+                rec.annotations[key].extend(vals)
+            except AttributeError:
+                rec.annotations[key] = [rec.annotations[key]] + vals
+        else:
+            rec.annotations[key] = vals
+
+    def _get_rec(self, base, info_dict):
+        """Retrieve a record to add features to."""
+        max_loc = info_dict.get("location", (0, 1))[1]
+        match_id = self._get_matching_record_id(
+            base, info_dict["rec_id"]
+        )
+        if match_id:
+            cur_rec = base[match_id]
+            # update generated unknown sequences
+            if unknown_seq_avail and isinstance(
+                cur_rec.seq, UnknownSeq
+            ):
+                cur_rec.seq._length = max(
+                    [max_loc, cur_rec.seq._length]
+                )
+            elif not unknown_seq_avail and isinstance(
+                cur_rec.seq._data, _UndefinedSequenceData
+            ):
+                cur_rec.seq._data._length = max(
+                    [max_loc, cur_rec.seq._data._length]
+                )
+            return cur_rec, base
+        elif self._create_missing:
+            if unknown_seq_avail:
+                new_rec = SeqRecord(
+                    UnknownSeq(max_loc), info_dict["rec_id"]
+                )
+            else:
+                new_rec = SeqRecord(
+                    Seq(None, length=max_loc), info_dict["rec_id"]
+                )
+            base[info_dict["rec_id"]] = new_rec
+            return new_rec, base
+        else:
+            raise KeyError(
+                "Did not find matching record in %s for %s"
+                % (base.keys(), info_dict)
+            )
+
+    def _add_missing_parent(self, base, parent_id, cur_children):
+        """Add a new feature that is missing from the GFF file."""
+        base_rec_id = list(set(c[0] for c in cur_children))
+        child_strands = list(
+            set(c[1].location.strand for c in cur_children)
+        )
+        inferred_strand = (
+            child_strands[0] if len(child_strands) == 1 else None
+        )
+        assert len(base_rec_id) > 0
+        feature_dict = dict(
+            id=parent_id,
+            strand=inferred_strand,
+            type="inferred_parent",
+            quals=dict(ID=[parent_id]),
+            rec_id=base_rec_id[0],
+        )
+        coords = [
+            (c.location.start, c.location.end)
+            for r, c in cur_children
+        ]
+        feature_dict["location"] = (
+            min([c[0] for c in coords]),
+            max([c[1] for c in coords]),
+        )
+        return self._add_toplevel_feature(base, feature_dict)
+
+    def _add_toplevel_feature(self, base, feature_dict):
+        """Add a toplevel non-nested feature to the appropriate record."""
+        new_feature = self._get_feature(feature_dict)
+        rec, base = self._get_rec(base, feature_dict)
+        rec.features.append(new_feature)
+        return new_feature, base
+
+    def _get_feature(self, feature_dict):
+        """Retrieve a Biopython feature from our dictionary representation."""
+        # location = SeqFeature.FeatureLocation(*feature_dict['location'])
+        rstart, rend = feature_dict["location"]
+        new_feature = SeqFeature.SeqFeature(
+            SeqFeature.SimpleLocation(
+                start=rstart, end=rend, strand=feature_dict["strand"]
+            ),
+            feature_dict["type"],
+            id=feature_dict["id"],
+        )
+        # Support for Biopython 1.68 and above, which removed sub_features
+        if not hasattr(new_feature, "sub_features"):
+            new_feature.sub_features = []
+        new_feature.qualifiers = feature_dict["quals"]
+        return new_feature
+
+    def _parse_fasta(self, in_handle):
+        """Parse FASTA sequence information contained in the GFF3 file."""
+        return list(SeqIO.parse(in_handle, "fasta"))
+
+
+class _GFFParserLocalOut:
+    """Provide a collector for local GFF MapReduce file parsing."""
+
+    def __init__(self, smart_breaks=False):
+        self._items = dict()
+        self._smart_breaks = smart_breaks
+        self._missing_keys = collections.defaultdict(int)
+        self._last_parent = None
+        self.can_break = True
+        self.num_lines = 0
+
+    def add(self, key, vals):
+        if self._smart_breaks:
+            # if we are not GFF2 we expect parents and break
+            # based on not having missing ones
+            if key == "directive":
+                if vals[0] == "#":
+                    self.can_break = True
+                self._last_parent = None
+            elif not vals[0].get("is_gff2", False):
+                self._update_missing_parents(key, vals)
+                self.can_break = len(self._missing_keys) == 0
+            # break when we are done with stretches of child features
+            elif key != "child":
+                self.can_break = True
+                self._last_parent = None
+            # break when we have lots of child features in a row
+            # and change between parents
+            else:
+                cur_parent = vals[0]["quals"]["Parent"][0]
+                if self._last_parent:
+                    self.can_break = cur_parent != self._last_parent
+                self._last_parent = cur_parent
+        self.num_lines += 1
+        try:
+            self._items[key].extend(vals)
+        except KeyError:
+            self._items[key] = vals
+
+    def _update_missing_parents(self, key, vals):
+        # smart way of deciding if we can break this.
+        # if this is too much, can go back to not breaking in the
+        # middle of children
+        if key in ["child"]:
+            for val in vals:
+                for p_id in val["quals"]["Parent"]:
+                    self._missing_keys[p_id] += 1
+        for val in vals:
+            try:
+                del self._missing_keys[val["quals"]["ID"][0]]
+            except KeyError:
+                pass
+
+    def has_items(self):
+        return len(self._items) > 0
+
+    def get_results(self):
+        self._last_parent = None
+        return self._items
+
+
+class GFFParser(_AbstractMapReduceGFF):
+    """Local GFF parser providing standardized parsing of GFF files."""
+
+    def __init__(self, line_adjust_fn=None, create_missing=True):
+        _AbstractMapReduceGFF.__init__(
+            self, create_missing=create_missing
+        )
+        self._line_adjust_fn = line_adjust_fn
+
+    def _gff_process(self, gff_files, limit_info, target_lines):
+        """Process GFF addition without any parallelization.
+
+        In addition to limit filtering, this accepts a target_lines attribute
+        which provides a number of lines to parse before returning results.
+        This allows partial parsing of a file to prevent memory issues.
+        """
+        line_gen = self._file_line_generator(gff_files)
+        for out in self._lines_to_out_info(
+            line_gen, limit_info, target_lines
+        ):
+            yield out
+
+    def _file_line_generator(self, gff_files):
+        """Generate single lines from a set of GFF files."""
+        for gff_file in gff_files:
+            if hasattr(gff_file, "read"):
+                need_close = False
+                in_handle = gff_file
+            else:
+                need_close = True
+                in_handle = open(gff_file)
+            while 1:
+                line = in_handle.readline()
+                if not line:
+                    break
+                yield line
+            if need_close:
+                in_handle.close()
+
+    def _lines_to_out_info(self, line_iter, limit_info=None, target_lines=None):
+        """Generate SeqRecord and SeqFeatures from GFF file lines."""
+        params = self._examiner._get_local_params(limit_info)
+        out_info = _GFFParserLocalOut(
+            (target_lines is not None and target_lines > 1)
+        )
+        found_seqs = False
+        for line in line_iter:
+            results = self._map_fn(line, params)
+            if self._line_adjust_fn and results:
+                if results[0][0] not in ["directive"]:
+                    results = [
+                        (
+                            results[0][0],
+                            self._line_adjust_fn(results[0][1]),
+                        )
+                    ]
+            self._reduce_fn(results, out_info, params)
+            if (
+                target_lines
+                and out_info.num_lines >= target_lines
+                and out_info.can_break
+            ):
+                yield out_info.get_results()
+                out_info = _GFFParserLocalOut(
+                    (target_lines is not None and target_lines > 1)
+                )
+            if (
+                results
+                and results[0][0] == "directive"
+                and results[0][1] == "FASTA"
+            ):
+                found_seqs = True
+                break
+
+        class FakeHandle:
+            def __init__(self, line_iter):
+                self._iter = line_iter
+
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                return next(self._iter)
+
+            next = __next__
+
+            def read(self, size=-1):
+                if size < 0:
+                    return "".join(x for x in self._iter)
+                elif size == 0:
+                    return ""  # Used by Biopython to sniff unicode vs bytes
+                else:
+                    raise NotImplementedError
+
+            def readline(self):
+                try:
+                    return next(self._iter)
+                except StopIteration:
+                    return ""
+
+        if found_seqs:
+            fasta_recs = self._parse_fasta(FakeHandle(line_iter))
+            out_info.add("fasta", fasta_recs)
+        if out_info.has_items():
+            yield out_info.get_results()
+
+
+class DiscoGFFParser(_AbstractMapReduceGFF):
+    """GFF Parser with parallelization  (http://discoproject.org."""
+
+    def __init__(self, disco_host, create_missing=True):
+        """Initialize parser.
+
+        disco_host - Web reference to a Disco host which will be used for
+        parallelizing the GFF reading job.
+        """
+        _AbstractMapReduceGFF.__init__(
+            self, create_missing=create_missing
+        )
+        self._disco_host = disco_host
+
+    def _gff_process(self, gff_files, limit_info, target_lines=None):
+        """Process GFF addition, using Disco to parallelize the process."""
+        assert target_lines is None, "Cannot split parallelized jobs"
+        # make these imports local; only need them when using disco
+        # absolute path names unless they are special disco files
+        full_files = []
+        for f in gff_files:
+            if f.split(":")[0] != "disco":
+                full_files.append(os.path.abspath(f))
+            else:
+                full_files.append(f)
+        results = disco.job(
+            self._disco_host,
+            name="gff_reader",
+            input=full_files,
+            params=disco.Params(
+                limit_info=limit_info,
+                jsonify=True,
+                filter_info=self._examiner._filter_info,
+            ),
+            required_modules=["json", "collections", "re"],
+            map=self._map_fn,
+            reduce=self._reduce_fn,
+        )
+        processed = dict()
+        for out_key, out_val in disco.result_iterator(results):
+            processed[out_key] = json.loads(out_val)
+        yield processed
+
+
+def parse(
+    gff_files, base_dict=None, limit_info=None, target_lines=None
+):
+    """parse GFF files into SeqRecords and SeqFeatures."""
+    parser = GFFParser()
+    for rec in parser.parse_in_parts(
+        gff_files, base_dict, limit_info, target_lines
+    ):
+        yield rec
+
+
+def parse_simple(gff_files, limit_info=None):
+    """Parse GFF files as line by line dictionary of parts."""
+    parser = GFFParser()
+    for rec in parser.parse_simple(gff_files, limit_info=limit_info):
+        if "child" in rec:
+            assert "parent" not in rec
+            yield rec["child"][0]
+        elif "parent" in rec:
+            yield rec["parent"][0]
+        elif "feature" in rec:
+            yield rec["feature"][0]
+        # ignore directive lines
+        else:
+            assert "directive" in rec
+
+
+def _file_or_handle(fn):
+    """Decorator to handle either an input handle or a file."""
+
+    def _file_or_handle_inside(*args, **kwargs):
+        in_file = args[1]
+        if hasattr(in_file, "read"):
+            need_close = False
+            in_handle = in_file
+            if six.PY3 and not isinstance(in_handle, io.TextIOBase):
+                raise TypeError(
+                    "input handle must be opened in text mode"
+                )
+        else:
+            need_close = True
+            in_handle = open(in_file)
+        args = (args[0], in_handle) + args[2:]
+        out = fn(*args, **kwargs)
+        if need_close:
+            in_handle.close()
+        return out
+
+    return _file_or_handle_inside
+
+
+class GFFExaminer:
+    """Provide high level details about a GFF file to refine parsing.
+
+    GFF is a spec and is provided by many different centers. Real life files
+    will present the same information in slightly different ways. Becoming
+    familiar with the file you are dealing with is the best way to extract the
+    information you need. This class provides high level summary details to
+    help in learning.
+    """
+
+    def __init__(self):
+        self._filter_info = dict(
+            gff_id=[0],
+            gff_source_type=[1, 2],
+            gff_source=[1],
+            gff_type=[2],
+        )
+
+    def _get_local_params(self, limit_info=None):
+        class _LocalParams:
+            def __init__(self):
+                self.jsonify = False
+
+        params = _LocalParams()
+        params.limit_info = limit_info
+        params.filter_info = self._filter_info
+        return params
+
+    @_file_or_handle
+    def available_limits(self, gff_handle):
+        """Return dictionary information on possible limits for this file.
+
+        This returns a nested dictionary with the following structure:
+
+        keys -- names of items to filter by
+        values -- dictionary with:
+            keys -- filter choice
+            value -- counts of that filter in this file
+
+        Not a parallelized map-reduce implementation.
+        """
+        cur_limits = dict()
+        for filter_key in self._filter_info.keys():
+            cur_limits[filter_key] = collections.defaultdict(int)
+        for line in gff_handle:
+            # when we hit FASTA sequences, we are done with annotations
+            if line.startswith("##FASTA"):
+                break
+            # ignore empty and comment lines
+            if line.strip() and line.strip()[0] != "#":
+                parts = [p.strip() for p in line.split("\t")]
+                assert len(parts) >= 8, line
+                parts = parts[:9]
+                for (
+                    filter_key,
+                    cur_indexes,
+                ) in self._filter_info.items():
+                    cur_id = tuple([parts[i] for i in cur_indexes])
+                    cur_limits[filter_key][cur_id] += 1
+        # get rid of the default dicts
+        final_dict = dict()
+        for key, value_dict in cur_limits.items():
+            if len(key) == 1:
+                key = key[0]
+            final_dict[key] = dict(value_dict)
+        gff_handle.close()
+        return final_dict
+
+    @_file_or_handle
+    def parent_child_map(self, gff_handle):
+        """Provide a mapping of parent to child relationships in the file.
+
+        Returns a dictionary of parent child relationships:
+
+        keys -- tuple of (source, type) for each parent
+        values -- tuple of (source, type) as children of that parent
+
+        Not a parallelized map-reduce implementation.
+        """
+        # collect all of the parent and child types mapped to IDs
+        parent_sts = dict()
+        child_sts = collections.defaultdict(list)
+        for line in gff_handle:
+            # when we hit FASTA sequences, we are done with annotations
+            if line.startswith("##FASTA"):
+                break
+            if line.strip() and not line.startswith("#"):
+                line_type, line_info = _gff_line_map(
+                    line, self._get_local_params()
+                )[0]
+                if line_type == "parent" or (
+                    line_type == "child" and line_info["id"]
+                ):
+                    parent_sts[line_info["id"]] = (
+                        line_info["quals"].get("source", [""])[0],
+                        line_info["type"],
+                    )
+                if line_type == "child":
+                    for parent_id in line_info["quals"]["Parent"]:
+                        child_sts[parent_id].append(
+                            (
+                                line_info["quals"].get(
+                                    "source", [""]
+                                )[0],
+                                line_info["type"],
+                            )
+                        )
+        # print parent_sts, child_sts
+        # generate a dictionary of the unique final type relationships
+        pc_map = collections.defaultdict(list)
+        for parent_id, parent_type in parent_sts.items():
+            for child_type in child_sts[parent_id]:
+                pc_map[parent_type].append(child_type)
+        pc_final_map = dict()
+        for ptype, ctypes in pc_map.items():
+            unique_ctypes = list(set(ctypes))
+            unique_ctypes.sort()
+            pc_final_map[ptype] = unique_ctypes
+        return pc_final_map
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/jb2_GFF/__init__.py	Sun Jan 28 06:48:52 2024 +0000
@@ -0,0 +1,6 @@
+"""Top level of GFF parsing providing shortcuts for useful classes.
+"""
+from jb2_GFF.GFFParser import GFFParser, DiscoGFFParser, GFFExaminer, parse, parse_simple
+from jb2_GFF.GFFOutput import GFF3Writer, write
+
+__version__ = "0.6.9"
Binary file jb2_GFF/__pycache__/GFFOutput.cpython-310.pyc has changed
Binary file jb2_GFF/__pycache__/GFFParser.cpython-310.pyc has changed
Binary file jb2_GFF/__pycache__/__init__.cpython-310.pyc has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/jb2_GFF/_utils.py	Sun Jan 28 06:48:52 2024 +0000
@@ -0,0 +1,49 @@
+class defaultdict(dict):
+    """Back compatible defaultdict:
+    http://code.activestate.com/recipes/523034/"""
+
+    def __init__(self, default_factory=None, *a, **kw):
+        if default_factory is not None and not hasattr(
+            default_factory, "__call__"
+        ):
+            raise TypeError("first argument must be callable")
+        dict.__init__(self, *a, **kw)
+        self.default_factory = default_factory
+
+    def __getitem__(self, key):
+        try:
+            return dict.__getitem__(self, key)
+        except KeyError:
+            return self.__missing__(key)
+
+    def __missing__(self, key):
+        if self.default_factory is None:
+            raise KeyError(key)
+        self[key] = value = self.default_factory()
+        return value
+
+    def __reduce__(self):
+        if self.default_factory is None:
+            args = tuple()
+        else:
+            args = (self.default_factory,)
+        return type(self), args, None, None, self.items()
+
+    def copy(self):
+        return self.__copy__()
+
+    def __copy__(self):
+        return type(self)(self.default_factory, self)
+
+    def __deepcopy__(self, memo):
+        import copy
+
+        return type(self)(
+            self.default_factory, copy.deepcopy(self.items())
+        )
+
+    def __repr__(self):
+        return "defaultdict(%s, %s)" % (
+            self.default_factory,
+            dict.__repr__(self),
+        )
--- a/jb2_webserver.py	Thu Jan 25 07:58:28 2024 +0000
+++ b/jb2_webserver.py	Sun Jan 28 06:48:52 2024 +0000
@@ -169,7 +169,9 @@
         help=f"Port to listen on (default: {DEFAULT_PORT})",
     )
     parser.add_argument(
-        "--bind", default="0.0.0.0", help="IP address to bind to (default: 0.0.0.0)"
+        "--bind",
+        default="0.0.0.0",
+        help="IP address to bind to (default: 0.0.0.0)",
     )
     args = parser.parse_args()
--- a/jbrowse2.py	Thu Jan 25 07:58:28 2024 +0000
+++ b/jbrowse2.py	Sun Jan 28 06:48:52 2024 +0000
@@ -458,6 +458,10 @@
                 self.genome_name = (
                     genome_name  # first one for all tracks - other than paf
                 )
+                self.genome_firstcontig = None
+                fl = open(fapath, "r").readline().strip().split(">", 1)
+                if len(fl) > 1:
+                    self.genome_firstcontig = fl[1].strip()
         if self.config_json.get("assemblies", None):
             self.config_json["assemblies"] += assemblies
         else:
@@ -560,7 +564,7 @@
         # can be served - if public.
         # dsId = trackData["metadata"]["dataset_id"]
         # url = "%s/api/datasets/%s/display?to_ext=hic " % (self.giURL, dsId)
-        hname = trackData["label"]
+        hname = trackData["name"]
         dest = os.path.join(self.outdir, hname)
         cmd = ["cp", data, dest]
         # these can be very big.
@@ -648,7 +652,10 @@
                     "type": "LinearBasicDisplay",
                     "displayId": "%s-LinearBasicDisplay" % tId,
                 },
-                {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId},
+                {
+                    "type": "LinearArcDisplay",
+                    "displayId": "%s-LinearArcDisplay" % tId,
+                },
             ],
         }
         style_json = self._prepare_track_style(trackDict)
@@ -717,7 +724,10 @@
                     "type": "LinearBasicDisplay",
                     "displayId": "%s-LinearBasicDisplay" % tId,
                 },
-                {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId},
+                {
+                    "type": "LinearArcDisplay",
+                    "displayId": "%s-LinearArcDisplay" % tId,
+                },
             ],
         }
         style_json = self._prepare_track_style(trackDict)
@@ -906,7 +916,10 @@
                     "type": "LinearBasicDisplay",
                     "displayId": "%s-LinearBasicDisplay" % tId,
                 },
-                {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId},
+                {
+                    "type": "LinearArcDisplay",
+                    "displayId": "%s-LinearArcDisplay" % tId,
+                },
             ],
         }
         style_json = self._prepare_track_style(trackDict)
@@ -945,7 +958,10 @@
                     "type": "LinearPileupDisplay",
                     "displayId": "%s-LinearPileupDisplay" % tId,
                 },
-                {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId},
+                {
+                    "type": "LinearArcDisplay",
+                    "displayId": "%s-LinearArcDisplay" % tId,
+                },
             ],
         }
         style_json = self._prepare_track_style(trackDict)
@@ -983,14 +999,14 @@
                 "assemblyNames": [self.genome_name, pgname],
             },
             # "displays": [
-                # {
-                    # "type": "LinearSyntenyDisplay",
-                    # "displayId": "%s-LinearSyntenyDisplay" % tId,
-                # },
-                # {
-                    # "type": "DotPlotDisplay",
-                    # "displayId": "%s-DotPlotDisplay" % tId,
-                # },
+            # {
+            # "type": "LinearSyntenyDisplay",
+            # "displayId": "%s-LinearSyntenyDisplay" % tId,
+            # },
+            # {
+            # "type": "DotPlotDisplay",
+            # "displayId": "%s-DotPlotDisplay" % tId,
+            # },
             # ],
         }
         style_json = self._prepare_track_style(trackDict)
@@ -1143,13 +1159,17 @@
                 )
             elif dataset_ext == "blastxml":
                 self.add_blastxml(
-                    dataset_path, outputTrackConfig, track["conf"]["options"]["blast"]
+                    dataset_path,
+                    outputTrackConfig,
+                    track["conf"]["options"]["blast"],
                 )
             elif dataset_ext == "vcf":
                 self.add_vcf(dataset_path, outputTrackConfig)
             elif dataset_ext == "paf":
                 self.add_paf(
-                    dataset_path, outputTrackConfig, track["conf"]["options"]["synteny"]
+                    dataset_path,
+                    outputTrackConfig,
+                    track["conf"]["options"]["synteny"],
                 )
             else:
                 log.warn("Do not know how to handle %s", dataset_ext)
@@ -1194,43 +1214,42 @@
         view_json = {"type": "LinearGenomeView", "tracks": tracks_data}

         refName = None
+        drdict = {
+            "reversed": False,
+            "assemblyName": self.genome_name,
+            "start": 0,
+            "end": 100000,
+        }
+
         if data.get("defaultLocation", ""):
             ddl = data["defaultLocation"]
-            loc_match = re.search(
-                r"^([^:]+):(\d+)\.+(\d+)$", ddl
-            )
+            loc_match = re.search(r"^([^:]+):(\d*)\.*(\d*)$", ddl)
             if loc_match:
                 refName = loc_match.group(1)
-                start = int(loc_match.group(2))
-                end = int(loc_match.group(3))
+                drdict["refName"] = refName
+                if loc_match.group(2) > "":
+                    drdict["start"] = int(loc_match.group(2))
+                if loc_match.group(3) > "":
+                    drdict["end"] = int(loc_match.group(3))
             else:
                 logging.info(
                     "@@@ regexp could not match contig:start..end in the supplied location %s - please fix"
                     % ddl
                 )
-        elif self.genome_name is not None:
-            start = 0
-            end = 10000  # Booh, hard coded! waiting for https://github.com/GMOD/jbrowse-components/issues/2708
+        elif self.genome_firstcontig is not None:
+            drdict["refName"] = self.genome_firstcontig
             logging.info(
-                "@@@ no defaultlocation found for default session - please add one"
+                "@@@ no defaultlocation found for default session - using %s as first contig found"
+                % self.genome_firstcontig
             )

-        if refName is not None:
+        if drdict.get("refName", None):
             # TODO displayedRegions is not just zooming to the region, it hides the rest of the chromosome
             view_json["displayedRegions"] = [
-                {
-                    "refName": refName,
-                    "start": start,
-                    "end": end,
-                    "reversed": False,
-                    "assemblyName": self.genome_name,
-                }
+                drdict,
             ]

-            logging.info(
-                "@@@ defaultlocation %s for default session"
-                % view_json["displayedRegions"]
-            )
+            logging.info("@@@ defaultlocation %s for default session" % drdict)
         else:
             logging.info(
                 "@@@ no contig name found for default session - please add one!"
@@ -1307,12 +1326,19 @@
         ]:
             cmd = ["rm", "-rf", os.path.join(self.outdir, fn)]
             self.subprocess_check_call(cmd)
-        cmd = ["cp", os.path.join(INSTALLED_TO, "jb2_webserver.py"), self.outdir]
+        cmd = [
+            "cp",
+            os.path.join(INSTALLED_TO, "jb2_webserver.py"),
+            self.outdir,
+        ]
         self.subprocess_check_call(cmd)


 def parse_style_conf(item):
-    if "type" in item.attrib and item.attrib["type"] in ["boolean", "integer"]:
+    if "type" in item.attrib and item.attrib["type"] in [
+        "boolean",
+        "integer",
+    ]:
         if item.attrib["type"] == "boolean":
             return item.text in ("yes", "true", "True")
         elif item.attrib["type"] == "integer":
@@ -1379,7 +1405,10 @@
             for x in track.findall("files/trackFile"):
                 if is_multi_bigwig:
                     multi_bigwig_paths.append(
-                        (x.attrib["label"], os.path.realpath(x.attrib["path"]))
+                        (
+                            x.attrib["label"],
+                            os.path.realpath(x.attrib["path"]),
+                        )
                     )
                 else:
                     if trackfiles:
--- a/jbrowse2.xml	Thu Jan 25 07:58:28 2024 +0000
+++ b/jbrowse2.xml	Sun Jan 28 06:48:52 2024 +0000
@@ -1,4 +1,4 @@
- <tool id="jbrowse2" name="jbrowse2" version="@TOOL_VERSION@+@WRAPPER_VERSION@_1" profile="22.05">
+ <tool id="jbrowse2" name="jbrowse2" version="@TOOL_VERSION@+@WRAPPER_VERSION@.1" profile="22.05">
     <description>genome browser</description>
     <macros>
         <import>macros.xml</import>
@@ -781,38 +781,38 @@
 about how to run the command line tools to format your data, and which
 options need to be supplied and where.

-The JBrowse-in-Galaxy tool is maintained by `the Galaxy IUC
-<https://github.com/galaxyproject/tools-iuc/issues>`__, who you can help you
-with missing features or bugs in the tool.
+The JBrowse-in-Galaxy tool has been rejected by `a Galaxy IUC
+<https://github.com/galaxyproject/tools-iuc/issues>`__, reviewer.
+It is maintained by https://github.com/fubar2 who you can help you
+with missing features or bugs in the tool. For the record, he remains unconvinced by the reviewer's logic,
+and disturbed by the distinctly coercive approach to introducing new code,
+compared to the more usual method of providing a working PR.

 Options
 -------

-The first option you encounter is the **Reference sequence(s)** to use. This option
-now accepts multiple fasta files, allowing you to build JBrowse2
-instances that contain data for multiple genomes or chrosomomes
-(generally known as "landmark features" in gff3 terminology.)
+**Reference or Assembly**
+
+Choose either a built-in or select one from your history.

-**Track Groups** represent a set of tracks in a single category. These
-can be used to let your users understand relationships between large
-groups of tracks.
+Track coordinates and contig names *must* match this reference precisely
+or they will not display.
+
+**Track Groups** represent a set of tracks in a single category.

 Annotation Tracks
 -----------------

-There are a few different types of tracks supported, each with their own
-set of options:
-
 GFF3/BED
 ~~~~~~~~

-These are standard feature tracks. They usually highlight genes,
-mRNAs and other features of interest along a genomic region.
+Standard feature tracks. They usually highlight genes, mRNAs and other features of interest along a genomic region.

 When these contain tens of millions of features, such as repeat regions from a VGP assembly, displaying one at a time leads
 to extremely slow loading times when a large region is in view, unless the "LinearPileupDisplay" display option is
 selected for that track in the styling options section. The default is LinearBasicDisplay, which shows all details and works
-well for relatively sparse bed files.
+well for relatively sparse bed files. A better option is to make a bigwig track using a set of windows based on the
+lengths of each assembly or reference contig.

 BAM Pileups
 ~~~~~~~~~~~
--- a/macros.xml	Thu Jan 25 07:58:28 2024 +0000
+++ b/macros.xml	Sun Jan 28 06:48:52 2024 +0000
@@ -24,7 +24,7 @@
         </requirements>
     </xml>
     <token name="@DATA_DIR@">\$GALAXY_JBROWSE_SHARED_DIR</token>
-    <token name="@WRAPPER_VERSION@">galaxy0</token>
+    <token name="@WRAPPER_VERSION@">galaxy2</token>
     <token name="@ATTRIBUTION@"><![CDATA[
 **Attribution**
 This Galaxy tool relies on the JBrowse2, maintained by the GMOD Community. The Galaxy wrapper is developed by the IUC
--- a/plants.sh	Thu Jan 25 07:58:28 2024 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1 +0,0 @@
-planemo shed_update --shed_target toolshed --owner fubar --name jbrowse2 --shed_key 8d01f2f35d48a0405f72d6d37aedde60 ./
--- a/readme.rst	Thu Jan 25 07:58:28 2024 +0000
+++ b/readme.rst	Sun Jan 28 06:48:52 2024 +0000
@@ -46,7 +46,8 @@
     - works well enough to be useful in workflows such as TreeValGal.
     - JB2 seems to set defaults wisely.
     - not yet ideal for users who need fine grained track control.
-    - synteny works.
+    - synteny (paf + reference) now working
+    - rehomed at https://github.com/usegalaxy-eu/temporary-tools/tree/master/jbrowse2 while IUC reviews are slowly sorted out.


 Wrapper License (MIT/BSD Style)
Binary file static/images/bam.png has changed
Binary file static/images/bigwig.png has changed
Binary file static/images/blast.png has changed
Binary file static/images/opacity.png has changed
Binary file static/images/sections.png has changed
Binary file static/images/styling.png has changed