# HG changeset patch # User fubar # Date 1706424532 0 # Node ID 4c201a3d4755ae24034e057f2460a8b99097738c # Parent 1fe91657bfd6acd1d61c72138c261ae232edd9f8 planemo upload for repository https://github.com/usegalaxy-eu/temporary-tools/tree/master/jbrowse2 commit a37bfdfc108501b11c7b2aa15efb1bd16f0c4b66 diff -r 1fe91657bfd6 -r 4c201a3d4755 Dockerfile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Dockerfile Sun Jan 28 06:48:52 2024 +0000 @@ -0,0 +1,3 @@ +FROM quay.io/bioconda/base-glibc-busybox-bash:3.0 + +RUN adduser -u 1000 user1000 -D && adduser -u 1001 user1001 -D diff -r 1fe91657bfd6 -r 4c201a3d4755 GFFOutput.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GFFOutput.py Sun Jan 28 06:48:52 2024 +0000 @@ -0,0 +1,213 @@ +"""Output Biopython SeqRecords and SeqFeatures to GFF3 format. + +The target format is GFF3, the current GFF standard: + http://www.sequenceontology.org/gff3.shtml +""" +from six.moves import urllib + +from Bio import SeqIO + + +class _IdHandler: + """Generate IDs for GFF3 Parent/Child + relationships where they don't exist.""" + + def __init__(self): + self._prefix = "biopygen" + self._counter = 1 + self._seen_ids = [] + + def _generate_id(self, quals): + """Generate a unique ID not present in our existing IDs.""" + gen_id = self._get_standard_id(quals) + if gen_id is None: + while 1: + gen_id = "%s%s" % (self._prefix, self._counter) + if gen_id not in self._seen_ids: + break + self._counter += 1 + return gen_id + + def _get_standard_id(self, quals): + """Retrieve standardized IDs from other sources like NCBI GenBank. + + This tries to find IDs from known key/values when stored differently + than GFF3 specifications. + """ + possible_keys = ["transcript_id", "protein_id"] + for test_key in possible_keys: + if test_key in quals: + cur_id = quals[test_key] + if isinstance(cur_id, tuple) or isinstance(cur_id, list): + return cur_id[0] + else: + return cur_id + return None + + def update_quals(self, quals, has_children): + """Update a set of qualifiers, adding an ID if necessary.""" + cur_id = quals.get("ID", None) + # if we have an ID, record it + if cur_id: + if not isinstance(cur_id, list) and not isinstance(cur_id, tuple): + cur_id = [cur_id] + for add_id in cur_id: + self._seen_ids.append(add_id) + # if we need one and don't have it, create a new one + elif has_children: + new_id = self._generate_id(quals) + self._seen_ids.append(new_id) + quals["ID"] = [new_id] + return quals + + +class GFF3Writer: + """Write GFF3 files starting with standard Biopython objects.""" + + def __init__(self): + pass + + def write(self, recs, out_handle, include_fasta=False): + """Write the provided records to the given handle in GFF3 format.""" + id_handler = _IdHandler() + self._write_header(out_handle) + fasta_recs = [] + try: + recs = iter(recs) + except TypeError: + recs = [recs] + for rec in recs: + self._write_rec(rec, out_handle) + self._write_annotations(rec.annotations, rec.id, len(rec.seq), out_handle) + for sf in rec.features: + sf = self._clean_feature(sf) + id_handler = self._write_feature(sf, rec.id, out_handle, id_handler) + if include_fasta and len(rec.seq) > 0: + fasta_recs.append(rec) + if len(fasta_recs) > 0: + self._write_fasta(fasta_recs, out_handle) + + def _clean_feature(self, feature): + quals = {} + for key, val in feature.qualifiers.items(): + if not isinstance(val, (list, tuple)): + val = [val] + val = [str(x) for x in val] + quals[key] = val + feature.qualifiers = quals + # Support for Biopython 1.68 and above, which removed sub_features + if not hasattr(feature, "sub_features"): + feature.sub_features = [] + clean_sub = [self._clean_feature(f) for f in feature.sub_features] + feature.sub_features = clean_sub + return feature + + def _write_rec(self, rec, out_handle): + # if we have a SeqRecord, write out optional directive + if len(rec.seq) > 0: + out_handle.write("##sequence-region %s 1 %s\n" % (rec.id, len(rec.seq))) + + def _get_phase(self, feature): + if "phase" in feature.qualifiers: + phase = feature.qualifiers["phase"][0] + elif feature.type == "CDS": + phase = int(feature.qualifiers.get("codon_start", [1])[0]) - 1 + else: + phase = "." + return str(phase) + + def _write_feature(self, feature, rec_id, out_handle, id_handler, parent_id=None): + """Write a feature with location information.""" + if feature.location.strand == 1: + strand = "+" + elif feature.location.strand == -1: + strand = "-" + else: + strand = "." + # remove any standard features from the qualifiers + quals = feature.qualifiers.copy() + for std_qual in ["source", "score", "phase"]: + if std_qual in quals and len(quals[std_qual]) == 1: + del quals[std_qual] + # add a link to a parent identifier if it exists + if parent_id: + if "Parent" not in quals: + quals["Parent"] = [] + quals["Parent"].append(parent_id) + quals = id_handler.update_quals(quals, len(feature.sub_features) > 0) + if feature.type: + ftype = feature.type + else: + ftype = "sequence_feature" + parts = [ + str(rec_id), + feature.qualifiers.get("source", ["feature"])[0], + ftype, + str(feature.location.start + 1), # 1-based indexing + str(feature.location.end), + feature.qualifiers.get("score", ["."])[0], + strand, + self._get_phase(feature), + self._format_keyvals(quals), + ] + out_handle.write("\t".join(parts) + "\n") + for sub_feature in feature.sub_features: + id_handler = self._write_feature( + sub_feature, + rec_id, + out_handle, + id_handler, + quals["ID"][0], + ) + return id_handler + + def _format_keyvals(self, keyvals): + format_kvs = [] + for key in sorted(keyvals.keys()): + values = keyvals[key] + key = key.strip() + format_vals = [] + if not isinstance(values, list) or isinstance(values, tuple): + values = [values] + for val in values: + val = urllib.parse.quote(str(val).strip(), safe=":/ ") + if (key and val) and val not in format_vals: + format_vals.append(val) + format_kvs.append("%s=%s" % (key, ",".join(format_vals))) + return ";".join(format_kvs) + + def _write_annotations(self, anns, rec_id, size, out_handle): + """Add annotations which refer to an entire sequence.""" + format_anns = self._format_keyvals(anns) + if format_anns: + parts = [ + rec_id, + "annotation", + "remark", + "1", + str(size if size > 1 else 1), + ".", + ".", + ".", + format_anns, + ] + out_handle.write("\t".join(parts) + "\n") + + def _write_header(self, out_handle): + """Write out standard header directives.""" + out_handle.write("##gff-version 3\n") + + def _write_fasta(self, recs, out_handle): + """Write sequence records using the ##FASTA directive.""" + out_handle.write("##FASTA\n") + SeqIO.write(recs, out_handle, "fasta") + + +def write(recs, out_handle, include_fasta=False): + """High level interface to write GFF3 files from SeqRecords and SeqFeatures. + + If include_fasta is True, the GFF3 file will include sequence information + using the ##FASTA directive. + """ + writer = GFF3Writer() + return writer.write(recs, out_handle, include_fasta) diff -r 1fe91657bfd6 -r 4c201a3d4755 blastxml_to_gapped_gff3.py --- a/blastxml_to_gapped_gff3.py Thu Jan 25 07:58:28 2024 +0000 +++ b/blastxml_to_gapped_gff3.py Sun Jan 28 06:48:52 2024 +0000 @@ -32,7 +32,7 @@ recid = record.query if " " in recid: - recid = recid[0: recid.index(" ")] + recid = recid[0 : recid.index(" ")] rec = SeqRecord(Seq("ACTG"), id=recid) for idx_hit, hit in enumerate(record.alignments): @@ -72,7 +72,7 @@ qualifiers["blast_" + prop] = getattr(hsp, prop, None) desc = hit.title.split(" >")[0] - qualifiers["description"] = desc[desc.index(" "):] + qualifiers["description"] = desc[desc.index(" ") :] # This required a fair bit of sketching out/match to figure out # the first time. @@ -161,9 +161,9 @@ fm = "" fs = "" for position in re.finditer("-", query): - fq += query[prev: position.start()] - fm += match[prev: position.start()] - fs += subject[prev: position.start()] + fq += query[prev : position.start()] + fm += match[prev : position.start()] + fs += subject[prev : position.start()] prev = position.start() + 1 fq += query[prev:] fm += match[prev:] @@ -290,7 +290,9 @@ help="Trim blast hits to be only as long as the parent feature", ) parser.add_argument( - "--trim_end", action="store_true", help="Cut blast results off at end of gene" + "--trim_end", + action="store_true", + help="Cut blast results off at end of gene", ) parser.add_argument("--include_seq", action="store_true", help="Include sequence") args = parser.parse_args() diff -r 1fe91657bfd6 -r 4c201a3d4755 gff3_rebase.py --- a/gff3_rebase.py Thu Jan 25 07:58:28 2024 +0000 +++ b/gff3_rebase.py Sun Jan 28 06:48:52 2024 +0000 @@ -65,7 +65,10 @@ if hasattr(feature, "sub_features"): for x in feature_lambda( - feature.sub_features, test, test_kwargs, subfeatures=subfeatures + feature.sub_features, + test, + test_kwargs, + subfeatures=subfeatures, ): yield x @@ -197,7 +200,9 @@ help="Child GFF3 annotations to rebase against parent", ) parser.add_argument( - "--interpro", action="store_true", help="Interpro specific modifications" + "--interpro", + action="store_true", + help="Interpro specific modifications", ) parser.add_argument( "--protein2dna", diff -r 1fe91657bfd6 -r 4c201a3d4755 jb2_GFF/GFFOutput.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/jb2_GFF/GFFOutput.py Sun Jan 28 06:48:52 2024 +0000 @@ -0,0 +1,233 @@ +"""Output Biopython SeqRecords and SeqFeatures to GFF3 format. + +The target format is GFF3, the current GFF standard: + http://www.sequenceontology.org/gff3.shtml +""" +from six.moves import urllib + +from Bio import SeqIO + + +class _IdHandler: + """Generate IDs for GFF3 Parent/Child + relationships where they don't exist.""" + + def __init__(self): + self._prefix = "biopygen" + self._counter = 1 + self._seen_ids = [] + + def _generate_id(self, quals): + """Generate a unique ID not present in our existing IDs.""" + gen_id = self._get_standard_id(quals) + if gen_id is None: + while 1: + gen_id = "%s%s" % (self._prefix, self._counter) + if gen_id not in self._seen_ids: + break + self._counter += 1 + return gen_id + + def _get_standard_id(self, quals): + """Retrieve standardized IDs from other sources like NCBI GenBank. + + This tries to find IDs from known key/values when stored differently + than GFF3 specifications. + """ + possible_keys = ["transcript_id", "protein_id"] + for test_key in possible_keys: + if test_key in quals: + cur_id = quals[test_key] + if isinstance(cur_id, tuple) or isinstance( + cur_id, list + ): + return cur_id[0] + else: + return cur_id + return None + + def update_quals(self, quals, has_children): + """Update a set of qualifiers, adding an ID if necessary.""" + cur_id = quals.get("ID", None) + # if we have an ID, record it + if cur_id: + if not isinstance(cur_id, list) and not isinstance( + cur_id, tuple + ): + cur_id = [cur_id] + for add_id in cur_id: + self._seen_ids.append(add_id) + # if we need one and don't have it, create a new one + elif has_children: + new_id = self._generate_id(quals) + self._seen_ids.append(new_id) + quals["ID"] = [new_id] + return quals + + +class GFF3Writer: + """Write GFF3 files starting with standard Biopython objects.""" + + def __init__(self): + pass + + def write(self, recs, out_handle, include_fasta=False): + """Write the provided records to the given handle in GFF3 format.""" + id_handler = _IdHandler() + self._write_header(out_handle) + fasta_recs = [] + try: + recs = iter(recs) + except TypeError: + recs = [recs] + for rec in recs: + self._write_rec(rec, out_handle) + self._write_annotations( + rec.annotations, rec.id, len(rec.seq), out_handle + ) + for sf in rec.features: + sf = self._clean_feature(sf) + id_handler = self._write_feature( + sf, rec.id, out_handle, id_handler + ) + if include_fasta and len(rec.seq) > 0: + fasta_recs.append(rec) + if len(fasta_recs) > 0: + self._write_fasta(fasta_recs, out_handle) + + def _clean_feature(self, feature): + quals = {} + for key, val in feature.qualifiers.items(): + if not isinstance(val, (list, tuple)): + val = [val] + val = [str(x) for x in val] + quals[key] = val + feature.qualifiers = quals + # Support for Biopython 1.68 and above, which removed sub_features + if not hasattr(feature, "sub_features"): + feature.sub_features = [] + clean_sub = [ + self._clean_feature(f) for f in feature.sub_features + ] + feature.sub_features = clean_sub + return feature + + def _write_rec(self, rec, out_handle): + # if we have a SeqRecord, write out optional directive + if len(rec.seq) > 0: + out_handle.write( + "##sequence-region %s 1 %s\n" % (rec.id, len(rec.seq)) + ) + + def _get_phase(self, feature): + if "phase" in feature.qualifiers: + phase = feature.qualifiers["phase"][0] + elif feature.type == "CDS": + phase = ( + int(feature.qualifiers.get("codon_start", [1])[0]) - 1 + ) + else: + phase = "." + return str(phase) + + def _write_feature( + self, feature, rec_id, out_handle, id_handler, parent_id=None + ): + """Write a feature with location information.""" + if feature.location.strand == 1: + strand = "+" + elif feature.location.strand == -1: + strand = "-" + else: + strand = "." + # remove any standard features from the qualifiers + quals = feature.qualifiers.copy() + for std_qual in ["source", "score", "phase"]: + if std_qual in quals and len(quals[std_qual]) == 1: + del quals[std_qual] + # add a link to a parent identifier if it exists + if parent_id: + if "Parent" not in quals: + quals["Parent"] = [] + quals["Parent"].append(parent_id) + quals = id_handler.update_quals( + quals, len(feature.sub_features) > 0 + ) + if feature.type: + ftype = feature.type + else: + ftype = "sequence_feature" + parts = [ + str(rec_id), + feature.qualifiers.get("source", ["feature"])[0], + ftype, + str(feature.location.start + 1), # 1-based indexing + str(feature.location.end), + feature.qualifiers.get("score", ["."])[0], + strand, + self._get_phase(feature), + self._format_keyvals(quals), + ] + out_handle.write("\t".join(parts) + "\n") + for sub_feature in feature.sub_features: + id_handler = self._write_feature( + sub_feature, + rec_id, + out_handle, + id_handler, + quals["ID"][0], + ) + return id_handler + + def _format_keyvals(self, keyvals): + format_kvs = [] + for key in sorted(keyvals.keys()): + values = keyvals[key] + key = key.strip() + format_vals = [] + if not isinstance(values, list) or isinstance( + values, tuple + ): + values = [values] + for val in values: + val = urllib.parse.quote(str(val).strip(), safe=":/ ") + if (key and val) and val not in format_vals: + format_vals.append(val) + format_kvs.append("%s=%s" % (key, ",".join(format_vals))) + return ";".join(format_kvs) + + def _write_annotations(self, anns, rec_id, size, out_handle): + """Add annotations which refer to an entire sequence.""" + format_anns = self._format_keyvals(anns) + if format_anns: + parts = [ + rec_id, + "annotation", + "remark", + "1", + str(size if size > 1 else 1), + ".", + ".", + ".", + format_anns, + ] + out_handle.write("\t".join(parts) + "\n") + + def _write_header(self, out_handle): + """Write out standard header directives.""" + out_handle.write("##gff-version 3\n") + + def _write_fasta(self, recs, out_handle): + """Write sequence records using the ##FASTA directive.""" + out_handle.write("##FASTA\n") + SeqIO.write(recs, out_handle, "fasta") + + +def write(recs, out_handle, include_fasta=False): + """High level interface to write GFF3 files from SeqRecords and SeqFeatures. + + If include_fasta is True, the GFF3 file will include sequence information + using the ##FASTA directive. + """ + writer = GFF3Writer() + return writer.write(recs, out_handle, include_fasta) diff -r 1fe91657bfd6 -r 4c201a3d4755 jb2_GFF/GFFParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/jb2_GFF/GFFParser.py Sun Jan 28 06:48:52 2024 +0000 @@ -0,0 +1,1099 @@ +"""Parse GFF files into features attached to Biopython SeqRecord objects. + +This deals with GFF3 formatted files, a tab delimited format for storing +sequence features and annotations: + +http://www.sequenceontology.org/gff3.shtml + +It will also deal with older GFF versions (GTF/GFF2): + +http://www.sanger.ac.uk/Software/formats/GFF/GFF_Spec.shtml +http://mblab.wustl.edu/GTF22.html + +The implementation utilizes map/reduce parsing of GFF using Disco. Disco +(http://discoproject.org) is a Map-Reduce framework for Python utilizing +Erlang for parallelization. The code works on a single processor without +Disco using the same architecture. +""" +import os +import copy +import json +import re +import collections +import io +import itertools +import warnings +import six +from six.moves import urllib + +from Bio.SeqRecord import SeqRecord +from Bio import SeqFeature +from Bio import SeqIO +from Bio import BiopythonDeprecationWarning + +import disco + +# Make defaultdict compatible with versions of python older than 2.4 +try: + collections.defaultdict +except AttributeError: + import _utils + + collections.defaultdict = _utils.defaultdict + +unknown_seq_avail = False +try: + from Bio.Seq import UnknownSeq + + unknown_seq_avail = True +except ImportError: + # Starting with biopython 1.81, has been removed + from Bio.Seq import _UndefinedSequenceData + from Bio.Seq import Seq + + +warnings.simplefilter("ignore", BiopythonDeprecationWarning) + + +def _gff_line_map(line, params): + """Map part of Map-Reduce; parses a line of GFF into a dictionary. + + Given an input line from a GFF file, this: + - decides if the file passes our filtering limits + - if so: + - breaks it into component elements + - determines the type of attribute (flat, parent, child or annotation) + - generates a dictionary of GFF info which can be serialized as JSON + """ + + def _merge_keyvals(parts): + """Merge key-values escaped by quotes + that are improperly split at semicolons.""" + out = [] + for i, p in enumerate(parts): + if ( + i > 0 + and len(p) == 1 + and p[0].endswith('"') + and not p[0].startswith('"') + ): + if out[-1][-1].startswith('"'): + prev_p = out.pop(-1) + to_merge = prev_p[-1] + prev_p[-1] = "%s; %s" % (to_merge, p[0]) + out.append(prev_p) + else: + out.append(p) + return out + + gff3_kw_pat = re.compile(r"\w+=") + + def _split_keyvals(keyval_str): + """Split key-value pairs in a GFF2, GTF and GFF3 compatible way. + + GFF3 has key value pairs like: + count=9;gene=amx-2;sequence=SAGE:aacggagccg + GFF2 and GTF have: + Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206" + name "fgenesh1_pg.C_chr_1000003"; transcriptId 869 + """ + quals = collections.defaultdict(list) + if keyval_str is None: + return quals + # ensembl GTF has a stray semi-colon at the end + if keyval_str[-1] == ";": + keyval_str = keyval_str[:-1] + # GFF2/GTF has a semi-colon with at least one space after it. + # It can have spaces on both sides; wormbase does this. + # GFF3 works with no spaces. + # Split at the first one we can recognize as working + parts = keyval_str.split(" ; ") + if len(parts) == 1: + parts = [x.strip() for x in keyval_str.split(";")] + # check if we have GFF3 style key-vals (with =) + is_gff2 = True + if gff3_kw_pat.match(parts[0]): + is_gff2 = False + key_vals = _merge_keyvals([p.split("=") for p in parts]) + # otherwise, we are separated by a space with a key as the first item + else: + pieces = [] + for p in parts: + # fix misplaced semi-colons in keys in some GFF2 files + if p and p[0] == ";": + p = p[1:] + pieces.append(p.strip().split(" ")) + key_vals = [(p[0], " ".join(p[1:])) for p in pieces] + for item in key_vals: + # standard in-spec items are key=value + if len(item) == 2: + key, val = item + # out-of-spec files can have just key values. We set an empty value + # which will be changed to true later to standardize. + else: + assert len(item) == 1, item + key = item[0] + val = "" + # remove quotes in GFF2 files + quoted = False + if len(val) > 0 and val[0] == '"' and val[-1] == '"': + quoted = True + val = val[1:-1] + if val: + if quoted: + quals[key].append(val) + else: + quals[key].extend( + [v for v in val.split(",") if v] + ) + # if we don't have a value, make this a key=True/False style + # attribute + else: + quals[key].append("true") + for key, vals in quals.items(): + quals[key] = [urllib.parse.unquote(v) for v in vals] + return quals, is_gff2 + + def _nest_gff2_features(gff_parts): + """Provide nesting of GFF2 transcript parts with transcript IDs. + + exons and coding sequences are mapped to a parent with a transcript_id + in GFF2. This is implemented differently at different genome centers + and this function attempts to resolve that and map things to the GFF3 + way of doing them. + """ + # map protein or transcript ids to a parent + for transcript_id in [ + "transcript_id", + "transcriptId", + "proteinId", + ]: + try: + gff_parts["quals"]["Parent"] = gff_parts["quals"][ + transcript_id + ] + break + except KeyError: + pass + # case for WormBase GFF -- everything labelled as Transcript or CDS + for flat_name in ["Transcript", "CDS"]: + if flat_name in gff_parts["quals"]: + # parent types + if gff_parts["type"] in [flat_name]: + if not gff_parts["id"]: + gff_parts["id"] = gff_parts["quals"][ + flat_name + ][0] + gff_parts["quals"]["ID"] = [gff_parts["id"]] + # children types + elif gff_parts["type"] in [ + "intron", + "exon", + "three_prime_UTR", + "coding_exon", + "five_prime_UTR", + "CDS", + "stop_codon", + "start_codon", + ]: + gff_parts["quals"]["Parent"] = gff_parts["quals"][ + flat_name + ] + break + + return gff_parts + + strand_map = {"+": 1, "-": -1, "?": None, None: None} + line = line.strip() + if line[:2] == "##": + return [("directive", line[2:])] + elif line and line[0] != "#": + parts = line.split("\t") + should_do = True + if params.limit_info: + for limit_name, limit_values in params.limit_info.items(): + cur_id = tuple( + [parts[i] for i in params.filter_info[limit_name]] + ) + if cur_id not in limit_values: + should_do = False + break + if should_do: + assert len(parts) >= 8, line + # not python2.4 compatible but easier to understand + # gff_parts = [(None if p == '.' else p) for p in parts] + gff_parts = [] + for p in parts: + if p == ".": + gff_parts.append(None) + else: + gff_parts.append(p) + gff_info = dict() + # collect all of the base qualifiers for this item + if len(parts) > 8: + quals, is_gff2 = _split_keyvals(gff_parts[8]) + else: + quals, is_gff2 = collections.defaultdict(list), False + gff_info["is_gff2"] = is_gff2 + if gff_parts[1]: + quals["source"].append(gff_parts[1]) + if gff_parts[5]: + quals["score"].append(gff_parts[5]) + if gff_parts[7]: + quals["phase"].append(gff_parts[7]) + gff_info["quals"] = dict(quals) + gff_info["rec_id"] = gff_parts[0] + # if we are describing a location, then we are a feature + if gff_parts[3] and gff_parts[4]: + gff_info["location"] = [ + int(gff_parts[3]) - 1, + int(gff_parts[4]), + ] + gff_info["type"] = gff_parts[2] + gff_info["id"] = quals.get("ID", [""])[0] + gff_info["strand"] = strand_map.get( + gff_parts[6], None + ) + if is_gff2: + gff_info = _nest_gff2_features(gff_info) + # features that have parents need to link so we can pick up + # the relationship + if "Parent" in gff_info["quals"]: + # check for self referential parent/child relationships + # remove the ID, which is not useful + for p in gff_info["quals"]["Parent"]: + if p == gff_info["id"]: + gff_info["id"] = "" + del gff_info["quals"]["ID"] + break + final_key = "child" + elif gff_info["id"]: + final_key = "parent" + # Handle flat features + else: + final_key = "feature" + # otherwise, associate these annotations with the full record + else: + final_key = "annotation" + if params.jsonify: + return [(final_key, json.dumps(gff_info))] + else: + return [(final_key, gff_info)] + return [] + + +def _gff_line_reduce(map_results, out, params): + """Reduce part of Map-Reduce; combines results of parsed features.""" + final_items = dict() + for gff_type, final_val in map_results: + if params.jsonify and gff_type not in ["directive"]: + final_val = json.loads(final_val) + try: + final_items[gff_type].append(final_val) + except KeyError: + final_items[gff_type] = [final_val] + for key, vals in final_items.items(): + if params.jsonify: + vals = json.dumps(vals) + out.add(key, vals) + + +class _MultiIDRemapper: + """Provide an ID remapping for cases where a parent has a non-unique ID. + + Real life GFF3 cases have non-unique ID attributes, which we fix here + by using the unique sequence region to assign children to the right + parent. + """ + + def __init__(self, base_id, all_parents): + self._base_id = base_id + self._parents = all_parents + + def remap_id(self, feature_dict): + rstart, rend = feature_dict["location"] + for index, parent in enumerate(self._parents): + pstart, pend = parent["location"] + if rstart >= pstart and rend <= pend: + if index > 0: + return "%s_%s" % (self._base_id, index + 1) + else: + return self._base_id + # if we haven't found a location match but parents are umabiguous, + # return that + if len(self._parents) == 1: + return self._base_id + raise ValueError( + "Did not find remapped ID location: %s, %s, %s" + % ( + self._base_id, + [p["location"] for p in self._parents], + feature_dict["location"], + ) + ) + + +class _AbstractMapReduceGFF: + """Base class providing general GFF parsing for local and remote classes. + + This class should be subclassed to provide a concrete class to parse + GFF under specific conditions. These classes need to implement + the _gff_process function, which returns a dictionary of SeqRecord + information. + """ + + def __init__(self, create_missing=True): + """Initialize GFF parser + + create_missing - If True, create blank records for GFF ids not in + the base_dict. If False, an error will be raised. + """ + self._create_missing = create_missing + self._map_fn = _gff_line_map + self._reduce_fn = _gff_line_reduce + self._examiner = GFFExaminer() + + def _gff_process(self, gff_files, limit_info, target_lines=None): + raise NotImplementedError("Derived class must define") + + def parse(self, gff_files, base_dict=None, limit_info=None): + """Parse a GFF file, returning an iterator of SeqRecords. + + limit_info - A dictionary specifying the regions of the GFF file + which should be extracted. This allows only relevant portions of a file + to be parsed. + + base_dict - A base dictionary of SeqRecord objects which may be + pre-populated with sequences and other features. The new features from + the GFF file will be added to this dictionary. + """ + for rec in self.parse_in_parts( + gff_files, base_dict, limit_info + ): + yield rec + + def parse_in_parts( + self, + gff_files, + base_dict=None, + limit_info=None, + target_lines=None, + ): + """Parse a region of a GFF file specified, returning info as generated. + + target_lines -- The number of lines in the file which should be used + for each partial parse. This should be determined based on available + memory. + """ + for results in self.parse_simple( + gff_files, limit_info, target_lines + ): + if base_dict is None: + cur_dict = dict() + else: + cur_dict = copy.deepcopy(base_dict) + cur_dict = self._results_to_features(cur_dict, results) + all_ids = list(cur_dict.keys()) + all_ids.sort() + for cur_id in all_ids: + yield cur_dict[cur_id] + + def parse_simple( + self, gff_files, limit_info=None, target_lines=1 + ): + """Simple parse which does not build or nest features. + + This returns a simple dictionary representation of each line in the + GFF file. + """ + # gracefully handle a single file passed + if not isinstance(gff_files, (list, tuple)): + gff_files = [gff_files] + limit_info = self._normalize_limit_info(limit_info) + for results in self._gff_process( + gff_files, limit_info, target_lines + ): + yield results + + def _normalize_limit_info(self, limit_info): + """Turn all limit information into tuples for identical comparisons.""" + final_limit_info = {} + if limit_info: + for key, values in limit_info.items(): + final_limit_info[key] = [] + for v in values: + if isinstance(v, str): + final_limit_info[key].append((v,)) + else: + final_limit_info[key].append(tuple(v)) + return final_limit_info + + def _results_to_features(self, base, results): + """Add parsed dictionaries of results to Biopython SeqFeatures.""" + base = self._add_annotations( + base, results.get("annotation", []) + ) + for feature in results.get("feature", []): + (_, base) = self._add_toplevel_feature(base, feature) + base = self._add_parent_child_features( + base, results.get("parent", []), results.get("child", []) + ) + base = self._add_seqs(base, results.get("fasta", [])) + base = self._add_directives( + base, results.get("directive", []) + ) + return base + + def _add_directives(self, base, directives): + """Handle any directives or meta-data in the GFF file. + + Relevant items are added as annotation meta-data to each record. + """ + dir_keyvals = collections.defaultdict(list) + for directive in directives: + parts = directive.split() + if len(parts) > 1: + key = parts[0] + if len(parts) == 2: + val = parts[1] + else: + val = tuple(parts[1:]) + # specific directives that need special handling + if ( + key == "sequence-region" + ): # convert to Python 0-based coordinates + if len(val) == 2: # handle regions missing contig + val = (int(val[0]) - 1, int(val[1])) + elif len(val) == 3: + val = (val[0], int(val[1]) - 1, int(val[2])) + dir_keyvals[key].append(val) + for key, vals in dir_keyvals.items(): + for rec in base.values(): + self._add_ann_to_rec(rec, key, vals) + return base + + def _get_matching_record_id(self, base, find_id): + """Find a matching base record with the test identifier, handling tricky cases. + + NCBI IDs https://en.wikipedia.org/wiki/FASTA_format#NCBI_identifiers + """ + # Straight matches for identifiers + if find_id in base: + return find_id + # NCBI style IDs in find_id + elif find_id and find_id.find("|") > 0: + for test_id in [ + x.strip() for x in find_id.split("|")[1:] + ]: + if test_id and test_id in base: + return test_id + # NCBI style IDs in base IDs + else: + for base_id in base.keys(): + if base_id.find("|") > 0: + for test_id in [ + x.strip() for x in base_id.split("|")[1:] + ]: + if test_id and test_id == find_id: + return base_id + return None + + def _add_seqs(self, base, recs): + """Add sequence information contained in the GFF3 to records.""" + for rec in recs: + match_id = self._get_matching_record_id(base, rec.id) + if match_id: + base[match_id].seq = rec.seq + else: + base[rec.id] = rec + return base + + def _add_parent_child_features(self, base, parents, children): + """Add nested features with parent child relationships.""" + multi_remap = self._identify_dup_ids(parents) + # add children features + children_prep = collections.defaultdict(list) + for child_dict in children: + child_feature = self._get_feature(child_dict) + for pindex, pid in enumerate( + child_feature.qualifiers["Parent"] + ): + if pid in multi_remap: + pid = multi_remap[pid].remap_id(child_dict) + child_feature.qualifiers["Parent"][pindex] = pid + children_prep[pid].append( + (child_dict["rec_id"], child_feature) + ) + children = dict(children_prep) + # add children to parents that exist + for cur_parent_dict in parents: + cur_id = cur_parent_dict["id"] + if cur_id in multi_remap: + cur_parent_dict["id"] = multi_remap[cur_id].remap_id( + cur_parent_dict + ) + cur_parent, base = self._add_toplevel_feature( + base, cur_parent_dict + ) + cur_parent, children = self._add_children_to_parent( + cur_parent, children + ) + # create parents for children without them (GFF2 or split/bad files) + while len(children) > 0: + parent_id, cur_children = next( + itertools.islice(children.items(), 1) + ) + # one child, do not nest it + if len(cur_children) == 1: + rec_id, child = cur_children[0] + loc = (child.location.start, child.location.end) + rec, base = self._get_rec( + base, dict(rec_id=rec_id, location=loc) + ) + rec.features.append(child) + del children[parent_id] + else: + cur_parent, base = self._add_missing_parent( + base, parent_id, cur_children + ) + cur_parent, children = self._add_children_to_parent( + cur_parent, children + ) + return base + + def _identify_dup_ids(self, parents): + """Identify duplicated ID attributes in potential nested parents. + + According to the GFF3 spec ID attributes are supposed to be unique + for a file, but this is not always true in practice. This looks + for duplicates, and provides unique IDs sorted by locations. + """ + multi_ids = collections.defaultdict(list) + for parent in parents: + multi_ids[parent["id"]].append(parent) + multi_ids = [ + (mid, ps) + for (mid, ps) in multi_ids.items() + if len(parents) > 1 + ] + multi_remap = dict() + for mid, parents in multi_ids: + multi_remap[mid] = _MultiIDRemapper(mid, parents) + return multi_remap + + def _add_children_to_parent(self, cur_parent, children): + """Recursively add children to parent features.""" + if cur_parent.id in children: + cur_children = children[cur_parent.id] + ready_children = [] + for _, cur_child in cur_children: + cur_child, _ = self._add_children_to_parent( + cur_child, children + ) + ready_children.append(cur_child) + # Support Biopython features for 1.62+ + # CompoundLocations and pre-1.62 + if not hasattr(SeqFeature, "CompoundLocation"): + cur_parent.location_operator = "join" + for cur_child in ready_children: + cur_parent.sub_features.append(cur_child) + del children[cur_parent.id] + return cur_parent, children + + def _add_annotations(self, base, anns): + """Add annotation data from the GFF file to records.""" + # add these as a list of annotations, checking not to overwrite + # current values + for ann in anns: + rec, base = self._get_rec(base, ann) + for key, vals in ann["quals"].items(): + self._add_ann_to_rec(rec, key, vals) + return base + + def _add_ann_to_rec(self, rec, key, vals): + """Add a key/value annotation to the given SeqRecord.""" + if key in rec.annotations: + try: + rec.annotations[key].extend(vals) + except AttributeError: + rec.annotations[key] = [rec.annotations[key]] + vals + else: + rec.annotations[key] = vals + + def _get_rec(self, base, info_dict): + """Retrieve a record to add features to.""" + max_loc = info_dict.get("location", (0, 1))[1] + match_id = self._get_matching_record_id( + base, info_dict["rec_id"] + ) + if match_id: + cur_rec = base[match_id] + # update generated unknown sequences + if unknown_seq_avail and isinstance( + cur_rec.seq, UnknownSeq + ): + cur_rec.seq._length = max( + [max_loc, cur_rec.seq._length] + ) + elif not unknown_seq_avail and isinstance( + cur_rec.seq._data, _UndefinedSequenceData + ): + cur_rec.seq._data._length = max( + [max_loc, cur_rec.seq._data._length] + ) + return cur_rec, base + elif self._create_missing: + if unknown_seq_avail: + new_rec = SeqRecord( + UnknownSeq(max_loc), info_dict["rec_id"] + ) + else: + new_rec = SeqRecord( + Seq(None, length=max_loc), info_dict["rec_id"] + ) + base[info_dict["rec_id"]] = new_rec + return new_rec, base + else: + raise KeyError( + "Did not find matching record in %s for %s" + % (base.keys(), info_dict) + ) + + def _add_missing_parent(self, base, parent_id, cur_children): + """Add a new feature that is missing from the GFF file.""" + base_rec_id = list(set(c[0] for c in cur_children)) + child_strands = list( + set(c[1].location.strand for c in cur_children) + ) + inferred_strand = ( + child_strands[0] if len(child_strands) == 1 else None + ) + assert len(base_rec_id) > 0 + feature_dict = dict( + id=parent_id, + strand=inferred_strand, + type="inferred_parent", + quals=dict(ID=[parent_id]), + rec_id=base_rec_id[0], + ) + coords = [ + (c.location.start, c.location.end) + for r, c in cur_children + ] + feature_dict["location"] = ( + min([c[0] for c in coords]), + max([c[1] for c in coords]), + ) + return self._add_toplevel_feature(base, feature_dict) + + def _add_toplevel_feature(self, base, feature_dict): + """Add a toplevel non-nested feature to the appropriate record.""" + new_feature = self._get_feature(feature_dict) + rec, base = self._get_rec(base, feature_dict) + rec.features.append(new_feature) + return new_feature, base + + def _get_feature(self, feature_dict): + """Retrieve a Biopython feature from our dictionary representation.""" + # location = SeqFeature.FeatureLocation(*feature_dict['location']) + rstart, rend = feature_dict["location"] + new_feature = SeqFeature.SeqFeature( + SeqFeature.SimpleLocation( + start=rstart, end=rend, strand=feature_dict["strand"] + ), + feature_dict["type"], + id=feature_dict["id"], + ) + # Support for Biopython 1.68 and above, which removed sub_features + if not hasattr(new_feature, "sub_features"): + new_feature.sub_features = [] + new_feature.qualifiers = feature_dict["quals"] + return new_feature + + def _parse_fasta(self, in_handle): + """Parse FASTA sequence information contained in the GFF3 file.""" + return list(SeqIO.parse(in_handle, "fasta")) + + +class _GFFParserLocalOut: + """Provide a collector for local GFF MapReduce file parsing.""" + + def __init__(self, smart_breaks=False): + self._items = dict() + self._smart_breaks = smart_breaks + self._missing_keys = collections.defaultdict(int) + self._last_parent = None + self.can_break = True + self.num_lines = 0 + + def add(self, key, vals): + if self._smart_breaks: + # if we are not GFF2 we expect parents and break + # based on not having missing ones + if key == "directive": + if vals[0] == "#": + self.can_break = True + self._last_parent = None + elif not vals[0].get("is_gff2", False): + self._update_missing_parents(key, vals) + self.can_break = len(self._missing_keys) == 0 + # break when we are done with stretches of child features + elif key != "child": + self.can_break = True + self._last_parent = None + # break when we have lots of child features in a row + # and change between parents + else: + cur_parent = vals[0]["quals"]["Parent"][0] + if self._last_parent: + self.can_break = cur_parent != self._last_parent + self._last_parent = cur_parent + self.num_lines += 1 + try: + self._items[key].extend(vals) + except KeyError: + self._items[key] = vals + + def _update_missing_parents(self, key, vals): + # smart way of deciding if we can break this. + # if this is too much, can go back to not breaking in the + # middle of children + if key in ["child"]: + for val in vals: + for p_id in val["quals"]["Parent"]: + self._missing_keys[p_id] += 1 + for val in vals: + try: + del self._missing_keys[val["quals"]["ID"][0]] + except KeyError: + pass + + def has_items(self): + return len(self._items) > 0 + + def get_results(self): + self._last_parent = None + return self._items + + +class GFFParser(_AbstractMapReduceGFF): + """Local GFF parser providing standardized parsing of GFF files.""" + + def __init__(self, line_adjust_fn=None, create_missing=True): + _AbstractMapReduceGFF.__init__( + self, create_missing=create_missing + ) + self._line_adjust_fn = line_adjust_fn + + def _gff_process(self, gff_files, limit_info, target_lines): + """Process GFF addition without any parallelization. + + In addition to limit filtering, this accepts a target_lines attribute + which provides a number of lines to parse before returning results. + This allows partial parsing of a file to prevent memory issues. + """ + line_gen = self._file_line_generator(gff_files) + for out in self._lines_to_out_info( + line_gen, limit_info, target_lines + ): + yield out + + def _file_line_generator(self, gff_files): + """Generate single lines from a set of GFF files.""" + for gff_file in gff_files: + if hasattr(gff_file, "read"): + need_close = False + in_handle = gff_file + else: + need_close = True + in_handle = open(gff_file) + while 1: + line = in_handle.readline() + if not line: + break + yield line + if need_close: + in_handle.close() + + def _lines_to_out_info(self, line_iter, limit_info=None, target_lines=None): + """Generate SeqRecord and SeqFeatures from GFF file lines.""" + params = self._examiner._get_local_params(limit_info) + out_info = _GFFParserLocalOut( + (target_lines is not None and target_lines > 1) + ) + found_seqs = False + for line in line_iter: + results = self._map_fn(line, params) + if self._line_adjust_fn and results: + if results[0][0] not in ["directive"]: + results = [ + ( + results[0][0], + self._line_adjust_fn(results[0][1]), + ) + ] + self._reduce_fn(results, out_info, params) + if ( + target_lines + and out_info.num_lines >= target_lines + and out_info.can_break + ): + yield out_info.get_results() + out_info = _GFFParserLocalOut( + (target_lines is not None and target_lines > 1) + ) + if ( + results + and results[0][0] == "directive" + and results[0][1] == "FASTA" + ): + found_seqs = True + break + + class FakeHandle: + def __init__(self, line_iter): + self._iter = line_iter + + def __iter__(self): + return self + + def __next__(self): + return next(self._iter) + + next = __next__ + + def read(self, size=-1): + if size < 0: + return "".join(x for x in self._iter) + elif size == 0: + return "" # Used by Biopython to sniff unicode vs bytes + else: + raise NotImplementedError + + def readline(self): + try: + return next(self._iter) + except StopIteration: + return "" + + if found_seqs: + fasta_recs = self._parse_fasta(FakeHandle(line_iter)) + out_info.add("fasta", fasta_recs) + if out_info.has_items(): + yield out_info.get_results() + + +class DiscoGFFParser(_AbstractMapReduceGFF): + """GFF Parser with parallelization (http://discoproject.org.""" + + def __init__(self, disco_host, create_missing=True): + """Initialize parser. + + disco_host - Web reference to a Disco host which will be used for + parallelizing the GFF reading job. + """ + _AbstractMapReduceGFF.__init__( + self, create_missing=create_missing + ) + self._disco_host = disco_host + + def _gff_process(self, gff_files, limit_info, target_lines=None): + """Process GFF addition, using Disco to parallelize the process.""" + assert target_lines is None, "Cannot split parallelized jobs" + # make these imports local; only need them when using disco + # absolute path names unless they are special disco files + full_files = [] + for f in gff_files: + if f.split(":")[0] != "disco": + full_files.append(os.path.abspath(f)) + else: + full_files.append(f) + results = disco.job( + self._disco_host, + name="gff_reader", + input=full_files, + params=disco.Params( + limit_info=limit_info, + jsonify=True, + filter_info=self._examiner._filter_info, + ), + required_modules=["json", "collections", "re"], + map=self._map_fn, + reduce=self._reduce_fn, + ) + processed = dict() + for out_key, out_val in disco.result_iterator(results): + processed[out_key] = json.loads(out_val) + yield processed + + +def parse( + gff_files, base_dict=None, limit_info=None, target_lines=None +): + """parse GFF files into SeqRecords and SeqFeatures.""" + parser = GFFParser() + for rec in parser.parse_in_parts( + gff_files, base_dict, limit_info, target_lines + ): + yield rec + + +def parse_simple(gff_files, limit_info=None): + """Parse GFF files as line by line dictionary of parts.""" + parser = GFFParser() + for rec in parser.parse_simple(gff_files, limit_info=limit_info): + if "child" in rec: + assert "parent" not in rec + yield rec["child"][0] + elif "parent" in rec: + yield rec["parent"][0] + elif "feature" in rec: + yield rec["feature"][0] + # ignore directive lines + else: + assert "directive" in rec + + +def _file_or_handle(fn): + """Decorator to handle either an input handle or a file.""" + + def _file_or_handle_inside(*args, **kwargs): + in_file = args[1] + if hasattr(in_file, "read"): + need_close = False + in_handle = in_file + if six.PY3 and not isinstance(in_handle, io.TextIOBase): + raise TypeError( + "input handle must be opened in text mode" + ) + else: + need_close = True + in_handle = open(in_file) + args = (args[0], in_handle) + args[2:] + out = fn(*args, **kwargs) + if need_close: + in_handle.close() + return out + + return _file_or_handle_inside + + +class GFFExaminer: + """Provide high level details about a GFF file to refine parsing. + + GFF is a spec and is provided by many different centers. Real life files + will present the same information in slightly different ways. Becoming + familiar with the file you are dealing with is the best way to extract the + information you need. This class provides high level summary details to + help in learning. + """ + + def __init__(self): + self._filter_info = dict( + gff_id=[0], + gff_source_type=[1, 2], + gff_source=[1], + gff_type=[2], + ) + + def _get_local_params(self, limit_info=None): + class _LocalParams: + def __init__(self): + self.jsonify = False + + params = _LocalParams() + params.limit_info = limit_info + params.filter_info = self._filter_info + return params + + @_file_or_handle + def available_limits(self, gff_handle): + """Return dictionary information on possible limits for this file. + + This returns a nested dictionary with the following structure: + + keys -- names of items to filter by + values -- dictionary with: + keys -- filter choice + value -- counts of that filter in this file + + Not a parallelized map-reduce implementation. + """ + cur_limits = dict() + for filter_key in self._filter_info.keys(): + cur_limits[filter_key] = collections.defaultdict(int) + for line in gff_handle: + # when we hit FASTA sequences, we are done with annotations + if line.startswith("##FASTA"): + break + # ignore empty and comment lines + if line.strip() and line.strip()[0] != "#": + parts = [p.strip() for p in line.split("\t")] + assert len(parts) >= 8, line + parts = parts[:9] + for ( + filter_key, + cur_indexes, + ) in self._filter_info.items(): + cur_id = tuple([parts[i] for i in cur_indexes]) + cur_limits[filter_key][cur_id] += 1 + # get rid of the default dicts + final_dict = dict() + for key, value_dict in cur_limits.items(): + if len(key) == 1: + key = key[0] + final_dict[key] = dict(value_dict) + gff_handle.close() + return final_dict + + @_file_or_handle + def parent_child_map(self, gff_handle): + """Provide a mapping of parent to child relationships in the file. + + Returns a dictionary of parent child relationships: + + keys -- tuple of (source, type) for each parent + values -- tuple of (source, type) as children of that parent + + Not a parallelized map-reduce implementation. + """ + # collect all of the parent and child types mapped to IDs + parent_sts = dict() + child_sts = collections.defaultdict(list) + for line in gff_handle: + # when we hit FASTA sequences, we are done with annotations + if line.startswith("##FASTA"): + break + if line.strip() and not line.startswith("#"): + line_type, line_info = _gff_line_map( + line, self._get_local_params() + )[0] + if line_type == "parent" or ( + line_type == "child" and line_info["id"] + ): + parent_sts[line_info["id"]] = ( + line_info["quals"].get("source", [""])[0], + line_info["type"], + ) + if line_type == "child": + for parent_id in line_info["quals"]["Parent"]: + child_sts[parent_id].append( + ( + line_info["quals"].get( + "source", [""] + )[0], + line_info["type"], + ) + ) + # print parent_sts, child_sts + # generate a dictionary of the unique final type relationships + pc_map = collections.defaultdict(list) + for parent_id, parent_type in parent_sts.items(): + for child_type in child_sts[parent_id]: + pc_map[parent_type].append(child_type) + pc_final_map = dict() + for ptype, ctypes in pc_map.items(): + unique_ctypes = list(set(ctypes)) + unique_ctypes.sort() + pc_final_map[ptype] = unique_ctypes + return pc_final_map diff -r 1fe91657bfd6 -r 4c201a3d4755 jb2_GFF/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/jb2_GFF/__init__.py Sun Jan 28 06:48:52 2024 +0000 @@ -0,0 +1,6 @@ +"""Top level of GFF parsing providing shortcuts for useful classes. +""" +from jb2_GFF.GFFParser import GFFParser, DiscoGFFParser, GFFExaminer, parse, parse_simple +from jb2_GFF.GFFOutput import GFF3Writer, write + +__version__ = "0.6.9" diff -r 1fe91657bfd6 -r 4c201a3d4755 jb2_GFF/__pycache__/GFFOutput.cpython-310.pyc Binary file jb2_GFF/__pycache__/GFFOutput.cpython-310.pyc has changed diff -r 1fe91657bfd6 -r 4c201a3d4755 jb2_GFF/__pycache__/GFFParser.cpython-310.pyc Binary file jb2_GFF/__pycache__/GFFParser.cpython-310.pyc has changed diff -r 1fe91657bfd6 -r 4c201a3d4755 jb2_GFF/__pycache__/__init__.cpython-310.pyc Binary file jb2_GFF/__pycache__/__init__.cpython-310.pyc has changed diff -r 1fe91657bfd6 -r 4c201a3d4755 jb2_GFF/_utils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/jb2_GFF/_utils.py Sun Jan 28 06:48:52 2024 +0000 @@ -0,0 +1,49 @@ +class defaultdict(dict): + """Back compatible defaultdict: + http://code.activestate.com/recipes/523034/""" + + def __init__(self, default_factory=None, *a, **kw): + if default_factory is not None and not hasattr( + default_factory, "__call__" + ): + raise TypeError("first argument must be callable") + dict.__init__(self, *a, **kw) + self.default_factory = default_factory + + def __getitem__(self, key): + try: + return dict.__getitem__(self, key) + except KeyError: + return self.__missing__(key) + + def __missing__(self, key): + if self.default_factory is None: + raise KeyError(key) + self[key] = value = self.default_factory() + return value + + def __reduce__(self): + if self.default_factory is None: + args = tuple() + else: + args = (self.default_factory,) + return type(self), args, None, None, self.items() + + def copy(self): + return self.__copy__() + + def __copy__(self): + return type(self)(self.default_factory, self) + + def __deepcopy__(self, memo): + import copy + + return type(self)( + self.default_factory, copy.deepcopy(self.items()) + ) + + def __repr__(self): + return "defaultdict(%s, %s)" % ( + self.default_factory, + dict.__repr__(self), + ) diff -r 1fe91657bfd6 -r 4c201a3d4755 jb2_webserver.py --- a/jb2_webserver.py Thu Jan 25 07:58:28 2024 +0000 +++ b/jb2_webserver.py Sun Jan 28 06:48:52 2024 +0000 @@ -169,7 +169,9 @@ help=f"Port to listen on (default: {DEFAULT_PORT})", ) parser.add_argument( - "--bind", default="0.0.0.0", help="IP address to bind to (default: 0.0.0.0)" + "--bind", + default="0.0.0.0", + help="IP address to bind to (default: 0.0.0.0)", ) args = parser.parse_args() diff -r 1fe91657bfd6 -r 4c201a3d4755 jbrowse2.py --- a/jbrowse2.py Thu Jan 25 07:58:28 2024 +0000 +++ b/jbrowse2.py Sun Jan 28 06:48:52 2024 +0000 @@ -458,6 +458,10 @@ self.genome_name = ( genome_name # first one for all tracks - other than paf ) + self.genome_firstcontig = None + fl = open(fapath, "r").readline().strip().split(">", 1) + if len(fl) > 1: + self.genome_firstcontig = fl[1].strip() if self.config_json.get("assemblies", None): self.config_json["assemblies"] += assemblies else: @@ -560,7 +564,7 @@ # can be served - if public. # dsId = trackData["metadata"]["dataset_id"] # url = "%s/api/datasets/%s/display?to_ext=hic " % (self.giURL, dsId) - hname = trackData["label"] + hname = trackData["name"] dest = os.path.join(self.outdir, hname) cmd = ["cp", data, dest] # these can be very big. @@ -648,7 +652,10 @@ "type": "LinearBasicDisplay", "displayId": "%s-LinearBasicDisplay" % tId, }, - {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId}, + { + "type": "LinearArcDisplay", + "displayId": "%s-LinearArcDisplay" % tId, + }, ], } style_json = self._prepare_track_style(trackDict) @@ -717,7 +724,10 @@ "type": "LinearBasicDisplay", "displayId": "%s-LinearBasicDisplay" % tId, }, - {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId}, + { + "type": "LinearArcDisplay", + "displayId": "%s-LinearArcDisplay" % tId, + }, ], } style_json = self._prepare_track_style(trackDict) @@ -906,7 +916,10 @@ "type": "LinearBasicDisplay", "displayId": "%s-LinearBasicDisplay" % tId, }, - {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId}, + { + "type": "LinearArcDisplay", + "displayId": "%s-LinearArcDisplay" % tId, + }, ], } style_json = self._prepare_track_style(trackDict) @@ -945,7 +958,10 @@ "type": "LinearPileupDisplay", "displayId": "%s-LinearPileupDisplay" % tId, }, - {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId}, + { + "type": "LinearArcDisplay", + "displayId": "%s-LinearArcDisplay" % tId, + }, ], } style_json = self._prepare_track_style(trackDict) @@ -983,14 +999,14 @@ "assemblyNames": [self.genome_name, pgname], }, # "displays": [ - # { - # "type": "LinearSyntenyDisplay", - # "displayId": "%s-LinearSyntenyDisplay" % tId, - # }, - # { - # "type": "DotPlotDisplay", - # "displayId": "%s-DotPlotDisplay" % tId, - # }, + # { + # "type": "LinearSyntenyDisplay", + # "displayId": "%s-LinearSyntenyDisplay" % tId, + # }, + # { + # "type": "DotPlotDisplay", + # "displayId": "%s-DotPlotDisplay" % tId, + # }, # ], } style_json = self._prepare_track_style(trackDict) @@ -1143,13 +1159,17 @@ ) elif dataset_ext == "blastxml": self.add_blastxml( - dataset_path, outputTrackConfig, track["conf"]["options"]["blast"] + dataset_path, + outputTrackConfig, + track["conf"]["options"]["blast"], ) elif dataset_ext == "vcf": self.add_vcf(dataset_path, outputTrackConfig) elif dataset_ext == "paf": self.add_paf( - dataset_path, outputTrackConfig, track["conf"]["options"]["synteny"] + dataset_path, + outputTrackConfig, + track["conf"]["options"]["synteny"], ) else: log.warn("Do not know how to handle %s", dataset_ext) @@ -1194,43 +1214,42 @@ view_json = {"type": "LinearGenomeView", "tracks": tracks_data} refName = None + drdict = { + "reversed": False, + "assemblyName": self.genome_name, + "start": 0, + "end": 100000, + } + if data.get("defaultLocation", ""): ddl = data["defaultLocation"] - loc_match = re.search( - r"^([^:]+):(\d+)\.+(\d+)$", ddl - ) + loc_match = re.search(r"^([^:]+):(\d*)\.*(\d*)$", ddl) if loc_match: refName = loc_match.group(1) - start = int(loc_match.group(2)) - end = int(loc_match.group(3)) + drdict["refName"] = refName + if loc_match.group(2) > "": + drdict["start"] = int(loc_match.group(2)) + if loc_match.group(3) > "": + drdict["end"] = int(loc_match.group(3)) else: logging.info( "@@@ regexp could not match contig:start..end in the supplied location %s - please fix" % ddl ) - elif self.genome_name is not None: - start = 0 - end = 10000 # Booh, hard coded! waiting for https://github.com/GMOD/jbrowse-components/issues/2708 + elif self.genome_firstcontig is not None: + drdict["refName"] = self.genome_firstcontig logging.info( - "@@@ no defaultlocation found for default session - please add one" + "@@@ no defaultlocation found for default session - using %s as first contig found" + % self.genome_firstcontig ) - if refName is not None: + if drdict.get("refName", None): # TODO displayedRegions is not just zooming to the region, it hides the rest of the chromosome view_json["displayedRegions"] = [ - { - "refName": refName, - "start": start, - "end": end, - "reversed": False, - "assemblyName": self.genome_name, - } + drdict, ] - logging.info( - "@@@ defaultlocation %s for default session" - % view_json["displayedRegions"] - ) + logging.info("@@@ defaultlocation %s for default session" % drdict) else: logging.info( "@@@ no contig name found for default session - please add one!" @@ -1307,12 +1326,19 @@ ]: cmd = ["rm", "-rf", os.path.join(self.outdir, fn)] self.subprocess_check_call(cmd) - cmd = ["cp", os.path.join(INSTALLED_TO, "jb2_webserver.py"), self.outdir] + cmd = [ + "cp", + os.path.join(INSTALLED_TO, "jb2_webserver.py"), + self.outdir, + ] self.subprocess_check_call(cmd) def parse_style_conf(item): - if "type" in item.attrib and item.attrib["type"] in ["boolean", "integer"]: + if "type" in item.attrib and item.attrib["type"] in [ + "boolean", + "integer", + ]: if item.attrib["type"] == "boolean": return item.text in ("yes", "true", "True") elif item.attrib["type"] == "integer": @@ -1379,7 +1405,10 @@ for x in track.findall("files/trackFile"): if is_multi_bigwig: multi_bigwig_paths.append( - (x.attrib["label"], os.path.realpath(x.attrib["path"])) + ( + x.attrib["label"], + os.path.realpath(x.attrib["path"]), + ) ) else: if trackfiles: diff -r 1fe91657bfd6 -r 4c201a3d4755 jbrowse2.xml --- a/jbrowse2.xml Thu Jan 25 07:58:28 2024 +0000 +++ b/jbrowse2.xml Sun Jan 28 06:48:52 2024 +0000 @@ -1,4 +1,4 @@ - + genome browser macros.xml @@ -781,38 +781,38 @@ about how to run the command line tools to format your data, and which options need to be supplied and where. -The JBrowse-in-Galaxy tool is maintained by `the Galaxy IUC -`__, who you can help you -with missing features or bugs in the tool. +The JBrowse-in-Galaxy tool has been rejected by `a Galaxy IUC +`__, reviewer. +It is maintained by https://github.com/fubar2 who you can help you +with missing features or bugs in the tool. For the record, he remains unconvinced by the reviewer's logic, +and disturbed by the distinctly coercive approach to introducing new code, +compared to the more usual method of providing a working PR. Options ------- -The first option you encounter is the **Reference sequence(s)** to use. This option -now accepts multiple fasta files, allowing you to build JBrowse2 -instances that contain data for multiple genomes or chrosomomes -(generally known as "landmark features" in gff3 terminology.) +**Reference or Assembly** + +Choose either a built-in or select one from your history. -**Track Groups** represent a set of tracks in a single category. These -can be used to let your users understand relationships between large -groups of tracks. +Track coordinates and contig names *must* match this reference precisely +or they will not display. + +**Track Groups** represent a set of tracks in a single category. Annotation Tracks ----------------- -There are a few different types of tracks supported, each with their own -set of options: - GFF3/BED ~~~~~~~~ -These are standard feature tracks. They usually highlight genes, -mRNAs and other features of interest along a genomic region. +Standard feature tracks. They usually highlight genes, mRNAs and other features of interest along a genomic region. When these contain tens of millions of features, such as repeat regions from a VGP assembly, displaying one at a time leads to extremely slow loading times when a large region is in view, unless the "LinearPileupDisplay" display option is selected for that track in the styling options section. The default is LinearBasicDisplay, which shows all details and works -well for relatively sparse bed files. +well for relatively sparse bed files. A better option is to make a bigwig track using a set of windows based on the +lengths of each assembly or reference contig. BAM Pileups ~~~~~~~~~~~ diff -r 1fe91657bfd6 -r 4c201a3d4755 macros.xml --- a/macros.xml Thu Jan 25 07:58:28 2024 +0000 +++ b/macros.xml Sun Jan 28 06:48:52 2024 +0000 @@ -24,7 +24,7 @@ \$GALAXY_JBROWSE_SHARED_DIR - galaxy0 + galaxy2