# HG changeset patch # User fubar # Date 1706594703 0 # Node ID bde6b1d09f7d0b0f03026df29bf4f8263149eded # Parent 2e6c489108199f56d3b9221d5ce5fa5780efb9fb planemo upload for repository https://github.com/usegalaxy-eu/temporary-tools/tree/master/jbrowse2 commit 1290bf486bc55c02fecd0327de10a28655a18e81-dirty diff -r 2e6c48910819 -r bde6b1d09f7d GFFOutput.py --- a/GFFOutput.py Mon Jan 29 02:34:43 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,213 +0,0 @@ -"""Output Biopython SeqRecords and SeqFeatures to GFF3 format. - -The target format is GFF3, the current GFF standard: - http://www.sequenceontology.org/gff3.shtml -""" -from six.moves import urllib - -from Bio import SeqIO - - -class _IdHandler: - """Generate IDs for GFF3 Parent/Child - relationships where they don't exist.""" - - def __init__(self): - self._prefix = "biopygen" - self._counter = 1 - self._seen_ids = [] - - def _generate_id(self, quals): - """Generate a unique ID not present in our existing IDs.""" - gen_id = self._get_standard_id(quals) - if gen_id is None: - while 1: - gen_id = "%s%s" % (self._prefix, self._counter) - if gen_id not in self._seen_ids: - break - self._counter += 1 - return gen_id - - def _get_standard_id(self, quals): - """Retrieve standardized IDs from other sources like NCBI GenBank. - - This tries to find IDs from known key/values when stored differently - than GFF3 specifications. - """ - possible_keys = ["transcript_id", "protein_id"] - for test_key in possible_keys: - if test_key in quals: - cur_id = quals[test_key] - if isinstance(cur_id, tuple) or isinstance(cur_id, list): - return cur_id[0] - else: - return cur_id - return None - - def update_quals(self, quals, has_children): - """Update a set of qualifiers, adding an ID if necessary.""" - cur_id = quals.get("ID", None) - # if we have an ID, record it - if cur_id: - if not isinstance(cur_id, list) and not isinstance(cur_id, tuple): - cur_id = [cur_id] - for add_id in cur_id: - self._seen_ids.append(add_id) - # if we need one and don't have it, create a new one - elif has_children: - new_id = self._generate_id(quals) - self._seen_ids.append(new_id) - quals["ID"] = [new_id] - return quals - - -class GFF3Writer: - """Write GFF3 files starting with standard Biopython objects.""" - - def __init__(self): - pass - - def write(self, recs, out_handle, include_fasta=False): - """Write the provided records to the given handle in GFF3 format.""" - id_handler = _IdHandler() - self._write_header(out_handle) - fasta_recs = [] - try: - recs = iter(recs) - except TypeError: - recs = [recs] - for rec in recs: - self._write_rec(rec, out_handle) - self._write_annotations(rec.annotations, rec.id, len(rec.seq), out_handle) - for sf in rec.features: - sf = self._clean_feature(sf) - id_handler = self._write_feature(sf, rec.id, out_handle, id_handler) - if include_fasta and len(rec.seq) > 0: - fasta_recs.append(rec) - if len(fasta_recs) > 0: - self._write_fasta(fasta_recs, out_handle) - - def _clean_feature(self, feature): - quals = {} - for key, val in feature.qualifiers.items(): - if not isinstance(val, (list, tuple)): - val = [val] - val = [str(x) for x in val] - quals[key] = val - feature.qualifiers = quals - # Support for Biopython 1.68 and above, which removed sub_features - if not hasattr(feature, "sub_features"): - feature.sub_features = [] - clean_sub = [self._clean_feature(f) for f in feature.sub_features] - feature.sub_features = clean_sub - return feature - - def _write_rec(self, rec, out_handle): - # if we have a SeqRecord, write out optional directive - if len(rec.seq) > 0: - out_handle.write("##sequence-region %s 1 %s\n" % (rec.id, len(rec.seq))) - - def _get_phase(self, feature): - if "phase" in feature.qualifiers: - phase = feature.qualifiers["phase"][0] - elif feature.type == "CDS": - phase = int(feature.qualifiers.get("codon_start", [1])[0]) - 1 - else: - phase = "." - return str(phase) - - def _write_feature(self, feature, rec_id, out_handle, id_handler, parent_id=None): - """Write a feature with location information.""" - if feature.location.strand == 1: - strand = "+" - elif feature.location.strand == -1: - strand = "-" - else: - strand = "." - # remove any standard features from the qualifiers - quals = feature.qualifiers.copy() - for std_qual in ["source", "score", "phase"]: - if std_qual in quals and len(quals[std_qual]) == 1: - del quals[std_qual] - # add a link to a parent identifier if it exists - if parent_id: - if "Parent" not in quals: - quals["Parent"] = [] - quals["Parent"].append(parent_id) - quals = id_handler.update_quals(quals, len(feature.sub_features) > 0) - if feature.type: - ftype = feature.type - else: - ftype = "sequence_feature" - parts = [ - str(rec_id), - feature.qualifiers.get("source", ["feature"])[0], - ftype, - str(feature.location.start + 1), # 1-based indexing - str(feature.location.end), - feature.qualifiers.get("score", ["."])[0], - strand, - self._get_phase(feature), - self._format_keyvals(quals), - ] - out_handle.write("\t".join(parts) + "\n") - for sub_feature in feature.sub_features: - id_handler = self._write_feature( - sub_feature, - rec_id, - out_handle, - id_handler, - quals["ID"][0], - ) - return id_handler - - def _format_keyvals(self, keyvals): - format_kvs = [] - for key in sorted(keyvals.keys()): - values = keyvals[key] - key = key.strip() - format_vals = [] - if not isinstance(values, list) or isinstance(values, tuple): - values = [values] - for val in values: - val = urllib.parse.quote(str(val).strip(), safe=":/ ") - if (key and val) and val not in format_vals: - format_vals.append(val) - format_kvs.append("%s=%s" % (key, ",".join(format_vals))) - return ";".join(format_kvs) - - def _write_annotations(self, anns, rec_id, size, out_handle): - """Add annotations which refer to an entire sequence.""" - format_anns = self._format_keyvals(anns) - if format_anns: - parts = [ - rec_id, - "annotation", - "remark", - "1", - str(size if size > 1 else 1), - ".", - ".", - ".", - format_anns, - ] - out_handle.write("\t".join(parts) + "\n") - - def _write_header(self, out_handle): - """Write out standard header directives.""" - out_handle.write("##gff-version 3\n") - - def _write_fasta(self, recs, out_handle): - """Write sequence records using the ##FASTA directive.""" - out_handle.write("##FASTA\n") - SeqIO.write(recs, out_handle, "fasta") - - -def write(recs, out_handle, include_fasta=False): - """High level interface to write GFF3 files from SeqRecords and SeqFeatures. - - If include_fasta is True, the GFF3 file will include sequence information - using the ##FASTA directive. - """ - writer = GFF3Writer() - return writer.write(recs, out_handle, include_fasta) diff -r 2e6c48910819 -r bde6b1d09f7d autogenJB2.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/autogenJB2.py Tue Jan 30 06:05:03 2024 +0000 @@ -0,0 +1,88 @@ +import argparse +import os + +from jbrowse2 import jbrowseConnector as jbC + + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="", epilog="") + parser.add_argument("--yaml", help="Track Configuration") + parser.add_argument("--outdir", help="Output directory", default="out") + parser.add_argument("--version", "-V", action="version", version="%(prog)s 0.0.1") + args = parser.parse_args() + if os.path.exists(args.outdir): + root = args.outdir + dirList = os.scandir(root) + subdirs = [f.path for f in dirList if f.is_dir()] + genome_paths = [f.path for f in dirList if f.name.startswith('genome') and f.is_file()] + if len(genome_paths) > 0: + genome_fnames = [os.path.basename(x).split('_')[2:] for x in genome_paths] # expect genome_1_genomename.fasta etc + jc = jbC( + outdir=args.outdir, + genomes=[ + { + "path": x, + "meta": {"name" : genome_fnames[i], }, + } + for i,x in enumerate(genome_paths) + ], + ) + jc.process_genomes() + # .add_default_view() replace from https://github.com/abretaud/tools-iuc/blob/jbrowse2/tools/jbrowse2/jbrowse2.py + default_session_data = { + "visibility": { + "default_on": [], + "default_off": [], + }, + "style": {}, + "style_labels": {}, + } + + track_paths = [x for x in genome_paths if not x.startswith('genome') and x.is_file()] + for i, track in enumerate(track_paths): + track_conf = {} + track_conf['format']= os.path.basename(track).split('_')[0] + track_conf["name"] = os.path.basename(track).split('_')[2:] # expect genome_1_genomename.fasta etc + fext = os.path.splitext(os.path.basename(track)).replace('.','') + track_conf["label"] = "%s_%i" % (os.path.basename(track), i) + track_conf["trackfiles"] = [] + keys = jc.process_annotations(track_conf) + + if keys: + for key in keys: + default_session_data["visibility"][ + track.attrib.get("visibility", "default_off") + ].append(key) + if track_conf.get("style", None): + default_session_data["style"][key] = track_conf[ + "style" + ] # TODO do we need this anymore? + if track_conf.get("style_lables", None): + default_session_data["style_labels"][key] = track_conf.get( + "style_labels", None + ) + # default_session_data["defaultLocation"] = root.find( + # "metadata/general/defaultLocation" + # ).text + # default_session_data["session_name"] = root.find( + # "metadata/general/session_name" + # ).text + # general_data = { + # "analytics": root.find("metadata/general/analytics").text, + # "primary_color": root.find("metadata/general/primary_color").text, + # "secondary_color": root.find("metadata/general/secondary_color").text, + # "tertiary_color": root.find("metadata/general/tertiary_color").text, + # "quaternary_color": root.find("metadata/general/quaternary_color").text, + # "font_size": root.find("metadata/general/font_size").text, + # } + # jc.add_general_configuration(general_data) + trackconf = jc.config_json.get("tracks", None) + if trackconf: + jc.config_json["tracks"].update(jc.tracksToAdd) + else: + jc.config_json["tracks"] = jc.tracksToAdd + jc.write_config() + jc.add_default_session(default_session_data) + # jc.text_index() not sure what broke here. diff -r 2e6c48910819 -r bde6b1d09f7d autogenJB2.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/autogenJB2.xml Tue Jan 30 06:05:03 2024 +0000 @@ -0,0 +1,148 @@ + + Files to JBrowse2 + + macros.xml + + + + jbrowse2 + + + python '${__tool_directory__}/autogenJB2.py' --version + to test the files I want to test. Hmph. +#if str($uglyTestingHack) == "enabled": + && cp '$trackxml' '$output' +#end if + ]]> + + + + + + + + + + `__, reviewer. +It is maintained by https://github.com/fubar2 who you can help you +with missing features or bugs in the tool. For the record, he remains unconvinced by the reviewer's logic, +and disturbed by the distinctly coercive approach to introducing new code, +compared to the more usual method of providing a working PR. + +Options +------- + +**Reference or Assembly** + +Choose either a built-in or select one from your history. + +Track coordinates and contig names *must* match this reference precisely +or they will not display. + +**Track Groups** represent a set of tracks in a single category. + +Annotation Tracks +----------------- + +GFF3/BED +~~~~~~~~ + +Standard feature tracks. They usually highlight genes, mRNAs and other features of interest along a genomic region. + +When these contain tens of millions of features, such as repeat regions from a VGP assembly, displaying one at a time leads +to extremely slow loading times when a large region is in view, unless the "LinearPileupDisplay" display option is +selected for that track in the styling options section. The default is LinearBasicDisplay, which shows all details and works +well for relatively sparse bed files. A better option is to make a bigwig track using a set of windows based on the +lengths of each assembly or reference contig. + +BAM Pileups +~~~~~~~~~~~ + +We support BAM files and can automatically generate SNP tracks based on +that bam data. + + +BlastXML +~~~~~~~~ + +JiG now supports both blastn and blastp datasets. JiG internally uses a +blastXML to gapped GFF3 tool to convert your blastxml datasets into a +format amenable to visualization in JBrowse. This tool is also +available separately from the IUC on the toolshed. + +**Minimum Gap Size** reflects how long a gap must be before it becomes a +real gap in the processed gff3 file. In the picture above, various sizes +of gaps can be seen. If the minimum gap size was set much higher, say +100nt, many of the smaller gaps would disappear, and the features on +both sides would be merged into one, longer feature. This setting is +inversely proportional to runtime and output file size. *Do not set this +to a low value for large datasets*. By setting this number lower, you +will have extremely large outputs and extremely long runtimes. The +default was configured based off of the author's experience, but the +author only works on small viruses. It is *strongly* recommended that +you filter your blast results before display, e.g. picking out the top +10 hits or so. + +**Protein blast search** option merely informs underlying tools that +they should adjust feature locations by 3x. + + +@ATTRIBUTION@ +]]> + + diff -r 2e6c48910819 -r bde6b1d09f7d gff3_rebase.py --- a/gff3_rebase.py Mon Jan 29 02:34:43 2024 +0000 +++ b/gff3_rebase.py Tue Jan 30 06:05:03 2024 +0000 @@ -117,7 +117,7 @@ start *= 3 end *= 3 - if parent.location.strand >= 0: + if parent.location.strand !=None and parent.location.strand >= 0: ns = parent.location.start + start ne = parent.location.start + end st = +1 @@ -136,7 +136,8 @@ ns %= 3 if ne < 0: ne %= 3 - + if ns > ne: + ne, ns = ns, ne # dunno why but sometimes happens feature.location = FeatureLocation(ns, ne, strand=st) if hasattr(feature, "sub_features"): diff -r 2e6c48910819 -r bde6b1d09f7d jb2_GFF/GFFOutput.py --- a/jb2_GFF/GFFOutput.py Mon Jan 29 02:34:43 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,233 +0,0 @@ -"""Output Biopython SeqRecords and SeqFeatures to GFF3 format. - -The target format is GFF3, the current GFF standard: - http://www.sequenceontology.org/gff3.shtml -""" -from six.moves import urllib - -from Bio import SeqIO - - -class _IdHandler: - """Generate IDs for GFF3 Parent/Child - relationships where they don't exist.""" - - def __init__(self): - self._prefix = "biopygen" - self._counter = 1 - self._seen_ids = [] - - def _generate_id(self, quals): - """Generate a unique ID not present in our existing IDs.""" - gen_id = self._get_standard_id(quals) - if gen_id is None: - while 1: - gen_id = "%s%s" % (self._prefix, self._counter) - if gen_id not in self._seen_ids: - break - self._counter += 1 - return gen_id - - def _get_standard_id(self, quals): - """Retrieve standardized IDs from other sources like NCBI GenBank. - - This tries to find IDs from known key/values when stored differently - than GFF3 specifications. - """ - possible_keys = ["transcript_id", "protein_id"] - for test_key in possible_keys: - if test_key in quals: - cur_id = quals[test_key] - if isinstance(cur_id, tuple) or isinstance( - cur_id, list - ): - return cur_id[0] - else: - return cur_id - return None - - def update_quals(self, quals, has_children): - """Update a set of qualifiers, adding an ID if necessary.""" - cur_id = quals.get("ID", None) - # if we have an ID, record it - if cur_id: - if not isinstance(cur_id, list) and not isinstance( - cur_id, tuple - ): - cur_id = [cur_id] - for add_id in cur_id: - self._seen_ids.append(add_id) - # if we need one and don't have it, create a new one - elif has_children: - new_id = self._generate_id(quals) - self._seen_ids.append(new_id) - quals["ID"] = [new_id] - return quals - - -class GFF3Writer: - """Write GFF3 files starting with standard Biopython objects.""" - - def __init__(self): - pass - - def write(self, recs, out_handle, include_fasta=False): - """Write the provided records to the given handle in GFF3 format.""" - id_handler = _IdHandler() - self._write_header(out_handle) - fasta_recs = [] - try: - recs = iter(recs) - except TypeError: - recs = [recs] - for rec in recs: - self._write_rec(rec, out_handle) - self._write_annotations( - rec.annotations, rec.id, len(rec.seq), out_handle - ) - for sf in rec.features: - sf = self._clean_feature(sf) - id_handler = self._write_feature( - sf, rec.id, out_handle, id_handler - ) - if include_fasta and len(rec.seq) > 0: - fasta_recs.append(rec) - if len(fasta_recs) > 0: - self._write_fasta(fasta_recs, out_handle) - - def _clean_feature(self, feature): - quals = {} - for key, val in feature.qualifiers.items(): - if not isinstance(val, (list, tuple)): - val = [val] - val = [str(x) for x in val] - quals[key] = val - feature.qualifiers = quals - # Support for Biopython 1.68 and above, which removed sub_features - if not hasattr(feature, "sub_features"): - feature.sub_features = [] - clean_sub = [ - self._clean_feature(f) for f in feature.sub_features - ] - feature.sub_features = clean_sub - return feature - - def _write_rec(self, rec, out_handle): - # if we have a SeqRecord, write out optional directive - if len(rec.seq) > 0: - out_handle.write( - "##sequence-region %s 1 %s\n" % (rec.id, len(rec.seq)) - ) - - def _get_phase(self, feature): - if "phase" in feature.qualifiers: - phase = feature.qualifiers["phase"][0] - elif feature.type == "CDS": - phase = ( - int(feature.qualifiers.get("codon_start", [1])[0]) - 1 - ) - else: - phase = "." - return str(phase) - - def _write_feature( - self, feature, rec_id, out_handle, id_handler, parent_id=None - ): - """Write a feature with location information.""" - if feature.location.strand == 1: - strand = "+" - elif feature.location.strand == -1: - strand = "-" - else: - strand = "." - # remove any standard features from the qualifiers - quals = feature.qualifiers.copy() - for std_qual in ["source", "score", "phase"]: - if std_qual in quals and len(quals[std_qual]) == 1: - del quals[std_qual] - # add a link to a parent identifier if it exists - if parent_id: - if "Parent" not in quals: - quals["Parent"] = [] - quals["Parent"].append(parent_id) - quals = id_handler.update_quals( - quals, len(feature.sub_features) > 0 - ) - if feature.type: - ftype = feature.type - else: - ftype = "sequence_feature" - parts = [ - str(rec_id), - feature.qualifiers.get("source", ["feature"])[0], - ftype, - str(feature.location.start + 1), # 1-based indexing - str(feature.location.end), - feature.qualifiers.get("score", ["."])[0], - strand, - self._get_phase(feature), - self._format_keyvals(quals), - ] - out_handle.write("\t".join(parts) + "\n") - for sub_feature in feature.sub_features: - id_handler = self._write_feature( - sub_feature, - rec_id, - out_handle, - id_handler, - quals["ID"][0], - ) - return id_handler - - def _format_keyvals(self, keyvals): - format_kvs = [] - for key in sorted(keyvals.keys()): - values = keyvals[key] - key = key.strip() - format_vals = [] - if not isinstance(values, list) or isinstance( - values, tuple - ): - values = [values] - for val in values: - val = urllib.parse.quote(str(val).strip(), safe=":/ ") - if (key and val) and val not in format_vals: - format_vals.append(val) - format_kvs.append("%s=%s" % (key, ",".join(format_vals))) - return ";".join(format_kvs) - - def _write_annotations(self, anns, rec_id, size, out_handle): - """Add annotations which refer to an entire sequence.""" - format_anns = self._format_keyvals(anns) - if format_anns: - parts = [ - rec_id, - "annotation", - "remark", - "1", - str(size if size > 1 else 1), - ".", - ".", - ".", - format_anns, - ] - out_handle.write("\t".join(parts) + "\n") - - def _write_header(self, out_handle): - """Write out standard header directives.""" - out_handle.write("##gff-version 3\n") - - def _write_fasta(self, recs, out_handle): - """Write sequence records using the ##FASTA directive.""" - out_handle.write("##FASTA\n") - SeqIO.write(recs, out_handle, "fasta") - - -def write(recs, out_handle, include_fasta=False): - """High level interface to write GFF3 files from SeqRecords and SeqFeatures. - - If include_fasta is True, the GFF3 file will include sequence information - using the ##FASTA directive. - """ - writer = GFF3Writer() - return writer.write(recs, out_handle, include_fasta) diff -r 2e6c48910819 -r bde6b1d09f7d jb2_GFF/GFFParser.py --- a/jb2_GFF/GFFParser.py Mon Jan 29 02:34:43 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1099 +0,0 @@ -"""Parse GFF files into features attached to Biopython SeqRecord objects. - -This deals with GFF3 formatted files, a tab delimited format for storing -sequence features and annotations: - -http://www.sequenceontology.org/gff3.shtml - -It will also deal with older GFF versions (GTF/GFF2): - -http://www.sanger.ac.uk/Software/formats/GFF/GFF_Spec.shtml -http://mblab.wustl.edu/GTF22.html - -The implementation utilizes map/reduce parsing of GFF using Disco. Disco -(http://discoproject.org) is a Map-Reduce framework for Python utilizing -Erlang for parallelization. The code works on a single processor without -Disco using the same architecture. -""" -import os -import copy -import json -import re -import collections -import io -import itertools -import warnings -import six -from six.moves import urllib - -from Bio.SeqRecord import SeqRecord -from Bio import SeqFeature -from Bio import SeqIO -from Bio import BiopythonDeprecationWarning - -import disco - -# Make defaultdict compatible with versions of python older than 2.4 -try: - collections.defaultdict -except AttributeError: - import _utils - - collections.defaultdict = _utils.defaultdict - -unknown_seq_avail = False -try: - from Bio.Seq import UnknownSeq - - unknown_seq_avail = True -except ImportError: - # Starting with biopython 1.81, has been removed - from Bio.Seq import _UndefinedSequenceData - from Bio.Seq import Seq - - -warnings.simplefilter("ignore", BiopythonDeprecationWarning) - - -def _gff_line_map(line, params): - """Map part of Map-Reduce; parses a line of GFF into a dictionary. - - Given an input line from a GFF file, this: - - decides if the file passes our filtering limits - - if so: - - breaks it into component elements - - determines the type of attribute (flat, parent, child or annotation) - - generates a dictionary of GFF info which can be serialized as JSON - """ - - def _merge_keyvals(parts): - """Merge key-values escaped by quotes - that are improperly split at semicolons.""" - out = [] - for i, p in enumerate(parts): - if ( - i > 0 - and len(p) == 1 - and p[0].endswith('"') - and not p[0].startswith('"') - ): - if out[-1][-1].startswith('"'): - prev_p = out.pop(-1) - to_merge = prev_p[-1] - prev_p[-1] = "%s; %s" % (to_merge, p[0]) - out.append(prev_p) - else: - out.append(p) - return out - - gff3_kw_pat = re.compile(r"\w+=") - - def _split_keyvals(keyval_str): - """Split key-value pairs in a GFF2, GTF and GFF3 compatible way. - - GFF3 has key value pairs like: - count=9;gene=amx-2;sequence=SAGE:aacggagccg - GFF2 and GTF have: - Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206" - name "fgenesh1_pg.C_chr_1000003"; transcriptId 869 - """ - quals = collections.defaultdict(list) - if keyval_str is None: - return quals - # ensembl GTF has a stray semi-colon at the end - if keyval_str[-1] == ";": - keyval_str = keyval_str[:-1] - # GFF2/GTF has a semi-colon with at least one space after it. - # It can have spaces on both sides; wormbase does this. - # GFF3 works with no spaces. - # Split at the first one we can recognize as working - parts = keyval_str.split(" ; ") - if len(parts) == 1: - parts = [x.strip() for x in keyval_str.split(";")] - # check if we have GFF3 style key-vals (with =) - is_gff2 = True - if gff3_kw_pat.match(parts[0]): - is_gff2 = False - key_vals = _merge_keyvals([p.split("=") for p in parts]) - # otherwise, we are separated by a space with a key as the first item - else: - pieces = [] - for p in parts: - # fix misplaced semi-colons in keys in some GFF2 files - if p and p[0] == ";": - p = p[1:] - pieces.append(p.strip().split(" ")) - key_vals = [(p[0], " ".join(p[1:])) for p in pieces] - for item in key_vals: - # standard in-spec items are key=value - if len(item) == 2: - key, val = item - # out-of-spec files can have just key values. We set an empty value - # which will be changed to true later to standardize. - else: - assert len(item) == 1, item - key = item[0] - val = "" - # remove quotes in GFF2 files - quoted = False - if len(val) > 0 and val[0] == '"' and val[-1] == '"': - quoted = True - val = val[1:-1] - if val: - if quoted: - quals[key].append(val) - else: - quals[key].extend( - [v for v in val.split(",") if v] - ) - # if we don't have a value, make this a key=True/False style - # attribute - else: - quals[key].append("true") - for key, vals in quals.items(): - quals[key] = [urllib.parse.unquote(v) for v in vals] - return quals, is_gff2 - - def _nest_gff2_features(gff_parts): - """Provide nesting of GFF2 transcript parts with transcript IDs. - - exons and coding sequences are mapped to a parent with a transcript_id - in GFF2. This is implemented differently at different genome centers - and this function attempts to resolve that and map things to the GFF3 - way of doing them. - """ - # map protein or transcript ids to a parent - for transcript_id in [ - "transcript_id", - "transcriptId", - "proteinId", - ]: - try: - gff_parts["quals"]["Parent"] = gff_parts["quals"][ - transcript_id - ] - break - except KeyError: - pass - # case for WormBase GFF -- everything labelled as Transcript or CDS - for flat_name in ["Transcript", "CDS"]: - if flat_name in gff_parts["quals"]: - # parent types - if gff_parts["type"] in [flat_name]: - if not gff_parts["id"]: - gff_parts["id"] = gff_parts["quals"][ - flat_name - ][0] - gff_parts["quals"]["ID"] = [gff_parts["id"]] - # children types - elif gff_parts["type"] in [ - "intron", - "exon", - "three_prime_UTR", - "coding_exon", - "five_prime_UTR", - "CDS", - "stop_codon", - "start_codon", - ]: - gff_parts["quals"]["Parent"] = gff_parts["quals"][ - flat_name - ] - break - - return gff_parts - - strand_map = {"+": 1, "-": -1, "?": None, None: None} - line = line.strip() - if line[:2] == "##": - return [("directive", line[2:])] - elif line and line[0] != "#": - parts = line.split("\t") - should_do = True - if params.limit_info: - for limit_name, limit_values in params.limit_info.items(): - cur_id = tuple( - [parts[i] for i in params.filter_info[limit_name]] - ) - if cur_id not in limit_values: - should_do = False - break - if should_do: - assert len(parts) >= 8, line - # not python2.4 compatible but easier to understand - # gff_parts = [(None if p == '.' else p) for p in parts] - gff_parts = [] - for p in parts: - if p == ".": - gff_parts.append(None) - else: - gff_parts.append(p) - gff_info = dict() - # collect all of the base qualifiers for this item - if len(parts) > 8: - quals, is_gff2 = _split_keyvals(gff_parts[8]) - else: - quals, is_gff2 = collections.defaultdict(list), False - gff_info["is_gff2"] = is_gff2 - if gff_parts[1]: - quals["source"].append(gff_parts[1]) - if gff_parts[5]: - quals["score"].append(gff_parts[5]) - if gff_parts[7]: - quals["phase"].append(gff_parts[7]) - gff_info["quals"] = dict(quals) - gff_info["rec_id"] = gff_parts[0] - # if we are describing a location, then we are a feature - if gff_parts[3] and gff_parts[4]: - gff_info["location"] = [ - int(gff_parts[3]) - 1, - int(gff_parts[4]), - ] - gff_info["type"] = gff_parts[2] - gff_info["id"] = quals.get("ID", [""])[0] - gff_info["strand"] = strand_map.get( - gff_parts[6], None - ) - if is_gff2: - gff_info = _nest_gff2_features(gff_info) - # features that have parents need to link so we can pick up - # the relationship - if "Parent" in gff_info["quals"]: - # check for self referential parent/child relationships - # remove the ID, which is not useful - for p in gff_info["quals"]["Parent"]: - if p == gff_info["id"]: - gff_info["id"] = "" - del gff_info["quals"]["ID"] - break - final_key = "child" - elif gff_info["id"]: - final_key = "parent" - # Handle flat features - else: - final_key = "feature" - # otherwise, associate these annotations with the full record - else: - final_key = "annotation" - if params.jsonify: - return [(final_key, json.dumps(gff_info))] - else: - return [(final_key, gff_info)] - return [] - - -def _gff_line_reduce(map_results, out, params): - """Reduce part of Map-Reduce; combines results of parsed features.""" - final_items = dict() - for gff_type, final_val in map_results: - if params.jsonify and gff_type not in ["directive"]: - final_val = json.loads(final_val) - try: - final_items[gff_type].append(final_val) - except KeyError: - final_items[gff_type] = [final_val] - for key, vals in final_items.items(): - if params.jsonify: - vals = json.dumps(vals) - out.add(key, vals) - - -class _MultiIDRemapper: - """Provide an ID remapping for cases where a parent has a non-unique ID. - - Real life GFF3 cases have non-unique ID attributes, which we fix here - by using the unique sequence region to assign children to the right - parent. - """ - - def __init__(self, base_id, all_parents): - self._base_id = base_id - self._parents = all_parents - - def remap_id(self, feature_dict): - rstart, rend = feature_dict["location"] - for index, parent in enumerate(self._parents): - pstart, pend = parent["location"] - if rstart >= pstart and rend <= pend: - if index > 0: - return "%s_%s" % (self._base_id, index + 1) - else: - return self._base_id - # if we haven't found a location match but parents are umabiguous, - # return that - if len(self._parents) == 1: - return self._base_id - raise ValueError( - "Did not find remapped ID location: %s, %s, %s" - % ( - self._base_id, - [p["location"] for p in self._parents], - feature_dict["location"], - ) - ) - - -class _AbstractMapReduceGFF: - """Base class providing general GFF parsing for local and remote classes. - - This class should be subclassed to provide a concrete class to parse - GFF under specific conditions. These classes need to implement - the _gff_process function, which returns a dictionary of SeqRecord - information. - """ - - def __init__(self, create_missing=True): - """Initialize GFF parser - - create_missing - If True, create blank records for GFF ids not in - the base_dict. If False, an error will be raised. - """ - self._create_missing = create_missing - self._map_fn = _gff_line_map - self._reduce_fn = _gff_line_reduce - self._examiner = GFFExaminer() - - def _gff_process(self, gff_files, limit_info, target_lines=None): - raise NotImplementedError("Derived class must define") - - def parse(self, gff_files, base_dict=None, limit_info=None): - """Parse a GFF file, returning an iterator of SeqRecords. - - limit_info - A dictionary specifying the regions of the GFF file - which should be extracted. This allows only relevant portions of a file - to be parsed. - - base_dict - A base dictionary of SeqRecord objects which may be - pre-populated with sequences and other features. The new features from - the GFF file will be added to this dictionary. - """ - for rec in self.parse_in_parts( - gff_files, base_dict, limit_info - ): - yield rec - - def parse_in_parts( - self, - gff_files, - base_dict=None, - limit_info=None, - target_lines=None, - ): - """Parse a region of a GFF file specified, returning info as generated. - - target_lines -- The number of lines in the file which should be used - for each partial parse. This should be determined based on available - memory. - """ - for results in self.parse_simple( - gff_files, limit_info, target_lines - ): - if base_dict is None: - cur_dict = dict() - else: - cur_dict = copy.deepcopy(base_dict) - cur_dict = self._results_to_features(cur_dict, results) - all_ids = list(cur_dict.keys()) - all_ids.sort() - for cur_id in all_ids: - yield cur_dict[cur_id] - - def parse_simple( - self, gff_files, limit_info=None, target_lines=1 - ): - """Simple parse which does not build or nest features. - - This returns a simple dictionary representation of each line in the - GFF file. - """ - # gracefully handle a single file passed - if not isinstance(gff_files, (list, tuple)): - gff_files = [gff_files] - limit_info = self._normalize_limit_info(limit_info) - for results in self._gff_process( - gff_files, limit_info, target_lines - ): - yield results - - def _normalize_limit_info(self, limit_info): - """Turn all limit information into tuples for identical comparisons.""" - final_limit_info = {} - if limit_info: - for key, values in limit_info.items(): - final_limit_info[key] = [] - for v in values: - if isinstance(v, str): - final_limit_info[key].append((v,)) - else: - final_limit_info[key].append(tuple(v)) - return final_limit_info - - def _results_to_features(self, base, results): - """Add parsed dictionaries of results to Biopython SeqFeatures.""" - base = self._add_annotations( - base, results.get("annotation", []) - ) - for feature in results.get("feature", []): - (_, base) = self._add_toplevel_feature(base, feature) - base = self._add_parent_child_features( - base, results.get("parent", []), results.get("child", []) - ) - base = self._add_seqs(base, results.get("fasta", [])) - base = self._add_directives( - base, results.get("directive", []) - ) - return base - - def _add_directives(self, base, directives): - """Handle any directives or meta-data in the GFF file. - - Relevant items are added as annotation meta-data to each record. - """ - dir_keyvals = collections.defaultdict(list) - for directive in directives: - parts = directive.split() - if len(parts) > 1: - key = parts[0] - if len(parts) == 2: - val = parts[1] - else: - val = tuple(parts[1:]) - # specific directives that need special handling - if ( - key == "sequence-region" - ): # convert to Python 0-based coordinates - if len(val) == 2: # handle regions missing contig - val = (int(val[0]) - 1, int(val[1])) - elif len(val) == 3: - val = (val[0], int(val[1]) - 1, int(val[2])) - dir_keyvals[key].append(val) - for key, vals in dir_keyvals.items(): - for rec in base.values(): - self._add_ann_to_rec(rec, key, vals) - return base - - def _get_matching_record_id(self, base, find_id): - """Find a matching base record with the test identifier, handling tricky cases. - - NCBI IDs https://en.wikipedia.org/wiki/FASTA_format#NCBI_identifiers - """ - # Straight matches for identifiers - if find_id in base: - return find_id - # NCBI style IDs in find_id - elif find_id and find_id.find("|") > 0: - for test_id in [ - x.strip() for x in find_id.split("|")[1:] - ]: - if test_id and test_id in base: - return test_id - # NCBI style IDs in base IDs - else: - for base_id in base.keys(): - if base_id.find("|") > 0: - for test_id in [ - x.strip() for x in base_id.split("|")[1:] - ]: - if test_id and test_id == find_id: - return base_id - return None - - def _add_seqs(self, base, recs): - """Add sequence information contained in the GFF3 to records.""" - for rec in recs: - match_id = self._get_matching_record_id(base, rec.id) - if match_id: - base[match_id].seq = rec.seq - else: - base[rec.id] = rec - return base - - def _add_parent_child_features(self, base, parents, children): - """Add nested features with parent child relationships.""" - multi_remap = self._identify_dup_ids(parents) - # add children features - children_prep = collections.defaultdict(list) - for child_dict in children: - child_feature = self._get_feature(child_dict) - for pindex, pid in enumerate( - child_feature.qualifiers["Parent"] - ): - if pid in multi_remap: - pid = multi_remap[pid].remap_id(child_dict) - child_feature.qualifiers["Parent"][pindex] = pid - children_prep[pid].append( - (child_dict["rec_id"], child_feature) - ) - children = dict(children_prep) - # add children to parents that exist - for cur_parent_dict in parents: - cur_id = cur_parent_dict["id"] - if cur_id in multi_remap: - cur_parent_dict["id"] = multi_remap[cur_id].remap_id( - cur_parent_dict - ) - cur_parent, base = self._add_toplevel_feature( - base, cur_parent_dict - ) - cur_parent, children = self._add_children_to_parent( - cur_parent, children - ) - # create parents for children without them (GFF2 or split/bad files) - while len(children) > 0: - parent_id, cur_children = next( - itertools.islice(children.items(), 1) - ) - # one child, do not nest it - if len(cur_children) == 1: - rec_id, child = cur_children[0] - loc = (child.location.start, child.location.end) - rec, base = self._get_rec( - base, dict(rec_id=rec_id, location=loc) - ) - rec.features.append(child) - del children[parent_id] - else: - cur_parent, base = self._add_missing_parent( - base, parent_id, cur_children - ) - cur_parent, children = self._add_children_to_parent( - cur_parent, children - ) - return base - - def _identify_dup_ids(self, parents): - """Identify duplicated ID attributes in potential nested parents. - - According to the GFF3 spec ID attributes are supposed to be unique - for a file, but this is not always true in practice. This looks - for duplicates, and provides unique IDs sorted by locations. - """ - multi_ids = collections.defaultdict(list) - for parent in parents: - multi_ids[parent["id"]].append(parent) - multi_ids = [ - (mid, ps) - for (mid, ps) in multi_ids.items() - if len(parents) > 1 - ] - multi_remap = dict() - for mid, parents in multi_ids: - multi_remap[mid] = _MultiIDRemapper(mid, parents) - return multi_remap - - def _add_children_to_parent(self, cur_parent, children): - """Recursively add children to parent features.""" - if cur_parent.id in children: - cur_children = children[cur_parent.id] - ready_children = [] - for _, cur_child in cur_children: - cur_child, _ = self._add_children_to_parent( - cur_child, children - ) - ready_children.append(cur_child) - # Support Biopython features for 1.62+ - # CompoundLocations and pre-1.62 - if not hasattr(SeqFeature, "CompoundLocation"): - cur_parent.location_operator = "join" - for cur_child in ready_children: - cur_parent.sub_features.append(cur_child) - del children[cur_parent.id] - return cur_parent, children - - def _add_annotations(self, base, anns): - """Add annotation data from the GFF file to records.""" - # add these as a list of annotations, checking not to overwrite - # current values - for ann in anns: - rec, base = self._get_rec(base, ann) - for key, vals in ann["quals"].items(): - self._add_ann_to_rec(rec, key, vals) - return base - - def _add_ann_to_rec(self, rec, key, vals): - """Add a key/value annotation to the given SeqRecord.""" - if key in rec.annotations: - try: - rec.annotations[key].extend(vals) - except AttributeError: - rec.annotations[key] = [rec.annotations[key]] + vals - else: - rec.annotations[key] = vals - - def _get_rec(self, base, info_dict): - """Retrieve a record to add features to.""" - max_loc = info_dict.get("location", (0, 1))[1] - match_id = self._get_matching_record_id( - base, info_dict["rec_id"] - ) - if match_id: - cur_rec = base[match_id] - # update generated unknown sequences - if unknown_seq_avail and isinstance( - cur_rec.seq, UnknownSeq - ): - cur_rec.seq._length = max( - [max_loc, cur_rec.seq._length] - ) - elif not unknown_seq_avail and isinstance( - cur_rec.seq._data, _UndefinedSequenceData - ): - cur_rec.seq._data._length = max( - [max_loc, cur_rec.seq._data._length] - ) - return cur_rec, base - elif self._create_missing: - if unknown_seq_avail: - new_rec = SeqRecord( - UnknownSeq(max_loc), info_dict["rec_id"] - ) - else: - new_rec = SeqRecord( - Seq(None, length=max_loc), info_dict["rec_id"] - ) - base[info_dict["rec_id"]] = new_rec - return new_rec, base - else: - raise KeyError( - "Did not find matching record in %s for %s" - % (base.keys(), info_dict) - ) - - def _add_missing_parent(self, base, parent_id, cur_children): - """Add a new feature that is missing from the GFF file.""" - base_rec_id = list(set(c[0] for c in cur_children)) - child_strands = list( - set(c[1].location.strand for c in cur_children) - ) - inferred_strand = ( - child_strands[0] if len(child_strands) == 1 else None - ) - assert len(base_rec_id) > 0 - feature_dict = dict( - id=parent_id, - strand=inferred_strand, - type="inferred_parent", - quals=dict(ID=[parent_id]), - rec_id=base_rec_id[0], - ) - coords = [ - (c.location.start, c.location.end) - for r, c in cur_children - ] - feature_dict["location"] = ( - min([c[0] for c in coords]), - max([c[1] for c in coords]), - ) - return self._add_toplevel_feature(base, feature_dict) - - def _add_toplevel_feature(self, base, feature_dict): - """Add a toplevel non-nested feature to the appropriate record.""" - new_feature = self._get_feature(feature_dict) - rec, base = self._get_rec(base, feature_dict) - rec.features.append(new_feature) - return new_feature, base - - def _get_feature(self, feature_dict): - """Retrieve a Biopython feature from our dictionary representation.""" - # location = SeqFeature.FeatureLocation(*feature_dict['location']) - rstart, rend = feature_dict["location"] - new_feature = SeqFeature.SeqFeature( - SeqFeature.SimpleLocation( - start=rstart, end=rend, strand=feature_dict["strand"] - ), - feature_dict["type"], - id=feature_dict["id"], - ) - # Support for Biopython 1.68 and above, which removed sub_features - if not hasattr(new_feature, "sub_features"): - new_feature.sub_features = [] - new_feature.qualifiers = feature_dict["quals"] - return new_feature - - def _parse_fasta(self, in_handle): - """Parse FASTA sequence information contained in the GFF3 file.""" - return list(SeqIO.parse(in_handle, "fasta")) - - -class _GFFParserLocalOut: - """Provide a collector for local GFF MapReduce file parsing.""" - - def __init__(self, smart_breaks=False): - self._items = dict() - self._smart_breaks = smart_breaks - self._missing_keys = collections.defaultdict(int) - self._last_parent = None - self.can_break = True - self.num_lines = 0 - - def add(self, key, vals): - if self._smart_breaks: - # if we are not GFF2 we expect parents and break - # based on not having missing ones - if key == "directive": - if vals[0] == "#": - self.can_break = True - self._last_parent = None - elif not vals[0].get("is_gff2", False): - self._update_missing_parents(key, vals) - self.can_break = len(self._missing_keys) == 0 - # break when we are done with stretches of child features - elif key != "child": - self.can_break = True - self._last_parent = None - # break when we have lots of child features in a row - # and change between parents - else: - cur_parent = vals[0]["quals"]["Parent"][0] - if self._last_parent: - self.can_break = cur_parent != self._last_parent - self._last_parent = cur_parent - self.num_lines += 1 - try: - self._items[key].extend(vals) - except KeyError: - self._items[key] = vals - - def _update_missing_parents(self, key, vals): - # smart way of deciding if we can break this. - # if this is too much, can go back to not breaking in the - # middle of children - if key in ["child"]: - for val in vals: - for p_id in val["quals"]["Parent"]: - self._missing_keys[p_id] += 1 - for val in vals: - try: - del self._missing_keys[val["quals"]["ID"][0]] - except KeyError: - pass - - def has_items(self): - return len(self._items) > 0 - - def get_results(self): - self._last_parent = None - return self._items - - -class GFFParser(_AbstractMapReduceGFF): - """Local GFF parser providing standardized parsing of GFF files.""" - - def __init__(self, line_adjust_fn=None, create_missing=True): - _AbstractMapReduceGFF.__init__( - self, create_missing=create_missing - ) - self._line_adjust_fn = line_adjust_fn - - def _gff_process(self, gff_files, limit_info, target_lines): - """Process GFF addition without any parallelization. - - In addition to limit filtering, this accepts a target_lines attribute - which provides a number of lines to parse before returning results. - This allows partial parsing of a file to prevent memory issues. - """ - line_gen = self._file_line_generator(gff_files) - for out in self._lines_to_out_info( - line_gen, limit_info, target_lines - ): - yield out - - def _file_line_generator(self, gff_files): - """Generate single lines from a set of GFF files.""" - for gff_file in gff_files: - if hasattr(gff_file, "read"): - need_close = False - in_handle = gff_file - else: - need_close = True - in_handle = open(gff_file) - while 1: - line = in_handle.readline() - if not line: - break - yield line - if need_close: - in_handle.close() - - def _lines_to_out_info(self, line_iter, limit_info=None, target_lines=None): - """Generate SeqRecord and SeqFeatures from GFF file lines.""" - params = self._examiner._get_local_params(limit_info) - out_info = _GFFParserLocalOut( - (target_lines is not None and target_lines > 1) - ) - found_seqs = False - for line in line_iter: - results = self._map_fn(line, params) - if self._line_adjust_fn and results: - if results[0][0] not in ["directive"]: - results = [ - ( - results[0][0], - self._line_adjust_fn(results[0][1]), - ) - ] - self._reduce_fn(results, out_info, params) - if ( - target_lines - and out_info.num_lines >= target_lines - and out_info.can_break - ): - yield out_info.get_results() - out_info = _GFFParserLocalOut( - (target_lines is not None and target_lines > 1) - ) - if ( - results - and results[0][0] == "directive" - and results[0][1] == "FASTA" - ): - found_seqs = True - break - - class FakeHandle: - def __init__(self, line_iter): - self._iter = line_iter - - def __iter__(self): - return self - - def __next__(self): - return next(self._iter) - - next = __next__ - - def read(self, size=-1): - if size < 0: - return "".join(x for x in self._iter) - elif size == 0: - return "" # Used by Biopython to sniff unicode vs bytes - else: - raise NotImplementedError - - def readline(self): - try: - return next(self._iter) - except StopIteration: - return "" - - if found_seqs: - fasta_recs = self._parse_fasta(FakeHandle(line_iter)) - out_info.add("fasta", fasta_recs) - if out_info.has_items(): - yield out_info.get_results() - - -class DiscoGFFParser(_AbstractMapReduceGFF): - """GFF Parser with parallelization (http://discoproject.org.""" - - def __init__(self, disco_host, create_missing=True): - """Initialize parser. - - disco_host - Web reference to a Disco host which will be used for - parallelizing the GFF reading job. - """ - _AbstractMapReduceGFF.__init__( - self, create_missing=create_missing - ) - self._disco_host = disco_host - - def _gff_process(self, gff_files, limit_info, target_lines=None): - """Process GFF addition, using Disco to parallelize the process.""" - assert target_lines is None, "Cannot split parallelized jobs" - # make these imports local; only need them when using disco - # absolute path names unless they are special disco files - full_files = [] - for f in gff_files: - if f.split(":")[0] != "disco": - full_files.append(os.path.abspath(f)) - else: - full_files.append(f) - results = disco.job( - self._disco_host, - name="gff_reader", - input=full_files, - params=disco.Params( - limit_info=limit_info, - jsonify=True, - filter_info=self._examiner._filter_info, - ), - required_modules=["json", "collections", "re"], - map=self._map_fn, - reduce=self._reduce_fn, - ) - processed = dict() - for out_key, out_val in disco.result_iterator(results): - processed[out_key] = json.loads(out_val) - yield processed - - -def parse( - gff_files, base_dict=None, limit_info=None, target_lines=None -): - """parse GFF files into SeqRecords and SeqFeatures.""" - parser = GFFParser() - for rec in parser.parse_in_parts( - gff_files, base_dict, limit_info, target_lines - ): - yield rec - - -def parse_simple(gff_files, limit_info=None): - """Parse GFF files as line by line dictionary of parts.""" - parser = GFFParser() - for rec in parser.parse_simple(gff_files, limit_info=limit_info): - if "child" in rec: - assert "parent" not in rec - yield rec["child"][0] - elif "parent" in rec: - yield rec["parent"][0] - elif "feature" in rec: - yield rec["feature"][0] - # ignore directive lines - else: - assert "directive" in rec - - -def _file_or_handle(fn): - """Decorator to handle either an input handle or a file.""" - - def _file_or_handle_inside(*args, **kwargs): - in_file = args[1] - if hasattr(in_file, "read"): - need_close = False - in_handle = in_file - if six.PY3 and not isinstance(in_handle, io.TextIOBase): - raise TypeError( - "input handle must be opened in text mode" - ) - else: - need_close = True - in_handle = open(in_file) - args = (args[0], in_handle) + args[2:] - out = fn(*args, **kwargs) - if need_close: - in_handle.close() - return out - - return _file_or_handle_inside - - -class GFFExaminer: - """Provide high level details about a GFF file to refine parsing. - - GFF is a spec and is provided by many different centers. Real life files - will present the same information in slightly different ways. Becoming - familiar with the file you are dealing with is the best way to extract the - information you need. This class provides high level summary details to - help in learning. - """ - - def __init__(self): - self._filter_info = dict( - gff_id=[0], - gff_source_type=[1, 2], - gff_source=[1], - gff_type=[2], - ) - - def _get_local_params(self, limit_info=None): - class _LocalParams: - def __init__(self): - self.jsonify = False - - params = _LocalParams() - params.limit_info = limit_info - params.filter_info = self._filter_info - return params - - @_file_or_handle - def available_limits(self, gff_handle): - """Return dictionary information on possible limits for this file. - - This returns a nested dictionary with the following structure: - - keys -- names of items to filter by - values -- dictionary with: - keys -- filter choice - value -- counts of that filter in this file - - Not a parallelized map-reduce implementation. - """ - cur_limits = dict() - for filter_key in self._filter_info.keys(): - cur_limits[filter_key] = collections.defaultdict(int) - for line in gff_handle: - # when we hit FASTA sequences, we are done with annotations - if line.startswith("##FASTA"): - break - # ignore empty and comment lines - if line.strip() and line.strip()[0] != "#": - parts = [p.strip() for p in line.split("\t")] - assert len(parts) >= 8, line - parts = parts[:9] - for ( - filter_key, - cur_indexes, - ) in self._filter_info.items(): - cur_id = tuple([parts[i] for i in cur_indexes]) - cur_limits[filter_key][cur_id] += 1 - # get rid of the default dicts - final_dict = dict() - for key, value_dict in cur_limits.items(): - if len(key) == 1: - key = key[0] - final_dict[key] = dict(value_dict) - gff_handle.close() - return final_dict - - @_file_or_handle - def parent_child_map(self, gff_handle): - """Provide a mapping of parent to child relationships in the file. - - Returns a dictionary of parent child relationships: - - keys -- tuple of (source, type) for each parent - values -- tuple of (source, type) as children of that parent - - Not a parallelized map-reduce implementation. - """ - # collect all of the parent and child types mapped to IDs - parent_sts = dict() - child_sts = collections.defaultdict(list) - for line in gff_handle: - # when we hit FASTA sequences, we are done with annotations - if line.startswith("##FASTA"): - break - if line.strip() and not line.startswith("#"): - line_type, line_info = _gff_line_map( - line, self._get_local_params() - )[0] - if line_type == "parent" or ( - line_type == "child" and line_info["id"] - ): - parent_sts[line_info["id"]] = ( - line_info["quals"].get("source", [""])[0], - line_info["type"], - ) - if line_type == "child": - for parent_id in line_info["quals"]["Parent"]: - child_sts[parent_id].append( - ( - line_info["quals"].get( - "source", [""] - )[0], - line_info["type"], - ) - ) - # print parent_sts, child_sts - # generate a dictionary of the unique final type relationships - pc_map = collections.defaultdict(list) - for parent_id, parent_type in parent_sts.items(): - for child_type in child_sts[parent_id]: - pc_map[parent_type].append(child_type) - pc_final_map = dict() - for ptype, ctypes in pc_map.items(): - unique_ctypes = list(set(ctypes)) - unique_ctypes.sort() - pc_final_map[ptype] = unique_ctypes - return pc_final_map diff -r 2e6c48910819 -r bde6b1d09f7d jb2_GFF/__init__.py --- a/jb2_GFF/__init__.py Mon Jan 29 02:34:43 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -"""Top level of GFF parsing providing shortcuts for useful classes. -""" -from jb2_GFF.GFFParser import GFFParser, DiscoGFFParser, GFFExaminer, parse, parse_simple -from jb2_GFF.GFFOutput import GFF3Writer, write - -__version__ = "0.6.9" diff -r 2e6c48910819 -r bde6b1d09f7d jb2_GFF/__pycache__/GFFOutput.cpython-310.pyc Binary file jb2_GFF/__pycache__/GFFOutput.cpython-310.pyc has changed diff -r 2e6c48910819 -r bde6b1d09f7d jb2_GFF/__pycache__/GFFParser.cpython-310.pyc Binary file jb2_GFF/__pycache__/GFFParser.cpython-310.pyc has changed diff -r 2e6c48910819 -r bde6b1d09f7d jb2_GFF/__pycache__/__init__.cpython-310.pyc Binary file jb2_GFF/__pycache__/__init__.cpython-310.pyc has changed diff -r 2e6c48910819 -r bde6b1d09f7d jb2_GFF/_utils.py --- a/jb2_GFF/_utils.py Mon Jan 29 02:34:43 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,49 +0,0 @@ -class defaultdict(dict): - """Back compatible defaultdict: - http://code.activestate.com/recipes/523034/""" - - def __init__(self, default_factory=None, *a, **kw): - if default_factory is not None and not hasattr( - default_factory, "__call__" - ): - raise TypeError("first argument must be callable") - dict.__init__(self, *a, **kw) - self.default_factory = default_factory - - def __getitem__(self, key): - try: - return dict.__getitem__(self, key) - except KeyError: - return self.__missing__(key) - - def __missing__(self, key): - if self.default_factory is None: - raise KeyError(key) - self[key] = value = self.default_factory() - return value - - def __reduce__(self): - if self.default_factory is None: - args = tuple() - else: - args = (self.default_factory,) - return type(self), args, None, None, self.items() - - def copy(self): - return self.__copy__() - - def __copy__(self): - return type(self)(self.default_factory, self) - - def __deepcopy__(self, memo): - import copy - - return type(self)( - self.default_factory, copy.deepcopy(self.items()) - ) - - def __repr__(self): - return "defaultdict(%s, %s)" % ( - self.default_factory, - dict.__repr__(self), - ) diff -r 2e6c48910819 -r bde6b1d09f7d jbrowse2.py --- a/jbrowse2.py Mon Jan 29 02:34:43 2024 +0000 +++ b/jbrowse2.py Tue Jan 30 06:05:03 2024 +0000 @@ -3,7 +3,6 @@ import argparse import binascii import datetime -import hashlib import json import logging import os @@ -23,7 +22,7 @@ TODAY = datetime.datetime.now().strftime("%Y-%m-%d") GALAXY_INFRASTRUCTURE_URL = None -JB2REL = "v2.10.0" +JB2REL = "v2.10.1" # version pinned for cloning mapped_chars = { @@ -232,7 +231,9 @@ elif "scaling" in track: if track["scaling"]["method"] == "ignore": if track["scaling"]["scheme"]["color"] != "__auto__": - trackConfig["style"]["color"] = track["scaling"]["scheme"]["color"] + trackConfig["style"]["color"] = track["scaling"]["scheme"][ + "color" + ] else: trackConfig["style"]["color"] = self.hex_from_rgb( *self._get_colours() @@ -259,13 +260,18 @@ "blue": blue, } ) - trackConfig["style"]["color"] = color_function.replace("\n", "") + trackConfig["style"]["color"] = color_function.replace( + "\n", "" + ) elif trackFormat == "gene_calls": # Default values, based on GFF3 spec min_val = 0 max_val = 1000 # Get min/max and build a scoring function since JBrowse doesn't - if scales["type"] == "automatic" or scales["type"] == "__auto__": + if ( + scales["type"] == "automatic" + or scales["type"] == "__auto__" + ): min_val, max_val = self.min_max_gff(gff3) else: min_val = scales.get("min", 0) @@ -273,7 +279,9 @@ if scheme["color"] == "__auto__": user_color = "undefined" - auto_color = "'%s'" % self.hex_from_rgb(*self._get_colours()) + auto_color = "'%s'" % self.hex_from_rgb( + *self._get_colours() + ) elif scheme["color"].startswith("#"): user_color = "'%s'" % self.hex_from_rgb( *self.rgb_from_hex(scheme["color"][1:]) @@ -281,7 +289,9 @@ auto_color = "undefined" else: user_color = "undefined" - auto_color = "'%s'" % self.hex_from_rgb(*self._get_colours()) + auto_color = "'%s'" % self.hex_from_rgb( + *self._get_colours() + ) color_function = self.COLOR_FUNCTION_TEMPLATE_QUAL.format( **{ @@ -293,7 +303,9 @@ } ) - trackConfig["style"]["color"] = color_function.replace("\n", "") + trackConfig["style"]["color"] = color_function.replace( + "\n", "" + ) return trackConfig @@ -336,40 +348,41 @@ for (key, value) in node.findall("dataset")[0].attrib.items(): metadata["dataset_%s" % key] = value - for (key, value) in node.findall("history")[0].attrib.items(): - metadata["history_%s" % key] = value - - for (key, value) in node.findall("metadata")[0].attrib.items(): - metadata["metadata_%s" % key] = value - - for (key, value) in node.findall("tool")[0].attrib.items(): - metadata["tool_%s" % key] = value + if node.findall("history"): + for (key, value) in node.findall("history")[0].attrib.items(): + metadata["history_%s" % key] = value - # Additional Mappings applied: - metadata[ - "dataset_edam_format" - ] = '{1}'.format( - metadata["dataset_edam_format"], metadata["dataset_file_ext"] - ) - metadata["history_user_email"] = '{0}'.format( - metadata["history_user_email"] - ) - metadata["hist_name"] = metadata["history_display_name"] - metadata[ - "history_display_name" - ] = '{hist_name}'.format( - galaxy=GALAXY_INFRASTRUCTURE_URL, - encoded_hist_id=metadata["history_id"], - hist_name=metadata["history_display_name"], - ) - metadata[ - "tool_tool" - ] = '{tool_id}'.format( - galaxy=GALAXY_INFRASTRUCTURE_URL, - encoded_id=metadata["dataset_id"], - tool_id=metadata["tool_tool_id"], - # tool_version=metadata['tool_tool_version'], - ) + if node.findall("metadata"): + for (key, value) in node.findall("metadata")[0].attrib.items(): + metadata["metadata_%s" % key] = value + # Additional Mappings applied: + metadata[ + "dataset_edam_format" + ] = '{1}'.format( + metadata["dataset_edam_format"], metadata["dataset_file_ext"] + ) + metadata["history_user_email"] = '{0}'.format( + metadata["history_user_email"] + ) + metadata["hist_name"] = metadata["history_display_name"] + metadata[ + "history_display_name" + ] = '{hist_name}'.format( + galaxy=GALAXY_INFRASTRUCTURE_URL, + encoded_hist_id=metadata["history_id"], + hist_name=metadata["history_display_name"], + ) + if node.findall("tool"): + for (key, value) in node.findall("tool")[0].attrib.items(): + metadata["tool_%s" % key] = value + metadata[ + "tool_tool" + ] = '{tool_id}{tool_version}'.format( + galaxy=GALAXY_INFRASTRUCTURE_URL, + encoded_id=metadata.get("dataset_id", ""), + tool_id=metadata.get("tool_tool_id", ""), + tool_version=metadata.get("tool_tool_version",""), + ) return metadata @@ -389,7 +402,9 @@ def subprocess_check_call(self, command, output=None): if output: - log.debug("cd %s && %s > %s", self.outdir, " ".join(command), output) + log.debug( + "cd %s && %s > %s", self.outdir, " ".join(command), output + ) subprocess.check_call(command, cwd=self.outdir, stdout=output) else: log.debug("cd %s && %s", self.outdir, " ".join(command)) @@ -468,13 +483,8 @@ self.config_json["assemblies"] = assemblies def make_assembly(self, fapath, gname): - hashData = [ - fapath, - gname, - ] - hashData = "|".join(hashData).encode("utf-8") - ghash = hashlib.md5(hashData).hexdigest() - faname = ghash + ".fa.gz" + + faname = gname + ".fa.gz" fadest = os.path.join(self.outdir, faname) cmd = "bgzip -i -c %s -I %s.gzi > %s && samtools faidx %s" % ( fapath, @@ -495,6 +505,7 @@ "uri": faname + ".gzi", }, } + self.genome_sequence_adapter = adapter trackDict = { "name": gname, "sequence": { @@ -604,7 +615,7 @@ "plugins": [ { "name": "MafViewer", - "url": "https://unpkg.com/browse/jbrowse-plugin-mafviewer@1.0.6/dist/jbrowse-plugin-mafviewer.umd.production.min.js", + "url": "https://unpkg.com/jbrowse-plugin-mafviewer/dist/jbrowse-plugin-mafviewer.umd.production.min.js" } ] } @@ -623,7 +634,9 @@ self.subprocess_check_call(cmd) # Construct samples list # We could get this from galaxy metadata, not sure how easily. - ps = subprocess.Popen(["grep", "^s [^ ]*", "-o", data], stdout=subprocess.PIPE) + ps = subprocess.Popen( + ["grep", "^s [^ ]*", "-o", data], stdout=subprocess.PIPE + ) output = subprocess.check_output(("sort", "-u"), stdin=ps.stdout) ps.wait() outp = output.decode("ascii") @@ -783,7 +796,9 @@ url = fname self.subprocess_check_call(["cp", data, dest]) bloc = {"uri": url} - if bam_index is not None and os.path.exists(os.path.realpath(bam_index)): + if bam_index is not None and os.path.exists( + os.path.realpath(bam_index) + ): # bai most probably made by galaxy and stored in galaxy dirs, need to copy it to dest self.subprocess_check_call( ["cp", os.path.realpath(bam_index), dest + ".bai"] @@ -794,7 +809,9 @@ # => no index generated by galaxy, but there might be one next to the symlink target # this trick allows to skip the bam sorting made by galaxy if already done outside if os.path.exists(os.path.realpath(data) + ".bai"): - self.symlink_or_copy(os.path.realpath(data) + ".bai", dest + ".bai") + self.symlink_or_copy( + os.path.realpath(data) + ".bai", dest + ".bai" + ) else: log.warn("Could not find a bam index (.bai file) for %s", data) trackDict = { @@ -823,12 +840,62 @@ self.tracksToAdd.append(trackDict) self.trackIdlist.append(tId) + def add_cram(self, data, trackData, cramOpts, cram_index=None, **kwargs): + tId = trackData["label"] + fname = "%s.cram" % trackData["label"] + dest = "%s/%s" % (self.outdir, fname) + url = fname + self.subprocess_check_call(["cp", data, dest]) + bloc = {"uri": url} + if cram_index is not None and os.path.exists( + os.path.realpath(cram_index) + ): + # most probably made by galaxy and stored in galaxy dirs, need to copy it to dest + self.subprocess_check_call( + ["cp", os.path.realpath(cram_index), dest + ".crai"] + ) + else: + # Can happen in exotic condition + # e.g. if bam imported as symlink with datatype=unsorted.bam, then datatype changed to bam + # => no index generated by galaxy, but there might be one next to the symlink target + # this trick allows to skip the bam sorting made by galaxy if already done outside + if os.path.exists(os.path.realpath(data) + ".crai"): + self.symlink_or_copy( + os.path.realpath(data) + ".crai", dest + ".crai" + ) + else: + log.warn( + "Could not find a cram index (.crai file) for %s", data + ) + trackDict = { + "type": "AlignmentsTrack", + "trackId": tId, + "name": trackData["name"], + "assemblyNames": [self.genome_name], + "adapter": { + "type": "CramAdapter", + "cramLocation": bloc, + "craiLocation": {"uri": fname + ".crai",}, + "sequenceAdapter": self.genome_sequence_adapter, + }, + "displays": [ + { + "type": "LinearAlignmentsDisplay", + "displayId": "%s-LinearAlignmentsDisplay" % tId, + }, + ], + } + style_json = self._prepare_track_style(trackDict) + trackDict["style"] = style_json + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + def add_vcf(self, data, trackData): tId = trackData["label"] - url = "%s/api/datasets/%s/display" % ( - self.giURL, - trackData["metadata"]["dataset_id"], - ) + # url = "%s/api/datasets/%s/display" % ( + # self.giURL, + # trackData["metadata"]["dataset_id"], + # ) url = "%s.vcf.gz" % tId dest = "%s/%s" % (self.outdir, url) cmd = "bgzip -c %s > %s" % (data, dest) @@ -879,7 +946,9 @@ dest, ) # "gff3sort.pl --precise '%s' | grep -v \"^$\" > '%s'" self.subprocess_popen(cmd) - self.subprocess_check_call(["tabix", "-f", "-p", "gff", dest + ".gz"]) + self.subprocess_check_call( + ["tabix", "-f", "-p", "gff", dest + ".gz"] + ) def _sort_bed(self, data, dest): # Only index if not already done @@ -1085,28 +1154,12 @@ outputTrackConfig["key"] = track_human_label - # We add extra data to hash for the case of REST + SPARQL. - if ( - "conf" in track - and "options" in track["conf"] - and "url" in track["conf"]["options"] - ): - rest_url = track["conf"]["options"]["url"] - else: - rest_url = "" outputTrackConfig["trackset"] = track.get("trackset", {}) - # I chose to use track['category'] instead of 'category' here. This - # is intentional. This way re-running the tool on a different date - # will not generate different hashes and make comparison of outputs - # much simpler. - hashData = [ - str(dataset_path), + outputTrackConfig["label"] = "%s_%i_%s" % ( + dataset_ext, + i, track_human_label, - track["category"], - rest_url, - ] - hashData = "|".join(hashData).encode("utf-8") - outputTrackConfig["label"] = hashlib.md5(hashData).hexdigest() + "_%s" % i + ) outputTrackConfig["metadata"] = extra_metadata outputTrackConfig["name"] = track_human_label @@ -1138,17 +1191,10 @@ outputTrackConfig, ) elif dataset_ext == "bam": - real_indexes = track["conf"]["options"]["pileup"]["bam_indices"][ - "bam_index" - ] + real_indexes = track["conf"]["options"]["pileup"][ + "bam_indices" + ]["bam_index"] if not isinstance(real_indexes, list): - # - # /path/to/a.bam.bai - # - # - # The above will result in the 'bam_index' key containing a - # string. If there are two or more indices, the container - # becomes a list. Fun! real_indexes = [real_indexes] self.add_bam( @@ -1157,6 +1203,19 @@ track["conf"]["options"]["pileup"], bam_index=real_indexes[i], ) + elif dataset_ext == "cram": + real_indexes = track["conf"]["options"]["cram"][ + "cram_indices" + ]["cram_index"] + if not isinstance(real_indexes, list): + real_indexes = [real_indexes] + + self.add_cram( + dataset_path, + outputTrackConfig, + track["conf"]["options"]["cram"], + cram_index=real_indexes[i], + ) elif dataset_ext == "blastxml": self.add_blastxml( dataset_path, @@ -1290,14 +1349,18 @@ config_json.update(self.config_json) config_data = {} - config_data["disableAnalytics"] = data.get("analytics", "false") == "true" + config_data["disableAnalytics"] = ( + data.get("analytics", "false") == "true" + ) config_data["theme"] = { "palette": { "primary": {"main": data.get("primary_color", "#0D233F")}, "secondary": {"main": data.get("secondary_color", "#721E63")}, "tertiary": {"main": data.get("tertiary_color", "#135560")}, - "quaternary": {"main": data.get("quaternary_color", "#FFB11D")}, + "quaternary": { + "main": data.get("quaternary_color", "#FFB11D") + }, }, "typography": {"fontSize": int(data.get("font_size", 10))}, } @@ -1351,9 +1414,10 @@ parser = argparse.ArgumentParser(description="", epilog="") parser.add_argument("--xml", help="Track Configuration") parser.add_argument("--outdir", help="Output directory", default="out") - parser.add_argument("--version", "-V", action="version", version="%(prog)s 2.0.1") + parser.add_argument( + "--version", "-V", action="version", version="%(prog)s 2.0.1" + ) args = parser.parse_args() - tree = ET.parse(args.xml) root = tree.getroot() @@ -1448,7 +1512,8 @@ track_conf["format"] = track.attrib["format"] if track.find("options/style"): track_conf["style"] = { - item.tag: parse_style_conf(item) for item in track.find("options/style") + item.tag: parse_style_conf(item) + for item in track.find("options/style") } if track.find("options/style_labels"): track_conf["style_labels"] = { @@ -1461,7 +1526,9 @@ track_conf["format"] = track.attrib["format"] try: # Only pertains to gff3 + blastxml. TODO? - track_conf["style"] = {t.tag: t.text for t in track.find("options/style")} + track_conf["style"] = { + t.tag: t.text for t in track.find("options/style") + } except TypeError: track_conf["style"] = {} pass @@ -1492,7 +1559,9 @@ "primary_color": root.find("metadata/general/primary_color").text, "secondary_color": root.find("metadata/general/secondary_color").text, "tertiary_color": root.find("metadata/general/tertiary_color").text, - "quaternary_color": root.find("metadata/general/quaternary_color").text, + "quaternary_color": root.find( + "metadata/general/quaternary_color" + ).text, "font_size": root.find("metadata/general/font_size").text, } jc.add_general_configuration(general_data) diff -r 2e6c48910819 -r bde6b1d09f7d jbrowse2.xml --- a/jbrowse2.xml Mon Jan 29 02:34:43 2024 +0000 +++ b/jbrowse2.xml Tue Jan 30 06:05:03 2024 +0000 @@ -1,4 +1,4 @@ - + genome browser macros.xml @@ -31,17 +31,20 @@ #if str($reference_genome.genome_type_select) == "indexed": - - + + + + #else - + + dname = "${reference_genome.genome.name}" /> ${track.data_format.synteny_genome} - ${track.data_format.synteny_genome.element_identifier} + ${track.data_format.synteny_genome.name} #else if str($track.data_format.data_format_select) == "hic": @@ -227,15 +230,15 @@ - + + - + @@ -291,24 +294,24 @@ + + + + - - - - @@ -371,7 +374,7 @@ - + diff -r 2e6c48910819 -r bde6b1d09f7d macros.xml --- a/macros.xml Mon Jan 29 02:34:43 2024 +0000 +++ b/macros.xml Tue Jan 30 06:05:03 2024 +0000 @@ -1,6 +1,6 @@ - 2.10.0 + 2.10.1 topic_3307 @@ -14,8 +14,8 @@ jbrowse2 - biopython - bcbio-gff + biopython + bcbio-gff samtools pyyaml tabix