Mercurial > repos > fubar > jbrowse2
diff abjbrowse2.py @ 13:1d86925dbb4c draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse2 commit 873a12803692b0a84814a6dc08331d772d0e5492-dirty
| author | fubar | 
|---|---|
| date | Mon, 22 Jan 2024 11:52:19 +0000 | 
| parents | |
| children | 7c2e28e144f3 | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/abjbrowse2.py Mon Jan 22 11:52:19 2024 +0000 @@ -0,0 +1,1097 @@ +#!/usr/bin/env python +import argparse +import binascii +import datetime +import hashlib +import json +import logging +import os +import re +import shutil +import struct +import subprocess +import tempfile +import xml.etree.ElementTree as ET +from collections import defaultdict + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger('jbrowse') +TODAY = datetime.datetime.now().strftime("%Y-%m-%d") +GALAXY_INFRASTRUCTURE_URL = None + + +class ColorScaling(object): + + COLOR_FUNCTION_TEMPLATE = """ + function(feature, variableName, glyphObject, track) {{ + var score = {score}; + {opacity} + return 'rgba({red}, {green}, {blue}, ' + opacity + ')'; + }} + """ + + COLOR_FUNCTION_TEMPLATE_QUAL = r""" + function(feature, variableName, glyphObject, track) {{ + var search_up = function self(sf, attr){{ + if(sf.get(attr) !== undefined){{ + return sf.get(attr); + }} + if(sf.parent() === undefined) {{ + return; + }}else{{ + return self(sf.parent(), attr); + }} + }}; + + var search_down = function self(sf, attr){{ + if(sf.get(attr) !== undefined){{ + return sf.get(attr); + }} + if(sf.children() === undefined) {{ + return; + }}else{{ + var kids = sf.children(); + for(var child_idx in kids){{ + var x = self(kids[child_idx], attr); + if(x !== undefined){{ + return x; + }} + }} + return; + }} + }}; + + var color = ({user_spec_color} || search_up(feature, 'color') || search_down(feature, 'color') || {auto_gen_color}); + var score = (search_up(feature, 'score') || search_down(feature, 'score')); + {opacity} + if(score === undefined){{ opacity = 1; }} + var result = /^#?([a-f\d]{{2}})([a-f\d]{{2}})([a-f\d]{{2}})$/i.exec(color); + var red = parseInt(result[1], 16); + var green = parseInt(result[2], 16); + var blue = parseInt(result[3], 16); + if(isNaN(opacity) || opacity < 0){{ opacity = 0; }} + return 'rgba(' + red + ',' + green + ',' + blue + ',' + opacity + ')'; + }} + """ + + OPACITY_MATH = { + 'linear': """ + var opacity = (score - ({min})) / (({max}) - ({min})); + """, + 'logarithmic': """ + var opacity = Math.log10(score - ({min})) / Math.log10(({max}) - ({min})); + """, + 'blast': """ + var opacity = 0; + if(score == 0.0) {{ + opacity = 1; + }} else {{ + opacity = (20 - Math.log10(score)) / 180; + }} + """ + } + + BREWER_COLOUR_IDX = 0 + BREWER_COLOUR_SCHEMES = [ + (166, 206, 227), + (31, 120, 180), + (178, 223, 138), + (51, 160, 44), + (251, 154, 153), + (227, 26, 28), + (253, 191, 111), + (255, 127, 0), + (202, 178, 214), + (106, 61, 154), + (255, 255, 153), + (177, 89, 40), + (228, 26, 28), + (55, 126, 184), + (77, 175, 74), + (152, 78, 163), + (255, 127, 0), + ] + + BREWER_DIVERGING_PALLETES = { + 'BrBg': ("#543005", "#003c30"), + 'PiYg': ("#8e0152", "#276419"), + 'PRGn': ("#40004b", "#00441b"), + 'PuOr': ("#7f3b08", "#2d004b"), + 'RdBu': ("#67001f", "#053061"), + 'RdGy': ("#67001f", "#1a1a1a"), + 'RdYlBu': ("#a50026", "#313695"), + 'RdYlGn': ("#a50026", "#006837"), + 'Spectral': ("#9e0142", "#5e4fa2"), + } + + def __init__(self): + self.brewer_colour_idx = 0 + + def rgb_from_hex(self, hexstr): + # http://stackoverflow.com/questions/4296249/how-do-i-convert-a-hex-triplet-to-an-rgb-tuple-and-back + return struct.unpack('BBB', binascii.unhexlify(hexstr)) + + def min_max_gff(self, gff_file): + min_val = None + max_val = None + with open(gff_file, 'r') as handle: + for line in handle: + try: + value = float(line.split('\t')[5]) + min_val = min(value, (min_val or value)) + max_val = max(value, (max_val or value)) + + if value < min_val: + min_val = value + + if value > max_val: + max_val = value + except Exception: + pass + return min_val, max_val + + def hex_from_rgb(self, r, g, b): + return '#%02x%02x%02x' % (r, g, b) + + def _get_colours(self): + r, g, b = self.BREWER_COLOUR_SCHEMES[self.brewer_colour_idx % len(self.BREWER_COLOUR_SCHEMES)] + self.brewer_colour_idx += 1 + return r, g, b + + def parse_menus(self, track): + trackConfig = {'menuTemplate': [{}, {}, {}, {}]} + + if 'menu' in track['menus']: + menu_list = [track['menus']['menu']] + if isinstance(track['menus']['menu'], list): + menu_list = track['menus']['menu'] + + for m in menu_list: + tpl = { + 'action': m['action'], + 'label': m.get('label', '{name}'), + 'iconClass': m.get('iconClass', 'dijitIconBookmark'), + } + if 'url' in m: + tpl['url'] = m['url'] + if 'content' in m: + tpl['content'] = m['content'] + if 'title' in m: + tpl['title'] = m['title'] + + trackConfig['menuTemplate'].append(tpl) + + return trackConfig + + def parse_colours(self, track, trackFormat, gff3=None): + # Wiggle tracks have a bicolor pallete + trackConfig = {'style': {}} + if trackFormat == 'wiggle': + + trackConfig['style']['pos_color'] = track['wiggle']['color_pos'] + trackConfig['style']['neg_color'] = track['wiggle']['color_neg'] + + if trackConfig['style']['pos_color'] == '__auto__': + trackConfig['style']['neg_color'] = self.hex_from_rgb(*self._get_colours()) + trackConfig['style']['pos_color'] = self.hex_from_rgb(*self._get_colours()) + + # Wiggle tracks can change colour at a specified place + bc_pivot = track['wiggle']['bicolor_pivot'] + if bc_pivot not in ('mean', 'zero'): + # The values are either one of those two strings + # or a number + bc_pivot = float(bc_pivot) + trackConfig['bicolor_pivot'] = bc_pivot + elif 'scaling' in track: + if track['scaling']['method'] == 'ignore': + if track['scaling']['scheme']['color'] != '__auto__': + trackConfig['style']['color'] = track['scaling']['scheme']['color'] + else: + trackConfig['style']['color'] = self.hex_from_rgb(*self._get_colours()) + else: + # Scored method + algo = track['scaling']['algo'] + # linear, logarithmic, blast + scales = track['scaling']['scales'] + # type __auto__, manual (min, max) + scheme = track['scaling']['scheme'] + # scheme -> (type (opacity), color) + # ================================== + # GENE CALLS OR BLAST + # ================================== + if trackFormat == 'blast': + red, green, blue = self._get_colours() + color_function = self.COLOR_FUNCTION_TEMPLATE.format(**{ + 'score': "feature._parent.get('score')", + 'opacity': self.OPACITY_MATH['blast'], + 'red': red, + 'green': green, + 'blue': blue, + }) + trackConfig['style']['color'] = color_function.replace('\n', '') + elif trackFormat == 'gene_calls': + # Default values, based on GFF3 spec + min_val = 0 + max_val = 1000 + # Get min/max and build a scoring function since JBrowse doesn't + if scales['type'] == 'automatic' or scales['type'] == '__auto__': + min_val, max_val = self.min_max_gff(gff3) + else: + min_val = scales.get('min', 0) + max_val = scales.get('max', 1000) + + if scheme['color'] == '__auto__': + user_color = 'undefined' + auto_color = "'%s'" % self.hex_from_rgb(*self._get_colours()) + elif scheme['color'].startswith('#'): + user_color = "'%s'" % self.hex_from_rgb(*self.rgb_from_hex(scheme['color'][1:])) + auto_color = 'undefined' + else: + user_color = 'undefined' + auto_color = "'%s'" % self.hex_from_rgb(*self._get_colours()) + + color_function = self.COLOR_FUNCTION_TEMPLATE_QUAL.format(**{ + 'opacity': self.OPACITY_MATH[algo].format(**{'max': max_val, 'min': min_val}), + 'user_spec_color': user_color, + 'auto_gen_color': auto_color, + }) + + trackConfig['style']['color'] = color_function.replace('\n', '') + return trackConfig + + +def etree_to_dict(t): + if t is None: + return {} + + d = {t.tag: {} if t.attrib else None} + children = list(t) + if children: + dd = defaultdict(list) + for dc in map(etree_to_dict, children): + for k, v in dc.items(): + dd[k].append(v) + d = {t.tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}} + if t.attrib: + d[t.tag].update(('@' + k, v) for k, v in t.attrib.items()) + if t.text: + text = t.text.strip() + if children or t.attrib: + if text: + d[t.tag]['#text'] = text + else: + d[t.tag] = text + return d + + +# score comes from feature._parent.get('score') or feature.get('score') + +INSTALLED_TO = os.path.dirname(os.path.realpath(__file__)) + + +def metadata_from_node(node): + metadata = {} + try: + if len(node.findall('dataset')) != 1: + # exit early + return metadata + except Exception: + return {} + + for (key, value) in node.findall('dataset')[0].attrib.items(): + metadata['dataset_%s' % key] = value + + for (key, value) in node.findall('history')[0].attrib.items(): + metadata['history_%s' % key] = value + + for (key, value) in node.findall('metadata')[0].attrib.items(): + metadata['metadata_%s' % key] = value + + for (key, value) in node.findall('tool')[0].attrib.items(): + metadata['tool_%s' % key] = value + + # Additional Mappings applied: + metadata['dataset_edam_format'] = '<a target="_blank" href="http://edamontology.org/{0}">{1}</a>'.format(metadata['dataset_edam_format'], metadata['dataset_file_ext']) + metadata['history_user_email'] = '<a href="mailto:{0}">{0}</a>'.format(metadata['history_user_email']) + metadata['history_display_name'] = '<a target="_blank" href="{galaxy}/history/view/{encoded_hist_id}">{hist_name}</a>'.format( + galaxy=GALAXY_INFRASTRUCTURE_URL, + encoded_hist_id=metadata['history_id'], + hist_name=metadata['history_display_name'] + ) + metadata['tool_tool'] = '<a target="_blank" href="{galaxy}/datasets/{encoded_id}/show_params">{tool_id}</a>'.format( + galaxy=GALAXY_INFRASTRUCTURE_URL, + encoded_id=metadata['dataset_id'], + tool_id=metadata['tool_tool_id'], + # tool_version=metadata['tool_tool_version'], + ) + return metadata + + +class JbrowseConnector(object): + + def __init__(self, jbrowse, outdir, genomes): + self.cs = ColorScaling() + self.jbrowse = jbrowse + self.outdir = outdir + self.genome_paths = genomes + self.tracksToIndex = [] + + # This is the id of the current assembly + self.assembly_ids = {} + self.current_assembly_id = [] + + # If upgrading, look at the existing data + self.check_existing(self.outdir) + + self.clone_jbrowse(self.jbrowse, self.outdir) + + self.process_genomes() + + def subprocess_check_call(self, command, output=None): + if output: + log.debug('cd %s && %s > %s', self.outdir, ' '.join(command), output) + subprocess.check_call(command, cwd=self.outdir, stdout=output) + else: + log.debug('cd %s && %s', self.outdir, ' '.join(command)) + subprocess.check_call(command, cwd=self.outdir) + + def subprocess_popen(self, command): + log.debug('cd %s && %s', self.outdir, command) + p = subprocess.Popen(command, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output, err = p.communicate() + retcode = p.returncode + if retcode != 0: + log.error('cd %s && %s', self.outdir, command) + log.error(output) + log.error(err) + raise RuntimeError("Command failed with exit code %s" % (retcode)) + + def subprocess_check_output(self, command): + log.debug('cd %s && %s', self.outdir, ' '.join(command)) + return subprocess.check_output(command, cwd=self.outdir) + + def symlink_or_copy(self, src, dest): + if 'GALAXY_JBROWSE_SYMLINKS' in os.environ and bool(os.environ['GALAXY_JBROWSE_SYMLINKS']): + cmd = ['ln', '-s', src, dest] + else: + cmd = ['cp', src, dest] + + return self.subprocess_check_call(cmd) + + def symlink_or_copy_load_action(self): + if 'GALAXY_JBROWSE_SYMLINKS' in os.environ and bool(os.environ['GALAXY_JBROWSE_SYMLINKS']): + return 'symlink' + else: + return 'copy' + + def check_existing(self, destination): + existing = os.path.join(destination, 'data', "config.json") + if os.path.exists(existing): + with open(existing, 'r') as existing_conf: + conf = json.load(existing_conf) + if 'assemblies' in conf: + for assembly in conf['assemblies']: + if 'name' in assembly: + self.assembly_ids[assembly['name']] = None + + def process_genomes(self): + for genome_node in self.genome_paths: + # We only expect one input genome per run. This for loop is just + # easier to write than the alternative / catches any possible + # issues. + self.add_assembly(genome_node['path'], genome_node['label']) + + def add_assembly(self, path, label, default=True): + # Find a non-existing filename for the new genome + # (to avoid colision when upgrading an existing instance) + rel_seq_path = os.path.join('data', 'assembly') + seq_path = os.path.join(self.outdir, rel_seq_path) + fn_try = 1 + while (os.path.exists(seq_path + '.fasta') or os.path.exists(seq_path + '.fasta.gz') + or os.path.exists(seq_path + '.fasta.gz.fai') or os.path.exists(seq_path + '.fasta.gz.gzi')): + rel_seq_path = os.path.join('data', 'assembly%s' % fn_try) + seq_path = os.path.join(self.outdir, rel_seq_path) + fn_try += 1 + + # Find a non-existing label for the new genome + # (to avoid colision when upgrading an existing instance) + lab_try = 1 + uniq_label = label + while uniq_label in self.assembly_ids: + uniq_label = label + str(lab_try) + lab_try += 1 + + # Find a default scaffold to display + # TODO this may not be necessary in the future, see https://github.com/GMOD/jbrowse-components/issues/2708 + with open(path, 'r') as fa_handle: + fa_header = fa_handle.readline()[1:].strip().split(' ')[0] + + self.assembly_ids[uniq_label] = fa_header + if default: + self.current_assembly_id = uniq_label + + copied_genome = seq_path + '.fasta' + shutil.copy(path, copied_genome) + + # Compress with bgzip + cmd = ['bgzip', copied_genome] + self.subprocess_check_call(cmd) + + # FAI Index + cmd = ['samtools', 'faidx', copied_genome + '.gz'] + self.subprocess_check_call(cmd) + + self.subprocess_check_call([ + 'jbrowse', 'add-assembly', + '--load', 'inPlace', + '--name', uniq_label, + '--type', 'bgzipFasta', + '--target', os.path.join(self.outdir, 'data'), + '--skipCheck', + rel_seq_path + '.fasta.gz']) + + return uniq_label + + def text_index(self): + # Index tracks + args = [ + 'jbrowse', 'text-index', + '--target', os.path.join(self.outdir, 'data'), + '--assemblies', self.current_assembly_id, + ] + + tracks = ','.join(self.tracksToIndex) + if tracks: + args += ['--tracks', tracks] + + self.subprocess_check_call(args) + + def _blastxml_to_gff3(self, xml, min_gap=10): + gff3_unrebased = tempfile.NamedTemporaryFile(delete=False) + cmd = ['python', os.path.join(INSTALLED_TO, 'blastxml_to_gapped_gff3.py'), + '--trim', '--trim_end', '--include_seq', '--min_gap', str(min_gap), xml] + log.debug('cd %s && %s > %s', self.outdir, ' '.join(cmd), gff3_unrebased.name) + subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_unrebased) + gff3_unrebased.close() + return gff3_unrebased.name + + def _prepare_track_style(self, xml_conf): + + style_data = { + "type": "LinearBasicDisplay" + } + + if 'display' in xml_conf['style']: + style_data['type'] = xml_conf['style']['display'] + del xml_conf['style']['display'] + + style_data['displayId'] = "%s_%s" % (xml_conf['label'], style_data['type']) + + style_data.update(xml_conf['style']) + + return {'displays': [style_data]} + + def add_blastxml(self, data, trackData, blastOpts, **kwargs): + gff3 = self._blastxml_to_gff3(data, min_gap=blastOpts['min_gap']) + + if 'parent' in blastOpts and blastOpts['parent'] != 'None': + gff3_rebased = tempfile.NamedTemporaryFile(delete=False) + cmd = ['python', os.path.join(INSTALLED_TO, 'gff3_rebase.py')] + if blastOpts.get('protein', 'false') == 'true': + cmd.append('--protein2dna') + cmd.extend([os.path.realpath(blastOpts['parent']), gff3]) + log.debug('cd %s && %s > %s', self.outdir, ' '.join(cmd), gff3_rebased.name) + subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_rebased) + gff3_rebased.close() + + # Replace original gff3 file + shutil.copy(gff3_rebased.name, gff3) + os.unlink(gff3_rebased.name) + + rel_dest = os.path.join('data', trackData['label'] + '.gff') + dest = os.path.join(self.outdir, rel_dest) + + self._sort_gff(gff3, dest) + os.unlink(gff3) + + style_json = self._prepare_track_style(trackData) + + self._add_track(trackData['label'], trackData['key'], trackData['category'], rel_dest + '.gz', config=style_json) + + def add_bigwig(self, data, trackData, wiggleOpts, **kwargs): + + rel_dest = os.path.join('data', trackData['label'] + '.bw') + dest = os.path.join(self.outdir, rel_dest) + self.symlink_or_copy(os.path.realpath(data), dest) + + style_json = self._prepare_track_style(trackData) + + self._add_track(trackData['label'], trackData['key'], trackData['category'], rel_dest, config=style_json) + + # Anything ending in "am" (Bam or Cram) + def add_xam(self, data, trackData, xamOpts, index=None, ext="bam", **kwargs): + + index_ext = "bai" + if ext == "cram": + index_ext = "crai" + + rel_dest = os.path.join('data', trackData['label'] + '.%s' % ext) + dest = os.path.join(self.outdir, rel_dest) + + self.symlink_or_copy(os.path.realpath(data), dest) + + if index is not None and os.path.exists(os.path.realpath(index)): + # xai most probably made by galaxy and stored in galaxy dirs, need to copy it to dest + self.subprocess_check_call(['cp', os.path.realpath(index), dest + '.%s' % index_ext]) + else: + # Can happen in exotic condition + # e.g. if bam imported as symlink with datatype=unsorted.bam, then datatype changed to bam + # => no index generated by galaxy, but there might be one next to the symlink target + # this trick allows to skip the bam sorting made by galaxy if already done outside + if os.path.exists(os.path.realpath(data) + '.%s' % index_ext): + self.symlink_or_copy(os.path.realpath(data) + '.%s' % index_ext, dest + '.%s' % index_ext) + else: + log.warn('Could not find a bam index (.%s file) for %s', (index_ext, data)) + + style_json = self._prepare_track_style(trackData) + + self._add_track(trackData['label'], trackData['key'], trackData['category'], rel_dest, config=style_json) + + def add_vcf(self, data, trackData, vcfOpts={}, zipped=False, **kwargs): + + if zipped: + rel_dest = os.path.join('data', trackData['label'] + '.vcf.gz') + dest = os.path.join(self.outdir, rel_dest) + shutil.copy(os.path.realpath(data), dest) + else: + rel_dest = os.path.join('data', trackData['label'] + '.vcf') + dest = os.path.join(self.outdir, rel_dest) + shutil.copy(os.path.realpath(data), dest) + + cmd = ['bgzip', dest] + self.subprocess_check_call(cmd) + cmd = ['tabix', dest + '.gz'] + self.subprocess_check_call(cmd) + + rel_dest = os.path.join('data', trackData['label'] + '.vcf.gz') + + style_json = self._prepare_track_style(trackData) + + self._add_track(trackData['label'], trackData['key'], trackData['category'], rel_dest, config=style_json) + + def add_gff(self, data, format, trackData, gffOpts, **kwargs): + rel_dest = os.path.join('data', trackData['label'] + '.gff') + dest = os.path.join(self.outdir, rel_dest) + + self._sort_gff(data, dest) + + style_json = self._prepare_track_style(trackData) + + self._add_track(trackData['label'], trackData['key'], trackData['category'], rel_dest + '.gz', config=style_json) + + def add_bed(self, data, format, trackData, gffOpts, **kwargs): + rel_dest = os.path.join('data', trackData['label'] + '.bed') + dest = os.path.join(self.outdir, rel_dest) + + self._sort_bed(data, dest) + + style_json = self._prepare_track_style(trackData) + + self._add_track(trackData['label'], trackData['key'], trackData['category'], rel_dest + '.gz', config=style_json) + + def add_paf(self, data, trackData, pafOpts, **kwargs): + rel_dest = os.path.join('data', trackData['label'] + '.paf') + dest = os.path.join(self.outdir, rel_dest) + + self.symlink_or_copy(os.path.realpath(data), dest) + + added_assembly = self.add_assembly(pafOpts['genome'], pafOpts['genome_label'], default=False) + + style_json = self._prepare_track_style(trackData) + + self._add_track(trackData['label'], trackData['key'], trackData['category'], rel_dest, assemblies=[self.current_assembly_id, added_assembly], config=style_json) + + def add_hic(self, data, trackData, hicOpts, **kwargs): + rel_dest = os.path.join('data', trackData['label'] + '.hic') + dest = os.path.join(self.outdir, rel_dest) + + self.symlink_or_copy(os.path.realpath(data), dest) + + style_json = self._prepare_track_style(trackData) + + self._add_track(trackData['label'], trackData['key'], trackData['category'], rel_dest, config=style_json) + + def add_sparql(self, url, query, query_refnames, trackData): + + json_track_data = { + "type": "FeatureTrack", + "trackId": id, + "name": trackData['label'], + "adapter": { + "type": "SPARQLAdapter", + "endpoint": { + "uri": url, + "locationType": "UriLocation" + }, + "queryTemplate": query + }, + "category": [ + trackData['category'] + ], + "assemblyNames": [ + self.current_assembly_id + ] + } + + if query_refnames: + json_track_data['adapter']['refNamesQueryTemplate']: query_refnames + + self.subprocess_check_call([ + 'jbrowse', 'add-track-json', + '--target', os.path.join(self.outdir, 'data'), + json_track_data]) + + # Doesn't work as of 1.6.4, might work in the future + # self.subprocess_check_call([ + # 'jbrowse', 'add-track', + # '--trackType', 'sparql', + # '--name', trackData['label'], + # '--category', trackData['category'], + # '--target', os.path.join(self.outdir, 'data'), + # '--trackId', id, + # '--config', '{"queryTemplate": "%s"}' % query, + # url]) + + def _add_track(self, id, label, category, path, assemblies=[], config=None): + + assemblies_opt = self.current_assembly_id + if assemblies: + assemblies_opt = ','.join(assemblies) + + cmd = [ + 'jbrowse', 'add-track', + '--load', 'inPlace', + '--name', label, + '--category', category, + '--target', os.path.join(self.outdir, 'data'), + '--trackId', id, + '--assemblyNames', assemblies_opt + ] + + if config: + cmd.append('--config') + cmd.append(json.dumps(config)) + + cmd.append(path) + + self.subprocess_check_call(cmd) + + def _sort_gff(self, data, dest): + # Only index if not already done + if not os.path.exists(dest): + cmd = "gff3sort.pl --precise '%s' | grep -v \"^$\" > '%s'" % (data, dest) + self.subprocess_popen(cmd) + + self.subprocess_check_call(['bgzip', '-f', dest]) + self.subprocess_check_call(['tabix', '-f', '-p', 'gff', dest + '.gz']) + + def _sort_bed(self, data, dest): + # Only index if not already done + if not os.path.exists(dest): + cmd = ['sort', '-k1,1', '-k2,2n', data] + with open(dest, 'w') as handle: + self.subprocess_check_call(cmd, output=handle) + + self.subprocess_check_call(['bgzip', '-f', dest]) + self.subprocess_check_call(['tabix', '-f', '-p', 'bed', dest + '.gz']) + + def process_annotations(self, track): + + category = track['category'].replace('__pd__date__pd__', TODAY) + outputTrackConfig = { + 'category': category, + } + + mapped_chars = { + '>': '__gt__', + '<': '__lt__', + "'": '__sq__', + '"': '__dq__', + '[': '__ob__', + ']': '__cb__', + '{': '__oc__', + '}': '__cc__', + '@': '__at__', + '#': '__pd__', + "": '__cn__' + } + + for i, (dataset_path, dataset_ext, track_human_label, extra_metadata) in enumerate(track['trackfiles']): + # Unsanitize labels (element_identifiers are always sanitized by Galaxy) + for key, value in mapped_chars.items(): + track_human_label = track_human_label.replace(value, key) + + log.info('Processing track %s / %s (%s)', category, track_human_label, dataset_ext) + outputTrackConfig['key'] = track_human_label + # We add extra data to hash for the case of REST + SPARQL. + if 'conf' in track and 'options' in track['conf'] and 'url' in track['conf']['options']: + rest_url = track['conf']['options']['url'] + else: + rest_url = '' + + # I chose to use track['category'] instead of 'category' here. This + # is intentional. This way re-running the tool on a different date + # will not generate different hashes and make comparison of outputs + # much simpler. + hashData = [str(dataset_path), track_human_label, track['category'], rest_url, self.current_assembly_id] + hashData = '|'.join(hashData).encode('utf-8') + outputTrackConfig['label'] = hashlib.md5(hashData).hexdigest() + '_%s' % i + outputTrackConfig['metadata'] = extra_metadata + + outputTrackConfig['style'] = track['style'] + + if 'menus' in track['conf']['options']: + menus = self.cs.parse_menus(track['conf']['options']) + outputTrackConfig.update(menus) + + if dataset_ext in ('gff', 'gff3'): + self.add_gff(dataset_path, dataset_ext, outputTrackConfig, + track['conf']['options']['gff']) + elif dataset_ext == 'bed': + self.add_bed(dataset_path, dataset_ext, outputTrackConfig, + track['conf']['options']['gff']) + elif dataset_ext == 'bigwig': + self.add_bigwig(dataset_path, outputTrackConfig, + track['conf']['options']['wiggle']) + elif dataset_ext == 'bam': + real_indexes = track['conf']['options']['pileup']['bam_indices']['bam_index'] + if not isinstance(real_indexes, list): + # <bam_indices> + # <bam_index>/path/to/a.bam.bai</bam_index> + # </bam_indices> + # + # The above will result in the 'bam_index' key containing a + # string. If there are two or more indices, the container + # becomes a list. Fun! + real_indexes = [real_indexes] + + self.add_xam(dataset_path, outputTrackConfig, + track['conf']['options']['pileup'], + index=real_indexes[i], ext="bam") + elif dataset_ext == 'cram': + real_indexes = track['conf']['options']['cram']['cram_indices']['cram_index'] + if not isinstance(real_indexes, list): + # <bam_indices> + # <bam_index>/path/to/a.bam.bai</bam_index> + # </bam_indices> + # + # The above will result in the 'bam_index' key containing a + # string. If there are two or more indices, the container + # becomes a list. Fun! + real_indexes = [real_indexes] + + self.add_xam(dataset_path, outputTrackConfig, + track['conf']['options']['cram'], + index=real_indexes[i], ext="cram") + elif dataset_ext == 'blastxml': + self.add_blastxml(dataset_path, outputTrackConfig, track['conf']['options']['blast']) + elif dataset_ext == 'vcf': + self.add_vcf(dataset_path, outputTrackConfig) + elif dataset_ext == 'vcf_bgzip': + self.add_vcf(dataset_path, outputTrackConfig, zipped=True) + elif dataset_ext == 'rest': + self.add_rest(track['conf']['options']['rest']['url'], outputTrackConfig) + elif dataset_ext == 'synteny': + self.add_paf(dataset_path, outputTrackConfig, + track['conf']['options']['synteny']) + elif dataset_ext == 'hic': + self.add_hic(dataset_path, outputTrackConfig, + track['conf']['options']['hic']) + elif dataset_ext == 'sparql': + sparql_query = track['conf']['options']['sparql']['query'] + for key, value in mapped_chars.items(): + sparql_query = sparql_query.replace(value, key) + sparql_query_refnames = track['conf']['options']['sparql']['query_refnames'] + for key, value in mapped_chars.items(): + sparql_query_refnames = sparql_query_refnames.replace(value, key) + self.add_sparql(track['conf']['options']['sparql']['url'], sparql_query, sparql_query_refnames, outputTrackConfig) + else: + log.warn('Do not know how to handle %s', dataset_ext) + + # Return non-human label for use in other fields + yield outputTrackConfig['label'] + + def add_default_session(self, data): + """ + Add some default session settings: set some assemblies/tracks on/off + """ + tracks_data = [] + + # TODO using the default session for now, but check out session specs in the future https://github.com/GMOD/jbrowse-components/issues/2708 + + # We need to know the track type from the config.json generated just before + config_path = os.path.join(self.outdir, 'data', 'config.json') + track_types = {} + with open(config_path, 'r') as config_file: + config_json = json.load(config_file) + + for track_conf in config_json['tracks']: + track_types[track_conf['trackId']] = track_conf['type'] + + for on_track in data['visibility']['default_on']: + # TODO several problems with this currently + # - we are forced to copy the same kind of style config as the per track config from _prepare_track_style (not exactly the same though) + # - we get an error when refreshing the page + # - this could be solved by session specs, see https://github.com/GMOD/jbrowse-components/issues/2708 + style_data = { + "type": "LinearBasicDisplay", + "height": 100 + } + + if on_track in data['style']: + if 'display' in data['style'][on_track]: + style_data['type'] = data['style'][on_track]['display'] + del data['style'][on_track]['display'] + + style_data.update(data['style'][on_track]) + + if on_track in data['style_labels']: + # TODO fix this: it should probably go in a renderer block (SvgFeatureRenderer) but still does not work + # TODO move this to per track displays? + style_data['labels'] = data['style_labels'][on_track] + + tracks_data.append({ + "type": track_types[on_track], + "configuration": on_track, + "displays": [ + style_data + ] + }) + + # The view for the assembly we're adding + view_json = { + "type": "LinearGenomeView", + "tracks": tracks_data + } + + refName = None + if data.get('defaultLocation', ''): + loc_match = re.search(r'^(\w+):(\d+)\.+(\d+)$', data['defaultLocation']) + if loc_match: + refName = loc_match.group(1) + start = int(loc_match.group(2)) + end = int(loc_match.group(3)) + elif self.assembly_ids[self.current_assembly_id] is not None: + refName = self.assembly_ids[self.current_assembly_id] + start = 0 + end = 1000000 # Booh, hard coded! waiting for https://github.com/GMOD/jbrowse-components/issues/2708 + + if refName is not None: + # TODO displayedRegions is not just zooming to the region, it hides the rest of the chromosome + view_json['displayedRegions'] = [{ + "refName": refName, + "start": start, + "end": end, + "reversed": False, + "assemblyName": self.current_assembly_id + }] + + session_name = data.get('session_name', "New session") + if not session_name: + session_name = "New session" + + # Merge with possibly existing defaultSession (if upgrading a jbrowse instance) + session_json = {} + if 'defaultSession' in config_json: + session_json = config_json['defaultSession'] + + session_json["name"] = session_name + + if 'views' not in session_json: + session_json['views'] = [] + + session_json['views'].append(view_json) + + config_json['defaultSession'] = session_json + + with open(config_path, 'w') as config_file: + json.dump(config_json, config_file, indent=2) + + def add_general_configuration(self, data): + """ + Add some general configuration to the config.json file + """ + + config_path = os.path.join(self.outdir, 'data', 'config.json') + with open(config_path, 'r') as config_file: + config_json = json.load(config_file) + + config_data = {} + + config_data['disableAnalytics'] = data.get('analytics', 'false') == 'true' + + config_data['theme'] = { + "palette": { + "primary": { + "main": data.get('primary_color', '#0D233F') + }, + "secondary": { + "main": data.get('secondary_color', '#721E63') + }, + "tertiary": { + "main": data.get('tertiary_color', '#135560') + }, + "quaternary": { + "main": data.get('quaternary_color', '#FFB11D') + }, + }, + "typography": { + "fontSize": int(data.get('font_size', 10)) + }, + } + + config_json['configuration'].update(config_data) + + with open(config_path, 'w') as config_file: + json.dump(config_json, config_file, indent=2) + + def clone_jbrowse(self, jbrowse_dir, destination): + """Clone a JBrowse directory into a destination directory. + """ + + copytree(jbrowse_dir, destination) + + try: + shutil.rmtree(os.path.join(destination, 'test_data')) + except OSError as e: + log.error("Error: %s - %s." % (e.filename, e.strerror)) + + if not os.path.exists(os.path.join(destination, 'data')): + # It can already exist if upgrading an instance + os.makedirs(os.path.join(destination, 'data')) + log.info("makedir %s" % (os.path.join(destination, 'data'))) + + os.symlink('./data/config.json', os.path.join(destination, 'config.json')) + + +def copytree(src, dst, symlinks=False, ignore=None): + for item in os.listdir(src): + s = os.path.join(src, item) + d = os.path.join(dst, item) + if os.path.isdir(s): + shutil.copytree(s, d, symlinks, ignore) + else: + shutil.copy2(s, d) + + +def parse_style_conf(item): + if 'type' in item.attrib and item.attrib['type'] in ['boolean', 'integer']: + if item.attrib['type'] == 'boolean': + return item.text in ("yes", "true", "True") + elif item.attrib['type'] == 'integer': + return int(item.text) + else: + return item.text + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="", epilog="") + parser.add_argument('xml', type=argparse.FileType('r'), help='Track Configuration') + + parser.add_argument('--jbrowse', help='Folder containing a jbrowse release') + parser.add_argument('--outdir', help='Output directory', default='out') + parser.add_argument('--version', '-V', action='version', version="%(prog)s 0.8.0") + args = parser.parse_args() + + tree = ET.parse(args.xml.name) + root = tree.getroot() + + # This should be done ASAP + GALAXY_INFRASTRUCTURE_URL = root.find('metadata/galaxyUrl').text + # Sometimes this comes as `localhost` without a protocol + if not GALAXY_INFRASTRUCTURE_URL.startswith('http'): + # so we'll prepend `http://` and hope for the best. Requests *should* + # be GET and not POST so it should redirect OK + GALAXY_INFRASTRUCTURE_URL = 'http://' + GALAXY_INFRASTRUCTURE_URL + + jc = JbrowseConnector( + jbrowse=args.jbrowse, + outdir=args.outdir, + genomes=[ + { + 'path': os.path.realpath(x.attrib['path']), + 'meta': metadata_from_node(x.find('metadata')), + 'label': x.attrib['label'] + } + for x in root.findall('metadata/genomes/genome') + ] + ) + + default_session_data = { + 'visibility': { + 'default_on': [], + 'default_off': [], + }, + 'style': {}, + 'style_labels': {} + } + + # TODO add metadata to tracks + for track in root.findall('tracks/track'): + track_conf = {} + track_conf['trackfiles'] = [] + + trackfiles = track.findall('files/trackFile') + if trackfiles: + for x in track.findall('files/trackFile'): + if trackfiles: + metadata = metadata_from_node(x.find('metadata')) + + track_conf['trackfiles'].append(( + os.path.realpath(x.attrib['path']), + x.attrib['ext'], + x.attrib['label'], + metadata + )) + else: + # For tracks without files (rest, sparql) + track_conf['trackfiles'].append(( + '', # N/A, no path for rest or sparql + track.attrib['format'], + track.find('options/label').text, + {} + )) + + track_conf['category'] = track.attrib['cat'] + track_conf['format'] = track.attrib['format'] + track_conf['style'] = {item.tag: parse_style_conf(item) for item in track.find('options/style')} + + track_conf['style'] = {item.tag: parse_style_conf(item) for item in track.find('options/style')} + + track_conf['style_labels'] = {item.tag: parse_style_conf(item) for item in track.find('options/style_labels')} + + track_conf['conf'] = etree_to_dict(track.find('options')) + keys = jc.process_annotations(track_conf) + + for key in keys: + default_session_data['visibility'][track.attrib.get('visibility', 'default_off')].append(key) + + default_session_data['style'][key] = track_conf['style'] # TODO do we need this anymore? + default_session_data['style_labels'][key] = track_conf['style_labels'] + + default_session_data['defaultLocation'] = root.find('metadata/general/defaultLocation').text + default_session_data['session_name'] = root.find('metadata/general/session_name').text + + general_data = { + 'analytics': root.find('metadata/general/analytics').text, + 'primary_color': root.find('metadata/general/primary_color').text, + 'secondary_color': root.find('metadata/general/secondary_color').text, + 'tertiary_color': root.find('metadata/general/tertiary_color').text, + 'quaternary_color': root.find('metadata/general/quaternary_color').text, + 'font_size': root.find('metadata/general/font_size').text, + } + + jc.add_default_session(default_session_data) + jc.add_general_configuration(general_data) + jc.text_index() +
