| 0 | 1 #!/usr/bin/env python | 
|  | 2 # change to accumulating all configuration for config.json based on the default from the clone | 
|  | 3 import argparse | 
|  | 4 import datetime | 
|  | 5 import hashlib | 
|  | 6 import json | 
|  | 7 import logging | 
|  | 8 import os | 
|  | 9 import shutil | 
|  | 10 import subprocess | 
|  | 11 import tempfile | 
|  | 12 import xml.etree.ElementTree as ET | 
|  | 13 from collections import defaultdict | 
|  | 14 | 
|  | 15 logging.basicConfig(level=logging.INFO) | 
|  | 16 log = logging.getLogger("jbrowse") | 
|  | 17 TODAY = datetime.datetime.now().strftime("%Y-%m-%d") | 
|  | 18 GALAXY_INFRASTRUCTURE_URL = None | 
|  | 19 mapped_chars = { | 
|  | 20     ">": "__gt__", | 
|  | 21     "<": "__lt__", | 
|  | 22     "'": "__sq__", | 
|  | 23     '"': "__dq__", | 
|  | 24     "[": "__ob__", | 
|  | 25     "]": "__cb__", | 
|  | 26     "{": "__oc__", | 
|  | 27     "}": "__cc__", | 
|  | 28     "@": "__at__", | 
|  | 29     "#": "__pd__", | 
|  | 30     "": "__cn__", | 
|  | 31 } | 
|  | 32 | 
|  | 33 | 
|  | 34 def etree_to_dict(t): | 
|  | 35     if t is None: | 
|  | 36         return {} | 
|  | 37 | 
|  | 38     d = {t.tag: {} if t.attrib else None} | 
|  | 39     children = list(t) | 
|  | 40     if children: | 
|  | 41         dd = defaultdict(list) | 
|  | 42         for dc in map(etree_to_dict, children): | 
|  | 43             for k, v in dc.items(): | 
|  | 44                 dd[k].append(v) | 
|  | 45         d = {t.tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}} | 
|  | 46     if t.attrib: | 
|  | 47         d[t.tag].update(("@" + k, v) for k, v in t.attrib.items()) | 
|  | 48     if t.text: | 
|  | 49         text = t.text.strip() | 
|  | 50         if children or t.attrib: | 
|  | 51             if text: | 
|  | 52                 d[t.tag]["#text"] = text | 
|  | 53         else: | 
|  | 54             d[t.tag] = text | 
|  | 55     return d | 
|  | 56 | 
|  | 57 | 
|  | 58 INSTALLED_TO = os.path.dirname(os.path.realpath(__file__)) | 
|  | 59 | 
|  | 60 | 
|  | 61 def metadata_from_node(node): | 
|  | 62     metadata = {} | 
|  | 63     try: | 
|  | 64         if len(node.findall("dataset")) != 1: | 
|  | 65             # exit early | 
|  | 66             return metadata | 
|  | 67     except Exception: | 
|  | 68         return {} | 
|  | 69 | 
|  | 70     for (key, value) in node.findall("dataset")[0].attrib.items(): | 
|  | 71         metadata["dataset_%s" % key] = value | 
|  | 72 | 
|  | 73     for (key, value) in node.findall("history")[0].attrib.items(): | 
|  | 74         metadata["history_%s" % key] = value | 
|  | 75 | 
|  | 76     for (key, value) in node.findall("metadata")[0].attrib.items(): | 
|  | 77         metadata["metadata_%s" % key] = value | 
|  | 78 | 
|  | 79     for (key, value) in node.findall("tool")[0].attrib.items(): | 
|  | 80         metadata["tool_%s" % key] = value | 
|  | 81 | 
|  | 82     # Additional Mappings applied: | 
|  | 83     metadata[ | 
|  | 84         "dataset_edam_format" | 
|  | 85     ] = '<a target="_blank" href="http://edamontology.org/{0}">{1}</a>'.format( | 
|  | 86         metadata["dataset_edam_format"], metadata["dataset_file_ext"] | 
|  | 87     ) | 
|  | 88     metadata["history_user_email"] = '<a href="mailto:{0}">{0}</a>'.format( | 
|  | 89         metadata["history_user_email"] | 
|  | 90     ) | 
|  | 91     metadata["hist_name"] = metadata["history_display_name"] | 
|  | 92     metadata[ | 
|  | 93         "history_display_name" | 
|  | 94     ] = '<a target="_blank" href="{galaxy}/history/view/{encoded_hist_id}">{hist_name}</a>'.format( | 
|  | 95         galaxy=GALAXY_INFRASTRUCTURE_URL, | 
|  | 96         encoded_hist_id=metadata["history_id"], | 
|  | 97         hist_name=metadata["history_display_name"], | 
|  | 98     ) | 
|  | 99     metadata[ | 
|  | 100         "tool_tool" | 
|  | 101     ] = '<a target="_blank" href="{galaxy}/datasets/{encoded_id}/show_params">{tool_id}</a>'.format( | 
|  | 102         galaxy=GALAXY_INFRASTRUCTURE_URL, | 
|  | 103         encoded_id=metadata["dataset_id"], | 
|  | 104         tool_id=metadata["tool_tool_id"], | 
|  | 105         # tool_version=metadata['tool_tool_version'], | 
|  | 106     ) | 
|  | 107     return metadata | 
|  | 108 | 
|  | 109 | 
|  | 110 class JbrowseConnector(object): | 
|  | 111     def __init__(self, jbrowse, outdir, genomes, standalone=None): | 
|  | 112         self.debug = False | 
|  | 113         self.giURL = GALAXY_INFRASTRUCTURE_URL | 
|  | 114         self.jbrowse = jbrowse | 
|  | 115         self.outdir = outdir | 
|  | 116         os.makedirs(self.outdir, exist_ok=True) | 
|  | 117         self.genome_paths = genomes | 
|  | 118         self.standalone = standalone | 
|  | 119         self.trackIdlist = [] | 
|  | 120         self.tracksToAdd = [] | 
|  | 121         self.config_json = { | 
|  | 122             "configuration": { | 
|  | 123                 "rpc": { | 
|  | 124                     "defaultDriver": "WebWorkerRpcDriver", | 
|  | 125                     "drivers": {"MainThreadRpcDriver": {}, "WebWorkerRpcDriver": {}}, | 
|  | 126                 }, | 
|  | 127                 "logoPath": {"locationType": "UriLocation", "uri": ""}, | 
|  | 128             } | 
|  | 129         } | 
|  | 130         self.config_json_file = os.path.join(outdir, "config.json") | 
|  | 131         if standalone == "complete": | 
|  | 132             self.clone_jbrowse(self.jbrowse, self.outdir) | 
|  | 133         elif standalone == "minimal": | 
|  | 134             self.clone_jbrowse(self.jbrowse, self.outdir, minimal=True) | 
|  | 135 | 
|  | 136     def subprocess_check_call(self, command, output=None): | 
|  | 137         if output: | 
|  | 138             if self.debug: | 
|  | 139                 log.debug("cd %s && %s >  %s", self.outdir, " ".join(command), output) | 
|  | 140             subprocess.check_call(command, cwd=self.outdir, stdout=output) | 
|  | 141         else: | 
|  | 142             log.debug("cd %s && %s", self.outdir, " ".join(command)) | 
|  | 143             subprocess.check_call(command, cwd=self.outdir) | 
|  | 144 | 
|  | 145     def subprocess_popen(self, command): | 
|  | 146         if self.debug: | 
|  | 147             log.debug("cd %s && %s", self.outdir, command) | 
|  | 148         p = subprocess.Popen( | 
|  | 149             command, | 
|  | 150             shell=True, | 
|  | 151             stdin=subprocess.PIPE, | 
|  | 152             stdout=subprocess.PIPE, | 
|  | 153             stderr=subprocess.PIPE, | 
|  | 154         ) | 
|  | 155         output, err = p.communicate() | 
|  | 156         retcode = p.returncode | 
|  | 157         if retcode != 0: | 
|  | 158             log.error("cd %s && %s", self.outdir, command) | 
|  | 159             log.error(output) | 
|  | 160             log.error(err) | 
|  | 161             raise RuntimeError("Command failed with exit code %s" % (retcode)) | 
|  | 162 | 
|  | 163     def subprocess_check_output(self, command): | 
|  | 164         if self.debug: | 
|  | 165             log.debug("cd %s && %s", self.outdir, " ".join(command)) | 
|  | 166         return subprocess.check_output(command, cwd=self.outdir) | 
|  | 167 | 
|  | 168     def _jbrowse_bin(self, command): | 
|  | 169         return os.path.realpath(os.path.join(self.jbrowse, "bin", command)) | 
|  | 170 | 
|  | 171     def symlink_or_copy(self, src, dest): | 
|  | 172         if "GALAXY_JBROWSE_SYMLINKS" in os.environ and bool( | 
|  | 173             os.environ["GALAXY_JBROWSE_SYMLINKS"] | 
|  | 174         ): | 
|  | 175             cmd = ["ln", "-s", src, dest] | 
|  | 176         else: | 
|  | 177             cmd = ["cp", src, dest] | 
|  | 178 | 
|  | 179         return self.subprocess_check_call(cmd) | 
|  | 180 | 
|  | 181     def process_genomes(self): | 
|  | 182         assemblies = [] | 
|  | 183         for i, genome_node in enumerate(self.genome_paths): | 
|  | 184             log.info("genome_node=%s" % str(genome_node)) | 
|  | 185             # We only expect one input genome per run. This for loop is just | 
|  | 186             # easier to write than the alternative / catches any possible | 
|  | 187             # issues. | 
|  | 188             genome_name = genome_node["meta"]["dataset_dname"] | 
|  | 189             dsId = genome_node["meta"]["dataset_id"] | 
| 5 | 190             fapath = genome_node["path"] | 
| 0 | 191             faname = genome_name + ".fasta" | 
|  | 192             faind = os.path.realpath(os.path.join(self.outdir, faname + ".fai")) | 
| 5 | 193             if self.standalone == "complete": | 
|  | 194                 faurl = faname | 
|  | 195                 fadest = os.path.realpath(os.path.join(self.outdir, faname)) | 
|  | 196                 cmd = ["cp", fapath, fadest] | 
|  | 197                 self.subprocess_check_call(cmd) | 
|  | 198             else: | 
|  | 199                 faurl = "%s/api/datasets/%s/display?to_ext=fasta" % (self.giURL, dsId) | 
| 0 | 200             cmd = ["samtools", "faidx", fapath, "--fai-idx", faind] | 
|  | 201             self.subprocess_check_call(cmd) | 
|  | 202             trackDict = { | 
|  | 203                 "name": genome_name, | 
|  | 204                 "sequence": { | 
|  | 205                     "type": "ReferenceSequenceTrack", | 
|  | 206                     "trackId": genome_name, | 
|  | 207                     "adapter": { | 
|  | 208                         "type": "IndexedFastaAdapter", | 
|  | 209                         "fastaLocation": {"uri": faurl, "locationType": "UriLocation"}, | 
|  | 210                         "faiLocation": { | 
|  | 211                             "uri": faname + ".fai", | 
|  | 212                             "locationType": "UriLocation", | 
|  | 213                         }, | 
|  | 214                     }, | 
|  | 215                 }, | 
|  | 216             } | 
|  | 217             assemblies.append(trackDict) | 
|  | 218         self.config_json["assemblies"] = assemblies | 
|  | 219         self.genome_name = genome_name | 
|  | 220         self.genome_path = faurl | 
|  | 221         self.genome_fai_path = faname + ".fai" | 
|  | 222 | 
|  | 223     def add_default_view(self): | 
|  | 224         cmd = [ | 
|  | 225             "jbrowse", | 
|  | 226             "set-default-session", | 
|  | 227             "-s", | 
|  | 228             self.config_json_file, | 
|  | 229             "-t", | 
|  | 230             ",".join(self.trackIdlist), | 
|  | 231             "-n", | 
|  | 232             "Default", | 
|  | 233             "--target", | 
|  | 234             self.outdir, | 
|  | 235         ]  # | 
|  | 236         self.subprocess_check_call(cmd) | 
|  | 237 | 
|  | 238     def write_config(self): | 
|  | 239         with open(self.config_json_file, "w") as fp: | 
|  | 240             json.dump(self.config_json, fp) | 
|  | 241 | 
|  | 242     def add_hic(self, data, trackData): | 
|  | 243         """ | 
|  | 244         HiC adapter. | 
|  | 245         https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md | 
|  | 246         for testing locally, these work: | 
|  | 247         HiC data is from https://s3.amazonaws.com/igv.broadinstitute.org/data/hic/intra_nofrag_30.hic | 
|  | 248         using hg19 reference track as a | 
|  | 249         'BgzipFastaAdapter' | 
|  | 250             fastaLocation: | 
|  | 251             uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz', | 
|  | 252             faiLocation: | 
|  | 253             uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.fai', | 
|  | 254             gziLocation: | 
|  | 255             uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.gzi', | 
|  | 256         Cool will not be likely to be a good fit - see discussion at https://github.com/GMOD/jbrowse-components/issues/2438 | 
|  | 257         """ | 
|  | 258         log.info("#### trackData=%s" % trackData) | 
|  | 259         tId = trackData["label"] | 
| 5 | 260         dsId = trackData["metadata"]["dataset_id"] | 
| 0 | 261         url = "%s/api/datasets/%s/display?to_ext=hic " % ( | 
|  | 262             self.giURL, | 
| 5 | 263             dsId, | 
| 0 | 264         ) | 
| 5 | 265         hname = trackData["name"] | 
|  | 266         if self.standalone == "complete": | 
|  | 267             dest = os.path.realpath(os.path.join(self.outdir, hname)) | 
|  | 268             url = hname | 
|  | 269             cmd = ["cp", data, dest] | 
|  | 270             self.subprocess_check_call(cmd) | 
|  | 271         else: | 
|  | 272             url = "%s/api/datasets/%s/display?to_ext=hic" % (self.giURL, dsId) | 
| 0 | 273         trackDict = { | 
|  | 274             "type": "HicTrack", | 
|  | 275             "trackId": tId, | 
| 5 | 276             "name": hname, | 
| 0 | 277             "assemblyNames": [self.genome_name], | 
|  | 278             "adapter": { | 
|  | 279                 "type": "HicAdapter", | 
|  | 280                 "hicLocation": {"uri": url, "locationType": "UriLocation"}, | 
|  | 281             }, | 
|  | 282         } | 
|  | 283         self.tracksToAdd.append(trackDict) | 
|  | 284         self.trackIdlist.append(tId) | 
|  | 285 | 
|  | 286     def add_maf(self, data, trackData): | 
|  | 287         """ | 
|  | 288         from https://github.com/cmdcolin/maf2bed | 
|  | 289         Note: Both formats start with a MAF as input, and note that your MAF file should contain the species name and chromosome name | 
|  | 290         e.g. hg38.chr1 in the sequence identifiers. | 
|  | 291         need the reference id - eg hg18, for maf2bed.pl as the first parameter | 
|  | 292         """ | 
|  | 293         mafPlugin = { | 
|  | 294             "plugins": [ | 
|  | 295                 { | 
|  | 296                     "name": "MafViewer", | 
|  | 297                     "url": "https://unpkg.com/jbrowse-plugin-mafviewer/dist/jbrowse-plugin-mafviewer.umd.production.min.js", | 
|  | 298                 } | 
|  | 299             ] | 
|  | 300         } | 
|  | 301         tId = trackData["label"] | 
|  | 302         fname = "%s.bed" % tId | 
|  | 303         dest = os.path.realpath("%s/%s" % (self.outdir, fname)) | 
|  | 304         # self.symlink_or_copy(data, dest) | 
|  | 305         # Process MAF to bed-like. Need build to munge chromosomes | 
|  | 306         gname = self.genome_name | 
|  | 307         cmd = [ | 
|  | 308             "bash", | 
|  | 309             os.path.join(INSTALLED_TO, "convertMAF.sh"), | 
|  | 310             data, | 
|  | 311             gname, | 
|  | 312             INSTALLED_TO, | 
|  | 313             dest, | 
|  | 314         ] | 
|  | 315         self.subprocess_check_call(cmd) | 
|  | 316         if True or self.debug: | 
|  | 317             log.info("### convertMAF.sh called as %s" % " ".join(cmd)) | 
|  | 318         # Construct samples list | 
|  | 319         # We could get this from galaxy metadata, not sure how easily. | 
|  | 320         ps = subprocess.Popen(["grep", "^s [^ ]*", "-o", data], stdout=subprocess.PIPE) | 
|  | 321         output = subprocess.check_output(("sort", "-u"), stdin=ps.stdout) | 
|  | 322         ps.wait() | 
|  | 323         outp = output.decode("ascii") | 
|  | 324         soutp = outp.split("\n") | 
|  | 325         samp = [x.split("s ")[1] for x in soutp if x.startswith("s ")] | 
|  | 326         samples = [x.split(".")[0] for x in samp] | 
|  | 327         if self.debug: | 
|  | 328             log.info("### got samples = %s " % (samples)) | 
|  | 329         trackDict = { | 
|  | 330             "type": "MafTrack", | 
|  | 331             "trackId": tId, | 
|  | 332             "name": trackData["name"], | 
|  | 333             "adapter": { | 
|  | 334                 "type": "MafTabixAdapter", | 
|  | 335                 "samples": samples, | 
|  | 336                 "bedGzLocation": {"uri": fname + ".sorted.bed.gz"}, | 
|  | 337                 "index": { | 
|  | 338                     "location": {"uri": fname + ".sorted.bed.gz.tbi"}, | 
|  | 339                 }, | 
|  | 340             }, | 
|  | 341             "assemblyNames": [self.genome_name], | 
|  | 342         } | 
|  | 343         self.tracksToAdd.append(trackDict) | 
|  | 344         self.trackIdlist.append(tId) | 
|  | 345         if self.config_json.get("plugins", None): | 
|  | 346             self.config_json["plugins"].append(mafPlugin[0]) | 
|  | 347         else: | 
|  | 348             self.config_json.update(mafPlugin) | 
|  | 349 | 
|  | 350     def _blastxml_to_gff3(self, xml, min_gap=10): | 
|  | 351         gff3_unrebased = tempfile.NamedTemporaryFile(delete=False) | 
|  | 352         cmd = [ | 
|  | 353             "python", | 
|  | 354             os.path.join(INSTALLED_TO, "blastxml_to_gapped_gff3.py"), | 
|  | 355             "--trim", | 
|  | 356             "--trim_end", | 
|  | 357             "--include_seq", | 
|  | 358             "--min_gap", | 
|  | 359             str(min_gap), | 
|  | 360             xml, | 
|  | 361         ] | 
|  | 362         subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_unrebased) | 
|  | 363         gff3_unrebased.close() | 
|  | 364         return gff3_unrebased.name | 
|  | 365 | 
|  | 366     def add_blastxml(self, data, trackData, blastOpts, **kwargs): | 
|  | 367         gff3 = self._blastxml_to_gff3(data, min_gap=blastOpts["min_gap"]) | 
|  | 368 | 
|  | 369         if "parent" in blastOpts and blastOpts["parent"] != "None": | 
|  | 370             gff3_rebased = tempfile.NamedTemporaryFile(delete=False) | 
|  | 371             cmd = ["python", os.path.join(INSTALLED_TO, "gff3_rebase.py")] | 
|  | 372             if blastOpts.get("protein", "false") == "true": | 
|  | 373                 cmd.append("--protein2dna") | 
|  | 374             cmd.extend([os.path.realpath(blastOpts["parent"]), gff3]) | 
|  | 375             subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_rebased) | 
|  | 376             gff3_rebased.close() | 
|  | 377 | 
|  | 378             # Replace original gff3 file | 
|  | 379             shutil.copy(gff3_rebased.name, gff3) | 
|  | 380             os.unlink(gff3_rebased.name) | 
|  | 381         url = "%s.gff3" % trackData["label"] | 
|  | 382         dest = os.path.realpath("%s/%s" % (self.outdir, url)) | 
|  | 383         self._sort_gff(gff3, dest) | 
|  | 384         url = url + ".gz" | 
|  | 385         tId = trackData["label"] | 
|  | 386         trackDict = { | 
|  | 387             "type": "FeatureTrack", | 
|  | 388             "trackId": tId, | 
|  | 389             "name": trackData["name"], | 
|  | 390             "assemblyNames": [self.genome_name], | 
|  | 391             "adapter": { | 
|  | 392                 "type": "Gff3TabixAdapter", | 
|  | 393                 "gffGzLocation": {"locationType": "UriLocation", "uri": url}, | 
|  | 394                 "index": { | 
|  | 395                     "location": {"locationType": "UriLocation", "uri": url + ".tbi"} | 
|  | 396                 }, | 
|  | 397             }, | 
|  | 398             "displays": [ | 
|  | 399                 { | 
|  | 400                     "type": "LinearBasicDisplay", | 
|  | 401                     "displayId": "%s-LinearBasicDisplay" % tId, | 
|  | 402                 }, | 
|  | 403                 {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId}, | 
|  | 404             ], | 
|  | 405         } | 
|  | 406         self.tracksToAdd.append(trackDict) | 
|  | 407         self.trackIdlist.append(tId) | 
|  | 408         os.unlink(gff3) | 
|  | 409 | 
|  | 410     def add_bigwig(self, data, trackData): | 
| 5 | 411         fname = trackData["name"] | 
|  | 412         if self.standalone == "complete": | 
|  | 413             dest = os.path.realpath(os.path.join(self.outdir, fname)) | 
|  | 414             url = fname | 
|  | 415             cmd = ["cp", data, dest] | 
|  | 416             self.subprocess_check_call(cmd) | 
|  | 417         else: | 
|  | 418             dsId = trackData["metadata"]["dataset_id"] | 
|  | 419             url = "%s/api/datasets/%s/display?to_ext=fasta" % (self.giURL, dsId) | 
| 0 | 420         tId = trackData["label"] | 
|  | 421         trackDict = { | 
|  | 422             "type": "QuantitativeTrack", | 
|  | 423             "trackId": tId, | 
| 5 | 424             "name": fname, | 
| 0 | 425             "assemblyNames": [ | 
|  | 426                 self.genome_name, | 
|  | 427             ], | 
|  | 428             "adapter": { | 
|  | 429                 "type": "BigWigAdapter", | 
|  | 430                 "bigWigLocation": {"locationType": "UriLocation", "uri": url}, | 
|  | 431             }, | 
|  | 432             "displays": [ | 
|  | 433                 { | 
|  | 434                     "type": "LinearWiggleDisplay", | 
|  | 435                     "displayId": "%s-LinearWiggleDisplay" % tId, | 
|  | 436                 } | 
|  | 437             ], | 
|  | 438         } | 
|  | 439         self.tracksToAdd.append(trackDict) | 
|  | 440         self.trackIdlist.append(tId) | 
|  | 441 | 
|  | 442     def add_bam(self, data, trackData, bamOpts, bam_index=None, **kwargs): | 
|  | 443         tId = trackData["label"] | 
| 5 | 444         fname = "%s.bam" % trackData["label"] | 
|  | 445         dest = os.path.realpath("%s/%s" % (self.outdir, fname)) | 
|  | 446         if self.standalone == "minimal": | 
|  | 447             dsId = trackData["metadata"]["dataset_id"] | 
|  | 448             url = "%s/api/datasets/%s/display?to_ext=bam" % (self.giURL, dsId) | 
|  | 449         else: | 
|  | 450             url = fname | 
|  | 451             self.symlink_or_copy(data, dest) | 
| 0 | 452         if bam_index is not None and os.path.exists(os.path.realpath(bam_index)): | 
|  | 453             # bai most probably made by galaxy and stored in galaxy dirs, need to copy it to dest | 
|  | 454             self.subprocess_check_call( | 
|  | 455                 ["cp", os.path.realpath(bam_index), dest + ".bai"] | 
|  | 456             ) | 
|  | 457         else: | 
|  | 458             # Can happen in exotic condition | 
|  | 459             # e.g. if bam imported as symlink with datatype=unsorted.bam, then datatype changed to bam | 
|  | 460             #      => no index generated by galaxy, but there might be one next to the symlink target | 
|  | 461             #      this trick allows to skip the bam sorting made by galaxy if already done outside | 
|  | 462             if os.path.exists(os.path.realpath(data) + ".bai"): | 
|  | 463                 self.symlink_or_copy(os.path.realpath(data) + ".bai", dest + ".bai") | 
|  | 464             else: | 
|  | 465                 log.warn("Could not find a bam index (.bai file) for %s", data) | 
|  | 466         trackDict = { | 
|  | 467             "type": "AlignmentsTrack", | 
|  | 468             "trackId": tId, | 
|  | 469             "name": trackData["name"], | 
|  | 470             "assemblyNames": [self.genome_name], | 
|  | 471             "adapter": { | 
|  | 472                 "type": "BamAdapter", | 
|  | 473                 "bamLocation": {"locationType": "UriLocation", "uri": url}, | 
|  | 474                 "index": { | 
| 5 | 475                     "location": {"locationType": "UriLocation", "uri": fname + ".bai"} | 
| 0 | 476                 }, | 
|  | 477                 "sequenceAdapter": { | 
|  | 478                     "type": "IndexedFastaAdapter", | 
|  | 479                     "fastaLocation": { | 
|  | 480                         "locationType": "UriLocation", | 
|  | 481                         "uri": self.genome_path, | 
|  | 482                     }, | 
|  | 483                     "faiLocation": { | 
|  | 484                         "locationType": "UriLocation", | 
|  | 485                         "uri": self.genome_fai_path, | 
|  | 486                     }, | 
|  | 487                     "metadataLocation": { | 
|  | 488                         "locationType": "UriLocation", | 
|  | 489                         "uri": "/path/to/fa.metadata.yaml", | 
|  | 490                     }, | 
|  | 491                 }, | 
|  | 492             }, | 
|  | 493         } | 
|  | 494         self.tracksToAdd.append(trackDict) | 
|  | 495         self.trackIdlist.append(tId) | 
|  | 496 | 
|  | 497     def add_vcf(self, data, trackData): | 
|  | 498         tId = trackData["label"] | 
|  | 499         url = "%s/api/datasets/%s/display" % ( | 
|  | 500             self.giURL, | 
|  | 501             trackData["metadata"]["dataset_id"], | 
|  | 502         ) | 
|  | 503         url = "%s.vcf.gz" % tId | 
|  | 504         dest = os.path.realpath("%s/%s" % (self.outdir, url)) | 
|  | 505         cmd = "bgzip -c %s  > %s" % (data, dest) | 
|  | 506         self.subprocess_popen(cmd) | 
|  | 507         cmd = ["tabix", "-p", "vcf", dest] | 
|  | 508         self.subprocess_check_call(cmd) | 
|  | 509         trackDict = { | 
|  | 510             "type": "VariantTrack", | 
|  | 511             "trackId": tId, | 
|  | 512             "name": trackData["name"], | 
|  | 513             "assemblyNames": [self.genome_name], | 
|  | 514             "adapter": { | 
|  | 515                 "type": "VcfTabixAdapter", | 
|  | 516                 "vcfGzLocation": {"uri": url, "locationType": "UriLocation"}, | 
|  | 517                 "index": { | 
|  | 518                     "location": {"uri": url + ".tbi", "locationType": "UriLocation"} | 
|  | 519                 }, | 
|  | 520             }, | 
|  | 521             "displays": [ | 
|  | 522                 { | 
|  | 523                     "type": "LinearVariantDisplay", | 
|  | 524                     "displayId": "%s-LinearVariantDisplay" % tId, | 
|  | 525                 }, | 
|  | 526                 { | 
|  | 527                     "type": "ChordVariantDisplay", | 
|  | 528                     "displayId": "%s-ChordVariantDisplay" % tId, | 
|  | 529                 }, | 
|  | 530                 { | 
|  | 531                     "type": "LinearPairedArcDisplay", | 
|  | 532                     "displayId": "%s-LinearPairedArcDisplay" % tId, | 
|  | 533                 }, | 
|  | 534             ], | 
|  | 535         } | 
|  | 536         self.tracksToAdd.append(trackDict) | 
|  | 537         self.trackIdlist.append(tId) | 
|  | 538 | 
|  | 539     def _sort_gff(self, data, dest): | 
|  | 540         # Only index if not already done | 
|  | 541         if not os.path.exists(dest + ".gz"): | 
|  | 542             cmd = "jbrowse sort-gff %s | bgzip -c > %s.gz" % ( | 
|  | 543                 data, | 
|  | 544                 dest, | 
|  | 545             )  # "gff3sort.pl --precise '%s' | grep -v \"^$\" > '%s'" | 
|  | 546             self.subprocess_popen(cmd) | 
|  | 547             self.subprocess_check_call(["tabix", "-f", "-p", "gff", dest + ".gz"]) | 
|  | 548 | 
|  | 549     def _sort_bed(self, data, dest): | 
|  | 550         # Only index if not already done | 
|  | 551         if not os.path.exists(dest): | 
| 5 | 552             cmd = "sort -k1,1 -k2,2n %s | bgzip -c > %s" % (data, dest) | 
|  | 553             self.subprocess_popen(cmd) | 
|  | 554             cmd = ["tabix", "-f", "-p", "bed", dest] | 
|  | 555             self.subprocess_check_call(cmd) | 
| 0 | 556 | 
|  | 557     def add_gff(self, data, ext, trackData): | 
|  | 558         url = "%s.%s" % (trackData["label"], ext) | 
|  | 559         dest = os.path.realpath("%s/%s" % (self.outdir, url)) | 
|  | 560         self._sort_gff(data, dest) | 
|  | 561         url = url + ".gz" | 
|  | 562         tId = trackData["label"] | 
|  | 563         trackDict = { | 
|  | 564             "type": "FeatureTrack", | 
|  | 565             "trackId": tId, | 
|  | 566             "name": trackData["name"], | 
|  | 567             "assemblyNames": [self.genome_name], | 
|  | 568             "adapter": { | 
|  | 569                 "type": "Gff3TabixAdapter", | 
|  | 570                 "gffGzLocation": {"locationType": "UriLocation", "uri": url}, | 
|  | 571                 "index": { | 
| 2 | 572                     "location": {"uri": url + ".tbi", "locationType": "UriLocation"} | 
| 0 | 573                 }, | 
|  | 574             }, | 
|  | 575             "displays": [ | 
|  | 576                 { | 
|  | 577                     "type": "LinearBasicDisplay", | 
|  | 578                     "displayId": "%s-LinearBasicDisplay" % tId, | 
|  | 579                 }, | 
|  | 580                 {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId}, | 
|  | 581             ], | 
|  | 582         } | 
|  | 583         self.tracksToAdd.append(trackDict) | 
|  | 584         self.trackIdlist.append(tId) | 
|  | 585 | 
|  | 586     def add_bed(self, data, ext, trackData): | 
|  | 587         url = "%s.%s" % (trackData["label"], ext) | 
| 5 | 588         dest = os.path.realpath("%s/%s.gz" % (self.outdir, url)) | 
| 0 | 589         self._sort_bed(data, dest) | 
|  | 590         tId = trackData["label"] | 
| 2 | 591         url = url + ".gz" | 
| 0 | 592         trackDict = { | 
|  | 593             "type": "FeatureTrack", | 
|  | 594             "trackId": tId, | 
|  | 595             "name": trackData["name"], | 
|  | 596             "assemblyNames": [self.genome_name], | 
|  | 597             "adapter": { | 
| 2 | 598                 "type": "BedTabixAdapter", | 
|  | 599                 "bedGzLocation": {"locationType": "UriLocation", "uri": url}, | 
|  | 600                 "index": { | 
|  | 601                     "location": {"uri": url + ".tbi", "locationType": "UriLocation"} | 
|  | 602                 }, | 
| 0 | 603             }, | 
|  | 604             "displays": [ | 
|  | 605                 { | 
|  | 606                     "type": "LinearBasicDisplay", | 
|  | 607                     "displayId": "%s-LinearBasicDisplay" % tId, | 
|  | 608                 }, | 
|  | 609                 {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId}, | 
|  | 610             ], | 
|  | 611         } | 
|  | 612         self.tracksToAdd.append(trackDict) | 
|  | 613         self.trackIdlist.append(tId) | 
|  | 614 | 
|  | 615     def process_annotations(self, track): | 
|  | 616         category = track["category"].replace("__pd__date__pd__", TODAY) | 
|  | 617         for i, ( | 
|  | 618             dataset_path, | 
|  | 619             dataset_ext, | 
|  | 620             track_human_label, | 
|  | 621             extra_metadata, | 
|  | 622         ) in enumerate(track["trackfiles"]): | 
|  | 623             # Unsanitize labels (element_identifiers are always sanitized by Galaxy) | 
|  | 624             for key, value in mapped_chars.items(): | 
|  | 625                 track_human_label = track_human_label.replace(value, key) | 
|  | 626             outputTrackConfig = { | 
|  | 627                 "category": category, | 
|  | 628             } | 
|  | 629             if self.debug: | 
|  | 630                 log.info( | 
|  | 631                     "Processing category = %s, track_human_label = %s", | 
|  | 632                     category, | 
|  | 633                     track_human_label, | 
|  | 634                 ) | 
|  | 635             # We add extra data to hash for the case of REST + SPARQL. | 
|  | 636             if ( | 
|  | 637                 "conf" in track | 
|  | 638                 and "options" in track["conf"] | 
|  | 639                 and "url" in track["conf"]["options"] | 
|  | 640             ): | 
|  | 641                 rest_url = track["conf"]["options"]["url"] | 
|  | 642             else: | 
|  | 643                 rest_url = "" | 
|  | 644 | 
|  | 645             # I chose to use track['category'] instead of 'category' here. This | 
|  | 646             # is intentional. This way re-running the tool on a different date | 
|  | 647             # will not generate different hashes and make comparison of outputs | 
|  | 648             # much simpler. | 
|  | 649             hashData = [ | 
|  | 650                 str(dataset_path), | 
|  | 651                 track_human_label, | 
|  | 652                 track["category"], | 
|  | 653                 rest_url, | 
|  | 654             ] | 
|  | 655             hashData = "|".join(hashData).encode("utf-8") | 
|  | 656             outputTrackConfig["label"] = hashlib.md5(hashData).hexdigest() + "_%s" % i | 
|  | 657             outputTrackConfig["metadata"] = extra_metadata | 
|  | 658             outputTrackConfig["name"] = track_human_label | 
|  | 659 | 
|  | 660             if dataset_ext in ("gff", "gff3"): | 
|  | 661                 self.add_gff( | 
|  | 662                     dataset_path, | 
|  | 663                     dataset_ext, | 
|  | 664                     outputTrackConfig, | 
|  | 665                 ) | 
|  | 666             elif dataset_ext in ("hic",): | 
|  | 667                 self.add_hic( | 
|  | 668                     dataset_path, | 
|  | 669                     outputTrackConfig, | 
|  | 670                 ) | 
|  | 671             elif dataset_ext in ("bed",): | 
|  | 672                 self.add_bed( | 
|  | 673                     dataset_path, | 
|  | 674                     dataset_ext, | 
|  | 675                     outputTrackConfig, | 
|  | 676                 ) | 
|  | 677             elif dataset_ext in ("maf",): | 
|  | 678                 self.add_maf( | 
|  | 679                     dataset_path, | 
|  | 680                     outputTrackConfig, | 
|  | 681                 ) | 
|  | 682             elif dataset_ext == "bigwig": | 
|  | 683                 self.add_bigwig( | 
|  | 684                     dataset_path, | 
|  | 685                     outputTrackConfig, | 
|  | 686                 ) | 
|  | 687             elif dataset_ext == "bam": | 
|  | 688                 real_indexes = track["conf"]["options"]["pileup"]["bam_indices"][ | 
|  | 689                     "bam_index" | 
|  | 690                 ] | 
|  | 691                 if not isinstance(real_indexes, list): | 
|  | 692                     # <bam_indices> | 
|  | 693                     #  <bam_index>/path/to/a.bam.bai</bam_index> | 
|  | 694                     # </bam_indices> | 
|  | 695                     # | 
|  | 696                     # The above will result in the 'bam_index' key containing a | 
|  | 697                     # string. If there are two or more indices, the container | 
|  | 698                     # becomes a list. Fun! | 
|  | 699                     real_indexes = [real_indexes] | 
|  | 700 | 
|  | 701                 self.add_bam( | 
|  | 702                     dataset_path, | 
|  | 703                     outputTrackConfig, | 
|  | 704                     track["conf"]["options"]["pileup"], | 
|  | 705                     bam_index=real_indexes[i], | 
|  | 706                 ) | 
|  | 707             elif dataset_ext == "blastxml": | 
|  | 708                 self.add_blastxml( | 
|  | 709                     dataset_path, outputTrackConfig, track["conf"]["options"]["blast"] | 
|  | 710                 ) | 
|  | 711             elif dataset_ext == "vcf": | 
|  | 712                 self.add_vcf(dataset_path, outputTrackConfig) | 
|  | 713             else: | 
|  | 714                 log.warn("Do not know how to handle %s", dataset_ext) | 
|  | 715 | 
|  | 716     def clone_jbrowse(self, jbrowse_dir, destination, minimal=False): | 
|  | 717         """Clone a JBrowse directory into a destination directory.""" | 
|  | 718         cmd = ["jbrowse", "create", "-f", self.outdir] | 
|  | 719         self.subprocess_check_call(cmd) | 
|  | 720         for fn in [ | 
|  | 721             "asset-manifest.json", | 
|  | 722             "favicon.ico", | 
|  | 723             "robots.txt", | 
|  | 724             "umd_plugin.js", | 
|  | 725             "version.txt", | 
|  | 726             "test_data", | 
|  | 727         ]: | 
|  | 728             cmd = ["rm", "-rf", os.path.join(self.outdir, fn)] | 
|  | 729             self.subprocess_check_call(cmd) | 
|  | 730 | 
|  | 731 | 
|  | 732 if __name__ == "__main__": | 
|  | 733     parser = argparse.ArgumentParser(description="", epilog="") | 
|  | 734     parser.add_argument("xml", type=argparse.FileType("r"), help="Track Configuration") | 
|  | 735 | 
|  | 736     parser.add_argument("--jbrowse", help="Folder containing a jbrowse release") | 
|  | 737     parser.add_argument("--outdir", help="Output directory", default="out") | 
|  | 738     parser.add_argument( | 
|  | 739         "--standalone", | 
|  | 740         choices=["complete", "minimal", "data"], | 
|  | 741         help="Standalone mode includes a copy of JBrowse", | 
|  | 742     ) | 
|  | 743     parser.add_argument("--version", "-V", action="version", version="%(prog)s 0.8.0") | 
|  | 744     args = parser.parse_args() | 
|  | 745 | 
|  | 746     tree = ET.parse(args.xml.name) | 
|  | 747     root = tree.getroot() | 
|  | 748 | 
|  | 749     # This should be done ASAP | 
|  | 750     GALAXY_INFRASTRUCTURE_URL = root.find("metadata/galaxyUrl").text | 
|  | 751     # Sometimes this comes as `localhost` without a protocol | 
|  | 752     if not GALAXY_INFRASTRUCTURE_URL.startswith("http"): | 
|  | 753         # so we'll prepend `http://` and hope for the best. Requests *should* | 
|  | 754         # be GET and not POST so it should redirect OK | 
|  | 755         GALAXY_INFRASTRUCTURE_URL = "http://" + GALAXY_INFRASTRUCTURE_URL | 
|  | 756 | 
|  | 757     jc = JbrowseConnector( | 
|  | 758         jbrowse=args.jbrowse, | 
|  | 759         outdir=args.outdir, | 
|  | 760         genomes=[ | 
|  | 761             { | 
|  | 762                 "path": os.path.realpath(x.attrib["path"]), | 
|  | 763                 "meta": metadata_from_node(x.find("metadata")), | 
|  | 764             } | 
|  | 765             for x in root.findall("metadata/genomes/genome") | 
|  | 766         ], | 
|  | 767         standalone=args.standalone, | 
|  | 768     ) | 
|  | 769     jc.process_genomes() | 
|  | 770 | 
|  | 771     for track in root.findall("tracks/track"): | 
|  | 772         track_conf = {} | 
|  | 773         track_conf["trackfiles"] = [] | 
|  | 774 | 
|  | 775         is_multi_bigwig = False | 
|  | 776         try: | 
|  | 777             if track.find("options/wiggle/multibigwig") and ( | 
|  | 778                 track.find("options/wiggle/multibigwig").text == "True" | 
|  | 779             ): | 
|  | 780                 is_multi_bigwig = True | 
|  | 781                 multi_bigwig_paths = [] | 
|  | 782         except KeyError: | 
|  | 783             pass | 
|  | 784 | 
|  | 785         trackfiles = track.findall("files/trackFile") | 
|  | 786         if trackfiles: | 
|  | 787             for x in track.findall("files/trackFile"): | 
|  | 788                 if is_multi_bigwig: | 
|  | 789                     multi_bigwig_paths.append( | 
|  | 790                         (x.attrib["label"], os.path.realpath(x.attrib["path"])) | 
|  | 791                     ) | 
|  | 792                 else: | 
|  | 793                     if trackfiles: | 
|  | 794                         metadata = metadata_from_node(x.find("metadata")) | 
|  | 795                         track_conf["dataset_id"] = metadata["dataset_id"] | 
|  | 796                         track_conf["trackfiles"].append( | 
|  | 797                             ( | 
|  | 798                                 os.path.realpath(x.attrib["path"]), | 
|  | 799                                 x.attrib["ext"], | 
|  | 800                                 x.attrib["label"], | 
|  | 801                                 metadata, | 
|  | 802                             ) | 
|  | 803                         ) | 
|  | 804         else: | 
|  | 805             # For tracks without files (rest, sparql) | 
|  | 806             track_conf["trackfiles"].append( | 
|  | 807                 ( | 
|  | 808                     "",  # N/A, no path for rest or sparql | 
|  | 809                     track.attrib["format"], | 
|  | 810                     track.find("options/label").text, | 
|  | 811                     {}, | 
|  | 812                 ) | 
|  | 813             ) | 
|  | 814 | 
|  | 815         if is_multi_bigwig: | 
|  | 816             metadata = metadata_from_node(x.find("metadata")) | 
|  | 817 | 
|  | 818             track_conf["trackfiles"].append( | 
|  | 819                 ( | 
|  | 820                     multi_bigwig_paths,  # Passing an array of paths to represent as one track | 
|  | 821                     "bigwig_multiple", | 
|  | 822                     "MultiBigWig",  # Giving an hardcoded name for now | 
|  | 823                     {},  # No metadata for multiple bigwig | 
|  | 824                 ) | 
|  | 825             ) | 
|  | 826 | 
|  | 827         track_conf["category"] = track.attrib["cat"] | 
|  | 828         track_conf["format"] = track.attrib["format"] | 
|  | 829         try: | 
|  | 830             # Only pertains to gff3 + blastxml. TODO? | 
|  | 831             track_conf["style"] = {t.tag: t.text for t in track.find("options/style")} | 
|  | 832         except TypeError: | 
|  | 833             track_conf["style"] = {} | 
|  | 834             pass | 
|  | 835         track_conf["conf"] = etree_to_dict(track.find("options")) | 
|  | 836         jc.process_annotations(track_conf) | 
|  | 837         print("## processed", str(track_conf), "trackIdlist", jc.trackIdlist) | 
|  | 838     print( | 
|  | 839         "###done processing, trackIdlist=", | 
|  | 840         jc.trackIdlist, | 
|  | 841         "config=", | 
|  | 842         str(jc.config_json), | 
|  | 843     ) | 
|  | 844     jc.config_json["tracks"] = jc.tracksToAdd | 
|  | 845     jc.write_config() | 
|  | 846     jc.add_default_view() |