| 0 | 1 #!/usr/bin/env python | 
|  | 2 # change to accumulating all configuration for config.json based on the default from the clone | 
|  | 3 import argparse | 
|  | 4 import datetime | 
|  | 5 import hashlib | 
|  | 6 import json | 
|  | 7 import logging | 
|  | 8 import os | 
|  | 9 import shutil | 
|  | 10 import subprocess | 
|  | 11 import tempfile | 
|  | 12 import xml.etree.ElementTree as ET | 
|  | 13 from collections import defaultdict | 
|  | 14 | 
|  | 15 logging.basicConfig(level=logging.INFO) | 
|  | 16 log = logging.getLogger("jbrowse") | 
|  | 17 TODAY = datetime.datetime.now().strftime("%Y-%m-%d") | 
|  | 18 GALAXY_INFRASTRUCTURE_URL = None | 
|  | 19 mapped_chars = { | 
|  | 20     ">": "__gt__", | 
|  | 21     "<": "__lt__", | 
|  | 22     "'": "__sq__", | 
|  | 23     '"': "__dq__", | 
|  | 24     "[": "__ob__", | 
|  | 25     "]": "__cb__", | 
|  | 26     "{": "__oc__", | 
|  | 27     "}": "__cc__", | 
|  | 28     "@": "__at__", | 
|  | 29     "#": "__pd__", | 
|  | 30     "": "__cn__", | 
|  | 31 } | 
|  | 32 | 
|  | 33 | 
|  | 34 def etree_to_dict(t): | 
|  | 35     if t is None: | 
|  | 36         return {} | 
|  | 37 | 
|  | 38     d = {t.tag: {} if t.attrib else None} | 
|  | 39     children = list(t) | 
|  | 40     if children: | 
|  | 41         dd = defaultdict(list) | 
|  | 42         for dc in map(etree_to_dict, children): | 
|  | 43             for k, v in dc.items(): | 
|  | 44                 dd[k].append(v) | 
|  | 45         d = {t.tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}} | 
|  | 46     if t.attrib: | 
|  | 47         d[t.tag].update(("@" + k, v) for k, v in t.attrib.items()) | 
|  | 48     if t.text: | 
|  | 49         text = t.text.strip() | 
|  | 50         if children or t.attrib: | 
|  | 51             if text: | 
|  | 52                 d[t.tag]["#text"] = text | 
|  | 53         else: | 
|  | 54             d[t.tag] = text | 
|  | 55     return d | 
|  | 56 | 
|  | 57 | 
|  | 58 INSTALLED_TO = os.path.dirname(os.path.realpath(__file__)) | 
|  | 59 | 
|  | 60 | 
|  | 61 def metadata_from_node(node): | 
|  | 62     metadata = {} | 
|  | 63     try: | 
|  | 64         if len(node.findall("dataset")) != 1: | 
|  | 65             # exit early | 
|  | 66             return metadata | 
|  | 67     except Exception: | 
|  | 68         return {} | 
|  | 69 | 
|  | 70     for (key, value) in node.findall("dataset")[0].attrib.items(): | 
|  | 71         metadata["dataset_%s" % key] = value | 
|  | 72 | 
|  | 73     for (key, value) in node.findall("history")[0].attrib.items(): | 
|  | 74         metadata["history_%s" % key] = value | 
|  | 75 | 
|  | 76     for (key, value) in node.findall("metadata")[0].attrib.items(): | 
|  | 77         metadata["metadata_%s" % key] = value | 
|  | 78 | 
|  | 79     for (key, value) in node.findall("tool")[0].attrib.items(): | 
|  | 80         metadata["tool_%s" % key] = value | 
|  | 81 | 
|  | 82     # Additional Mappings applied: | 
|  | 83     metadata[ | 
|  | 84         "dataset_edam_format" | 
|  | 85     ] = '<a target="_blank" href="http://edamontology.org/{0}">{1}</a>'.format( | 
|  | 86         metadata["dataset_edam_format"], metadata["dataset_file_ext"] | 
|  | 87     ) | 
|  | 88     metadata["history_user_email"] = '<a href="mailto:{0}">{0}</a>'.format( | 
|  | 89         metadata["history_user_email"] | 
|  | 90     ) | 
|  | 91     metadata["hist_name"] = metadata["history_display_name"] | 
|  | 92     metadata[ | 
|  | 93         "history_display_name" | 
|  | 94     ] = '<a target="_blank" href="{galaxy}/history/view/{encoded_hist_id}">{hist_name}</a>'.format( | 
|  | 95         galaxy=GALAXY_INFRASTRUCTURE_URL, | 
|  | 96         encoded_hist_id=metadata["history_id"], | 
|  | 97         hist_name=metadata["history_display_name"], | 
|  | 98     ) | 
|  | 99     metadata[ | 
|  | 100         "tool_tool" | 
|  | 101     ] = '<a target="_blank" href="{galaxy}/datasets/{encoded_id}/show_params">{tool_id}</a>'.format( | 
|  | 102         galaxy=GALAXY_INFRASTRUCTURE_URL, | 
|  | 103         encoded_id=metadata["dataset_id"], | 
|  | 104         tool_id=metadata["tool_tool_id"], | 
|  | 105         # tool_version=metadata['tool_tool_version'], | 
|  | 106     ) | 
|  | 107     return metadata | 
|  | 108 | 
|  | 109 | 
|  | 110 class JbrowseConnector(object): | 
| 7 | 111     def __init__(self, jbrowse, outdir, genomes): | 
| 0 | 112         self.debug = False | 
| 6 | 113         self.usejson = True | 
| 0 | 114         self.giURL = GALAXY_INFRASTRUCTURE_URL | 
|  | 115         self.jbrowse = jbrowse | 
|  | 116         self.outdir = outdir | 
|  | 117         os.makedirs(self.outdir, exist_ok=True) | 
|  | 118         self.genome_paths = genomes | 
|  | 119         self.trackIdlist = [] | 
|  | 120         self.tracksToAdd = [] | 
| 6 | 121         self.config_json = {} | 
|  | 122         self.config_json_file = os.path.realpath(os.path.join(outdir, "config.json")) | 
|  | 123         self.clone_jbrowse(self.jbrowse, self.outdir) | 
| 0 | 124 | 
|  | 125     def subprocess_check_call(self, command, output=None): | 
|  | 126         if output: | 
|  | 127             if self.debug: | 
|  | 128                 log.debug("cd %s && %s >  %s", self.outdir, " ".join(command), output) | 
|  | 129             subprocess.check_call(command, cwd=self.outdir, stdout=output) | 
|  | 130         else: | 
|  | 131             log.debug("cd %s && %s", self.outdir, " ".join(command)) | 
|  | 132             subprocess.check_call(command, cwd=self.outdir) | 
|  | 133 | 
|  | 134     def subprocess_popen(self, command): | 
|  | 135         if self.debug: | 
|  | 136             log.debug("cd %s && %s", self.outdir, command) | 
|  | 137         p = subprocess.Popen( | 
|  | 138             command, | 
|  | 139             shell=True, | 
|  | 140             stdin=subprocess.PIPE, | 
|  | 141             stdout=subprocess.PIPE, | 
|  | 142             stderr=subprocess.PIPE, | 
|  | 143         ) | 
|  | 144         output, err = p.communicate() | 
|  | 145         retcode = p.returncode | 
|  | 146         if retcode != 0: | 
|  | 147             log.error("cd %s && %s", self.outdir, command) | 
|  | 148             log.error(output) | 
|  | 149             log.error(err) | 
|  | 150             raise RuntimeError("Command failed with exit code %s" % (retcode)) | 
|  | 151 | 
|  | 152     def subprocess_check_output(self, command): | 
|  | 153         if self.debug: | 
|  | 154             log.debug("cd %s && %s", self.outdir, " ".join(command)) | 
|  | 155         return subprocess.check_output(command, cwd=self.outdir) | 
|  | 156 | 
|  | 157     def _jbrowse_bin(self, command): | 
|  | 158         return os.path.realpath(os.path.join(self.jbrowse, "bin", command)) | 
|  | 159 | 
|  | 160     def symlink_or_copy(self, src, dest): | 
|  | 161         if "GALAXY_JBROWSE_SYMLINKS" in os.environ and bool( | 
|  | 162             os.environ["GALAXY_JBROWSE_SYMLINKS"] | 
|  | 163         ): | 
|  | 164             cmd = ["ln", "-s", src, dest] | 
|  | 165         else: | 
|  | 166             cmd = ["cp", src, dest] | 
|  | 167 | 
|  | 168         return self.subprocess_check_call(cmd) | 
|  | 169 | 
|  | 170     def process_genomes(self): | 
|  | 171         assemblies = [] | 
|  | 172         for i, genome_node in enumerate(self.genome_paths): | 
| 6 | 173             if self.debug: | 
|  | 174                 log.info("genome_node=%s" % str(genome_node)) | 
| 0 | 175             genome_name = genome_node["meta"]["dataset_dname"] | 
| 5 | 176             fapath = genome_node["path"] | 
| 7 | 177             faname = genome_name + ".fa.gz" | 
|  | 178             fadest = os.path.realpath(os.path.join(self.outdir, faname)) | 
|  | 179             cmd = "bgzip -i -c %s > %s && samtools faidx %s" % ( | 
|  | 180                 fapath, | 
|  | 181                 fadest, | 
|  | 182                 fadest, | 
|  | 183             ) | 
|  | 184             self.subprocess_popen(cmd) | 
|  | 185             adapter = { | 
|  | 186                 "type": "BgzipFastaAdapter", | 
|  | 187                 "fastaLocation": { | 
| 6 | 188                     "uri": faname, | 
| 7 | 189                 }, | 
|  | 190                 "faiLocation": { | 
|  | 191                     "uri": faname + ".fai", | 
|  | 192                 }, | 
|  | 193                 "gziLocation": { | 
|  | 194                     "uri": faname + ".gzi", | 
|  | 195                 }, | 
|  | 196             } | 
| 0 | 197             trackDict = { | 
|  | 198                 "name": genome_name, | 
|  | 199                 "sequence": { | 
|  | 200                     "type": "ReferenceSequenceTrack", | 
|  | 201                     "trackId": genome_name, | 
| 6 | 202                     "adapter": adapter, | 
| 0 | 203                 }, | 
| 6 | 204                 "rendering": {"type": "DivSequenceRenderer"}, | 
| 0 | 205             } | 
|  | 206             assemblies.append(trackDict) | 
|  | 207         self.genome_name = genome_name | 
| 6 | 208         if self.usejson: | 
|  | 209             self.config_json["assemblies"] = assemblies | 
|  | 210         else: | 
| 7 | 211             cmd = [ | 
|  | 212                 "jbrowse", | 
|  | 213                 "add-assembly", | 
|  | 214                 faname, | 
|  | 215                 "-t", | 
|  | 216                 "bgzipFasta", | 
|  | 217                 "-n", | 
|  | 218                 genome_name, | 
|  | 219                 "--load", | 
|  | 220                 "inPlace", | 
|  | 221                 "--faiLocation", | 
|  | 222                 faname + ".fai", | 
|  | 223                 "--gziLocation", | 
|  | 224                 faname + ".gzi", | 
|  | 225                 "--target", | 
|  | 226                 self.outdir, | 
|  | 227             ] | 
| 6 | 228             self.subprocess_check_call(cmd) | 
| 0 | 229 | 
|  | 230     def add_default_view(self): | 
|  | 231         cmd = [ | 
|  | 232             "jbrowse", | 
|  | 233             "set-default-session", | 
|  | 234             "-s", | 
|  | 235             self.config_json_file, | 
|  | 236             "-t", | 
|  | 237             ",".join(self.trackIdlist), | 
|  | 238             "-n", | 
| 6 | 239             "JBrowse2 in Galaxy", | 
| 0 | 240             "--target", | 
| 6 | 241             self.config_json_file, | 
|  | 242             "-v", | 
|  | 243             " LinearGenomeView", | 
|  | 244         ] | 
| 7 | 245         if self.debug: | 
| 6 | 246             log.info("### calling set-default-session with cmd=%s" % "  ".join(cmd)) | 
| 0 | 247         self.subprocess_check_call(cmd) | 
|  | 248 | 
|  | 249     def write_config(self): | 
|  | 250         with open(self.config_json_file, "w") as fp: | 
|  | 251             json.dump(self.config_json, fp) | 
|  | 252 | 
|  | 253     def add_hic(self, data, trackData): | 
|  | 254         """ | 
|  | 255         HiC adapter. | 
|  | 256         https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md | 
|  | 257         for testing locally, these work: | 
|  | 258         HiC data is from https://s3.amazonaws.com/igv.broadinstitute.org/data/hic/intra_nofrag_30.hic | 
|  | 259         using hg19 reference track as a | 
|  | 260         'BgzipFastaAdapter' | 
|  | 261             fastaLocation: | 
|  | 262             uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz', | 
|  | 263             faiLocation: | 
|  | 264             uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.fai', | 
|  | 265             gziLocation: | 
|  | 266             uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.gzi', | 
|  | 267         Cool will not be likely to be a good fit - see discussion at https://github.com/GMOD/jbrowse-components/issues/2438 | 
|  | 268         """ | 
|  | 269         log.info("#### trackData=%s" % trackData) | 
|  | 270         tId = trackData["label"] | 
| 5 | 271         dsId = trackData["metadata"]["dataset_id"] | 
| 0 | 272         url = "%s/api/datasets/%s/display?to_ext=hic " % ( | 
|  | 273             self.giURL, | 
| 5 | 274             dsId, | 
| 0 | 275         ) | 
| 5 | 276         hname = trackData["name"] | 
| 7 | 277         dest = os.path.realpath(os.path.join(self.outdir, hname)) | 
|  | 278         url = hname | 
|  | 279         cmd = ["cp", data, dest] | 
|  | 280         self.subprocess_check_call(cmd) | 
|  | 281         floc = { | 
|  | 282             "uri": hname, | 
|  | 283         } | 
| 0 | 284         trackDict = { | 
|  | 285             "type": "HicTrack", | 
|  | 286             "trackId": tId, | 
| 5 | 287             "name": hname, | 
| 0 | 288             "assemblyNames": [self.genome_name], | 
|  | 289             "adapter": { | 
|  | 290                 "type": "HicAdapter", | 
| 6 | 291                 "hicLocation": floc, | 
| 0 | 292             }, | 
|  | 293         } | 
| 6 | 294         if self.usejson: | 
|  | 295             self.tracksToAdd.append(trackDict) | 
|  | 296             self.trackIdlist.append(tId) | 
|  | 297         else: | 
|  | 298             cmd = [ | 
|  | 299                 "jbrowse", | 
|  | 300                 "add-track", | 
|  | 301                 url, | 
|  | 302                 "-t", | 
|  | 303                 "HicTrack", | 
|  | 304                 "-a", | 
|  | 305                 self.genome_name, | 
|  | 306                 "-n", | 
|  | 307                 hname, | 
|  | 308                 "--load", | 
|  | 309                 "inPlace", | 
|  | 310                 "--target", | 
|  | 311                 self.outdir, | 
|  | 312             ] | 
|  | 313             self.subprocess_check_call(cmd) | 
| 0 | 314 | 
|  | 315     def add_maf(self, data, trackData): | 
|  | 316         """ | 
|  | 317         from https://github.com/cmdcolin/maf2bed | 
|  | 318         Note: Both formats start with a MAF as input, and note that your MAF file should contain the species name and chromosome name | 
|  | 319         e.g. hg38.chr1 in the sequence identifiers. | 
|  | 320         need the reference id - eg hg18, for maf2bed.pl as the first parameter | 
|  | 321         """ | 
|  | 322         mafPlugin = { | 
|  | 323             "plugins": [ | 
|  | 324                 { | 
|  | 325                     "name": "MafViewer", | 
|  | 326                     "url": "https://unpkg.com/jbrowse-plugin-mafviewer/dist/jbrowse-plugin-mafviewer.umd.production.min.js", | 
|  | 327                 } | 
|  | 328             ] | 
|  | 329         } | 
|  | 330         tId = trackData["label"] | 
|  | 331         fname = "%s.bed" % tId | 
|  | 332         dest = os.path.realpath("%s/%s" % (self.outdir, fname)) | 
|  | 333         # self.symlink_or_copy(data, dest) | 
|  | 334         # Process MAF to bed-like. Need build to munge chromosomes | 
|  | 335         gname = self.genome_name | 
|  | 336         cmd = [ | 
|  | 337             "bash", | 
|  | 338             os.path.join(INSTALLED_TO, "convertMAF.sh"), | 
|  | 339             data, | 
|  | 340             gname, | 
|  | 341             INSTALLED_TO, | 
|  | 342             dest, | 
|  | 343         ] | 
|  | 344         self.subprocess_check_call(cmd) | 
|  | 345         if True or self.debug: | 
|  | 346             log.info("### convertMAF.sh called as %s" % " ".join(cmd)) | 
|  | 347         # Construct samples list | 
|  | 348         # We could get this from galaxy metadata, not sure how easily. | 
|  | 349         ps = subprocess.Popen(["grep", "^s [^ ]*", "-o", data], stdout=subprocess.PIPE) | 
|  | 350         output = subprocess.check_output(("sort", "-u"), stdin=ps.stdout) | 
|  | 351         ps.wait() | 
|  | 352         outp = output.decode("ascii") | 
|  | 353         soutp = outp.split("\n") | 
|  | 354         samp = [x.split("s ")[1] for x in soutp if x.startswith("s ")] | 
|  | 355         samples = [x.split(".")[0] for x in samp] | 
|  | 356         if self.debug: | 
|  | 357             log.info("### got samples = %s " % (samples)) | 
|  | 358         trackDict = { | 
|  | 359             "type": "MafTrack", | 
|  | 360             "trackId": tId, | 
|  | 361             "name": trackData["name"], | 
|  | 362             "adapter": { | 
|  | 363                 "type": "MafTabixAdapter", | 
|  | 364                 "samples": samples, | 
| 6 | 365                 "bedGzLocation": { | 
|  | 366                     "uri": fname + ".sorted.bed.gz", | 
|  | 367                 }, | 
| 0 | 368                 "index": { | 
| 6 | 369                     "location": { | 
|  | 370                         "uri": fname + ".sorted.bed.gz.tbi", | 
|  | 371                     }, | 
| 0 | 372                 }, | 
|  | 373             }, | 
|  | 374             "assemblyNames": [self.genome_name], | 
|  | 375         } | 
|  | 376         self.tracksToAdd.append(trackDict) | 
|  | 377         self.trackIdlist.append(tId) | 
|  | 378         if self.config_json.get("plugins", None): | 
|  | 379             self.config_json["plugins"].append(mafPlugin[0]) | 
|  | 380         else: | 
|  | 381             self.config_json.update(mafPlugin) | 
|  | 382 | 
|  | 383     def _blastxml_to_gff3(self, xml, min_gap=10): | 
|  | 384         gff3_unrebased = tempfile.NamedTemporaryFile(delete=False) | 
|  | 385         cmd = [ | 
|  | 386             "python", | 
|  | 387             os.path.join(INSTALLED_TO, "blastxml_to_gapped_gff3.py"), | 
|  | 388             "--trim", | 
|  | 389             "--trim_end", | 
|  | 390             "--include_seq", | 
|  | 391             "--min_gap", | 
|  | 392             str(min_gap), | 
|  | 393             xml, | 
|  | 394         ] | 
|  | 395         subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_unrebased) | 
|  | 396         gff3_unrebased.close() | 
|  | 397         return gff3_unrebased.name | 
|  | 398 | 
|  | 399     def add_blastxml(self, data, trackData, blastOpts, **kwargs): | 
|  | 400         gff3 = self._blastxml_to_gff3(data, min_gap=blastOpts["min_gap"]) | 
|  | 401 | 
|  | 402         if "parent" in blastOpts and blastOpts["parent"] != "None": | 
|  | 403             gff3_rebased = tempfile.NamedTemporaryFile(delete=False) | 
|  | 404             cmd = ["python", os.path.join(INSTALLED_TO, "gff3_rebase.py")] | 
|  | 405             if blastOpts.get("protein", "false") == "true": | 
|  | 406                 cmd.append("--protein2dna") | 
|  | 407             cmd.extend([os.path.realpath(blastOpts["parent"]), gff3]) | 
|  | 408             subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_rebased) | 
|  | 409             gff3_rebased.close() | 
|  | 410 | 
|  | 411             # Replace original gff3 file | 
|  | 412             shutil.copy(gff3_rebased.name, gff3) | 
|  | 413             os.unlink(gff3_rebased.name) | 
|  | 414         url = "%s.gff3" % trackData["label"] | 
|  | 415         dest = os.path.realpath("%s/%s" % (self.outdir, url)) | 
|  | 416         self._sort_gff(gff3, dest) | 
|  | 417         url = url + ".gz" | 
|  | 418         tId = trackData["label"] | 
|  | 419         trackDict = { | 
|  | 420             "type": "FeatureTrack", | 
|  | 421             "trackId": tId, | 
|  | 422             "name": trackData["name"], | 
|  | 423             "assemblyNames": [self.genome_name], | 
|  | 424             "adapter": { | 
|  | 425                 "type": "Gff3TabixAdapter", | 
| 6 | 426                 "gffGzLocation": { | 
|  | 427                     "uri": url, | 
|  | 428                 }, | 
| 0 | 429                 "index": { | 
| 6 | 430                     "location": { | 
|  | 431                         "uri": url + ".tbi", | 
|  | 432                     } | 
| 0 | 433                 }, | 
|  | 434             }, | 
|  | 435             "displays": [ | 
|  | 436                 { | 
|  | 437                     "type": "LinearBasicDisplay", | 
|  | 438                     "displayId": "%s-LinearBasicDisplay" % tId, | 
|  | 439                 }, | 
|  | 440                 {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId}, | 
|  | 441             ], | 
|  | 442         } | 
| 6 | 443         if self.usejson: | 
|  | 444             self.tracksToAdd.append(trackDict) | 
|  | 445             self.trackIdlist.append(tId) | 
|  | 446         else: | 
|  | 447             cmd = [ | 
|  | 448                 "jbrowse", | 
|  | 449                 "add-track", | 
|  | 450                 url, | 
|  | 451                 "-t", | 
|  | 452                 "FeatureTrack", | 
|  | 453                 "-a", | 
|  | 454                 self.genome_name, | 
|  | 455                 "--indexFile", | 
|  | 456                 url + ".tbi", | 
|  | 457                 "-n", | 
|  | 458                 trackData["name"], | 
|  | 459                 "--load", | 
|  | 460                 "inPlace", | 
|  | 461                 "--target", | 
|  | 462                 self.outdir, | 
|  | 463             ] | 
|  | 464             self.subprocess_check_call(cmd) | 
| 0 | 465         os.unlink(gff3) | 
|  | 466 | 
|  | 467     def add_bigwig(self, data, trackData): | 
| 6 | 468         url = "%s.bw" % trackData["name"] | 
| 7 | 469         dest = os.path.realpath(os.path.join(self.outdir, url)) | 
|  | 470         cmd = ["cp", data, dest] | 
|  | 471         self.subprocess_check_call(cmd) | 
|  | 472         bwloc = {"uri": url} | 
| 0 | 473         tId = trackData["label"] | 
|  | 474         trackDict = { | 
|  | 475             "type": "QuantitativeTrack", | 
|  | 476             "trackId": tId, | 
| 6 | 477             "name": url, | 
| 0 | 478             "assemblyNames": [ | 
|  | 479                 self.genome_name, | 
|  | 480             ], | 
|  | 481             "adapter": { | 
|  | 482                 "type": "BigWigAdapter", | 
| 6 | 483                 "bigWigLocation": bwloc, | 
| 0 | 484             }, | 
|  | 485             "displays": [ | 
|  | 486                 { | 
|  | 487                     "type": "LinearWiggleDisplay", | 
|  | 488                     "displayId": "%s-LinearWiggleDisplay" % tId, | 
|  | 489                 } | 
|  | 490             ], | 
|  | 491         } | 
| 6 | 492         if self.usejson: | 
|  | 493             self.tracksToAdd.append(trackDict) | 
|  | 494             self.trackIdlist.append(tId) | 
|  | 495         else: | 
|  | 496             cmd = [ | 
|  | 497                 "jbrowse", | 
|  | 498                 "add-track", | 
|  | 499                 url, | 
|  | 500                 "-t", | 
|  | 501                 "QuantitativeTrack", | 
|  | 502                 "-a", | 
|  | 503                 self.genome_name, | 
|  | 504                 "-n", | 
|  | 505                 trackData["name"], | 
|  | 506                 "--load", | 
|  | 507                 "inPlace", | 
|  | 508                 "--target", | 
|  | 509                 self.outdir, | 
|  | 510             ] | 
|  | 511             self.subprocess_check_call(cmd) | 
| 0 | 512 | 
|  | 513     def add_bam(self, data, trackData, bamOpts, bam_index=None, **kwargs): | 
|  | 514         tId = trackData["label"] | 
| 5 | 515         fname = "%s.bam" % trackData["label"] | 
|  | 516         dest = os.path.realpath("%s/%s" % (self.outdir, fname)) | 
| 7 | 517         url = fname | 
|  | 518         self.subprocess_check_call(["cp", data, dest]) | 
|  | 519         log.info("### copied %s to %s" % (data, dest)) | 
|  | 520         bloc = {"uri": url} | 
| 0 | 521         if bam_index is not None and os.path.exists(os.path.realpath(bam_index)): | 
|  | 522             # bai most probably made by galaxy and stored in galaxy dirs, need to copy it to dest | 
|  | 523             self.subprocess_check_call( | 
|  | 524                 ["cp", os.path.realpath(bam_index), dest + ".bai"] | 
|  | 525             ) | 
|  | 526         else: | 
|  | 527             # Can happen in exotic condition | 
|  | 528             # e.g. if bam imported as symlink with datatype=unsorted.bam, then datatype changed to bam | 
|  | 529             #      => no index generated by galaxy, but there might be one next to the symlink target | 
|  | 530             #      this trick allows to skip the bam sorting made by galaxy if already done outside | 
|  | 531             if os.path.exists(os.path.realpath(data) + ".bai"): | 
|  | 532                 self.symlink_or_copy(os.path.realpath(data) + ".bai", dest + ".bai") | 
|  | 533             else: | 
|  | 534                 log.warn("Could not find a bam index (.bai file) for %s", data) | 
|  | 535         trackDict = { | 
|  | 536             "type": "AlignmentsTrack", | 
|  | 537             "trackId": tId, | 
|  | 538             "name": trackData["name"], | 
|  | 539             "assemblyNames": [self.genome_name], | 
|  | 540             "adapter": { | 
|  | 541                 "type": "BamAdapter", | 
| 6 | 542                 "bamLocation": bloc, | 
| 0 | 543                 "index": { | 
| 6 | 544                     "location": { | 
|  | 545                         "uri": fname + ".bai", | 
|  | 546                     } | 
| 0 | 547                 }, | 
|  | 548             }, | 
|  | 549         } | 
| 6 | 550         if self.usejson: | 
|  | 551             self.tracksToAdd.append(trackDict) | 
|  | 552             self.trackIdlist.append(tId) | 
|  | 553         else: | 
|  | 554             cmd = [ | 
|  | 555                 "jbrowse", | 
|  | 556                 "add-track", | 
|  | 557                 fname, | 
|  | 558                 "-t", | 
|  | 559                 "AlignmentsTrack", | 
|  | 560                 "-l", | 
|  | 561                 "inPlace", | 
|  | 562                 "-a", | 
|  | 563                 self.genome_name, | 
|  | 564                 "--indexFile", | 
|  | 565                 fname + ".bai", | 
|  | 566                 "-n", | 
|  | 567                 trackData["name"], | 
|  | 568                 "--target", | 
|  | 569                 self.outdir, | 
|  | 570             ] | 
|  | 571             self.subprocess_check_call(cmd) | 
| 0 | 572 | 
|  | 573     def add_vcf(self, data, trackData): | 
|  | 574         tId = trackData["label"] | 
|  | 575         url = "%s/api/datasets/%s/display" % ( | 
|  | 576             self.giURL, | 
|  | 577             trackData["metadata"]["dataset_id"], | 
|  | 578         ) | 
|  | 579         url = "%s.vcf.gz" % tId | 
|  | 580         dest = os.path.realpath("%s/%s" % (self.outdir, url)) | 
|  | 581         cmd = "bgzip -c %s  > %s" % (data, dest) | 
|  | 582         self.subprocess_popen(cmd) | 
|  | 583         cmd = ["tabix", "-p", "vcf", dest] | 
|  | 584         self.subprocess_check_call(cmd) | 
|  | 585         trackDict = { | 
|  | 586             "type": "VariantTrack", | 
|  | 587             "trackId": tId, | 
|  | 588             "name": trackData["name"], | 
|  | 589             "assemblyNames": [self.genome_name], | 
|  | 590             "adapter": { | 
|  | 591                 "type": "VcfTabixAdapter", | 
| 6 | 592                 "vcfGzLocation": { | 
|  | 593                     "uri": url, | 
|  | 594                 }, | 
| 0 | 595                 "index": { | 
| 6 | 596                     "location": { | 
|  | 597                         "uri": url + ".tbi", | 
|  | 598                     } | 
| 0 | 599                 }, | 
|  | 600             }, | 
|  | 601             "displays": [ | 
|  | 602                 { | 
|  | 603                     "type": "LinearVariantDisplay", | 
|  | 604                     "displayId": "%s-LinearVariantDisplay" % tId, | 
|  | 605                 }, | 
|  | 606                 { | 
|  | 607                     "type": "ChordVariantDisplay", | 
|  | 608                     "displayId": "%s-ChordVariantDisplay" % tId, | 
|  | 609                 }, | 
|  | 610                 { | 
|  | 611                     "type": "LinearPairedArcDisplay", | 
|  | 612                     "displayId": "%s-LinearPairedArcDisplay" % tId, | 
|  | 613                 }, | 
|  | 614             ], | 
|  | 615         } | 
| 6 | 616         if self.usejson: | 
|  | 617             self.tracksToAdd.append(trackDict) | 
|  | 618             self.trackIdlist.append(tId) | 
|  | 619         else: | 
|  | 620             cmd = [ | 
|  | 621                 "jbrowse", | 
|  | 622                 "add-track", | 
|  | 623                 url, | 
|  | 624                 "-t", | 
|  | 625                 "VariantTrack", | 
|  | 626                 "-a", | 
|  | 627                 self.genome_name, | 
|  | 628                 "--indexFile", | 
|  | 629                 url + ".tbi", | 
|  | 630                 "-n", | 
|  | 631                 trackData["name"], | 
|  | 632                 "--load", | 
|  | 633                 "inPlace", | 
|  | 634                 "--target", | 
|  | 635                 self.outdir, | 
|  | 636             ] | 
|  | 637             self.subprocess_check_call(cmd) | 
| 0 | 638 | 
|  | 639     def _sort_gff(self, data, dest): | 
|  | 640         # Only index if not already done | 
|  | 641         if not os.path.exists(dest + ".gz"): | 
|  | 642             cmd = "jbrowse sort-gff %s | bgzip -c > %s.gz" % ( | 
|  | 643                 data, | 
|  | 644                 dest, | 
|  | 645             )  # "gff3sort.pl --precise '%s' | grep -v \"^$\" > '%s'" | 
|  | 646             self.subprocess_popen(cmd) | 
|  | 647             self.subprocess_check_call(["tabix", "-f", "-p", "gff", dest + ".gz"]) | 
|  | 648 | 
|  | 649     def _sort_bed(self, data, dest): | 
|  | 650         # Only index if not already done | 
|  | 651         if not os.path.exists(dest): | 
| 5 | 652             cmd = "sort -k1,1 -k2,2n %s | bgzip -c > %s" % (data, dest) | 
|  | 653             self.subprocess_popen(cmd) | 
|  | 654             cmd = ["tabix", "-f", "-p", "bed", dest] | 
|  | 655             self.subprocess_check_call(cmd) | 
| 0 | 656 | 
|  | 657     def add_gff(self, data, ext, trackData): | 
|  | 658         url = "%s.%s" % (trackData["label"], ext) | 
|  | 659         dest = os.path.realpath("%s/%s" % (self.outdir, url)) | 
|  | 660         self._sort_gff(data, dest) | 
|  | 661         url = url + ".gz" | 
|  | 662         tId = trackData["label"] | 
|  | 663         trackDict = { | 
|  | 664             "type": "FeatureTrack", | 
|  | 665             "trackId": tId, | 
|  | 666             "name": trackData["name"], | 
|  | 667             "assemblyNames": [self.genome_name], | 
|  | 668             "adapter": { | 
|  | 669                 "type": "Gff3TabixAdapter", | 
| 6 | 670                 "gffGzLocation": { | 
|  | 671                     "uri": url, | 
|  | 672                 }, | 
| 0 | 673                 "index": { | 
| 6 | 674                     "location": { | 
|  | 675                         "uri": url + ".tbi", | 
|  | 676                     } | 
| 0 | 677                 }, | 
|  | 678             }, | 
|  | 679             "displays": [ | 
|  | 680                 { | 
|  | 681                     "type": "LinearBasicDisplay", | 
|  | 682                     "displayId": "%s-LinearBasicDisplay" % tId, | 
|  | 683                 }, | 
|  | 684                 {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId}, | 
|  | 685             ], | 
|  | 686         } | 
| 6 | 687         if self.usejson: | 
|  | 688             self.tracksToAdd.append(trackDict) | 
|  | 689             self.trackIdlist.append(tId) | 
|  | 690         else: | 
|  | 691             cmd = [ | 
|  | 692                 "jbrowse", | 
|  | 693                 "add-track", | 
|  | 694                 url, | 
|  | 695                 "-t", | 
|  | 696                 "FeatureTrack", | 
|  | 697                 "-a", | 
|  | 698                 self.genome_name, | 
|  | 699                 "-n", | 
|  | 700                 trackData["name"], | 
|  | 701                 "--load", | 
|  | 702                 "inPlace", | 
|  | 703                 "--target", | 
|  | 704                 self.outdir, | 
|  | 705             ] | 
|  | 706             self.subprocess_check_call(cmd) | 
| 0 | 707 | 
|  | 708     def add_bed(self, data, ext, trackData): | 
|  | 709         url = "%s.%s" % (trackData["label"], ext) | 
| 5 | 710         dest = os.path.realpath("%s/%s.gz" % (self.outdir, url)) | 
| 0 | 711         self._sort_bed(data, dest) | 
|  | 712         tId = trackData["label"] | 
| 2 | 713         url = url + ".gz" | 
| 0 | 714         trackDict = { | 
|  | 715             "type": "FeatureTrack", | 
|  | 716             "trackId": tId, | 
|  | 717             "name": trackData["name"], | 
|  | 718             "assemblyNames": [self.genome_name], | 
|  | 719             "adapter": { | 
| 2 | 720                 "type": "BedTabixAdapter", | 
| 6 | 721                 "bedGzLocation": { | 
|  | 722                     "uri": url, | 
|  | 723                 }, | 
| 2 | 724                 "index": { | 
| 6 | 725                     "location": { | 
|  | 726                         "uri": url + ".tbi", | 
|  | 727                     } | 
| 2 | 728                 }, | 
| 0 | 729             }, | 
|  | 730             "displays": [ | 
|  | 731                 { | 
|  | 732                     "type": "LinearBasicDisplay", | 
|  | 733                     "displayId": "%s-LinearBasicDisplay" % tId, | 
|  | 734                 }, | 
|  | 735                 {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId}, | 
|  | 736             ], | 
|  | 737         } | 
| 6 | 738         if self.usejson: | 
|  | 739             self.tracksToAdd.append(trackDict) | 
|  | 740             self.trackIdlist.append(tId) | 
|  | 741         else: | 
|  | 742             cmd = [ | 
|  | 743                 "jbrowse", | 
|  | 744                 "add-track", | 
|  | 745                 url, | 
|  | 746                 "-t", | 
|  | 747                 "FeatureTrack", | 
|  | 748                 "-a", | 
|  | 749                 self.genome_name, | 
|  | 750                 "--indexFile", | 
|  | 751                 url + ".tbi", | 
|  | 752                 "-n", | 
|  | 753                 trackData["name"], | 
|  | 754                 "--load", | 
|  | 755                 "inPlace", | 
|  | 756                 "--target", | 
|  | 757                 self.outdir, | 
|  | 758             ] | 
|  | 759             self.subprocess_check_call(cmd) | 
| 0 | 760 | 
|  | 761     def process_annotations(self, track): | 
|  | 762         category = track["category"].replace("__pd__date__pd__", TODAY) | 
|  | 763         for i, ( | 
|  | 764             dataset_path, | 
|  | 765             dataset_ext, | 
|  | 766             track_human_label, | 
|  | 767             extra_metadata, | 
|  | 768         ) in enumerate(track["trackfiles"]): | 
|  | 769             # Unsanitize labels (element_identifiers are always sanitized by Galaxy) | 
|  | 770             for key, value in mapped_chars.items(): | 
|  | 771                 track_human_label = track_human_label.replace(value, key) | 
|  | 772             outputTrackConfig = { | 
|  | 773                 "category": category, | 
|  | 774             } | 
|  | 775             if self.debug: | 
|  | 776                 log.info( | 
|  | 777                     "Processing category = %s, track_human_label = %s", | 
|  | 778                     category, | 
|  | 779                     track_human_label, | 
|  | 780                 ) | 
|  | 781             # We add extra data to hash for the case of REST + SPARQL. | 
|  | 782             if ( | 
|  | 783                 "conf" in track | 
|  | 784                 and "options" in track["conf"] | 
|  | 785                 and "url" in track["conf"]["options"] | 
|  | 786             ): | 
|  | 787                 rest_url = track["conf"]["options"]["url"] | 
|  | 788             else: | 
|  | 789                 rest_url = "" | 
|  | 790 | 
|  | 791             # I chose to use track['category'] instead of 'category' here. This | 
|  | 792             # is intentional. This way re-running the tool on a different date | 
|  | 793             # will not generate different hashes and make comparison of outputs | 
|  | 794             # much simpler. | 
|  | 795             hashData = [ | 
|  | 796                 str(dataset_path), | 
|  | 797                 track_human_label, | 
|  | 798                 track["category"], | 
|  | 799                 rest_url, | 
|  | 800             ] | 
|  | 801             hashData = "|".join(hashData).encode("utf-8") | 
|  | 802             outputTrackConfig["label"] = hashlib.md5(hashData).hexdigest() + "_%s" % i | 
|  | 803             outputTrackConfig["metadata"] = extra_metadata | 
|  | 804             outputTrackConfig["name"] = track_human_label | 
|  | 805 | 
|  | 806             if dataset_ext in ("gff", "gff3"): | 
|  | 807                 self.add_gff( | 
|  | 808                     dataset_path, | 
|  | 809                     dataset_ext, | 
|  | 810                     outputTrackConfig, | 
|  | 811                 ) | 
|  | 812             elif dataset_ext in ("hic",): | 
|  | 813                 self.add_hic( | 
|  | 814                     dataset_path, | 
|  | 815                     outputTrackConfig, | 
|  | 816                 ) | 
|  | 817             elif dataset_ext in ("bed",): | 
|  | 818                 self.add_bed( | 
|  | 819                     dataset_path, | 
|  | 820                     dataset_ext, | 
|  | 821                     outputTrackConfig, | 
|  | 822                 ) | 
|  | 823             elif dataset_ext in ("maf",): | 
|  | 824                 self.add_maf( | 
|  | 825                     dataset_path, | 
|  | 826                     outputTrackConfig, | 
|  | 827                 ) | 
|  | 828             elif dataset_ext == "bigwig": | 
|  | 829                 self.add_bigwig( | 
|  | 830                     dataset_path, | 
|  | 831                     outputTrackConfig, | 
|  | 832                 ) | 
|  | 833             elif dataset_ext == "bam": | 
|  | 834                 real_indexes = track["conf"]["options"]["pileup"]["bam_indices"][ | 
|  | 835                     "bam_index" | 
|  | 836                 ] | 
|  | 837                 if not isinstance(real_indexes, list): | 
|  | 838                     # <bam_indices> | 
|  | 839                     #  <bam_index>/path/to/a.bam.bai</bam_index> | 
|  | 840                     # </bam_indices> | 
|  | 841                     # | 
|  | 842                     # The above will result in the 'bam_index' key containing a | 
|  | 843                     # string. If there are two or more indices, the container | 
|  | 844                     # becomes a list. Fun! | 
|  | 845                     real_indexes = [real_indexes] | 
|  | 846 | 
|  | 847                 self.add_bam( | 
|  | 848                     dataset_path, | 
|  | 849                     outputTrackConfig, | 
|  | 850                     track["conf"]["options"]["pileup"], | 
|  | 851                     bam_index=real_indexes[i], | 
|  | 852                 ) | 
|  | 853             elif dataset_ext == "blastxml": | 
|  | 854                 self.add_blastxml( | 
|  | 855                     dataset_path, outputTrackConfig, track["conf"]["options"]["blast"] | 
|  | 856                 ) | 
|  | 857             elif dataset_ext == "vcf": | 
|  | 858                 self.add_vcf(dataset_path, outputTrackConfig) | 
|  | 859             else: | 
|  | 860                 log.warn("Do not know how to handle %s", dataset_ext) | 
|  | 861 | 
| 6 | 862     def clone_jbrowse(self, jbrowse_dir, destination): | 
| 0 | 863         """Clone a JBrowse directory into a destination directory.""" | 
|  | 864         cmd = ["jbrowse", "create", "-f", self.outdir] | 
|  | 865         self.subprocess_check_call(cmd) | 
|  | 866         for fn in [ | 
|  | 867             "asset-manifest.json", | 
|  | 868             "favicon.ico", | 
|  | 869             "robots.txt", | 
|  | 870             "umd_plugin.js", | 
|  | 871             "version.txt", | 
|  | 872             "test_data", | 
|  | 873         ]: | 
|  | 874             cmd = ["rm", "-rf", os.path.join(self.outdir, fn)] | 
|  | 875             self.subprocess_check_call(cmd) | 
| 7 | 876         cmd = ['cp', os.path.join(INSTALLED_TO, "servejb2.py"), self.outdir] | 
|  | 877         self.subprocess_check_call(cmd) | 
| 0 | 878 | 
|  | 879 | 
|  | 880 if __name__ == "__main__": | 
|  | 881     parser = argparse.ArgumentParser(description="", epilog="") | 
|  | 882     parser.add_argument("xml", type=argparse.FileType("r"), help="Track Configuration") | 
|  | 883 | 
|  | 884     parser.add_argument("--jbrowse", help="Folder containing a jbrowse release") | 
|  | 885     parser.add_argument("--outdir", help="Output directory", default="out") | 
|  | 886     parser.add_argument("--version", "-V", action="version", version="%(prog)s 0.8.0") | 
|  | 887     args = parser.parse_args() | 
|  | 888 | 
|  | 889     tree = ET.parse(args.xml.name) | 
|  | 890     root = tree.getroot() | 
|  | 891 | 
|  | 892     # This should be done ASAP | 
|  | 893     GALAXY_INFRASTRUCTURE_URL = root.find("metadata/galaxyUrl").text | 
|  | 894     # Sometimes this comes as `localhost` without a protocol | 
|  | 895     if not GALAXY_INFRASTRUCTURE_URL.startswith("http"): | 
|  | 896         # so we'll prepend `http://` and hope for the best. Requests *should* | 
|  | 897         # be GET and not POST so it should redirect OK | 
|  | 898         GALAXY_INFRASTRUCTURE_URL = "http://" + GALAXY_INFRASTRUCTURE_URL | 
|  | 899 | 
|  | 900     jc = JbrowseConnector( | 
|  | 901         jbrowse=args.jbrowse, | 
|  | 902         outdir=args.outdir, | 
|  | 903         genomes=[ | 
|  | 904             { | 
|  | 905                 "path": os.path.realpath(x.attrib["path"]), | 
|  | 906                 "meta": metadata_from_node(x.find("metadata")), | 
|  | 907             } | 
|  | 908             for x in root.findall("metadata/genomes/genome") | 
|  | 909         ], | 
|  | 910     ) | 
|  | 911     jc.process_genomes() | 
|  | 912 | 
|  | 913     for track in root.findall("tracks/track"): | 
|  | 914         track_conf = {} | 
|  | 915         track_conf["trackfiles"] = [] | 
|  | 916 | 
|  | 917         is_multi_bigwig = False | 
|  | 918         try: | 
|  | 919             if track.find("options/wiggle/multibigwig") and ( | 
|  | 920                 track.find("options/wiggle/multibigwig").text == "True" | 
|  | 921             ): | 
|  | 922                 is_multi_bigwig = True | 
|  | 923                 multi_bigwig_paths = [] | 
|  | 924         except KeyError: | 
|  | 925             pass | 
|  | 926 | 
|  | 927         trackfiles = track.findall("files/trackFile") | 
|  | 928         if trackfiles: | 
|  | 929             for x in track.findall("files/trackFile"): | 
|  | 930                 if is_multi_bigwig: | 
|  | 931                     multi_bigwig_paths.append( | 
|  | 932                         (x.attrib["label"], os.path.realpath(x.attrib["path"])) | 
|  | 933                     ) | 
|  | 934                 else: | 
|  | 935                     if trackfiles: | 
|  | 936                         metadata = metadata_from_node(x.find("metadata")) | 
|  | 937                         track_conf["dataset_id"] = metadata["dataset_id"] | 
|  | 938                         track_conf["trackfiles"].append( | 
|  | 939                             ( | 
|  | 940                                 os.path.realpath(x.attrib["path"]), | 
|  | 941                                 x.attrib["ext"], | 
|  | 942                                 x.attrib["label"], | 
|  | 943                                 metadata, | 
|  | 944                             ) | 
|  | 945                         ) | 
|  | 946         else: | 
|  | 947             # For tracks without files (rest, sparql) | 
|  | 948             track_conf["trackfiles"].append( | 
|  | 949                 ( | 
|  | 950                     "",  # N/A, no path for rest or sparql | 
|  | 951                     track.attrib["format"], | 
|  | 952                     track.find("options/label").text, | 
|  | 953                     {}, | 
|  | 954                 ) | 
|  | 955             ) | 
|  | 956 | 
|  | 957         if is_multi_bigwig: | 
|  | 958             metadata = metadata_from_node(x.find("metadata")) | 
|  | 959 | 
|  | 960             track_conf["trackfiles"].append( | 
|  | 961                 ( | 
|  | 962                     multi_bigwig_paths,  # Passing an array of paths to represent as one track | 
|  | 963                     "bigwig_multiple", | 
|  | 964                     "MultiBigWig",  # Giving an hardcoded name for now | 
|  | 965                     {},  # No metadata for multiple bigwig | 
|  | 966                 ) | 
|  | 967             ) | 
|  | 968 | 
|  | 969         track_conf["category"] = track.attrib["cat"] | 
|  | 970         track_conf["format"] = track.attrib["format"] | 
|  | 971         try: | 
|  | 972             # Only pertains to gff3 + blastxml. TODO? | 
|  | 973             track_conf["style"] = {t.tag: t.text for t in track.find("options/style")} | 
|  | 974         except TypeError: | 
|  | 975             track_conf["style"] = {} | 
|  | 976             pass | 
|  | 977         track_conf["conf"] = etree_to_dict(track.find("options")) | 
|  | 978         jc.process_annotations(track_conf) | 
|  | 979         print("## processed", str(track_conf), "trackIdlist", jc.trackIdlist) | 
|  | 980     print( | 
|  | 981         "###done processing, trackIdlist=", | 
|  | 982         jc.trackIdlist, | 
|  | 983         "config=", | 
|  | 984         str(jc.config_json), | 
|  | 985     ) | 
|  | 986     jc.config_json["tracks"] = jc.tracksToAdd | 
| 6 | 987     if jc.usejson: | 
|  | 988         jc.write_config() | 
| 0 | 989     jc.add_default_view() |