comparison jbrowse2/jbrowse2_jbrowsenotjson.py @ 6:88b9b105c09b draft

Uploaded
author fubar
date Fri, 05 Jan 2024 01:58:02 +0000
parents
children
comparison
equal deleted inserted replaced
5:42ca8804cd93 6:88b9b105c09b
1 #!/usr/bin/env python
2 # change to accumulating all configuration for config.json based on the default from the clone
3 import argparse
4 import datetime
5 import hashlib
6 import json
7 import logging
8 import os
9 import shutil
10 import subprocess
11 import tempfile
12 import xml.etree.ElementTree as ET
13 from collections import defaultdict
14
15 logging.basicConfig(level=logging.INFO)
16 log = logging.getLogger("jbrowse")
17 TODAY = datetime.datetime.now().strftime("%Y-%m-%d")
18 GALAXY_INFRASTRUCTURE_URL = None
19 mapped_chars = {
20 ">": "__gt__",
21 "<": "__lt__",
22 "'": "__sq__",
23 '"': "__dq__",
24 "[": "__ob__",
25 "]": "__cb__",
26 "{": "__oc__",
27 "}": "__cc__",
28 "@": "__at__",
29 "#": "__pd__",
30 "": "__cn__",
31 }
32
33
34 def etree_to_dict(t):
35 if t is None:
36 return {}
37
38 d = {t.tag: {} if t.attrib else None}
39 children = list(t)
40 if children:
41 dd = defaultdict(list)
42 for dc in map(etree_to_dict, children):
43 for k, v in dc.items():
44 dd[k].append(v)
45 d = {t.tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}}
46 if t.attrib:
47 d[t.tag].update(("@" + k, v) for k, v in t.attrib.items())
48 if t.text:
49 text = t.text.strip()
50 if children or t.attrib:
51 if text:
52 d[t.tag]["#text"] = text
53 else:
54 d[t.tag] = text
55 return d
56
57
58 INSTALLED_TO = os.path.dirname(os.path.realpath(__file__))
59
60
61 def metadata_from_node(node):
62 metadata = {}
63 try:
64 if len(node.findall("dataset")) != 1:
65 # exit early
66 return metadata
67 except Exception:
68 return {}
69
70 for (key, value) in node.findall("dataset")[0].attrib.items():
71 metadata["dataset_%s" % key] = value
72
73 for (key, value) in node.findall("history")[0].attrib.items():
74 metadata["history_%s" % key] = value
75
76 for (key, value) in node.findall("metadata")[0].attrib.items():
77 metadata["metadata_%s" % key] = value
78
79 for (key, value) in node.findall("tool")[0].attrib.items():
80 metadata["tool_%s" % key] = value
81
82 # Additional Mappings applied:
83 metadata[
84 "dataset_edam_format"
85 ] = '<a target="_blank" href="http://edamontology.org/{0}">{1}</a>'.format(
86 metadata["dataset_edam_format"], metadata["dataset_file_ext"]
87 )
88 metadata["history_user_email"] = '<a href="mailto:{0}">{0}</a>'.format(
89 metadata["history_user_email"]
90 )
91 metadata["hist_name"] = metadata["history_display_name"]
92 metadata[
93 "history_display_name"
94 ] = '<a target="_blank" href="{galaxy}/history/view/{encoded_hist_id}">{hist_name}</a>'.format(
95 galaxy=GALAXY_INFRASTRUCTURE_URL,
96 encoded_hist_id=metadata["history_id"],
97 hist_name=metadata["history_display_name"],
98 )
99 metadata[
100 "tool_tool"
101 ] = '<a target="_blank" href="{galaxy}/datasets/{encoded_id}/show_params">{tool_id}</a>'.format(
102 galaxy=GALAXY_INFRASTRUCTURE_URL,
103 encoded_id=metadata["dataset_id"],
104 tool_id=metadata["tool_tool_id"],
105 # tool_version=metadata['tool_tool_version'],
106 )
107 return metadata
108
109
110 class JbrowseConnector(object):
111 def __init__(self, jbrowse, outdir, genomes, standalone=None):
112 self.debug = False
113 self.usejson = True
114 self.giURL = GALAXY_INFRASTRUCTURE_URL
115 self.jbrowse = jbrowse
116 self.outdir = outdir
117 os.makedirs(self.outdir, exist_ok=True)
118 self.genome_paths = genomes
119 self.standalone = standalone
120 self.trackIdlist = []
121 self.tracksToAdd = []
122 self.config_json = {}
123 self.config_json_file = os.path.realpath(os.path.join(outdir, "config.json"))
124 if standalone == "complete":
125 self.clone_jbrowse(self.jbrowse, self.outdir)
126 elif standalone == "minimal":
127 self.clone_jbrowse(self.jbrowse, self.outdir, minimal=True)
128
129 def subprocess_check_call(self, command, output=None):
130 if output:
131 if self.debug:
132 log.debug("cd %s && %s > %s", self.outdir, " ".join(command), output)
133 subprocess.check_call(command, cwd=self.outdir, stdout=output)
134 else:
135 log.debug("cd %s && %s", self.outdir, " ".join(command))
136 subprocess.check_call(command, cwd=self.outdir)
137
138 def subprocess_popen(self, command):
139 if self.debug:
140 log.debug("cd %s && %s", self.outdir, command)
141 p = subprocess.Popen(
142 command,
143 shell=True,
144 stdin=subprocess.PIPE,
145 stdout=subprocess.PIPE,
146 stderr=subprocess.PIPE,
147 )
148 output, err = p.communicate()
149 retcode = p.returncode
150 if retcode != 0:
151 log.error("cd %s && %s", self.outdir, command)
152 log.error(output)
153 log.error(err)
154 raise RuntimeError("Command failed with exit code %s" % (retcode))
155
156 def subprocess_check_output(self, command):
157 if self.debug:
158 log.debug("cd %s && %s", self.outdir, " ".join(command))
159 return subprocess.check_output(command, cwd=self.outdir)
160
161 def _jbrowse_bin(self, command):
162 return os.path.realpath(os.path.join(self.jbrowse, "bin", command))
163
164 def symlink_or_copy(self, src, dest):
165 if "GALAXY_JBROWSE_SYMLINKS" in os.environ and bool(
166 os.environ["GALAXY_JBROWSE_SYMLINKS"]
167 ):
168 cmd = ["ln", "-s", src, dest]
169 else:
170 cmd = ["cp", src, dest]
171
172 return self.subprocess_check_call(cmd)
173
174 def process_genomes(self):
175 assemblies = []
176 for i, genome_node in enumerate(self.genome_paths):
177 log.info("genome_node=%s" % str(genome_node))
178 # We only expect one input genome per run. This for loop is just
179 # easier to write than the alternative / catches any possible
180 # issues.
181 genome_name = genome_node["meta"]["dataset_dname"]
182 dsId = genome_node["meta"]["dataset_id"]
183 fapath = genome_node["path"]
184 faname = genome_name + ".fa.gz"
185 faind = os.path.realpath(os.path.join(self.outdir, faname + ".gzi"))
186 if self.standalone == "complete":
187 fadest = os.path.realpath(os.path.join(self.outdir, faname))
188 cmd = "bgzip -i -c %s > %s && samtools faidx %s" % (
189 fapath,
190 fadest,
191 fadest,
192 )
193 self.subprocess_popen(cmd)
194 adapter = {
195 "type": "BgzipFastaAdapter",
196 "fastaLocation": {
197 "uri": faname,
198 },
199 "faiLocation": {
200 "uri": faname + ".fai",
201 },
202 "gziLocation": {
203 "uri": faname + ".gzi",
204 },
205 }
206 else:
207 faurl = "%s/api/datasets/%s/display" % (self.giURL, dsId)
208 fastalocation = {
209 "uri": faurl,
210 }
211 failocation = {
212 "uri": faname + ".fai",
213 }
214 adapter = {
215 "type": "IndexedFastaAdapter",
216 "fastaLocation": fastalocation,
217 "faiLocation": failocation,
218 }
219
220 cmd = ["samtools", "faidx", fapath, "--fai-idx", faind]
221 self.subprocess_check_call(cmd)
222 trackDict = {
223 "name": genome_name,
224 "sequence": {
225 "type": "ReferenceSequenceTrack",
226 "trackId": genome_name,
227 "adapter": adapter,
228 },
229 "rendering": {"type": "DivSequenceRenderer"},
230 }
231 assemblies.append(trackDict)
232 self.genome_name = genome_name
233 if self.usejson:
234 self.config_json["assemblies"] = assemblies
235 else:
236 cmd = [
237 "jbrowse",
238 "add-assembly",
239 faname,
240 "-t",
241 "bgzipFasta",
242 "-n",
243 genome_name,
244 "--load",
245 "inPlace",
246 "--faiLocation",
247 faname + ".fai",
248 "--gziLocation",
249 faname + ".gzi",
250 "--target",
251 self.outdir,
252 ]
253 self.subprocess_check_call(cmd)
254
255 def add_default_view(self):
256 cmd = [
257 "jbrowse",
258 "set-default-session",
259 "-s",
260 self.config_json_file,
261 "-t",
262 ",".join(self.trackIdlist),
263 "-n",
264 "JBrowse2 in Galaxy",
265 "--target",
266 self.config_json_file,
267 "-v",
268 " LinearGenomeView",
269 ]
270 if True or self.debug:
271 log.info("### calling set-default-session with cmd=%s" % " ".join(cmd))
272 self.subprocess_check_call(cmd)
273
274 def write_config(self):
275 with open(self.config_json_file, "w") as fp:
276 json.dump(self.config_json, fp)
277
278 def add_hic(self, data, trackData):
279 """
280 HiC adapter.
281 https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md
282 for testing locally, these work:
283 HiC data is from https://s3.amazonaws.com/igv.broadinstitute.org/data/hic/intra_nofrag_30.hic
284 using hg19 reference track as a
285 'BgzipFastaAdapter'
286 fastaLocation:
287 uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz',
288 faiLocation:
289 uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.fai',
290 gziLocation:
291 uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.gzi',
292 Cool will not be likely to be a good fit - see discussion at https://github.com/GMOD/jbrowse-components/issues/2438
293 """
294 log.info("#### trackData=%s" % trackData)
295 tId = trackData["label"]
296 dsId = trackData["metadata"]["dataset_id"]
297 url = "%s/api/datasets/%s/display?to_ext=hic " % (
298 self.giURL,
299 dsId,
300 )
301 hname = trackData["name"]
302 if self.standalone == "complete":
303 dest = os.path.realpath(os.path.join(self.outdir, hname))
304 url = hname
305 cmd = ["cp", data, dest]
306 self.subprocess_check_call(cmd)
307 floc = {
308 "uri": hname,
309 }
310 else:
311 url = "%s/api/datasets/%s/display?to_ext=hic" % (self.giURL, dsId)
312 floc = {
313 "uri": url,
314 }
315 trackDict = {
316 "type": "HicTrack",
317 "trackId": tId,
318 "name": hname,
319 "assemblyNames": [self.genome_name],
320 "adapter": {
321 "type": "HicAdapter",
322 "hicLocation": floc,
323 },
324 }
325 if self.usejson:
326 self.tracksToAdd.append(trackDict)
327 self.trackIdlist.append(tId)
328 else:
329 cmd = [
330 "jbrowse",
331 "add-track",
332 url,
333 "-t",
334 "HicTrack",
335 "-a",
336 self.genome_name,
337 "-n",
338 hname,
339 "--load",
340 "inPlace",
341 "--target",
342 self.outdir,
343 ]
344 self.subprocess_check_call(cmd)
345
346 def add_maf(self, data, trackData):
347 """
348 from https://github.com/cmdcolin/maf2bed
349 Note: Both formats start with a MAF as input, and note that your MAF file should contain the species name and chromosome name
350 e.g. hg38.chr1 in the sequence identifiers.
351 need the reference id - eg hg18, for maf2bed.pl as the first parameter
352 """
353 mafPlugin = {
354 "plugins": [
355 {
356 "name": "MafViewer",
357 "url": "https://unpkg.com/jbrowse-plugin-mafviewer/dist/jbrowse-plugin-mafviewer.umd.production.min.js",
358 }
359 ]
360 }
361 tId = trackData["label"]
362 fname = "%s.bed" % tId
363 dest = os.path.realpath("%s/%s" % (self.outdir, fname))
364 # self.symlink_or_copy(data, dest)
365 # Process MAF to bed-like. Need build to munge chromosomes
366 gname = self.genome_name
367 cmd = [
368 "bash",
369 os.path.join(INSTALLED_TO, "convertMAF.sh"),
370 data,
371 gname,
372 INSTALLED_TO,
373 dest,
374 ]
375 self.subprocess_check_call(cmd)
376 if True or self.debug:
377 log.info("### convertMAF.sh called as %s" % " ".join(cmd))
378 # Construct samples list
379 # We could get this from galaxy metadata, not sure how easily.
380 ps = subprocess.Popen(["grep", "^s [^ ]*", "-o", data], stdout=subprocess.PIPE)
381 output = subprocess.check_output(("sort", "-u"), stdin=ps.stdout)
382 ps.wait()
383 outp = output.decode("ascii")
384 soutp = outp.split("\n")
385 samp = [x.split("s ")[1] for x in soutp if x.startswith("s ")]
386 samples = [x.split(".")[0] for x in samp]
387 if self.debug:
388 log.info("### got samples = %s " % (samples))
389 trackDict = {
390 "type": "MafTrack",
391 "trackId": tId,
392 "name": trackData["name"],
393 "adapter": {
394 "type": "MafTabixAdapter",
395 "samples": samples,
396 "bedGzLocation": {
397 "uri": fname + ".sorted.bed.gz",
398 },
399 "index": {
400 "location": {
401 "uri": fname + ".sorted.bed.gz.tbi",
402 },
403 },
404 },
405 "assemblyNames": [self.genome_name],
406 }
407 self.tracksToAdd.append(trackDict)
408 self.trackIdlist.append(tId)
409 if self.config_json.get("plugins", None):
410 self.config_json["plugins"].append(mafPlugin[0])
411 else:
412 self.config_json.update(mafPlugin)
413
414 def _blastxml_to_gff3(self, xml, min_gap=10):
415 gff3_unrebased = tempfile.NamedTemporaryFile(delete=False)
416 cmd = [
417 "python",
418 os.path.join(INSTALLED_TO, "blastxml_to_gapped_gff3.py"),
419 "--trim",
420 "--trim_end",
421 "--include_seq",
422 "--min_gap",
423 str(min_gap),
424 xml,
425 ]
426 subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_unrebased)
427 gff3_unrebased.close()
428 return gff3_unrebased.name
429
430 def add_blastxml(self, data, trackData, blastOpts, **kwargs):
431 gff3 = self._blastxml_to_gff3(data, min_gap=blastOpts["min_gap"])
432
433 if "parent" in blastOpts and blastOpts["parent"] != "None":
434 gff3_rebased = tempfile.NamedTemporaryFile(delete=False)
435 cmd = ["python", os.path.join(INSTALLED_TO, "gff3_rebase.py")]
436 if blastOpts.get("protein", "false") == "true":
437 cmd.append("--protein2dna")
438 cmd.extend([os.path.realpath(blastOpts["parent"]), gff3])
439 subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_rebased)
440 gff3_rebased.close()
441
442 # Replace original gff3 file
443 shutil.copy(gff3_rebased.name, gff3)
444 os.unlink(gff3_rebased.name)
445 url = "%s.gff3" % trackData["label"]
446 dest = os.path.realpath("%s/%s" % (self.outdir, url))
447 self._sort_gff(gff3, dest)
448 url = url + ".gz"
449 tId = trackData["label"]
450 trackDict = {
451 "type": "FeatureTrack",
452 "trackId": tId,
453 "name": trackData["name"],
454 "assemblyNames": [self.genome_name],
455 "adapter": {
456 "type": "Gff3TabixAdapter",
457 "gffGzLocation": {
458 "uri": url,
459 },
460 "index": {
461 "location": {
462 "uri": url + ".tbi",
463 }
464 },
465 },
466 "displays": [
467 {
468 "type": "LinearBasicDisplay",
469 "displayId": "%s-LinearBasicDisplay" % tId,
470 },
471 {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId},
472 ],
473 }
474 if self.usejson:
475 self.tracksToAdd.append(trackDict)
476 self.trackIdlist.append(tId)
477 else:
478 cmd = [
479 "jbrowse",
480 "add-track",
481 url,
482 "-t",
483 "FeatureTrack",
484 "-a",
485 self.genome_name,
486 "--indexFile",
487 url + ".tbi",
488 "-n",
489 trackData["name"],
490 "--load",
491 "inPlace",
492 "--target",
493 self.outdir,
494 ]
495 self.subprocess_check_call(cmd)
496 os.unlink(gff3)
497
498 def add_bigwig(self, data, trackData):
499 url = "%s.bw" % trackData["name"]
500 if self.standalone == "complete":
501 dest = os.path.realpath(os.path.join(self.outdir, url))
502 cmd = ["cp", data, dest]
503 self.subprocess_check_call(cmd)
504 bwloc = {"uri": url}
505 else:
506 dsId = trackData["metadata"]["dataset_id"]
507 url = "%s/api/datasets/%s/display?to_ext=fasta" % (self.giURL, dsId)
508 bwloc = {"uri": url}
509 tId = trackData["label"]
510 trackDict = {
511 "type": "QuantitativeTrack",
512 "trackId": tId,
513 "name": url,
514 "assemblyNames": [
515 self.genome_name,
516 ],
517 "adapter": {
518 "type": "BigWigAdapter",
519 "bigWigLocation": bwloc,
520 },
521 "displays": [
522 {
523 "type": "LinearWiggleDisplay",
524 "displayId": "%s-LinearWiggleDisplay" % tId,
525 }
526 ],
527 }
528 if self.usejson:
529 self.tracksToAdd.append(trackDict)
530 self.trackIdlist.append(tId)
531 else:
532 cmd = [
533 "jbrowse",
534 "add-track",
535 url,
536 "-t",
537 "QuantitativeTrack",
538 "-a",
539 self.genome_name,
540 "-n",
541 trackData["name"],
542 "--load",
543 "inPlace",
544 "--target",
545 self.outdir,
546 ]
547 self.subprocess_check_call(cmd)
548
549 def add_bam(self, data, trackData, bamOpts, bam_index=None, **kwargs):
550 tId = trackData["label"]
551 fname = "%s.bam" % trackData["label"]
552 dest = os.path.realpath("%s/%s" % (self.outdir, fname))
553 if self.standalone == "complete":
554 url = fname
555 self.subprocess_check_call(["cp", data, dest])
556 log.info("### copied %s to %s" % (data, dest))
557 bloc = {"uri": url}
558 else:
559 dsId = trackData["metadata"]["dataset_id"]
560 url = "%s/api/datasets/%s/display?to_ext=bam" % (self.giURL, dsId)
561 bloc = {"uri": url}
562 if bam_index is not None and os.path.exists(os.path.realpath(bam_index)):
563 # bai most probably made by galaxy and stored in galaxy dirs, need to copy it to dest
564 self.subprocess_check_call(
565 ["cp", os.path.realpath(bam_index), dest + ".bai"]
566 )
567 else:
568 # Can happen in exotic condition
569 # e.g. if bam imported as symlink with datatype=unsorted.bam, then datatype changed to bam
570 # => no index generated by galaxy, but there might be one next to the symlink target
571 # this trick allows to skip the bam sorting made by galaxy if already done outside
572 if os.path.exists(os.path.realpath(data) + ".bai"):
573 self.symlink_or_copy(os.path.realpath(data) + ".bai", dest + ".bai")
574 else:
575 log.warn("Could not find a bam index (.bai file) for %s", data)
576 trackDict = {
577 "type": "AlignmentsTrack",
578 "trackId": tId,
579 "name": trackData["name"],
580 "assemblyNames": [self.genome_name],
581 "adapter": {
582 "type": "BamAdapter",
583 "bamLocation": bloc,
584 "index": {
585 "location": {
586 "uri": fname + ".bai",
587 }
588 },
589 },
590 }
591 if self.usejson:
592 self.tracksToAdd.append(trackDict)
593 self.trackIdlist.append(tId)
594 else:
595 cmd = [
596 "jbrowse",
597 "add-track",
598 fname,
599 "-t",
600 "AlignmentsTrack",
601 "-l",
602 "inPlace",
603 "-a",
604 self.genome_name,
605 "--indexFile",
606 fname + ".bai",
607 "-n",
608 trackData["name"],
609 "--target",
610 self.outdir,
611 ]
612 self.subprocess_check_call(cmd)
613
614 def add_vcf(self, data, trackData):
615 tId = trackData["label"]
616 url = "%s/api/datasets/%s/display" % (
617 self.giURL,
618 trackData["metadata"]["dataset_id"],
619 )
620 url = "%s.vcf.gz" % tId
621 dest = os.path.realpath("%s/%s" % (self.outdir, url))
622 cmd = "bgzip -c %s > %s" % (data, dest)
623 self.subprocess_popen(cmd)
624 cmd = ["tabix", "-p", "vcf", dest]
625 self.subprocess_check_call(cmd)
626 trackDict = {
627 "type": "VariantTrack",
628 "trackId": tId,
629 "name": trackData["name"],
630 "assemblyNames": [self.genome_name],
631 "adapter": {
632 "type": "VcfTabixAdapter",
633 "vcfGzLocation": {
634 "uri": url,
635 },
636 "index": {
637 "location": {
638 "uri": url + ".tbi",
639 }
640 },
641 },
642 "displays": [
643 {
644 "type": "LinearVariantDisplay",
645 "displayId": "%s-LinearVariantDisplay" % tId,
646 },
647 {
648 "type": "ChordVariantDisplay",
649 "displayId": "%s-ChordVariantDisplay" % tId,
650 },
651 {
652 "type": "LinearPairedArcDisplay",
653 "displayId": "%s-LinearPairedArcDisplay" % tId,
654 },
655 ],
656 }
657 if self.usejson:
658 self.tracksToAdd.append(trackDict)
659 self.trackIdlist.append(tId)
660 else:
661 cmd = [
662 "jbrowse",
663 "add-track",
664 url,
665 "-t",
666 "VariantTrack",
667 "-a",
668 self.genome_name,
669 "--indexFile",
670 url + ".tbi",
671 "-n",
672 trackData["name"],
673 "--load",
674 "inPlace",
675 "--target",
676 self.outdir,
677 ]
678 self.subprocess_check_call(cmd)
679
680 def _sort_gff(self, data, dest):
681 # Only index if not already done
682 if not os.path.exists(dest + ".gz"):
683 cmd = "jbrowse sort-gff %s | bgzip -c > %s.gz" % (
684 data,
685 dest,
686 ) # "gff3sort.pl --precise '%s' | grep -v \"^$\" > '%s'"
687 self.subprocess_popen(cmd)
688 self.subprocess_check_call(["tabix", "-f", "-p", "gff", dest + ".gz"])
689
690 def _sort_bed(self, data, dest):
691 # Only index if not already done
692 if not os.path.exists(dest):
693 cmd = "sort -k1,1 -k2,2n %s | bgzip -c > %s" % (data, dest)
694 self.subprocess_popen(cmd)
695 cmd = ["tabix", "-f", "-p", "bed", dest]
696 self.subprocess_check_call(cmd)
697
698 def add_gff(self, data, ext, trackData):
699 url = "%s.%s" % (trackData["label"], ext)
700 dest = os.path.realpath("%s/%s" % (self.outdir, url))
701 self._sort_gff(data, dest)
702 url = url + ".gz"
703 tId = trackData["label"]
704 trackDict = {
705 "type": "FeatureTrack",
706 "trackId": tId,
707 "name": trackData["name"],
708 "assemblyNames": [self.genome_name],
709 "adapter": {
710 "type": "Gff3TabixAdapter",
711 "gffGzLocation": {
712 "uri": url,
713 },
714 "index": {
715 "location": {
716 "uri": url + ".tbi",
717 }
718 },
719 },
720 "displays": [
721 {
722 "type": "LinearBasicDisplay",
723 "displayId": "%s-LinearBasicDisplay" % tId,
724 },
725 {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId},
726 ],
727 }
728 if self.usejson:
729 self.tracksToAdd.append(trackDict)
730 self.trackIdlist.append(tId)
731 else:
732 cmd = [
733 "jbrowse",
734 "add-track",
735 url,
736 "-t",
737 "FeatureTrack",
738 "-a",
739 self.genome_name,
740 "-n",
741 trackData["name"],
742 "--load",
743 "inPlace",
744 "--target",
745 self.outdir,
746 ]
747 self.subprocess_check_call(cmd)
748
749 def add_bed(self, data, ext, trackData):
750 url = "%s.%s" % (trackData["label"], ext)
751 dest = os.path.realpath("%s/%s.gz" % (self.outdir, url))
752 self._sort_bed(data, dest)
753 tId = trackData["label"]
754 url = url + ".gz"
755 trackDict = {
756 "type": "FeatureTrack",
757 "trackId": tId,
758 "name": trackData["name"],
759 "assemblyNames": [self.genome_name],
760 "adapter": {
761 "type": "BedTabixAdapter",
762 "bedGzLocation": {
763 "uri": url,
764 },
765 "index": {
766 "location": {
767 "uri": url + ".tbi",
768 }
769 },
770 },
771 "displays": [
772 {
773 "type": "LinearBasicDisplay",
774 "displayId": "%s-LinearBasicDisplay" % tId,
775 },
776 {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId},
777 ],
778 }
779 if self.usejson:
780 self.tracksToAdd.append(trackDict)
781 self.trackIdlist.append(tId)
782 else:
783 cmd = [
784 "jbrowse",
785 "add-track",
786 url,
787 "-t",
788 "FeatureTrack",
789 "-a",
790 self.genome_name,
791 "--indexFile",
792 url + ".tbi",
793 "-n",
794 trackData["name"],
795 "--load",
796 "inPlace",
797 "--target",
798 self.outdir,
799 ]
800 self.subprocess_check_call(cmd)
801
802 def process_annotations(self, track):
803 category = track["category"].replace("__pd__date__pd__", TODAY)
804 for i, (
805 dataset_path,
806 dataset_ext,
807 track_human_label,
808 extra_metadata,
809 ) in enumerate(track["trackfiles"]):
810 # Unsanitize labels (element_identifiers are always sanitized by Galaxy)
811 for key, value in mapped_chars.items():
812 track_human_label = track_human_label.replace(value, key)
813 outputTrackConfig = {
814 "category": category,
815 }
816 if self.debug:
817 log.info(
818 "Processing category = %s, track_human_label = %s",
819 category,
820 track_human_label,
821 )
822 # We add extra data to hash for the case of REST + SPARQL.
823 if (
824 "conf" in track
825 and "options" in track["conf"]
826 and "url" in track["conf"]["options"]
827 ):
828 rest_url = track["conf"]["options"]["url"]
829 else:
830 rest_url = ""
831
832 # I chose to use track['category'] instead of 'category' here. This
833 # is intentional. This way re-running the tool on a different date
834 # will not generate different hashes and make comparison of outputs
835 # much simpler.
836 hashData = [
837 str(dataset_path),
838 track_human_label,
839 track["category"],
840 rest_url,
841 ]
842 hashData = "|".join(hashData).encode("utf-8")
843 outputTrackConfig["label"] = hashlib.md5(hashData).hexdigest() + "_%s" % i
844 outputTrackConfig["metadata"] = extra_metadata
845 outputTrackConfig["name"] = track_human_label
846
847 if dataset_ext in ("gff", "gff3"):
848 self.add_gff(
849 dataset_path,
850 dataset_ext,
851 outputTrackConfig,
852 )
853 elif dataset_ext in ("hic",):
854 self.add_hic(
855 dataset_path,
856 outputTrackConfig,
857 )
858 elif dataset_ext in ("bed",):
859 self.add_bed(
860 dataset_path,
861 dataset_ext,
862 outputTrackConfig,
863 )
864 elif dataset_ext in ("maf",):
865 self.add_maf(
866 dataset_path,
867 outputTrackConfig,
868 )
869 elif dataset_ext == "bigwig":
870 self.add_bigwig(
871 dataset_path,
872 outputTrackConfig,
873 )
874 elif dataset_ext == "bam":
875 real_indexes = track["conf"]["options"]["pileup"]["bam_indices"][
876 "bam_index"
877 ]
878 if not isinstance(real_indexes, list):
879 # <bam_indices>
880 # <bam_index>/path/to/a.bam.bai</bam_index>
881 # </bam_indices>
882 #
883 # The above will result in the 'bam_index' key containing a
884 # string. If there are two or more indices, the container
885 # becomes a list. Fun!
886 real_indexes = [real_indexes]
887
888 self.add_bam(
889 dataset_path,
890 outputTrackConfig,
891 track["conf"]["options"]["pileup"],
892 bam_index=real_indexes[i],
893 )
894 elif dataset_ext == "blastxml":
895 self.add_blastxml(
896 dataset_path, outputTrackConfig, track["conf"]["options"]["blast"]
897 )
898 elif dataset_ext == "vcf":
899 self.add_vcf(dataset_path, outputTrackConfig)
900 else:
901 log.warn("Do not know how to handle %s", dataset_ext)
902
903 def clone_jbrowse(self, jbrowse_dir, destination, minimal=False):
904 """Clone a JBrowse directory into a destination directory."""
905 cmd = ["jbrowse", "create", "-f", self.outdir]
906 self.subprocess_check_call(cmd)
907 for fn in [
908 "asset-manifest.json",
909 "favicon.ico",
910 "robots.txt",
911 "umd_plugin.js",
912 "version.txt",
913 "test_data",
914 ]:
915 cmd = ["rm", "-rf", os.path.join(self.outdir, fn)]
916 self.subprocess_check_call(cmd)
917
918
919 if __name__ == "__main__":
920 parser = argparse.ArgumentParser(description="", epilog="")
921 parser.add_argument("xml", type=argparse.FileType("r"), help="Track Configuration")
922
923 parser.add_argument("--jbrowse", help="Folder containing a jbrowse release")
924 parser.add_argument("--outdir", help="Output directory", default="out")
925 parser.add_argument(
926 "--standalone",
927 choices=["complete", "minimal", "data"],
928 help="Standalone mode includes a copy of JBrowse",
929 )
930 parser.add_argument("--version", "-V", action="version", version="%(prog)s 0.8.0")
931 args = parser.parse_args()
932
933 tree = ET.parse(args.xml.name)
934 root = tree.getroot()
935
936 # This should be done ASAP
937 GALAXY_INFRASTRUCTURE_URL = root.find("metadata/galaxyUrl").text
938 # Sometimes this comes as `localhost` without a protocol
939 if not GALAXY_INFRASTRUCTURE_URL.startswith("http"):
940 # so we'll prepend `http://` and hope for the best. Requests *should*
941 # be GET and not POST so it should redirect OK
942 GALAXY_INFRASTRUCTURE_URL = "http://" + GALAXY_INFRASTRUCTURE_URL
943
944 jc = JbrowseConnector(
945 jbrowse=args.jbrowse,
946 outdir=args.outdir,
947 genomes=[
948 {
949 "path": os.path.realpath(x.attrib["path"]),
950 "meta": metadata_from_node(x.find("metadata")),
951 }
952 for x in root.findall("metadata/genomes/genome")
953 ],
954 standalone=args.standalone,
955 )
956 jc.process_genomes()
957
958 for track in root.findall("tracks/track"):
959 track_conf = {}
960 track_conf["trackfiles"] = []
961
962 is_multi_bigwig = False
963 try:
964 if track.find("options/wiggle/multibigwig") and (
965 track.find("options/wiggle/multibigwig").text == "True"
966 ):
967 is_multi_bigwig = True
968 multi_bigwig_paths = []
969 except KeyError:
970 pass
971
972 trackfiles = track.findall("files/trackFile")
973 if trackfiles:
974 for x in track.findall("files/trackFile"):
975 if is_multi_bigwig:
976 multi_bigwig_paths.append(
977 (x.attrib["label"], os.path.realpath(x.attrib["path"]))
978 )
979 else:
980 if trackfiles:
981 metadata = metadata_from_node(x.find("metadata"))
982 track_conf["dataset_id"] = metadata["dataset_id"]
983 track_conf["trackfiles"].append(
984 (
985 os.path.realpath(x.attrib["path"]),
986 x.attrib["ext"],
987 x.attrib["label"],
988 metadata,
989 )
990 )
991 else:
992 # For tracks without files (rest, sparql)
993 track_conf["trackfiles"].append(
994 (
995 "", # N/A, no path for rest or sparql
996 track.attrib["format"],
997 track.find("options/label").text,
998 {},
999 )
1000 )
1001
1002 if is_multi_bigwig:
1003 metadata = metadata_from_node(x.find("metadata"))
1004
1005 track_conf["trackfiles"].append(
1006 (
1007 multi_bigwig_paths, # Passing an array of paths to represent as one track
1008 "bigwig_multiple",
1009 "MultiBigWig", # Giving an hardcoded name for now
1010 {}, # No metadata for multiple bigwig
1011 )
1012 )
1013
1014 track_conf["category"] = track.attrib["cat"]
1015 track_conf["format"] = track.attrib["format"]
1016 try:
1017 # Only pertains to gff3 + blastxml. TODO?
1018 track_conf["style"] = {t.tag: t.text for t in track.find("options/style")}
1019 except TypeError:
1020 track_conf["style"] = {}
1021 pass
1022 track_conf["conf"] = etree_to_dict(track.find("options"))
1023 jc.process_annotations(track_conf)
1024 print("## processed", str(track_conf), "trackIdlist", jc.trackIdlist)
1025 print(
1026 "###done processing, trackIdlist=",
1027 jc.trackIdlist,
1028 "config=",
1029 str(jc.config_json),
1030 )
1031 jc.config_json["tracks"] = jc.tracksToAdd
1032 if jc.usejson:
1033 jc.write_config()
1034 jc.add_default_view()