comparison jbrowse2/jbrowse2.py @ 0:cd5d63cd0eb5 draft

Uploaded
author fubar
date Wed, 03 Jan 2024 01:36:39 +0000
parents
children 22e3d068fdc9
comparison
equal deleted inserted replaced
-1:000000000000 0:cd5d63cd0eb5
1 #!/usr/bin/env python
2 # change to accumulating all configuration for config.json based on the default from the clone
3 import argparse
4 import datetime
5 import hashlib
6 import json
7 import logging
8 import os
9 import shutil
10 import subprocess
11 import tempfile
12 import xml.etree.ElementTree as ET
13 from collections import defaultdict
14
15 logging.basicConfig(level=logging.INFO)
16 log = logging.getLogger("jbrowse")
17 TODAY = datetime.datetime.now().strftime("%Y-%m-%d")
18 GALAXY_INFRASTRUCTURE_URL = None
19 mapped_chars = {
20 ">": "__gt__",
21 "<": "__lt__",
22 "'": "__sq__",
23 '"': "__dq__",
24 "[": "__ob__",
25 "]": "__cb__",
26 "{": "__oc__",
27 "}": "__cc__",
28 "@": "__at__",
29 "#": "__pd__",
30 "": "__cn__",
31 }
32
33
34 def etree_to_dict(t):
35 if t is None:
36 return {}
37
38 d = {t.tag: {} if t.attrib else None}
39 children = list(t)
40 if children:
41 dd = defaultdict(list)
42 for dc in map(etree_to_dict, children):
43 for k, v in dc.items():
44 dd[k].append(v)
45 d = {t.tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}}
46 if t.attrib:
47 d[t.tag].update(("@" + k, v) for k, v in t.attrib.items())
48 if t.text:
49 text = t.text.strip()
50 if children or t.attrib:
51 if text:
52 d[t.tag]["#text"] = text
53 else:
54 d[t.tag] = text
55 return d
56
57
58 INSTALLED_TO = os.path.dirname(os.path.realpath(__file__))
59
60
61 def metadata_from_node(node):
62 metadata = {}
63 try:
64 if len(node.findall("dataset")) != 1:
65 # exit early
66 return metadata
67 except Exception:
68 return {}
69
70 for (key, value) in node.findall("dataset")[0].attrib.items():
71 metadata["dataset_%s" % key] = value
72
73 for (key, value) in node.findall("history")[0].attrib.items():
74 metadata["history_%s" % key] = value
75
76 for (key, value) in node.findall("metadata")[0].attrib.items():
77 metadata["metadata_%s" % key] = value
78
79 for (key, value) in node.findall("tool")[0].attrib.items():
80 metadata["tool_%s" % key] = value
81
82 # Additional Mappings applied:
83 metadata[
84 "dataset_edam_format"
85 ] = '<a target="_blank" href="http://edamontology.org/{0}">{1}</a>'.format(
86 metadata["dataset_edam_format"], metadata["dataset_file_ext"]
87 )
88 metadata["history_user_email"] = '<a href="mailto:{0}">{0}</a>'.format(
89 metadata["history_user_email"]
90 )
91 metadata["hist_name"] = metadata["history_display_name"]
92 metadata[
93 "history_display_name"
94 ] = '<a target="_blank" href="{galaxy}/history/view/{encoded_hist_id}">{hist_name}</a>'.format(
95 galaxy=GALAXY_INFRASTRUCTURE_URL,
96 encoded_hist_id=metadata["history_id"],
97 hist_name=metadata["history_display_name"],
98 )
99 metadata[
100 "tool_tool"
101 ] = '<a target="_blank" href="{galaxy}/datasets/{encoded_id}/show_params">{tool_id}</a>'.format(
102 galaxy=GALAXY_INFRASTRUCTURE_URL,
103 encoded_id=metadata["dataset_id"],
104 tool_id=metadata["tool_tool_id"],
105 # tool_version=metadata['tool_tool_version'],
106 )
107 return metadata
108
109
110 class JbrowseConnector(object):
111 def __init__(self, jbrowse, outdir, genomes, standalone=None):
112 self.debug = False
113 self.giURL = GALAXY_INFRASTRUCTURE_URL
114 self.jbrowse = jbrowse
115 self.outdir = outdir
116 os.makedirs(self.outdir, exist_ok=True)
117 self.genome_paths = genomes
118 self.standalone = standalone
119 self.trackIdlist = []
120 self.tracksToAdd = []
121 self.config_json = {
122 "configuration": {
123 "rpc": {
124 "defaultDriver": "WebWorkerRpcDriver",
125 "drivers": {"MainThreadRpcDriver": {}, "WebWorkerRpcDriver": {}},
126 },
127 "logoPath": {"locationType": "UriLocation", "uri": ""},
128 }
129 }
130 self.config_json_file = os.path.join(outdir, "config.json")
131 if standalone == "complete":
132 self.clone_jbrowse(self.jbrowse, self.outdir)
133 elif standalone == "minimal":
134 self.clone_jbrowse(self.jbrowse, self.outdir, minimal=True)
135
136 def subprocess_check_call(self, command, output=None):
137 if output:
138 if self.debug:
139 log.debug("cd %s && %s > %s", self.outdir, " ".join(command), output)
140 subprocess.check_call(command, cwd=self.outdir, stdout=output)
141 else:
142 log.debug("cd %s && %s", self.outdir, " ".join(command))
143 subprocess.check_call(command, cwd=self.outdir)
144
145 def subprocess_popen(self, command):
146 if self.debug:
147 log.debug("cd %s && %s", self.outdir, command)
148 p = subprocess.Popen(
149 command,
150 shell=True,
151 stdin=subprocess.PIPE,
152 stdout=subprocess.PIPE,
153 stderr=subprocess.PIPE,
154 )
155 output, err = p.communicate()
156 retcode = p.returncode
157 if retcode != 0:
158 log.error("cd %s && %s", self.outdir, command)
159 log.error(output)
160 log.error(err)
161 raise RuntimeError("Command failed with exit code %s" % (retcode))
162
163 def subprocess_check_output(self, command):
164 if self.debug:
165 log.debug("cd %s && %s", self.outdir, " ".join(command))
166 return subprocess.check_output(command, cwd=self.outdir)
167
168 def _jbrowse_bin(self, command):
169 return os.path.realpath(os.path.join(self.jbrowse, "bin", command))
170
171 def symlink_or_copy(self, src, dest):
172 if "GALAXY_JBROWSE_SYMLINKS" in os.environ and bool(
173 os.environ["GALAXY_JBROWSE_SYMLINKS"]
174 ):
175 cmd = ["ln", "-s", src, dest]
176 else:
177 cmd = ["cp", src, dest]
178
179 return self.subprocess_check_call(cmd)
180
181 def process_genomes(self):
182 assemblies = []
183 for i, genome_node in enumerate(self.genome_paths):
184 log.info("genome_node=%s" % str(genome_node))
185 # We only expect one input genome per run. This for loop is just
186 # easier to write than the alternative / catches any possible
187 # issues.
188 genome_name = genome_node["meta"]["dataset_dname"]
189 dsId = genome_node["meta"]["dataset_id"]
190 faname = genome_name + ".fasta"
191 faurl = "%s/api/datasets/%s/display?to_ext=fasta" % (self.giURL, dsId)
192 fapath = genome_node["path"]
193 faind = os.path.realpath(os.path.join(self.outdir, faname + ".fai"))
194 cmd = ["samtools", "faidx", fapath, "--fai-idx", faind]
195 self.subprocess_check_call(cmd)
196 trackDict = {
197 "name": genome_name,
198 "sequence": {
199 "type": "ReferenceSequenceTrack",
200 "trackId": genome_name,
201 "adapter": {
202 "type": "IndexedFastaAdapter",
203 "fastaLocation": {"uri": faurl, "locationType": "UriLocation"},
204 "faiLocation": {
205 "uri": faname + ".fai",
206 "locationType": "UriLocation",
207 },
208 },
209 },
210 }
211 assemblies.append(trackDict)
212 self.config_json["assemblies"] = assemblies
213 self.genome_name = genome_name
214 self.genome_path = faurl
215 self.genome_fai_path = faname + ".fai"
216
217 def add_default_view(self):
218 cmd = [
219 "jbrowse",
220 "set-default-session",
221 "-s",
222 self.config_json_file,
223 "-t",
224 ",".join(self.trackIdlist),
225 "-n",
226 "Default",
227 "--target",
228 self.outdir,
229 ] #
230 self.subprocess_check_call(cmd)
231
232 def write_config(self):
233 with open(self.config_json_file, "w") as fp:
234 json.dump(self.config_json, fp)
235
236 def add_hic(self, data, trackData):
237 """
238 HiC adapter.
239 https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md
240 for testing locally, these work:
241 HiC data is from https://s3.amazonaws.com/igv.broadinstitute.org/data/hic/intra_nofrag_30.hic
242 using hg19 reference track as a
243 'BgzipFastaAdapter'
244 fastaLocation:
245 uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz',
246 faiLocation:
247 uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.fai',
248 gziLocation:
249 uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.gzi',
250 Cool will not be likely to be a good fit - see discussion at https://github.com/GMOD/jbrowse-components/issues/2438
251 """
252 log.info("#### trackData=%s" % trackData)
253 tId = trackData["label"]
254 url = "%s/api/datasets/%s/display?to_ext=hic " % (
255 self.giURL,
256 trackData["metadata"]["dataset_id"],
257 )
258 trackDict = {
259 "type": "HicTrack",
260 "trackId": tId,
261 "name": trackData["name"],
262 "assemblyNames": [self.genome_name],
263 "adapter": {
264 "type": "HicAdapter",
265 "hicLocation": {"uri": url, "locationType": "UriLocation"},
266 },
267 }
268 self.tracksToAdd.append(trackDict)
269 self.trackIdlist.append(tId)
270
271 def add_maf(self, data, trackData):
272 """
273 from https://github.com/cmdcolin/maf2bed
274 Note: Both formats start with a MAF as input, and note that your MAF file should contain the species name and chromosome name
275 e.g. hg38.chr1 in the sequence identifiers.
276 need the reference id - eg hg18, for maf2bed.pl as the first parameter
277 """
278 mafPlugin = {
279 "plugins": [
280 {
281 "name": "MafViewer",
282 "url": "https://unpkg.com/jbrowse-plugin-mafviewer/dist/jbrowse-plugin-mafviewer.umd.production.min.js",
283 }
284 ]
285 }
286 tId = trackData["label"]
287 fname = "%s.bed" % tId
288 dest = os.path.realpath("%s/%s" % (self.outdir, fname))
289 # self.symlink_or_copy(data, dest)
290 # Process MAF to bed-like. Need build to munge chromosomes
291 gname = self.genome_name
292 cmd = [
293 "bash",
294 os.path.join(INSTALLED_TO, "convertMAF.sh"),
295 data,
296 gname,
297 INSTALLED_TO,
298 dest,
299 ]
300 self.subprocess_check_call(cmd)
301 if True or self.debug:
302 log.info("### convertMAF.sh called as %s" % " ".join(cmd))
303 # Construct samples list
304 # We could get this from galaxy metadata, not sure how easily.
305 ps = subprocess.Popen(["grep", "^s [^ ]*", "-o", data], stdout=subprocess.PIPE)
306 output = subprocess.check_output(("sort", "-u"), stdin=ps.stdout)
307 ps.wait()
308 outp = output.decode("ascii")
309 soutp = outp.split("\n")
310 samp = [x.split("s ")[1] for x in soutp if x.startswith("s ")]
311 samples = [x.split(".")[0] for x in samp]
312 if self.debug:
313 log.info("### got samples = %s " % (samples))
314 trackDict = {
315 "type": "MafTrack",
316 "trackId": tId,
317 "name": trackData["name"],
318 "adapter": {
319 "type": "MafTabixAdapter",
320 "samples": samples,
321 "bedGzLocation": {"uri": fname + ".sorted.bed.gz"},
322 "index": {
323 "location": {"uri": fname + ".sorted.bed.gz.tbi"},
324 },
325 },
326 "assemblyNames": [self.genome_name],
327 }
328 self.tracksToAdd.append(trackDict)
329 self.trackIdlist.append(tId)
330 if self.config_json.get("plugins", None):
331 self.config_json["plugins"].append(mafPlugin[0])
332 else:
333 self.config_json.update(mafPlugin)
334
335 def _blastxml_to_gff3(self, xml, min_gap=10):
336 gff3_unrebased = tempfile.NamedTemporaryFile(delete=False)
337 cmd = [
338 "python",
339 os.path.join(INSTALLED_TO, "blastxml_to_gapped_gff3.py"),
340 "--trim",
341 "--trim_end",
342 "--include_seq",
343 "--min_gap",
344 str(min_gap),
345 xml,
346 ]
347 subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_unrebased)
348 gff3_unrebased.close()
349 return gff3_unrebased.name
350
351 def add_blastxml(self, data, trackData, blastOpts, **kwargs):
352 gff3 = self._blastxml_to_gff3(data, min_gap=blastOpts["min_gap"])
353
354 if "parent" in blastOpts and blastOpts["parent"] != "None":
355 gff3_rebased = tempfile.NamedTemporaryFile(delete=False)
356 cmd = ["python", os.path.join(INSTALLED_TO, "gff3_rebase.py")]
357 if blastOpts.get("protein", "false") == "true":
358 cmd.append("--protein2dna")
359 cmd.extend([os.path.realpath(blastOpts["parent"]), gff3])
360 subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_rebased)
361 gff3_rebased.close()
362
363 # Replace original gff3 file
364 shutil.copy(gff3_rebased.name, gff3)
365 os.unlink(gff3_rebased.name)
366 url = "%s.gff3" % trackData["label"]
367 dest = os.path.realpath("%s/%s" % (self.outdir, url))
368 self._sort_gff(gff3, dest)
369 url = url + ".gz"
370 tId = trackData["label"]
371 trackDict = {
372 "type": "FeatureTrack",
373 "trackId": tId,
374 "name": trackData["name"],
375 "assemblyNames": [self.genome_name],
376 "adapter": {
377 "type": "Gff3TabixAdapter",
378 "gffGzLocation": {"locationType": "UriLocation", "uri": url},
379 "index": {
380 "location": {"locationType": "UriLocation", "uri": url + ".tbi"}
381 },
382 },
383 "displays": [
384 {
385 "type": "LinearBasicDisplay",
386 "displayId": "%s-LinearBasicDisplay" % tId,
387 },
388 {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId},
389 ],
390 }
391 self.tracksToAdd.append(trackDict)
392 self.trackIdlist.append(tId)
393 os.unlink(gff3)
394
395 def add_bigwig(self, data, trackData):
396 url = "%s/api/datasets/%s/display" % (
397 self.giURL,
398 trackData["metadata"]["dataset_id"],
399 )
400 tId = trackData["label"]
401 trackDict = {
402 "type": "QuantitativeTrack",
403 "trackId": tId,
404 "name": trackData["name"],
405 "assemblyNames": [
406 self.genome_name,
407 ],
408 "adapter": {
409 "type": "BigWigAdapter",
410 "bigWigLocation": {"locationType": "UriLocation", "uri": url},
411 },
412 "displays": [
413 {
414 "type": "LinearWiggleDisplay",
415 "displayId": "%s-LinearWiggleDisplay" % tId,
416 }
417 ],
418 }
419 self.tracksToAdd.append(trackDict)
420 self.trackIdlist.append(tId)
421
422 def add_bam(self, data, trackData, bamOpts, bam_index=None, **kwargs):
423 tId = trackData["label"]
424 url = "%s.bam" % trackData["label"]
425 dest = os.path.realpath("%s/%s" % (self.outdir, url))
426 self.symlink_or_copy(os.path.realpath(data), dest)
427 if bam_index is not None and os.path.exists(os.path.realpath(bam_index)):
428 # bai most probably made by galaxy and stored in galaxy dirs, need to copy it to dest
429 self.subprocess_check_call(
430 ["cp", os.path.realpath(bam_index), dest + ".bai"]
431 )
432 else:
433 # Can happen in exotic condition
434 # e.g. if bam imported as symlink with datatype=unsorted.bam, then datatype changed to bam
435 # => no index generated by galaxy, but there might be one next to the symlink target
436 # this trick allows to skip the bam sorting made by galaxy if already done outside
437 if os.path.exists(os.path.realpath(data) + ".bai"):
438 self.symlink_or_copy(os.path.realpath(data) + ".bai", dest + ".bai")
439 else:
440 log.warn("Could not find a bam index (.bai file) for %s", data)
441 trackDict = {
442 "type": "AlignmentsTrack",
443 "trackId": tId,
444 "name": trackData["name"],
445 "assemblyNames": [self.genome_name],
446 "adapter": {
447 "type": "BamAdapter",
448 "bamLocation": {"locationType": "UriLocation", "uri": url},
449 "index": {
450 "location": {"locationType": "UriLocation", "uri": url + ".bai"}
451 },
452 "sequenceAdapter": {
453 "type": "IndexedFastaAdapter",
454 "fastaLocation": {
455 "locationType": "UriLocation",
456 "uri": self.genome_path,
457 },
458 "faiLocation": {
459 "locationType": "UriLocation",
460 "uri": self.genome_fai_path,
461 },
462 "metadataLocation": {
463 "locationType": "UriLocation",
464 "uri": "/path/to/fa.metadata.yaml",
465 },
466 },
467 },
468 }
469 self.tracksToAdd.append(trackDict)
470 self.trackIdlist.append(tId)
471
472 def add_vcf(self, data, trackData):
473 tId = trackData["label"]
474 url = "%s/api/datasets/%s/display" % (
475 self.giURL,
476 trackData["metadata"]["dataset_id"],
477 )
478
479 url = "%s.vcf.gz" % tId
480 dest = os.path.realpath("%s/%s" % (self.outdir, url))
481 cmd = "bgzip -c %s > %s" % (data, dest)
482 self.subprocess_popen(cmd)
483 cmd = ["tabix", "-p", "vcf", dest]
484 self.subprocess_check_call(cmd)
485 trackDict = {
486 "type": "VariantTrack",
487 "trackId": tId,
488 "name": trackData["name"],
489 "assemblyNames": [self.genome_name],
490 "adapter": {
491 "type": "VcfTabixAdapter",
492 "vcfGzLocation": {"uri": url, "locationType": "UriLocation"},
493 "index": {
494 "location": {"uri": url + ".tbi", "locationType": "UriLocation"}
495 },
496 },
497 "displays": [
498 {
499 "type": "LinearVariantDisplay",
500 "displayId": "%s-LinearVariantDisplay" % tId,
501 },
502 {
503 "type": "ChordVariantDisplay",
504 "displayId": "%s-ChordVariantDisplay" % tId,
505 },
506 {
507 "type": "LinearPairedArcDisplay",
508 "displayId": "%s-LinearPairedArcDisplay" % tId,
509 },
510 ],
511 }
512 self.tracksToAdd.append(trackDict)
513 self.trackIdlist.append(tId)
514
515 def _sort_gff(self, data, dest):
516 # Only index if not already done
517 if not os.path.exists(dest + ".gz"):
518 cmd = "jbrowse sort-gff %s | bgzip -c > %s.gz" % (
519 data,
520 dest,
521 ) # "gff3sort.pl --precise '%s' | grep -v \"^$\" > '%s'"
522 self.subprocess_popen(cmd)
523 self.subprocess_check_call(["tabix", "-f", "-p", "gff", dest + ".gz"])
524
525 def _sort_bed(self, data, dest):
526 # Only index if not already done
527 if not os.path.exists(dest):
528 cmd = ["sort", "-k1,1", "-k2,2n", data]
529 with open(dest, "w") as handle:
530 self.subprocess_check_call(cmd, output=handle)
531
532 self.subprocess_check_call(["bgzip", "-f", dest])
533 self.subprocess_check_call(["tabix", "-f", "-p", "bed", dest + ".gz"])
534
535 def add_gff(self, data, ext, trackData):
536 url = "%s.%s" % (trackData["label"], ext)
537 dest = os.path.realpath("%s/%s" % (self.outdir, url))
538 self._sort_gff(data, dest)
539 url = url + ".gz"
540 tId = trackData["label"]
541 trackDict = {
542 "type": "FeatureTrack",
543 "trackId": tId,
544 "name": trackData["name"],
545 "assemblyNames": [self.genome_name],
546 "adapter": {
547 "type": "Gff3TabixAdapter",
548 "gffGzLocation": {"locationType": "UriLocation", "uri": url},
549 "index": {
550 "location": {"locationType": "UriLocation", "uri": url + ".tbi"}
551 },
552 },
553 "displays": [
554 {
555 "type": "LinearBasicDisplay",
556 "displayId": "%s-LinearBasicDisplay" % tId,
557 },
558 {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId},
559 ],
560 }
561 self.tracksToAdd.append(trackDict)
562 self.trackIdlist.append(tId)
563
564 def add_bed(self, data, ext, trackData):
565 url = "%s.%s" % (trackData["label"], ext)
566 dest = os.path.realpath("%s/%s" % (self.outdir, url))
567 self._sort_bed(data, dest)
568 tId = trackData["label"]
569 trackDict = {
570 "type": "FeatureTrack",
571 "trackId": tId,
572 "name": trackData["name"],
573 "assemblyNames": [self.genome_name],
574 "adapter": {
575 "type": "BedAdapter",
576 "bedLocation": {"locationType": "UriLocation", "uri": url + ".gz"},
577 },
578 "displays": [
579 {
580 "type": "LinearBasicDisplay",
581 "displayId": "%s-LinearBasicDisplay" % tId,
582 },
583 {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId},
584 ],
585 }
586 self.tracksToAdd.append(trackDict)
587 self.trackIdlist.append(tId)
588
589 def process_annotations(self, track):
590 category = track["category"].replace("__pd__date__pd__", TODAY)
591 for i, (
592 dataset_path,
593 dataset_ext,
594 track_human_label,
595 extra_metadata,
596 ) in enumerate(track["trackfiles"]):
597 # Unsanitize labels (element_identifiers are always sanitized by Galaxy)
598 for key, value in mapped_chars.items():
599 track_human_label = track_human_label.replace(value, key)
600 outputTrackConfig = {
601 "category": category,
602 }
603 if self.debug:
604 log.info(
605 "Processing category = %s, track_human_label = %s",
606 category,
607 track_human_label,
608 )
609 # We add extra data to hash for the case of REST + SPARQL.
610 if (
611 "conf" in track
612 and "options" in track["conf"]
613 and "url" in track["conf"]["options"]
614 ):
615 rest_url = track["conf"]["options"]["url"]
616 else:
617 rest_url = ""
618
619 # I chose to use track['category'] instead of 'category' here. This
620 # is intentional. This way re-running the tool on a different date
621 # will not generate different hashes and make comparison of outputs
622 # much simpler.
623 hashData = [
624 str(dataset_path),
625 track_human_label,
626 track["category"],
627 rest_url,
628 ]
629 hashData = "|".join(hashData).encode("utf-8")
630 outputTrackConfig["label"] = hashlib.md5(hashData).hexdigest() + "_%s" % i
631 outputTrackConfig["metadata"] = extra_metadata
632 outputTrackConfig["name"] = track_human_label
633
634 if dataset_ext in ("gff", "gff3"):
635 self.add_gff(
636 dataset_path,
637 dataset_ext,
638 outputTrackConfig,
639 )
640 elif dataset_ext in ("hic",):
641 self.add_hic(
642 dataset_path,
643 outputTrackConfig,
644 )
645 elif dataset_ext in ("bed",):
646 self.add_bed(
647 dataset_path,
648 dataset_ext,
649 outputTrackConfig,
650 )
651 elif dataset_ext in ("maf",):
652 self.add_maf(
653 dataset_path,
654 outputTrackConfig,
655 )
656 elif dataset_ext == "bigwig":
657 self.add_bigwig(
658 dataset_path,
659 outputTrackConfig,
660 )
661 elif dataset_ext == "bam":
662 real_indexes = track["conf"]["options"]["pileup"]["bam_indices"][
663 "bam_index"
664 ]
665 if not isinstance(real_indexes, list):
666 # <bam_indices>
667 # <bam_index>/path/to/a.bam.bai</bam_index>
668 # </bam_indices>
669 #
670 # The above will result in the 'bam_index' key containing a
671 # string. If there are two or more indices, the container
672 # becomes a list. Fun!
673 real_indexes = [real_indexes]
674
675 self.add_bam(
676 dataset_path,
677 outputTrackConfig,
678 track["conf"]["options"]["pileup"],
679 bam_index=real_indexes[i],
680 )
681 elif dataset_ext == "blastxml":
682 self.add_blastxml(
683 dataset_path, outputTrackConfig, track["conf"]["options"]["blast"]
684 )
685 elif dataset_ext == "vcf":
686 self.add_vcf(dataset_path, outputTrackConfig)
687 else:
688 log.warn("Do not know how to handle %s", dataset_ext)
689
690 def clone_jbrowse(self, jbrowse_dir, destination, minimal=False):
691 """Clone a JBrowse directory into a destination directory."""
692 cmd = ["jbrowse", "create", "-f", self.outdir]
693 self.subprocess_check_call(cmd)
694 for fn in [
695 "asset-manifest.json",
696 "favicon.ico",
697 "robots.txt",
698 "umd_plugin.js",
699 "version.txt",
700 "test_data",
701 ]:
702 cmd = ["rm", "-rf", os.path.join(self.outdir, fn)]
703 self.subprocess_check_call(cmd)
704
705
706 if __name__ == "__main__":
707 parser = argparse.ArgumentParser(description="", epilog="")
708 parser.add_argument("xml", type=argparse.FileType("r"), help="Track Configuration")
709
710 parser.add_argument("--jbrowse", help="Folder containing a jbrowse release")
711 parser.add_argument("--outdir", help="Output directory", default="out")
712 parser.add_argument(
713 "--standalone",
714 choices=["complete", "minimal", "data"],
715 help="Standalone mode includes a copy of JBrowse",
716 )
717 parser.add_argument("--version", "-V", action="version", version="%(prog)s 0.8.0")
718 args = parser.parse_args()
719
720 tree = ET.parse(args.xml.name)
721 root = tree.getroot()
722
723 # This should be done ASAP
724 GALAXY_INFRASTRUCTURE_URL = root.find("metadata/galaxyUrl").text
725 # Sometimes this comes as `localhost` without a protocol
726 if not GALAXY_INFRASTRUCTURE_URL.startswith("http"):
727 # so we'll prepend `http://` and hope for the best. Requests *should*
728 # be GET and not POST so it should redirect OK
729 GALAXY_INFRASTRUCTURE_URL = "http://" + GALAXY_INFRASTRUCTURE_URL
730
731 jc = JbrowseConnector(
732 jbrowse=args.jbrowse,
733 outdir=args.outdir,
734 genomes=[
735 {
736 "path": os.path.realpath(x.attrib["path"]),
737 "meta": metadata_from_node(x.find("metadata")),
738 }
739 for x in root.findall("metadata/genomes/genome")
740 ],
741 standalone=args.standalone,
742 )
743 jc.process_genomes()
744
745 for track in root.findall("tracks/track"):
746 track_conf = {}
747 track_conf["trackfiles"] = []
748
749 is_multi_bigwig = False
750 try:
751 if track.find("options/wiggle/multibigwig") and (
752 track.find("options/wiggle/multibigwig").text == "True"
753 ):
754 is_multi_bigwig = True
755 multi_bigwig_paths = []
756 except KeyError:
757 pass
758
759 trackfiles = track.findall("files/trackFile")
760 if trackfiles:
761 for x in track.findall("files/trackFile"):
762 if is_multi_bigwig:
763 multi_bigwig_paths.append(
764 (x.attrib["label"], os.path.realpath(x.attrib["path"]))
765 )
766 else:
767 if trackfiles:
768 metadata = metadata_from_node(x.find("metadata"))
769 track_conf["dataset_id"] = metadata["dataset_id"]
770 track_conf["trackfiles"].append(
771 (
772 os.path.realpath(x.attrib["path"]),
773 x.attrib["ext"],
774 x.attrib["label"],
775 metadata,
776 )
777 )
778 else:
779 # For tracks without files (rest, sparql)
780 track_conf["trackfiles"].append(
781 (
782 "", # N/A, no path for rest or sparql
783 track.attrib["format"],
784 track.find("options/label").text,
785 {},
786 )
787 )
788
789 if is_multi_bigwig:
790 metadata = metadata_from_node(x.find("metadata"))
791
792 track_conf["trackfiles"].append(
793 (
794 multi_bigwig_paths, # Passing an array of paths to represent as one track
795 "bigwig_multiple",
796 "MultiBigWig", # Giving an hardcoded name for now
797 {}, # No metadata for multiple bigwig
798 )
799 )
800
801 track_conf["category"] = track.attrib["cat"]
802 track_conf["format"] = track.attrib["format"]
803 try:
804 # Only pertains to gff3 + blastxml. TODO?
805 track_conf["style"] = {t.tag: t.text for t in track.find("options/style")}
806 except TypeError:
807 track_conf["style"] = {}
808 pass
809 track_conf["conf"] = etree_to_dict(track.find("options"))
810 jc.process_annotations(track_conf)
811 print("## processed", str(track_conf), "trackIdlist", jc.trackIdlist)
812 print(
813 "###done processing, trackIdlist=",
814 jc.trackIdlist,
815 "config=",
816 str(jc.config_json),
817 )
818 jc.config_json["tracks"] = jc.tracksToAdd
819 jc.write_config()
820 jc.add_default_view()