0
|
1 #!/usr/bin/env python
|
|
2 # change to accumulating all configuration for config.json based on the default from the clone
|
|
3 import argparse
|
|
4 import datetime
|
|
5 import hashlib
|
|
6 import json
|
|
7 import logging
|
|
8 import os
|
|
9 import shutil
|
|
10 import subprocess
|
|
11 import tempfile
|
|
12 import xml.etree.ElementTree as ET
|
|
13 from collections import defaultdict
|
|
14
|
|
15 logging.basicConfig(level=logging.INFO)
|
|
16 log = logging.getLogger("jbrowse")
|
|
17 TODAY = datetime.datetime.now().strftime("%Y-%m-%d")
|
|
18 GALAXY_INFRASTRUCTURE_URL = None
|
|
19 mapped_chars = {
|
|
20 ">": "__gt__",
|
|
21 "<": "__lt__",
|
|
22 "'": "__sq__",
|
|
23 '"': "__dq__",
|
|
24 "[": "__ob__",
|
|
25 "]": "__cb__",
|
|
26 "{": "__oc__",
|
|
27 "}": "__cc__",
|
|
28 "@": "__at__",
|
|
29 "#": "__pd__",
|
|
30 "": "__cn__",
|
|
31 }
|
|
32
|
|
33
|
|
34 def etree_to_dict(t):
|
|
35 if t is None:
|
|
36 return {}
|
|
37
|
|
38 d = {t.tag: {} if t.attrib else None}
|
|
39 children = list(t)
|
|
40 if children:
|
|
41 dd = defaultdict(list)
|
|
42 for dc in map(etree_to_dict, children):
|
|
43 for k, v in dc.items():
|
|
44 dd[k].append(v)
|
|
45 d = {t.tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}}
|
|
46 if t.attrib:
|
|
47 d[t.tag].update(("@" + k, v) for k, v in t.attrib.items())
|
|
48 if t.text:
|
|
49 text = t.text.strip()
|
|
50 if children or t.attrib:
|
|
51 if text:
|
|
52 d[t.tag]["#text"] = text
|
|
53 else:
|
|
54 d[t.tag] = text
|
|
55 return d
|
|
56
|
|
57
|
|
58 INSTALLED_TO = os.path.dirname(os.path.realpath(__file__))
|
|
59
|
|
60
|
|
61 def metadata_from_node(node):
|
|
62 metadata = {}
|
|
63 try:
|
|
64 if len(node.findall("dataset")) != 1:
|
|
65 # exit early
|
|
66 return metadata
|
|
67 except Exception:
|
|
68 return {}
|
|
69
|
|
70 for (key, value) in node.findall("dataset")[0].attrib.items():
|
|
71 metadata["dataset_%s" % key] = value
|
|
72
|
|
73 for (key, value) in node.findall("history")[0].attrib.items():
|
|
74 metadata["history_%s" % key] = value
|
|
75
|
|
76 for (key, value) in node.findall("metadata")[0].attrib.items():
|
|
77 metadata["metadata_%s" % key] = value
|
|
78
|
|
79 for (key, value) in node.findall("tool")[0].attrib.items():
|
|
80 metadata["tool_%s" % key] = value
|
|
81
|
|
82 # Additional Mappings applied:
|
|
83 metadata[
|
|
84 "dataset_edam_format"
|
|
85 ] = '<a target="_blank" href="http://edamontology.org/{0}">{1}</a>'.format(
|
|
86 metadata["dataset_edam_format"], metadata["dataset_file_ext"]
|
|
87 )
|
|
88 metadata["history_user_email"] = '<a href="mailto:{0}">{0}</a>'.format(
|
|
89 metadata["history_user_email"]
|
|
90 )
|
|
91 metadata["hist_name"] = metadata["history_display_name"]
|
|
92 metadata[
|
|
93 "history_display_name"
|
|
94 ] = '<a target="_blank" href="{galaxy}/history/view/{encoded_hist_id}">{hist_name}</a>'.format(
|
|
95 galaxy=GALAXY_INFRASTRUCTURE_URL,
|
|
96 encoded_hist_id=metadata["history_id"],
|
|
97 hist_name=metadata["history_display_name"],
|
|
98 )
|
|
99 metadata[
|
|
100 "tool_tool"
|
|
101 ] = '<a target="_blank" href="{galaxy}/datasets/{encoded_id}/show_params">{tool_id}</a>'.format(
|
|
102 galaxy=GALAXY_INFRASTRUCTURE_URL,
|
|
103 encoded_id=metadata["dataset_id"],
|
|
104 tool_id=metadata["tool_tool_id"],
|
|
105 # tool_version=metadata['tool_tool_version'],
|
|
106 )
|
|
107 return metadata
|
|
108
|
|
109
|
|
110 class JbrowseConnector(object):
|
|
111 def __init__(self, jbrowse, outdir, genomes, standalone=None):
|
|
112 self.debug = False
|
6
|
113 self.usejson = True
|
0
|
114 self.giURL = GALAXY_INFRASTRUCTURE_URL
|
|
115 self.jbrowse = jbrowse
|
|
116 self.outdir = outdir
|
|
117 os.makedirs(self.outdir, exist_ok=True)
|
|
118 self.genome_paths = genomes
|
|
119 self.standalone = standalone
|
|
120 self.trackIdlist = []
|
|
121 self.tracksToAdd = []
|
6
|
122 self.config_json = {}
|
|
123 self.config_json_file = os.path.realpath(os.path.join(outdir, "config.json"))
|
|
124 self.clone_jbrowse(self.jbrowse, self.outdir)
|
0
|
125
|
|
126 def subprocess_check_call(self, command, output=None):
|
|
127 if output:
|
|
128 if self.debug:
|
|
129 log.debug("cd %s && %s > %s", self.outdir, " ".join(command), output)
|
|
130 subprocess.check_call(command, cwd=self.outdir, stdout=output)
|
|
131 else:
|
|
132 log.debug("cd %s && %s", self.outdir, " ".join(command))
|
|
133 subprocess.check_call(command, cwd=self.outdir)
|
|
134
|
|
135 def subprocess_popen(self, command):
|
|
136 if self.debug:
|
|
137 log.debug("cd %s && %s", self.outdir, command)
|
|
138 p = subprocess.Popen(
|
|
139 command,
|
|
140 shell=True,
|
|
141 stdin=subprocess.PIPE,
|
|
142 stdout=subprocess.PIPE,
|
|
143 stderr=subprocess.PIPE,
|
|
144 )
|
|
145 output, err = p.communicate()
|
|
146 retcode = p.returncode
|
|
147 if retcode != 0:
|
|
148 log.error("cd %s && %s", self.outdir, command)
|
|
149 log.error(output)
|
|
150 log.error(err)
|
|
151 raise RuntimeError("Command failed with exit code %s" % (retcode))
|
|
152
|
|
153 def subprocess_check_output(self, command):
|
|
154 if self.debug:
|
|
155 log.debug("cd %s && %s", self.outdir, " ".join(command))
|
|
156 return subprocess.check_output(command, cwd=self.outdir)
|
|
157
|
|
158 def _jbrowse_bin(self, command):
|
|
159 return os.path.realpath(os.path.join(self.jbrowse, "bin", command))
|
|
160
|
|
161 def symlink_or_copy(self, src, dest):
|
|
162 if "GALAXY_JBROWSE_SYMLINKS" in os.environ and bool(
|
|
163 os.environ["GALAXY_JBROWSE_SYMLINKS"]
|
|
164 ):
|
|
165 cmd = ["ln", "-s", src, dest]
|
|
166 else:
|
|
167 cmd = ["cp", src, dest]
|
|
168
|
|
169 return self.subprocess_check_call(cmd)
|
|
170
|
|
171 def process_genomes(self):
|
|
172 assemblies = []
|
|
173 for i, genome_node in enumerate(self.genome_paths):
|
6
|
174 if self.debug:
|
|
175 log.info("genome_node=%s" % str(genome_node))
|
0
|
176 genome_name = genome_node["meta"]["dataset_dname"]
|
|
177 dsId = genome_node["meta"]["dataset_id"]
|
5
|
178 fapath = genome_node["path"]
|
|
179 if self.standalone == "complete":
|
6
|
180 faname = genome_name + ".fa.gz"
|
5
|
181 fadest = os.path.realpath(os.path.join(self.outdir, faname))
|
6
|
182 cmd = "bgzip -i -c %s > %s && samtools faidx %s" % (
|
|
183 fapath,
|
|
184 fadest,
|
|
185 fadest,
|
|
186 )
|
|
187 self.subprocess_popen(cmd)
|
|
188 adapter = {
|
|
189 "type": "BgzipFastaAdapter",
|
|
190 "fastaLocation": {
|
|
191 "uri": faname,
|
|
192 },
|
|
193 "faiLocation": {
|
|
194 "uri": faname + ".fai",
|
|
195 },
|
|
196 "gziLocation": {
|
|
197 "uri": faname + ".gzi",
|
|
198 },
|
|
199 }
|
5
|
200 else:
|
6
|
201 faurl = "%s/api/datasets/%s/display" % (self.giURL, dsId)
|
|
202 faname = genome_name + ".fa.fai"
|
|
203 fastalocation = {
|
|
204 "uri": faurl,
|
|
205 }
|
|
206 failocation = {
|
|
207 "uri": faname,
|
|
208 }
|
|
209 adapter = {
|
|
210 "type": "IndexedFastaAdapter",
|
|
211 "fastaLocation": fastalocation,
|
|
212 "faiLocation": failocation,
|
|
213 }
|
|
214
|
|
215 cmd = ["samtools", "faidx", fapath, "--fai-idx", faname]
|
|
216 self.subprocess_check_call(cmd)
|
0
|
217 trackDict = {
|
|
218 "name": genome_name,
|
|
219 "sequence": {
|
|
220 "type": "ReferenceSequenceTrack",
|
|
221 "trackId": genome_name,
|
6
|
222 "adapter": adapter,
|
0
|
223 },
|
6
|
224 "rendering": {"type": "DivSequenceRenderer"},
|
0
|
225 }
|
|
226 assemblies.append(trackDict)
|
|
227 self.genome_name = genome_name
|
6
|
228 if self.usejson:
|
|
229 self.config_json["assemblies"] = assemblies
|
|
230 else:
|
|
231 if self.standalone == "complete":
|
|
232 cmd = [
|
|
233 "jbrowse",
|
|
234 "add-assembly",
|
|
235 faname,
|
|
236 "-t",
|
|
237 "bgzipFasta",
|
|
238 "-n",
|
|
239 genome_name,
|
|
240 "--load",
|
|
241 "inPlace",
|
|
242 "--faiLocation",
|
|
243 faname + ".fai",
|
|
244 "--gziLocation",
|
|
245 faname + ".gzi",
|
|
246 "--target",
|
|
247 self.outdir,
|
|
248 ]
|
|
249 else:
|
|
250 cmd = [
|
|
251 "jbrowse",
|
|
252 "add-assembly",
|
|
253 faname,
|
|
254 "-t",
|
|
255 "indexedFasta",
|
|
256 "-n",
|
|
257 genome_name,
|
|
258 "--load",
|
|
259 "inPlace",
|
|
260 "--faiLocation",
|
|
261 faname + ".fai",
|
|
262 "--target",
|
|
263 self.outdir,
|
|
264 ]
|
|
265 self.subprocess_check_call(cmd)
|
0
|
266
|
|
267 def add_default_view(self):
|
|
268 cmd = [
|
|
269 "jbrowse",
|
|
270 "set-default-session",
|
|
271 "-s",
|
|
272 self.config_json_file,
|
|
273 "-t",
|
|
274 ",".join(self.trackIdlist),
|
|
275 "-n",
|
6
|
276 "JBrowse2 in Galaxy",
|
0
|
277 "--target",
|
6
|
278 self.config_json_file,
|
|
279 "-v",
|
|
280 " LinearGenomeView",
|
|
281 ]
|
|
282 if True or self.debug:
|
|
283 log.info("### calling set-default-session with cmd=%s" % " ".join(cmd))
|
0
|
284 self.subprocess_check_call(cmd)
|
|
285
|
|
286 def write_config(self):
|
|
287 with open(self.config_json_file, "w") as fp:
|
|
288 json.dump(self.config_json, fp)
|
|
289
|
|
290 def add_hic(self, data, trackData):
|
|
291 """
|
|
292 HiC adapter.
|
|
293 https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md
|
|
294 for testing locally, these work:
|
|
295 HiC data is from https://s3.amazonaws.com/igv.broadinstitute.org/data/hic/intra_nofrag_30.hic
|
|
296 using hg19 reference track as a
|
|
297 'BgzipFastaAdapter'
|
|
298 fastaLocation:
|
|
299 uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz',
|
|
300 faiLocation:
|
|
301 uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.fai',
|
|
302 gziLocation:
|
|
303 uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.gzi',
|
|
304 Cool will not be likely to be a good fit - see discussion at https://github.com/GMOD/jbrowse-components/issues/2438
|
|
305 """
|
|
306 log.info("#### trackData=%s" % trackData)
|
|
307 tId = trackData["label"]
|
5
|
308 dsId = trackData["metadata"]["dataset_id"]
|
0
|
309 url = "%s/api/datasets/%s/display?to_ext=hic " % (
|
|
310 self.giURL,
|
5
|
311 dsId,
|
0
|
312 )
|
5
|
313 hname = trackData["name"]
|
|
314 if self.standalone == "complete":
|
|
315 dest = os.path.realpath(os.path.join(self.outdir, hname))
|
|
316 url = hname
|
|
317 cmd = ["cp", data, dest]
|
|
318 self.subprocess_check_call(cmd)
|
6
|
319 floc = {
|
|
320 "uri": hname,
|
|
321 }
|
5
|
322 else:
|
|
323 url = "%s/api/datasets/%s/display?to_ext=hic" % (self.giURL, dsId)
|
6
|
324 floc = {
|
|
325 "uri": url,
|
|
326 }
|
0
|
327 trackDict = {
|
|
328 "type": "HicTrack",
|
|
329 "trackId": tId,
|
5
|
330 "name": hname,
|
0
|
331 "assemblyNames": [self.genome_name],
|
|
332 "adapter": {
|
|
333 "type": "HicAdapter",
|
6
|
334 "hicLocation": floc,
|
0
|
335 },
|
|
336 }
|
6
|
337 if self.usejson:
|
|
338 self.tracksToAdd.append(trackDict)
|
|
339 self.trackIdlist.append(tId)
|
|
340 else:
|
|
341 cmd = [
|
|
342 "jbrowse",
|
|
343 "add-track",
|
|
344 url,
|
|
345 "-t",
|
|
346 "HicTrack",
|
|
347 "-a",
|
|
348 self.genome_name,
|
|
349 "-n",
|
|
350 hname,
|
|
351 "--load",
|
|
352 "inPlace",
|
|
353 "--target",
|
|
354 self.outdir,
|
|
355 ]
|
|
356 self.subprocess_check_call(cmd)
|
0
|
357
|
|
358 def add_maf(self, data, trackData):
|
|
359 """
|
|
360 from https://github.com/cmdcolin/maf2bed
|
|
361 Note: Both formats start with a MAF as input, and note that your MAF file should contain the species name and chromosome name
|
|
362 e.g. hg38.chr1 in the sequence identifiers.
|
|
363 need the reference id - eg hg18, for maf2bed.pl as the first parameter
|
|
364 """
|
|
365 mafPlugin = {
|
|
366 "plugins": [
|
|
367 {
|
|
368 "name": "MafViewer",
|
|
369 "url": "https://unpkg.com/jbrowse-plugin-mafviewer/dist/jbrowse-plugin-mafviewer.umd.production.min.js",
|
|
370 }
|
|
371 ]
|
|
372 }
|
|
373 tId = trackData["label"]
|
|
374 fname = "%s.bed" % tId
|
|
375 dest = os.path.realpath("%s/%s" % (self.outdir, fname))
|
|
376 # self.symlink_or_copy(data, dest)
|
|
377 # Process MAF to bed-like. Need build to munge chromosomes
|
|
378 gname = self.genome_name
|
|
379 cmd = [
|
|
380 "bash",
|
|
381 os.path.join(INSTALLED_TO, "convertMAF.sh"),
|
|
382 data,
|
|
383 gname,
|
|
384 INSTALLED_TO,
|
|
385 dest,
|
|
386 ]
|
|
387 self.subprocess_check_call(cmd)
|
|
388 if True or self.debug:
|
|
389 log.info("### convertMAF.sh called as %s" % " ".join(cmd))
|
|
390 # Construct samples list
|
|
391 # We could get this from galaxy metadata, not sure how easily.
|
|
392 ps = subprocess.Popen(["grep", "^s [^ ]*", "-o", data], stdout=subprocess.PIPE)
|
|
393 output = subprocess.check_output(("sort", "-u"), stdin=ps.stdout)
|
|
394 ps.wait()
|
|
395 outp = output.decode("ascii")
|
|
396 soutp = outp.split("\n")
|
|
397 samp = [x.split("s ")[1] for x in soutp if x.startswith("s ")]
|
|
398 samples = [x.split(".")[0] for x in samp]
|
|
399 if self.debug:
|
|
400 log.info("### got samples = %s " % (samples))
|
|
401 trackDict = {
|
|
402 "type": "MafTrack",
|
|
403 "trackId": tId,
|
|
404 "name": trackData["name"],
|
|
405 "adapter": {
|
|
406 "type": "MafTabixAdapter",
|
|
407 "samples": samples,
|
6
|
408 "bedGzLocation": {
|
|
409 "uri": fname + ".sorted.bed.gz",
|
|
410 },
|
0
|
411 "index": {
|
6
|
412 "location": {
|
|
413 "uri": fname + ".sorted.bed.gz.tbi",
|
|
414 },
|
0
|
415 },
|
|
416 },
|
|
417 "assemblyNames": [self.genome_name],
|
|
418 }
|
|
419 self.tracksToAdd.append(trackDict)
|
|
420 self.trackIdlist.append(tId)
|
|
421 if self.config_json.get("plugins", None):
|
|
422 self.config_json["plugins"].append(mafPlugin[0])
|
|
423 else:
|
|
424 self.config_json.update(mafPlugin)
|
|
425
|
|
426 def _blastxml_to_gff3(self, xml, min_gap=10):
|
|
427 gff3_unrebased = tempfile.NamedTemporaryFile(delete=False)
|
|
428 cmd = [
|
|
429 "python",
|
|
430 os.path.join(INSTALLED_TO, "blastxml_to_gapped_gff3.py"),
|
|
431 "--trim",
|
|
432 "--trim_end",
|
|
433 "--include_seq",
|
|
434 "--min_gap",
|
|
435 str(min_gap),
|
|
436 xml,
|
|
437 ]
|
|
438 subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_unrebased)
|
|
439 gff3_unrebased.close()
|
|
440 return gff3_unrebased.name
|
|
441
|
|
442 def add_blastxml(self, data, trackData, blastOpts, **kwargs):
|
|
443 gff3 = self._blastxml_to_gff3(data, min_gap=blastOpts["min_gap"])
|
|
444
|
|
445 if "parent" in blastOpts and blastOpts["parent"] != "None":
|
|
446 gff3_rebased = tempfile.NamedTemporaryFile(delete=False)
|
|
447 cmd = ["python", os.path.join(INSTALLED_TO, "gff3_rebase.py")]
|
|
448 if blastOpts.get("protein", "false") == "true":
|
|
449 cmd.append("--protein2dna")
|
|
450 cmd.extend([os.path.realpath(blastOpts["parent"]), gff3])
|
|
451 subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_rebased)
|
|
452 gff3_rebased.close()
|
|
453
|
|
454 # Replace original gff3 file
|
|
455 shutil.copy(gff3_rebased.name, gff3)
|
|
456 os.unlink(gff3_rebased.name)
|
|
457 url = "%s.gff3" % trackData["label"]
|
|
458 dest = os.path.realpath("%s/%s" % (self.outdir, url))
|
|
459 self._sort_gff(gff3, dest)
|
|
460 url = url + ".gz"
|
|
461 tId = trackData["label"]
|
|
462 trackDict = {
|
|
463 "type": "FeatureTrack",
|
|
464 "trackId": tId,
|
|
465 "name": trackData["name"],
|
|
466 "assemblyNames": [self.genome_name],
|
|
467 "adapter": {
|
|
468 "type": "Gff3TabixAdapter",
|
6
|
469 "gffGzLocation": {
|
|
470 "uri": url,
|
|
471 },
|
0
|
472 "index": {
|
6
|
473 "location": {
|
|
474 "uri": url + ".tbi",
|
|
475 }
|
0
|
476 },
|
|
477 },
|
|
478 "displays": [
|
|
479 {
|
|
480 "type": "LinearBasicDisplay",
|
|
481 "displayId": "%s-LinearBasicDisplay" % tId,
|
|
482 },
|
|
483 {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId},
|
|
484 ],
|
|
485 }
|
6
|
486 if self.usejson:
|
|
487 self.tracksToAdd.append(trackDict)
|
|
488 self.trackIdlist.append(tId)
|
|
489 else:
|
|
490 cmd = [
|
|
491 "jbrowse",
|
|
492 "add-track",
|
|
493 url,
|
|
494 "-t",
|
|
495 "FeatureTrack",
|
|
496 "-a",
|
|
497 self.genome_name,
|
|
498 "--indexFile",
|
|
499 url + ".tbi",
|
|
500 "-n",
|
|
501 trackData["name"],
|
|
502 "--load",
|
|
503 "inPlace",
|
|
504 "--target",
|
|
505 self.outdir,
|
|
506 ]
|
|
507 self.subprocess_check_call(cmd)
|
0
|
508 os.unlink(gff3)
|
|
509
|
|
510 def add_bigwig(self, data, trackData):
|
6
|
511 url = "%s.bw" % trackData["name"]
|
5
|
512 if self.standalone == "complete":
|
6
|
513 dest = os.path.realpath(os.path.join(self.outdir, url))
|
5
|
514 cmd = ["cp", data, dest]
|
|
515 self.subprocess_check_call(cmd)
|
6
|
516 bwloc = {"uri": url}
|
5
|
517 else:
|
|
518 dsId = trackData["metadata"]["dataset_id"]
|
|
519 url = "%s/api/datasets/%s/display?to_ext=fasta" % (self.giURL, dsId)
|
6
|
520 bwloc = {"uri": url}
|
0
|
521 tId = trackData["label"]
|
|
522 trackDict = {
|
|
523 "type": "QuantitativeTrack",
|
|
524 "trackId": tId,
|
6
|
525 "name": url,
|
0
|
526 "assemblyNames": [
|
|
527 self.genome_name,
|
|
528 ],
|
|
529 "adapter": {
|
|
530 "type": "BigWigAdapter",
|
6
|
531 "bigWigLocation": bwloc,
|
0
|
532 },
|
|
533 "displays": [
|
|
534 {
|
|
535 "type": "LinearWiggleDisplay",
|
|
536 "displayId": "%s-LinearWiggleDisplay" % tId,
|
|
537 }
|
|
538 ],
|
|
539 }
|
6
|
540 if self.usejson:
|
|
541 self.tracksToAdd.append(trackDict)
|
|
542 self.trackIdlist.append(tId)
|
|
543 else:
|
|
544 cmd = [
|
|
545 "jbrowse",
|
|
546 "add-track",
|
|
547 url,
|
|
548 "-t",
|
|
549 "QuantitativeTrack",
|
|
550 "-a",
|
|
551 self.genome_name,
|
|
552 "-n",
|
|
553 trackData["name"],
|
|
554 "--load",
|
|
555 "inPlace",
|
|
556 "--target",
|
|
557 self.outdir,
|
|
558 ]
|
|
559 self.subprocess_check_call(cmd)
|
0
|
560
|
|
561 def add_bam(self, data, trackData, bamOpts, bam_index=None, **kwargs):
|
|
562 tId = trackData["label"]
|
5
|
563 fname = "%s.bam" % trackData["label"]
|
|
564 dest = os.path.realpath("%s/%s" % (self.outdir, fname))
|
6
|
565 if self.standalone == "complete":
|
|
566 url = fname
|
|
567 self.subprocess_check_call(["cp", data, dest])
|
|
568 log.info("### copied %s to %s" % (data, dest))
|
|
569 bloc = {"uri": url}
|
|
570 else:
|
5
|
571 dsId = trackData["metadata"]["dataset_id"]
|
|
572 url = "%s/api/datasets/%s/display?to_ext=bam" % (self.giURL, dsId)
|
6
|
573 bloc = {"uri": url}
|
0
|
574 if bam_index is not None and os.path.exists(os.path.realpath(bam_index)):
|
|
575 # bai most probably made by galaxy and stored in galaxy dirs, need to copy it to dest
|
|
576 self.subprocess_check_call(
|
|
577 ["cp", os.path.realpath(bam_index), dest + ".bai"]
|
|
578 )
|
|
579 else:
|
|
580 # Can happen in exotic condition
|
|
581 # e.g. if bam imported as symlink with datatype=unsorted.bam, then datatype changed to bam
|
|
582 # => no index generated by galaxy, but there might be one next to the symlink target
|
|
583 # this trick allows to skip the bam sorting made by galaxy if already done outside
|
|
584 if os.path.exists(os.path.realpath(data) + ".bai"):
|
|
585 self.symlink_or_copy(os.path.realpath(data) + ".bai", dest + ".bai")
|
|
586 else:
|
|
587 log.warn("Could not find a bam index (.bai file) for %s", data)
|
|
588 trackDict = {
|
|
589 "type": "AlignmentsTrack",
|
|
590 "trackId": tId,
|
|
591 "name": trackData["name"],
|
|
592 "assemblyNames": [self.genome_name],
|
|
593 "adapter": {
|
|
594 "type": "BamAdapter",
|
6
|
595 "bamLocation": bloc,
|
0
|
596 "index": {
|
6
|
597 "location": {
|
|
598 "uri": fname + ".bai",
|
|
599 }
|
0
|
600 },
|
|
601 },
|
|
602 }
|
6
|
603 if self.usejson:
|
|
604 self.tracksToAdd.append(trackDict)
|
|
605 self.trackIdlist.append(tId)
|
|
606 else:
|
|
607 cmd = [
|
|
608 "jbrowse",
|
|
609 "add-track",
|
|
610 fname,
|
|
611 "-t",
|
|
612 "AlignmentsTrack",
|
|
613 "-l",
|
|
614 "inPlace",
|
|
615 "-a",
|
|
616 self.genome_name,
|
|
617 "--indexFile",
|
|
618 fname + ".bai",
|
|
619 "-n",
|
|
620 trackData["name"],
|
|
621 "--target",
|
|
622 self.outdir,
|
|
623 ]
|
|
624 self.subprocess_check_call(cmd)
|
0
|
625
|
|
626 def add_vcf(self, data, trackData):
|
|
627 tId = trackData["label"]
|
|
628 url = "%s/api/datasets/%s/display" % (
|
|
629 self.giURL,
|
|
630 trackData["metadata"]["dataset_id"],
|
|
631 )
|
|
632 url = "%s.vcf.gz" % tId
|
|
633 dest = os.path.realpath("%s/%s" % (self.outdir, url))
|
|
634 cmd = "bgzip -c %s > %s" % (data, dest)
|
|
635 self.subprocess_popen(cmd)
|
|
636 cmd = ["tabix", "-p", "vcf", dest]
|
|
637 self.subprocess_check_call(cmd)
|
|
638 trackDict = {
|
|
639 "type": "VariantTrack",
|
|
640 "trackId": tId,
|
|
641 "name": trackData["name"],
|
|
642 "assemblyNames": [self.genome_name],
|
|
643 "adapter": {
|
|
644 "type": "VcfTabixAdapter",
|
6
|
645 "vcfGzLocation": {
|
|
646 "uri": url,
|
|
647 },
|
0
|
648 "index": {
|
6
|
649 "location": {
|
|
650 "uri": url + ".tbi",
|
|
651 }
|
0
|
652 },
|
|
653 },
|
|
654 "displays": [
|
|
655 {
|
|
656 "type": "LinearVariantDisplay",
|
|
657 "displayId": "%s-LinearVariantDisplay" % tId,
|
|
658 },
|
|
659 {
|
|
660 "type": "ChordVariantDisplay",
|
|
661 "displayId": "%s-ChordVariantDisplay" % tId,
|
|
662 },
|
|
663 {
|
|
664 "type": "LinearPairedArcDisplay",
|
|
665 "displayId": "%s-LinearPairedArcDisplay" % tId,
|
|
666 },
|
|
667 ],
|
|
668 }
|
6
|
669 if self.usejson:
|
|
670 self.tracksToAdd.append(trackDict)
|
|
671 self.trackIdlist.append(tId)
|
|
672 else:
|
|
673 cmd = [
|
|
674 "jbrowse",
|
|
675 "add-track",
|
|
676 url,
|
|
677 "-t",
|
|
678 "VariantTrack",
|
|
679 "-a",
|
|
680 self.genome_name,
|
|
681 "--indexFile",
|
|
682 url + ".tbi",
|
|
683 "-n",
|
|
684 trackData["name"],
|
|
685 "--load",
|
|
686 "inPlace",
|
|
687 "--target",
|
|
688 self.outdir,
|
|
689 ]
|
|
690 self.subprocess_check_call(cmd)
|
0
|
691
|
|
692 def _sort_gff(self, data, dest):
|
|
693 # Only index if not already done
|
|
694 if not os.path.exists(dest + ".gz"):
|
|
695 cmd = "jbrowse sort-gff %s | bgzip -c > %s.gz" % (
|
|
696 data,
|
|
697 dest,
|
|
698 ) # "gff3sort.pl --precise '%s' | grep -v \"^$\" > '%s'"
|
|
699 self.subprocess_popen(cmd)
|
|
700 self.subprocess_check_call(["tabix", "-f", "-p", "gff", dest + ".gz"])
|
|
701
|
|
702 def _sort_bed(self, data, dest):
|
|
703 # Only index if not already done
|
|
704 if not os.path.exists(dest):
|
5
|
705 cmd = "sort -k1,1 -k2,2n %s | bgzip -c > %s" % (data, dest)
|
|
706 self.subprocess_popen(cmd)
|
|
707 cmd = ["tabix", "-f", "-p", "bed", dest]
|
|
708 self.subprocess_check_call(cmd)
|
0
|
709
|
|
710 def add_gff(self, data, ext, trackData):
|
|
711 url = "%s.%s" % (trackData["label"], ext)
|
|
712 dest = os.path.realpath("%s/%s" % (self.outdir, url))
|
|
713 self._sort_gff(data, dest)
|
|
714 url = url + ".gz"
|
|
715 tId = trackData["label"]
|
|
716 trackDict = {
|
|
717 "type": "FeatureTrack",
|
|
718 "trackId": tId,
|
|
719 "name": trackData["name"],
|
|
720 "assemblyNames": [self.genome_name],
|
|
721 "adapter": {
|
|
722 "type": "Gff3TabixAdapter",
|
6
|
723 "gffGzLocation": {
|
|
724 "uri": url,
|
|
725 },
|
0
|
726 "index": {
|
6
|
727 "location": {
|
|
728 "uri": url + ".tbi",
|
|
729 }
|
0
|
730 },
|
|
731 },
|
|
732 "displays": [
|
|
733 {
|
|
734 "type": "LinearBasicDisplay",
|
|
735 "displayId": "%s-LinearBasicDisplay" % tId,
|
|
736 },
|
|
737 {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId},
|
|
738 ],
|
|
739 }
|
6
|
740 if self.usejson:
|
|
741 self.tracksToAdd.append(trackDict)
|
|
742 self.trackIdlist.append(tId)
|
|
743 else:
|
|
744 cmd = [
|
|
745 "jbrowse",
|
|
746 "add-track",
|
|
747 url,
|
|
748 "-t",
|
|
749 "FeatureTrack",
|
|
750 "-a",
|
|
751 self.genome_name,
|
|
752 "-n",
|
|
753 trackData["name"],
|
|
754 "--load",
|
|
755 "inPlace",
|
|
756 "--target",
|
|
757 self.outdir,
|
|
758 ]
|
|
759 self.subprocess_check_call(cmd)
|
0
|
760
|
|
761 def add_bed(self, data, ext, trackData):
|
|
762 url = "%s.%s" % (trackData["label"], ext)
|
5
|
763 dest = os.path.realpath("%s/%s.gz" % (self.outdir, url))
|
0
|
764 self._sort_bed(data, dest)
|
|
765 tId = trackData["label"]
|
2
|
766 url = url + ".gz"
|
0
|
767 trackDict = {
|
|
768 "type": "FeatureTrack",
|
|
769 "trackId": tId,
|
|
770 "name": trackData["name"],
|
|
771 "assemblyNames": [self.genome_name],
|
|
772 "adapter": {
|
2
|
773 "type": "BedTabixAdapter",
|
6
|
774 "bedGzLocation": {
|
|
775 "uri": url,
|
|
776 },
|
2
|
777 "index": {
|
6
|
778 "location": {
|
|
779 "uri": url + ".tbi",
|
|
780 }
|
2
|
781 },
|
0
|
782 },
|
|
783 "displays": [
|
|
784 {
|
|
785 "type": "LinearBasicDisplay",
|
|
786 "displayId": "%s-LinearBasicDisplay" % tId,
|
|
787 },
|
|
788 {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId},
|
|
789 ],
|
|
790 }
|
6
|
791 if self.usejson:
|
|
792 self.tracksToAdd.append(trackDict)
|
|
793 self.trackIdlist.append(tId)
|
|
794 else:
|
|
795 cmd = [
|
|
796 "jbrowse",
|
|
797 "add-track",
|
|
798 url,
|
|
799 "-t",
|
|
800 "FeatureTrack",
|
|
801 "-a",
|
|
802 self.genome_name,
|
|
803 "--indexFile",
|
|
804 url + ".tbi",
|
|
805 "-n",
|
|
806 trackData["name"],
|
|
807 "--load",
|
|
808 "inPlace",
|
|
809 "--target",
|
|
810 self.outdir,
|
|
811 ]
|
|
812 self.subprocess_check_call(cmd)
|
0
|
813
|
|
814 def process_annotations(self, track):
|
|
815 category = track["category"].replace("__pd__date__pd__", TODAY)
|
|
816 for i, (
|
|
817 dataset_path,
|
|
818 dataset_ext,
|
|
819 track_human_label,
|
|
820 extra_metadata,
|
|
821 ) in enumerate(track["trackfiles"]):
|
|
822 # Unsanitize labels (element_identifiers are always sanitized by Galaxy)
|
|
823 for key, value in mapped_chars.items():
|
|
824 track_human_label = track_human_label.replace(value, key)
|
|
825 outputTrackConfig = {
|
|
826 "category": category,
|
|
827 }
|
|
828 if self.debug:
|
|
829 log.info(
|
|
830 "Processing category = %s, track_human_label = %s",
|
|
831 category,
|
|
832 track_human_label,
|
|
833 )
|
|
834 # We add extra data to hash for the case of REST + SPARQL.
|
|
835 if (
|
|
836 "conf" in track
|
|
837 and "options" in track["conf"]
|
|
838 and "url" in track["conf"]["options"]
|
|
839 ):
|
|
840 rest_url = track["conf"]["options"]["url"]
|
|
841 else:
|
|
842 rest_url = ""
|
|
843
|
|
844 # I chose to use track['category'] instead of 'category' here. This
|
|
845 # is intentional. This way re-running the tool on a different date
|
|
846 # will not generate different hashes and make comparison of outputs
|
|
847 # much simpler.
|
|
848 hashData = [
|
|
849 str(dataset_path),
|
|
850 track_human_label,
|
|
851 track["category"],
|
|
852 rest_url,
|
|
853 ]
|
|
854 hashData = "|".join(hashData).encode("utf-8")
|
|
855 outputTrackConfig["label"] = hashlib.md5(hashData).hexdigest() + "_%s" % i
|
|
856 outputTrackConfig["metadata"] = extra_metadata
|
|
857 outputTrackConfig["name"] = track_human_label
|
|
858
|
|
859 if dataset_ext in ("gff", "gff3"):
|
|
860 self.add_gff(
|
|
861 dataset_path,
|
|
862 dataset_ext,
|
|
863 outputTrackConfig,
|
|
864 )
|
|
865 elif dataset_ext in ("hic",):
|
|
866 self.add_hic(
|
|
867 dataset_path,
|
|
868 outputTrackConfig,
|
|
869 )
|
|
870 elif dataset_ext in ("bed",):
|
|
871 self.add_bed(
|
|
872 dataset_path,
|
|
873 dataset_ext,
|
|
874 outputTrackConfig,
|
|
875 )
|
|
876 elif dataset_ext in ("maf",):
|
|
877 self.add_maf(
|
|
878 dataset_path,
|
|
879 outputTrackConfig,
|
|
880 )
|
|
881 elif dataset_ext == "bigwig":
|
|
882 self.add_bigwig(
|
|
883 dataset_path,
|
|
884 outputTrackConfig,
|
|
885 )
|
|
886 elif dataset_ext == "bam":
|
|
887 real_indexes = track["conf"]["options"]["pileup"]["bam_indices"][
|
|
888 "bam_index"
|
|
889 ]
|
|
890 if not isinstance(real_indexes, list):
|
|
891 # <bam_indices>
|
|
892 # <bam_index>/path/to/a.bam.bai</bam_index>
|
|
893 # </bam_indices>
|
|
894 #
|
|
895 # The above will result in the 'bam_index' key containing a
|
|
896 # string. If there are two or more indices, the container
|
|
897 # becomes a list. Fun!
|
|
898 real_indexes = [real_indexes]
|
|
899
|
|
900 self.add_bam(
|
|
901 dataset_path,
|
|
902 outputTrackConfig,
|
|
903 track["conf"]["options"]["pileup"],
|
|
904 bam_index=real_indexes[i],
|
|
905 )
|
|
906 elif dataset_ext == "blastxml":
|
|
907 self.add_blastxml(
|
|
908 dataset_path, outputTrackConfig, track["conf"]["options"]["blast"]
|
|
909 )
|
|
910 elif dataset_ext == "vcf":
|
|
911 self.add_vcf(dataset_path, outputTrackConfig)
|
|
912 else:
|
|
913 log.warn("Do not know how to handle %s", dataset_ext)
|
|
914
|
6
|
915 def clone_jbrowse(self, jbrowse_dir, destination):
|
0
|
916 """Clone a JBrowse directory into a destination directory."""
|
|
917 cmd = ["jbrowse", "create", "-f", self.outdir]
|
|
918 self.subprocess_check_call(cmd)
|
|
919 for fn in [
|
|
920 "asset-manifest.json",
|
|
921 "favicon.ico",
|
|
922 "robots.txt",
|
|
923 "umd_plugin.js",
|
|
924 "version.txt",
|
|
925 "test_data",
|
|
926 ]:
|
|
927 cmd = ["rm", "-rf", os.path.join(self.outdir, fn)]
|
|
928 self.subprocess_check_call(cmd)
|
|
929
|
|
930
|
|
931 if __name__ == "__main__":
|
|
932 parser = argparse.ArgumentParser(description="", epilog="")
|
|
933 parser.add_argument("xml", type=argparse.FileType("r"), help="Track Configuration")
|
|
934
|
|
935 parser.add_argument("--jbrowse", help="Folder containing a jbrowse release")
|
|
936 parser.add_argument("--outdir", help="Output directory", default="out")
|
|
937 parser.add_argument(
|
|
938 "--standalone",
|
|
939 choices=["complete", "minimal", "data"],
|
|
940 help="Standalone mode includes a copy of JBrowse",
|
|
941 )
|
|
942 parser.add_argument("--version", "-V", action="version", version="%(prog)s 0.8.0")
|
|
943 args = parser.parse_args()
|
|
944
|
|
945 tree = ET.parse(args.xml.name)
|
|
946 root = tree.getroot()
|
|
947
|
|
948 # This should be done ASAP
|
|
949 GALAXY_INFRASTRUCTURE_URL = root.find("metadata/galaxyUrl").text
|
|
950 # Sometimes this comes as `localhost` without a protocol
|
|
951 if not GALAXY_INFRASTRUCTURE_URL.startswith("http"):
|
|
952 # so we'll prepend `http://` and hope for the best. Requests *should*
|
|
953 # be GET and not POST so it should redirect OK
|
|
954 GALAXY_INFRASTRUCTURE_URL = "http://" + GALAXY_INFRASTRUCTURE_URL
|
|
955
|
|
956 jc = JbrowseConnector(
|
|
957 jbrowse=args.jbrowse,
|
|
958 outdir=args.outdir,
|
|
959 genomes=[
|
|
960 {
|
|
961 "path": os.path.realpath(x.attrib["path"]),
|
|
962 "meta": metadata_from_node(x.find("metadata")),
|
|
963 }
|
|
964 for x in root.findall("metadata/genomes/genome")
|
|
965 ],
|
|
966 standalone=args.standalone,
|
|
967 )
|
|
968 jc.process_genomes()
|
|
969
|
|
970 for track in root.findall("tracks/track"):
|
|
971 track_conf = {}
|
|
972 track_conf["trackfiles"] = []
|
|
973
|
|
974 is_multi_bigwig = False
|
|
975 try:
|
|
976 if track.find("options/wiggle/multibigwig") and (
|
|
977 track.find("options/wiggle/multibigwig").text == "True"
|
|
978 ):
|
|
979 is_multi_bigwig = True
|
|
980 multi_bigwig_paths = []
|
|
981 except KeyError:
|
|
982 pass
|
|
983
|
|
984 trackfiles = track.findall("files/trackFile")
|
|
985 if trackfiles:
|
|
986 for x in track.findall("files/trackFile"):
|
|
987 if is_multi_bigwig:
|
|
988 multi_bigwig_paths.append(
|
|
989 (x.attrib["label"], os.path.realpath(x.attrib["path"]))
|
|
990 )
|
|
991 else:
|
|
992 if trackfiles:
|
|
993 metadata = metadata_from_node(x.find("metadata"))
|
|
994 track_conf["dataset_id"] = metadata["dataset_id"]
|
|
995 track_conf["trackfiles"].append(
|
|
996 (
|
|
997 os.path.realpath(x.attrib["path"]),
|
|
998 x.attrib["ext"],
|
|
999 x.attrib["label"],
|
|
1000 metadata,
|
|
1001 )
|
|
1002 )
|
|
1003 else:
|
|
1004 # For tracks without files (rest, sparql)
|
|
1005 track_conf["trackfiles"].append(
|
|
1006 (
|
|
1007 "", # N/A, no path for rest or sparql
|
|
1008 track.attrib["format"],
|
|
1009 track.find("options/label").text,
|
|
1010 {},
|
|
1011 )
|
|
1012 )
|
|
1013
|
|
1014 if is_multi_bigwig:
|
|
1015 metadata = metadata_from_node(x.find("metadata"))
|
|
1016
|
|
1017 track_conf["trackfiles"].append(
|
|
1018 (
|
|
1019 multi_bigwig_paths, # Passing an array of paths to represent as one track
|
|
1020 "bigwig_multiple",
|
|
1021 "MultiBigWig", # Giving an hardcoded name for now
|
|
1022 {}, # No metadata for multiple bigwig
|
|
1023 )
|
|
1024 )
|
|
1025
|
|
1026 track_conf["category"] = track.attrib["cat"]
|
|
1027 track_conf["format"] = track.attrib["format"]
|
|
1028 try:
|
|
1029 # Only pertains to gff3 + blastxml. TODO?
|
|
1030 track_conf["style"] = {t.tag: t.text for t in track.find("options/style")}
|
|
1031 except TypeError:
|
|
1032 track_conf["style"] = {}
|
|
1033 pass
|
|
1034 track_conf["conf"] = etree_to_dict(track.find("options"))
|
|
1035 jc.process_annotations(track_conf)
|
|
1036 print("## processed", str(track_conf), "trackIdlist", jc.trackIdlist)
|
|
1037 print(
|
|
1038 "###done processing, trackIdlist=",
|
|
1039 jc.trackIdlist,
|
|
1040 "config=",
|
|
1041 str(jc.config_json),
|
|
1042 )
|
|
1043 jc.config_json["tracks"] = jc.tracksToAdd
|
6
|
1044 if jc.usejson:
|
|
1045 jc.write_config()
|
0
|
1046 jc.add_default_view()
|