changeset 7:234cf4490901 draft

Uploaded
author fubar
date Fri, 05 Jan 2024 04:31:35 +0000
parents 88b9b105c09b
children 1e6128ccc82b
files jbrowse2/blastxml_to_gapped_gff3.py jbrowse2/jbrowse2.py jbrowse2/jbrowse2.xml jbrowse2/servejb2.py jbrowse2/test-data/bam/merlin-sample.bam.gz
diffstat 5 files changed, 243 insertions(+), 168 deletions(-) [+]
line wrap: on
line diff
--- a/jbrowse2/blastxml_to_gapped_gff3.py	Fri Jan 05 01:58:02 2024 +0000
+++ b/jbrowse2/blastxml_to_gapped_gff3.py	Fri Jan 05 04:31:35 2024 +0000
@@ -32,7 +32,7 @@
 
         recid = record.query
         if " " in recid:
-            recid = recid[0 : recid.index(" ")]
+            recid = recid[0: recid.index(" ")]
 
         rec = SeqRecord(Seq("ACTG"), id=recid)
         for idx_hit, hit in enumerate(record.alignments):
@@ -72,7 +72,7 @@
                     qualifiers["blast_" + prop] = getattr(hsp, prop, None)
 
                 desc = hit.title.split(" >")[0]
-                qualifiers["description"] = desc[desc.index(" ") :]
+                qualifiers["description"] = desc[desc.index(" "):]
 
                 # This required a fair bit of sketching out/match to figure out
                 # the first time.
@@ -161,9 +161,9 @@
     fm = ""
     fs = ""
     for position in re.finditer("-", query):
-        fq += query[prev : position.start()]
-        fm += match[prev : position.start()]
-        fs += subject[prev : position.start()]
+        fq += query[prev: position.start()]
+        fm += match[prev: position.start()]
+        fs += subject[prev: position.start()]
         prev = position.start() + 1
     fq += query[prev:]
     fm += match[prev:]
--- a/jbrowse2/jbrowse2.py	Fri Jan 05 01:58:02 2024 +0000
+++ b/jbrowse2/jbrowse2.py	Fri Jan 05 04:31:35 2024 +0000
@@ -108,7 +108,7 @@
 
 
 class JbrowseConnector(object):
-    def __init__(self, jbrowse, outdir, genomes, standalone=None):
+    def __init__(self, jbrowse, outdir, genomes):
         self.debug = False
         self.usejson = True
         self.giURL = GALAXY_INFRASTRUCTURE_URL
@@ -116,7 +116,6 @@
         self.outdir = outdir
         os.makedirs(self.outdir, exist_ok=True)
         self.genome_paths = genomes
-        self.standalone = standalone
         self.trackIdlist = []
         self.tracksToAdd = []
         self.config_json = {}
@@ -174,46 +173,27 @@
             if self.debug:
                 log.info("genome_node=%s" % str(genome_node))
             genome_name = genome_node["meta"]["dataset_dname"]
-            dsId = genome_node["meta"]["dataset_id"]
             fapath = genome_node["path"]
-            if self.standalone == "complete":
-                faname = genome_name + ".fa.gz"
-                fadest = os.path.realpath(os.path.join(self.outdir, faname))
-                cmd = "bgzip -i -c %s > %s && samtools faidx %s" % (
-                    fapath,
-                    fadest,
-                    fadest,
-                )
-                self.subprocess_popen(cmd)
-                adapter = {
-                    "type": "BgzipFastaAdapter",
-                    "fastaLocation": {
-                        "uri": faname,
-                    },
-                    "faiLocation": {
-                        "uri": faname + ".fai",
-                    },
-                    "gziLocation": {
-                        "uri": faname + ".gzi",
-                    },
-                }
-            else:
-                faurl = "%s/api/datasets/%s/display" % (self.giURL, dsId)
-                faname = genome_name + ".fa.fai"
-                fastalocation = {
-                    "uri": faurl,
-                }
-                failocation = {
+            faname = genome_name + ".fa.gz"
+            fadest = os.path.realpath(os.path.join(self.outdir, faname))
+            cmd = "bgzip -i -c %s > %s && samtools faidx %s" % (
+                fapath,
+                fadest,
+                fadest,
+            )
+            self.subprocess_popen(cmd)
+            adapter = {
+                "type": "BgzipFastaAdapter",
+                "fastaLocation": {
                     "uri": faname,
-                }
-                adapter = {
-                    "type": "IndexedFastaAdapter",
-                    "fastaLocation": fastalocation,
-                    "faiLocation": failocation,
-                }
-
-                cmd = ["samtools", "faidx", fapath, "--fai-idx", faname]
-                self.subprocess_check_call(cmd)
+                },
+                "faiLocation": {
+                    "uri": faname + ".fai",
+                },
+                "gziLocation": {
+                    "uri": faname + ".gzi",
+                },
+            }
             trackDict = {
                 "name": genome_name,
                 "sequence": {
@@ -228,40 +208,23 @@
         if self.usejson:
             self.config_json["assemblies"] = assemblies
         else:
-            if self.standalone == "complete":
-                cmd = [
-                    "jbrowse",
-                    "add-assembly",
-                    faname,
-                    "-t",
-                    "bgzipFasta",
-                    "-n",
-                    genome_name,
-                    "--load",
-                    "inPlace",
-                    "--faiLocation",
-                    faname + ".fai",
-                    "--gziLocation",
-                    faname + ".gzi",
-                    "--target",
-                    self.outdir,
-                ]
-            else:
-                cmd = [
-                    "jbrowse",
-                    "add-assembly",
-                    faname,
-                    "-t",
-                    "indexedFasta",
-                    "-n",
-                    genome_name,
-                    "--load",
-                    "inPlace",
-                    "--faiLocation",
-                    faname + ".fai",
-                    "--target",
-                    self.outdir,
-                ]
+            cmd = [
+                "jbrowse",
+                "add-assembly",
+                faname,
+                "-t",
+                "bgzipFasta",
+                "-n",
+                genome_name,
+                "--load",
+                "inPlace",
+                "--faiLocation",
+                faname + ".fai",
+                "--gziLocation",
+                faname + ".gzi",
+                "--target",
+                self.outdir,
+            ]
             self.subprocess_check_call(cmd)
 
     def add_default_view(self):
@@ -279,7 +242,7 @@
             "-v",
             " LinearGenomeView",
         ]
-        if True or self.debug:
+        if self.debug:
             log.info("### calling set-default-session with cmd=%s" % "  ".join(cmd))
         self.subprocess_check_call(cmd)
 
@@ -311,19 +274,13 @@
             dsId,
         )
         hname = trackData["name"]
-        if self.standalone == "complete":
-            dest = os.path.realpath(os.path.join(self.outdir, hname))
-            url = hname
-            cmd = ["cp", data, dest]
-            self.subprocess_check_call(cmd)
-            floc = {
-                "uri": hname,
-            }
-        else:
-            url = "%s/api/datasets/%s/display?to_ext=hic" % (self.giURL, dsId)
-            floc = {
-                "uri": url,
-            }
+        dest = os.path.realpath(os.path.join(self.outdir, hname))
+        url = hname
+        cmd = ["cp", data, dest]
+        self.subprocess_check_call(cmd)
+        floc = {
+            "uri": hname,
+        }
         trackDict = {
             "type": "HicTrack",
             "trackId": tId,
@@ -509,15 +466,10 @@
 
     def add_bigwig(self, data, trackData):
         url = "%s.bw" % trackData["name"]
-        if self.standalone == "complete":
-            dest = os.path.realpath(os.path.join(self.outdir, url))
-            cmd = ["cp", data, dest]
-            self.subprocess_check_call(cmd)
-            bwloc = {"uri": url}
-        else:
-            dsId = trackData["metadata"]["dataset_id"]
-            url = "%s/api/datasets/%s/display?to_ext=fasta" % (self.giURL, dsId)
-            bwloc = {"uri": url}
+        dest = os.path.realpath(os.path.join(self.outdir, url))
+        cmd = ["cp", data, dest]
+        self.subprocess_check_call(cmd)
+        bwloc = {"uri": url}
         tId = trackData["label"]
         trackDict = {
             "type": "QuantitativeTrack",
@@ -562,15 +514,10 @@
         tId = trackData["label"]
         fname = "%s.bam" % trackData["label"]
         dest = os.path.realpath("%s/%s" % (self.outdir, fname))
-        if self.standalone == "complete":
-            url = fname
-            self.subprocess_check_call(["cp", data, dest])
-            log.info("### copied %s to %s" % (data, dest))
-            bloc = {"uri": url}
-        else:
-            dsId = trackData["metadata"]["dataset_id"]
-            url = "%s/api/datasets/%s/display?to_ext=bam" % (self.giURL, dsId)
-            bloc = {"uri": url}
+        url = fname
+        self.subprocess_check_call(["cp", data, dest])
+        log.info("### copied %s to %s" % (data, dest))
+        bloc = {"uri": url}
         if bam_index is not None and os.path.exists(os.path.realpath(bam_index)):
             # bai most probably made by galaxy and stored in galaxy dirs, need to copy it to dest
             self.subprocess_check_call(
@@ -926,6 +873,8 @@
         ]:
             cmd = ["rm", "-rf", os.path.join(self.outdir, fn)]
             self.subprocess_check_call(cmd)
+        cmd = ['cp', os.path.join(INSTALLED_TO, "servejb2.py"), self.outdir]
+        self.subprocess_check_call(cmd)
 
 
 if __name__ == "__main__":
@@ -934,11 +883,6 @@
 
     parser.add_argument("--jbrowse", help="Folder containing a jbrowse release")
     parser.add_argument("--outdir", help="Output directory", default="out")
-    parser.add_argument(
-        "--standalone",
-        choices=["complete", "minimal", "data"],
-        help="Standalone mode includes a copy of JBrowse",
-    )
     parser.add_argument("--version", "-V", action="version", version="%(prog)s 0.8.0")
     args = parser.parse_args()
 
@@ -963,7 +907,6 @@
             }
             for x in root.findall("metadata/genomes/genome")
         ],
-        standalone=args.standalone,
     )
     jc.process_genomes()
 
--- a/jbrowse2/jbrowse2.xml	Fri Jan 05 01:58:02 2024 +0000
+++ b/jbrowse2/jbrowse2.xml	Fri Jan 05 04:31:35 2024 +0000
@@ -21,16 +21,11 @@
 python '$__tool_directory__/jbrowse2.py'
 
 --jbrowse \${JBROWSE_SOURCE_DIR}
---standalone '$standalone'
 
 --outdir '$output.files_path'
 '$trackxml' &&
 
-#if str($standalone) != "data":
-    cp '$output.files_path/index.html' '$output'
-#else:
-    cp '$dummyIndex' '$output'
-#end if
+cp '$output.files_path/index.html' '$output'
 
 ## Ugly testing hack since I cannot get <extra_files> to test the files I want to test. Hmph.
 #if str($uglyTestingHack) == "enabled":
@@ -212,14 +207,6 @@
             </when>
         </conditional>
 
-        <param name="standalone" label="Include all reference and track data in the JBrowse2 object" type="select"
-              help="Default is efficient but will not work offline. Including reference sequences, tracks and indexes will allow standalone viewing, at the cost of copying and moving all data" >
-            <option value="complete">Complete: Choose ONLY if need to view offline, or if history cannot be published. WARNING: produces bloated downloads storing redundant copies of all data!
-            </option>
-            <option value="minimal" selected="true">Sufficient: Uses URLs for Galaxy data. Requires internet access and a published history to download, share and view remotely.
-            </option>
-        </param>
-
         <repeat name="track_groups" title="Track Group">
             <param label="Track Category"
                 name="category"
@@ -230,12 +217,12 @@
             <repeat name="data_tracks" title="Annotation Track">
                 <conditional name="data_format" label="Track Options">
                     <param type="select" label="Track Type" name="data_format_select">
-                        <option value="blast">Blast XML</option>
-                        <option value="gene_calls">GFF/GFF3/BED Features</option>
-                        <option value="hic">HiC data (convert .cool with hicexplorer)</option>
-                        <option value="pileup">BAM Pileups</option>
-                        <option value="vcf">VCF SNPs</option>
-                        <option value="wiggle">BigWig XY</option>
+                        <option value="blast">Blast XML track - converted to GFF with actual gaps between hits</option>
+                        <option value="gene_calls" selected="true">GFF/GFF3/BED feature tracks</option>
+                        <option value="hic">HiC binary data. Existing cool format must be converted to binary hic - hic_matrix will NOT work.</option>
+                        <option value="pileup">BAM Pileup track</option>
+                        <option value="vcf">VCF SNP annotation track</option>
+                        <option value="wiggle">BigWig XY track</option>
                     </param>
                     <when value="hic">
                         <expand macro="input_conditional" label="HiC Track Data" format="hic" help="Cool files must be converted first with hicexplorer" />
@@ -316,13 +303,12 @@
         <param type="hidden" name="uglyTestingHack" value="" />
     </inputs>
     <outputs>
-        <data format="html" name="output" label="JBrowse2 on $reference_genome.genome.element_identifier - $standalone"/>
+        <data format="html" name="output" label="JBrowse2 on $reference_genome.genome.element_identifier"/>
     </outputs>
     <tests>
         <test>
             <param name="reference_genome|genome_type_select" value="history"/>
             <param name="reference_genome|genome" value="merlin.fa"/>
-                <param name="standalone" value="minimal" />
             <param name="uglyTestingHack" value="enabled" />
             <output name="output">
                 <assert_contents>
@@ -337,7 +323,6 @@
             <test>
             <param name="reference_genome|genome_type_select" value="history"/>
             <param name="reference_genome|genome" value="merlin.fa"/>
-            <param name="standalone" value="minimal" />
             <repeat name="track_groups">
                 <param name="category" value="Default" />
                 <repeat name="data_tracks">
@@ -364,7 +349,6 @@
                  <param name="genome_type_select" value="history"/>
                  <param name="genome" value="merlin.fa"/>
             </conditional>
-            <param name="standalone" value="minimal" />
             <repeat name="track_groups">
                 <param name="category" value="Auto Coloured" />
                 <repeat name="data_tracks">
@@ -388,7 +372,6 @@
         <test>
             <param name="reference_genome|genome_type_select" value="history"/>
             <param name="reference_genome|genome" value="merlin.fa"/>
-            <param name="standalone" value="minimal" />
             <param name="uglyTestingHack" value="enabled" />
             <output name="output">
                 <assert_contents>
@@ -410,6 +393,19 @@
 and detailed track styling is not yet implemented. Send code.
 JBrowse1 development has now ceased in favour of JBrowse2.
 
+Use and local viewing
+=====================
+
+A JBrowse2 history item can be opened by viewing it (the "eye" icon).
+They can also be downloaded as archives ("floppy disk" icon) to share and for local viewing.
+One extra step is required before they can be viewed. A local python web server must be started using a script included in each archive.
+Unpack the archive (tar -xvzf [filename].tgz) and the first level directory will contain a file named "servejb2.py"
+
+Assuming you have python3 installed, running
+
+*python3 servjb2.py*
+
+will serve the unarchived JBrowse2 configuration, so it can be browsed by pointing a web browser to localhost:8080
 
 Overview
 --------
@@ -436,9 +432,7 @@
 The first option you encounter is the **Fasta Sequence(s)**. This option
 now accepts multiple fasta files, allowing you to build JBrowse
 instances that contain data for multiple genomes or chrosomomes
-(generally known as "landmark features" in gff3 terminology.) Up to 30
-will be shown from the dropdown selector within JBrowse, this is a known
-issue.
+(generally known as "landmark features" in gff3 terminology.)
 
 **Track Groups** represent a set of tracks in a single category. These
 can be used to let your users understand relationships between large
@@ -449,25 +443,14 @@
 Annotation Tracks
 -----------------
 
-Within Track Groups, you have one or more **Annotation Tracks**. Each
-Annotation Track is a groups of datasets which have similar styling.
-This allows you to rapidly build up JBrowse instances without having to
-configure tracks individually. A massive improvement over previous
-versions. For example, if you have five different GFF3 files from
-various gene callers that you wish to display, you can take advantage of
-this feature to style all of them similarly.
-
 There are a few different types of tracks supported, each with their own
 set of options:
 
 GFF3/BED
 ~~~~~~~~
 
-These are your standard feature tracks. They usually highlight genes,
-mRNAs and other features of interest along a genomic region. The
-underlying tool and this help documentation focus primarily on GFF3
-data, and have not been tested extensively with other formats. Automatic
-min/max detection will fail under BED datasets.
+These are standard feature tracks. They usually highlight genes,
+mRNAs and other features of interest along a genomic region.
 
 BAM Pileups
 ~~~~~~~~~~~
@@ -475,11 +458,6 @@
 We support BAM files and can automatically generate SNP tracks based on
 that bam data.
 
-.. image:: bam.png
-
-This is *strongly discouraged* for high coverage density datasets.
-Unfortunately there are no other configuration options exposed for bam
-files.
 
 BlastXML
 ~~~~~~~~
@@ -512,12 +490,7 @@
 
 .. image:: bigwig.png
 
-**XYPlot**
 
-BigWig tracks can be displayed as a "density" plot which is a continuous
-line which varies in colour, or as an "XYplot." XYplots are preferable
-for users to visually identify specific features in a bigwig track,
-however density tracks are more visually compact.
 
 VCFs/SNPs
 ~~~~~~~~~
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/jbrowse2/servejb2.py	Fri Jan 05 04:31:35 2024 +0000
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+
+# spec: simplest python web server with range support and multithreading that takes root path,
+# port and bind address as command line arguments; by default uses the current dir as webroot,
+# port 8000 and bind address of 0.0.0.0
+# borrowed from https://github.com/danvk/RangeHTTPServer
+# and reborrowed from https://gist.github.com/glowinthedark/b99900abe935e4ab4857314d647a9068
+
+
+import argparse
+import functools
+import os
+import re
+import socketserver
+import webbrowser
+from http.server import SimpleHTTPRequestHandler
+
+
+DEFAULT_PORT = 8080
+
+
+def copy_byte_range(infile, outfile, start=None, stop=None, bufsize=16 * 1024):
+    """Like shutil.copyfileobj, but only copy a range of the streams.
+
+    Both start and stop are inclusive.
+    """
+    if start is not None:
+        infile.seek(start)
+    while 1:
+        to_read = min(bufsize, stop + 1 - infile.tell() if stop else bufsize)
+        buf = infile.read(to_read)
+        if not buf:
+            break
+        outfile.write(buf)
+
+
+BYTE_RANGE_RE = re.compile(r"bytes=(\d+)-(\d+)?$")
+
+
+def parse_byte_range(byte_range):
+    """Returns the two numbers in 'bytes=123-456' or throws ValueError.
+
+    The last number or both numbers may be None.
+    """
+    if byte_range.strip() == "":
+        return None, None
+
+    m = BYTE_RANGE_RE.match(byte_range)
+    if not m:
+        raise ValueError("Invalid byte range %s" % byte_range)
+
+    first, last = [x and int(x) for x in m.groups()]
+    if last and last < first:
+        raise ValueError("Invalid byte range %s" % byte_range)
+    return first, last
+
+
+class RangeRequestHandler(SimpleHTTPRequestHandler):
+    """Adds support for HTTP 'Range' requests to SimpleHTTPRequestHandler
+
+    The approach is to:
+    - Override send_head to look for 'Range' and respond appropriately.
+    - Override copyfile to only transmit a range when requested.
+    """
+
+    def handle(self):
+        try:
+            SimpleHTTPRequestHandler.handle(self)
+        except Exception:
+            # ignored, thrown whenever the client aborts streaming (broken pipe)
+            pass
+
+    def send_head(self):
+        if "Range" not in self.headers:
+            self.range = None
+            return SimpleHTTPRequestHandler.send_head(self)
+        try:
+            self.range = parse_byte_range(self.headers["Range"])
+        except ValueError:
+            self.send_error(400, "Invalid byte range")
+            return None
+        first, last = self.range
+
+        # Mirroring SimpleHTTPServer.py here
+        path = self.translate_path(self.path)
+        f = None
+        ctype = self.guess_type(path)
+        try:
+            f = open(path, "rb")
+        except IOError:
+            self.send_error(404, "File not found")
+            return None
+
+        fs = os.fstat(f.fileno())
+        file_len = fs[6]
+        if first >= file_len:
+            self.send_error(416, "Requested Range Not Satisfiable")
+            return None
+
+        self.send_response(206)
+        self.send_header("Content-type", ctype)
+
+        if last is None or last >= file_len:
+            last = file_len - 1
+        response_length = last - first + 1
+
+        self.send_header("Content-Range", "bytes %s-%s/%s" % (first, last, file_len))
+        self.send_header("Content-Length", str(response_length))
+        self.send_header("Last-Modified", self.date_time_string(fs.st_mtime))
+        self.end_headers()
+        return f
+
+    def end_headers(self):
+        self.send_header("Accept-Ranges", "bytes")
+        return SimpleHTTPRequestHandler.end_headers(self)
+
+    def copyfile(self, source, outputfile):
+        if not self.range:
+            return SimpleHTTPRequestHandler.copyfile(self, source, outputfile)
+
+        # SimpleHTTPRequestHandler uses shutil.copyfileobj, which doesn't let
+        # you stop the copying before the end of the file.
+        start, stop = self.range  # set in send_head()
+        copy_byte_range(source, outputfile, start, stop)
+
+
+class ThreadedTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
+    allow_reuse_address = True
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Simple Python Web Server with Range Support"
+    )
+    parser.add_argument(
+        "--root",
+        default=os.getcwd(),
+        help="Root path to serve files from (default: current working directory)",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=DEFAULT_PORT,
+        help=f"Port to listen on (default: {DEFAULT_PORT})",
+    )
+    parser.add_argument(
+        "--bind", default="0.0.0.0", help="IP address to bind to (default: 0.0.0.0)"
+    )
+    args = parser.parse_args()
+
+    handler = functools.partial(RangeRequestHandler, directory=args.root)
+
+    webbrowser.open(f"http://{args.bind}:{args.port}")
+
+    with ThreadedTCPServer((args.bind, args.port), handler) as httpd:
+        print(
+            f"Serving HTTP on {args.bind} port {args.port} (http://{args.bind}:{args.port}/)"
+        )
+        httpd.serve_forever()
Binary file jbrowse2/test-data/bam/merlin-sample.bam.gz has changed