Repository 'translate_bed'
hg clone https://toolshed.g2.bx.psu.edu/repos/galaxyp/translate_bed

Changeset 0:038ecf54cbec (2018-01-22)
Commit message:
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/translate_bed commit 383bb485120a193bcc14f88364e51356d6ede219
added:
bedutil.py
digest.py
ensembl_rest.py
macros.xml
test-data/GRCh38.1.2bit
test-data/human_transcripts.bed
test-data/human_transcripts_seq.bed
tool-data/twobit.loc.sample
tool_data_table_conf.xml.sample
translate_bed.py
translate_bed.xml
b
diff -r 000000000000 -r 038ecf54cbec bedutil.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bedutil.py Mon Jan 22 13:59:27 2018 -0500
[
b'@@ -0,0 +1,515 @@\n+#!/usr/bin/env python\n+"""\n+#\n+#------------------------------------------------------------------------------\n+#                         University of Minnesota\n+#         Copyright 2016, Regents of the University of Minnesota\n+#------------------------------------------------------------------------------\n+# Author:\n+#\n+#  James E Johnson\n+#\n+#------------------------------------------------------------------------------\n+"""\n+\n+from __future__ import print_function\n+\n+import sys\n+\n+from Bio.Seq import reverse_complement, translate\n+\n+\n+def bed_from_line(line, ensembl=False, seq_column=None):\n+    fields = line.rstrip(\'\\r\\n\').split(\'\\t\')\n+    if len(fields) < 12:\n+        return None\n+    (chrom, chromStart, chromEnd, name, score, strand,\n+     thickStart, thickEnd, itemRgb,\n+     blockCount, blockSizes, blockStarts) = fields[0:12]\n+    bed_entry = BedEntry(chrom=chrom, chromStart=chromStart, chromEnd=chromEnd,\n+                         name=name, score=score, strand=strand,\n+                         thickStart=thickStart, thickEnd=thickEnd,\n+                         itemRgb=itemRgb,\n+                         blockCount=blockCount,\n+                         blockSizes=blockSizes.rstrip(\',\'),\n+                         blockStarts=blockStarts.rstrip(\',\'))\n+    if seq_column is not None and -len(fields) <= seq_column < len(fields):\n+        bed_entry.seq = fields[seq_column]\n+    if ensembl and len(fields) >= 20:\n+        bed_entry.second_name = fields[12]\n+        bed_entry.cds_start_status = fields[13]\n+        bed_entry.cds_end_status = fields[14]\n+        bed_entry.exon_frames = fields[15].rstrip(\',\')\n+        bed_entry.biotype = fields[16]\n+        bed_entry.gene_name = fields[17]\n+        bed_entry.second_gene_name = fields[18]\n+        bed_entry.gene_type = fields[19]\n+    return bed_entry\n+\n+\n+def as_int_list(obj):\n+    if obj is None:\n+        return None\n+    if isinstance(obj, list):\n+        return [int(x) for x in obj]\n+    elif isinstance(obj, str):\n+        return [int(x) for x in obj.split(\',\')]\n+    else:  # python2 unicode?\n+        return [int(x) for x in str(obj).split(\',\')]\n+\n+\n+class BedEntry(object):\n+    def __init__(self, chrom=None, chromStart=None, chromEnd=None,\n+                 name=None, score=None, strand=None,\n+                 thickStart=None, thickEnd=None, itemRgb=None,\n+                 blockCount=None, blockSizes=None, blockStarts=None):\n+        self.chrom = chrom\n+        self.chromStart = int(chromStart)\n+        self.chromEnd = int(chromEnd)\n+        self.name = name\n+        self.score = int(score) if score is not None else 0\n+        self.strand = \'-\' if str(strand).startswith(\'-\') else \'+\'\n+        self.thickStart = int(thickStart) if thickStart else self.chromStart\n+        self.thickEnd = int(thickEnd) if thickEnd else self.chromEnd\n+        self.itemRgb = str(itemRgb) if itemRgb is not None else r\'100,100,100\'\n+        self.blockCount = int(blockCount)\n+        self.blockSizes = as_int_list(blockSizes)\n+        self.blockStarts = as_int_list(blockStarts)\n+        self.second_name = None\n+        self.cds_start_status = None\n+        self.cds_end_status = None\n+        self.exon_frames = None\n+        self.biotype = None\n+        self.gene_name = None\n+        self.second_gene_name = None\n+        self.gene_type = None\n+        self.seq = None\n+        self.cdna = None\n+        self.pep = None\n+        # T26C\n+        self.aa_change = []\n+        # p.Trp26Cys g.<pos><ref>><alt> # g.1304573A>G\n+        self.variants = []\n+\n+    def __str__(self):\n+        return \'%s\\t%d\\t%d\\t%s\\t%d\\t%s\\t%d\\t%d\\t%s\\t%d\\t%s\\t%s\' % (\n+            self.chrom, self.chromStart, self.chromEnd,\n+            self.name, self.score, self.strand,\n+            self.thickStart, self.thickEnd, str(self.itemRgb), self.blockCount,\n+            \',\'.join([str(x) for x in self.blockSizes]),\n+            \',\'.join([str(x) for x in self.blockStarts]))\n+\n+    def get_splice_junctions(self):\n+        spl'..b' translation.find(\'*\', junc)\n+                        tstop = stop if stop >= 0 else len(translation)\n+                    offset = (block_sum - i) % 3\n+                    trimmed = translation[tstart:tstop]\n+                    if debug:\n+                        print("frame: %d\\ttstart: %d  tstop: %d  " +\n+                              "offset: %d\\t%s" %\n+                              (i, tstart, tstop, offset, trimmed),\n+                              file=sys.stderr)\n+                    if filtering and tstart > ignore:\n+                        continue\n+                    # get genomic locations for start and end\n+                    if self.strand == \'+\':\n+                        chromStart = self.chromStart + i + (tstart * 3)\n+                        chromEnd = self.chromEnd - offset\\\n+                            - (len(translation) - tstop) * 3\n+                    else:\n+                        chromStart = self.chromStart + offset\\\n+                            + (len(translation) - tstop) * 3\n+                        chromEnd = self.chromEnd - i - (tstart * 3)\n+                    # get the blocks for this translation\n+                    (tblockCount, tblockSizes, tblockStarts) =\\\n+                        self.get_blocks(chromStart, chromEnd)\n+                    translations[i] = (chromStart, chromEnd, trimmed,\n+                                       tblockCount, tblockSizes, tblockStarts)\n+                    if debug:\n+                        print("tblockCount: %d tblockStarts: %s " +\n+                              "tblockSizes: %s" %\n+                              (tblockCount, tblockStarts, tblockSizes),\n+                              file=sys.stderr)\n+        return translations\n+\n+    def get_seq_id(self, seqtype=\'unk:unk\', reference=\'\', frame=None):\n+        # Ensembl fasta ID format\n+        # >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT\n+        # >ENSP00000328693 pep:splice chromosome:NCBI35:1:904515:910768:1\\\n+        #   gene:ENSG00000158815:transcript:ENST00000328693\\\n+        #    gene_biotype:protein_coding transcript_biotype:protein_coding\n+        frame_name = \'\'\n+        chromStart = self.chromStart\n+        chromEnd = self.chromEnd\n+        strand = 1 if self.strand == \'+\' else -1\n+        if frame is not None:\n+            block_sum = sum(self.blockSizes)\n+            offset = (block_sum - frame) % 3\n+            frame_name = \'_\' + str(frame + 1)\n+            if self.strand == \'+\':\n+                chromStart += frame\n+                chromEnd -= offset\n+            else:\n+                chromStart += offset\n+                chromEnd -= frame\n+        location = "chromosome:%s:%s:%s:%s:%s"\\\n+            % (reference, self.chrom, chromStart, chromEnd, strand)\n+        seq_id = "%s%s %s %s" % (self.name, frame_name, seqtype, location)\n+        return seq_id\n+\n+    def get_line(self, start_offset=0, end_offset=0):\n+        if start_offset or end_offset:\n+            s_offset = start_offset if start_offset else 0\n+            e_offset = end_offset if end_offset else 0\n+            if s_offset > self.chromStart:\n+                s_offset = self.chromStart\n+            chrStart = self.chromStart - s_offset\n+            chrEnd = self.chromEnd + e_offset\n+            blkSizes = self.blockSizes\n+            blkSizes[0] += s_offset\n+            blkSizes[-1] += e_offset\n+            blkStarts = self.blockStarts\n+            for i in range(1, self.blockCount):\n+                blkStarts[i] += s_offset\n+            items = [str(x) for x in [self.chrom, chrStart, chrEnd, self.name,\n+                                      self.score, self.strand, self.thickStart,\n+                                      self.thickEnd, self.itemRgb,\n+                                      self.blockCount,\n+                                      \',\'.join([str(x) for x in blkSizes]),\n+                                      \',\'.join([str(x) for x in blkStarts])]]\n+            return \'\\t\'.join(items) + \'\\n\'\n+        return self.line\n'
b
diff -r 000000000000 -r 038ecf54cbec digest.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/digest.py Mon Jan 22 13:59:27 2018 -0500
[
@@ -0,0 +1,162 @@
+#   Copyright 2012 Anton Goloborodko, Lev Levitsky
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+import itertools as it
+import re
+from collections import deque
+
+
+def cleave(sequence, rule, missed_cleavages=0, min_length=None):
+    """Cleaves a polypeptide sequence using a given rule.
+
+    Parameters
+    ----------
+    sequence : str
+        The sequence of a polypeptide.
+
+        .. note::
+            The sequence is expected to be in one-letter uppercase notation.
+            Otherwise, some of the cleavage rules in :py:data:`expasy_rules`
+            will not work as expected.
+
+    rule : str or compiled regex
+        A regular expression describing the site of cleavage. It is recommended
+        to design the regex so that it matches only the residue whose
+        C-terminal bond is to be cleaved. All additional requirements should be
+        specified using `lookaround assertions
+        <http://www.regular-expressions.info/lookaround.html>`_.
+        :py:data:`expasy_rules` contains cleavage rules
+        for popular cleavage agents.
+    missed_cleavages : int, optional
+        Maximum number of allowed missed cleavages. Defaults to 0.
+    min_length : int or None, optional
+        Minimum peptide length. Defaults to :py:const:`None`.
+
+        ..note ::
+            This checks for string length, which is only correct for one-letter
+            notation and not for full *modX*. Use :py:func:`length` manually if
+            you know what you are doing and apply :py:func:`cleave` to *modX*
+            sequences.
+
+    Returns
+    -------
+    out : set
+        A set of unique (!) peptides.
+
+    Examples
+    --------
+    >>> cleave('AKAKBK', expasy_rules['trypsin'], 0) == {'AK', 'BK'}
+    True
+    >>> cleave('GKGKYKCK', expasy_rules['trypsin'], 2) == \
+    {'CK', 'GKYK', 'YKCK', 'GKGK', 'GKYKCK', 'GK', 'GKGKYK', 'YK'}
+    True
+
+    """
+    return set(_cleave(sequence, rule, missed_cleavages, min_length))
+
+
+def _cleave(sequence, rule, missed_cleavages=0, min_length=None):
+    """Like :py:func:`cleave`, but the result is a list. Refer to
+    :py:func:`cleave` for explanation of parameters.
+    """
+    peptides = []
+    ml = missed_cleavages+2
+    trange = range(ml)
+    cleavage_sites = deque([0], maxlen=ml)
+    cl = 1
+    for i in it.chain([x.end() for x in re.finditer(rule, sequence)],
+                      [None]):
+        cleavage_sites.append(i)
+        if cl < ml:
+            cl += 1
+        for j in trange[:cl-1]:
+            seq = sequence[cleavage_sites[j]:cleavage_sites[-1]]
+            if seq:
+                if min_length is None or len(seq) >= min_length:
+                    peptides.append(seq)
+    return peptides
+
+
+def num_sites(sequence, rule, **kwargs):
+    """Count the number of sites where `sequence` can be cleaved using
+    the given `rule` (e.g. number of miscleavages for a peptide).
+
+    Parameters
+    ----------
+    sequence : str
+        The sequence of a polypeptide.
+    rule : str or compiled regex
+        A regular expression describing the site of cleavage. It is recommended
+        to design the regex so that it matches only the residue whose
+        C-terminal bond is to be cleaved. All additional requirements should be
+        specified using `lookaround assertions
+        <http://www.regular-expressions.info/lookaround.html>`_.
+    labels : list, optional
+        A list of allowed labels for amino acids and terminal modifications.
+
+    Returns
+    -------
+    out : int
+        Number of cleavage sites.
+    """
+    return len(_cleave(sequence, rule, **kwargs)) - 1
+
+
+expasy_rules = {
+    'arg-c':         r'R',
+    'asp-n':         r'\w(?=D)',
+    'bnps-skatole': r'W',
+    'caspase 1':     r'(?<=[FWYL]\w[HAT])D(?=[^PEDQKR])',
+    'caspase 2':     r'(?<=DVA)D(?=[^PEDQKR])',
+    'caspase 3':     r'(?<=DMQ)D(?=[^PEDQKR])',
+    'caspase 4':     r'(?<=LEV)D(?=[^PEDQKR])',
+    'caspase 5':     r'(?<=[LW]EH)D',
+    'caspase 6':     r'(?<=VE[HI])D(?=[^PEDQKR])',
+    'caspase 7':     r'(?<=DEV)D(?=[^PEDQKR])',
+    'caspase 8':     r'(?<=[IL]ET)D(?=[^PEDQKR])',
+    'caspase 9':     r'(?<=LEH)D',
+    'caspase 10':    r'(?<=IEA)D',
+    'chymotrypsin high specificity': r'([FY](?=[^P]))|(W(?=[^MP]))',
+    'chymotrypsin low specificity':
+        r'([FLY](?=[^P]))|(W(?=[^MP]))|(M(?=[^PY]))|(H(?=[^DMPW]))',
+    'clostripain':   r'R',
+    'cnbr':          r'M',
+    'enterokinase':  r'(?<=[DE]{3})K',
+    'factor xa':     r'(?<=[AFGILTVM][DE]G)R',
+    'formic acid':   r'D',
+    'glutamyl endopeptidase': r'E',
+    'granzyme b':    r'(?<=IEP)D',
+    'hydroxylamine': r'N(?=G)',
+    'iodosobenzoic acid': r'W',
+    'lysc':          r'K',
+    'ntcb':          r'\w(?=C)',
+    'pepsin ph1.3':  r'((?<=[^HKR][^P])[^R](?=[FLWY][^P]))|'
+                     r'((?<=[^HKR][^P])[FLWY](?=\w[^P]))',
+    'pepsin ph2.0':  r'((?<=[^HKR][^P])[^R](?=[FL][^P]))|'
+                     r'((?<=[^HKR][^P])[FL](?=\w[^P]))',
+    'proline endopeptidase': r'(?<=[HKR])P(?=[^P])',
+    'proteinase k':  r'[AEFILTVWY]',
+    'staphylococcal peptidase i': r'(?<=[^E])E',
+    'thermolysin':   r'[^DE](?=[AFILMV])',
+    'thrombin':      r'((?<=G)R(?=G))|'
+                     r'((?<=[AFGILTVM][AFGILTVWA]P)R(?=[^DE][^DE]))',
+    'trypsin':       r'([KR](?=[^P]))|((?<=W)K(?=P))|((?<=M)R(?=P))'
+    }
+"""
+This dict contains regular expressions for cleavage rules of the most
+popular proteolytic enzymes. The rules were taken from the
+`PeptideCutter tool
+<http://ca.expasy.org/tools/peptidecutter/peptidecutter_enzymes.html>`_
+at Expasy.
+"""
b
diff -r 000000000000 -r 038ecf54cbec ensembl_rest.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ensembl_rest.py Mon Jan 22 13:59:27 2018 -0500
[
@@ -0,0 +1,129 @@
+#!/usr/bin/env python
+"""
+#
+#------------------------------------------------------------------------------
+#                         University of Minnesota
+#         Copyright 2017, Regents of the University of Minnesota
+#------------------------------------------------------------------------------
+# Author:
+#
+#  James E Johnson
+#
+#------------------------------------------------------------------------------
+"""
+
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import sys
+
+from time import sleep
+
+import requests
+
+
+server = "https://rest.ensembl.org"
+ext = "/info/assembly/homo_sapiens?"
+max_region = 4000000
+debug = False
+
+
+def ensembl_rest(ext, headers):
+    if debug:
+        print("%s" % ext, file=sys.stderr)
+    r = requests.get(server+ext, headers=headers)
+    if r.status_code == 429:
+        print("response headers: %s\n" % r.headers, file=sys.stderr)
+        if 'Retry-After' in r.headers:
+            sleep(r.headers['Retry-After'])
+            r = requests.get(server+ext, headers=headers)
+    if not r.ok:
+        r.raise_for_status()
+    return r
+
+
+def get_species():
+    results = dict()
+    ext = "/info/species"
+    req_header = {"Content-Type": "application/json"}
+    r = ensembl_rest(ext, req_header)
+    for species in r.json()['species']:
+        results[species['name']] = species
+        print("%s\t%s\t%s\t%s\t%s" %
+              (species['name'], species['common_name'],
+               species['display_name'],
+               species['strain'],
+               species['taxon_id']), file=sys.stdout)
+    return results
+
+
+def get_biotypes(species):
+    biotypes = []
+    ext = "/info/biotypes/%s?" % species
+    req_header = {"Content-Type": "application/json"}
+    r = ensembl_rest(ext, req_header)
+    for entry in r.json():
+        if 'biotype' in entry:
+            biotypes.append(entry['biotype'])
+    return biotypes
+
+
+def get_toplevel(species):
+    coord_systems = dict()
+    ext = "/info/assembly/%s?" % species
+    req_header = {"Content-Type": "application/json"}
+    r = ensembl_rest(ext, req_header)
+    toplevel = r.json()
+    for seq in toplevel['top_level_region']:
+        if seq['coord_system'] not in coord_systems:
+            coord_systems[seq['coord_system']] = dict()
+        coord_system = coord_systems[seq['coord_system']]
+        coord_system[seq['name']] = int(seq['length'])
+    return coord_systems
+
+
+def get_transcripts_bed(species, refseq, start, length, strand='',
+                        params=None):
+    bed = []
+    param = params if params else ''
+    req_header = {"Content-Type": "text/x-bed"}
+    regions = list(range(start, length, max_region))
+    if not regions or regions[-1] < length:
+        regions.append(length)
+    for end in regions[1:]:
+        ext = "/overlap/region/%s/%s:%d-%d%s?feature=transcript;%s"\
+            % (species, refseq, start, end, strand, param)
+        start = end + 1
+        r = ensembl_rest(ext, req_header)
+        if r.text:
+            bed += r.text.splitlines()
+    return bed
+
+
+def get_seq(id, seqtype, params=None):
+    param = params if params else ''
+    ext = "/sequence/id/%s?type=%s;%s" % (id, seqtype, param)
+    req_header = {"Content-Type": "text/plain"}
+    r = ensembl_rest(ext, req_header)
+    return r.text
+
+
+def get_cdna(id, params=None):
+    return get_seq(id, 'cdna', params=params)
+
+
+def get_cds(id, params=None):
+    return get_seq(id, 'cds', params=params)
+
+
+def get_genomic(id, params=None):
+    return get_seq(id, 'genomic', params=params)
+
+
+def get_transcript_haplotypes(species, transcript):
+    ext = "/transcript_haplotypes/%s/%s?aligned_sequences=1"\
+        % (species, transcript)
+    req_header = {"Content-Type": "application/json"}
+    r = ensembl_rest(ext, req_header)
+    decoded = r.json()
+    return decoded
b
diff -r 000000000000 -r 038ecf54cbec macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Mon Jan 22 13:59:27 2018 -0500
[
b'@@ -0,0 +1,125 @@\n+<macros>\n+    <xml name="bedutil_requirements">\n+        <requirement type="package" version="1.62">biopython</requirement>\n+    </xml>\n+    <xml name="ensembl_requirements">\n+        <requirement type="package" version="0.4.10">requests-cache</requirement>\n+    </xml>\n+    <xml name="twobit_requirements">\n+        <requirement type="package" version="3.1.4">twobitreader</requirement>\n+    </xml>\n+    <xml name="species_options">\n+            <option value="homo_sapiens">homo_sapiens  (Human) taxon_id: 9606</option>\n+            <option value="mus_musculus">mus_musculus  (Mouse) taxon_id: 10090</option>\n+            <option value="ailuropoda_melanoleuca">ailuropoda_melanoleuca  (Panda) taxon_id: 9646</option>\n+            <option value="anas_platyrhynchos">anas_platyrhynchos  (Duck) taxon_id: 8839</option>\n+            <option value="anolis_carolinensis">anolis_carolinensis  (Anole lizard) taxon_id: 28377</option>\n+            <option value="astyanax_mexicanus">astyanax_mexicanus  (Cave fish) taxon_id: 7994</option>\n+            <option value="bos_taurus">bos_taurus  (Cow) taxon_id: 9913</option>\n+            <option value="caenorhabditis_elegans">caenorhabditis_elegans  (Caenorhabditis elegans) taxon_id: 6239</option>\n+            <option value="callithrix_jacchus">callithrix_jacchus  (Marmoset) taxon_id: 9483</option>\n+            <option value="canis_familiaris">canis_familiaris  (Dog) taxon_id: 9615</option>\n+            <option value="carlito_syrichta">carlito_syrichta  (Tarsier) taxon_id: 1868482</option>\n+            <option value="cavia_aperea">cavia_aperea  (Brazilian guinea pig) taxon_id: 37548</option>\n+            <option value="cavia_porcellus">cavia_porcellus  (Guinea Pig) taxon_id: 10141</option>\n+            <option value="chinchilla_lanigera">chinchilla_lanigera  (Long-tailed chinchilla) taxon_id: 34839</option>\n+            <option value="chlorocebus_sabaeus">chlorocebus_sabaeus  (Vervet-AGM) taxon_id: 60711</option>\n+            <option value="choloepus_hoffmanni">choloepus_hoffmanni  (Sloth) taxon_id: 9358</option>\n+            <option value="ciona_intestinalis">ciona_intestinalis  (C.intestinalis) taxon_id: 7719</option>\n+            <option value="ciona_savignyi">ciona_savignyi  (C.savignyi) taxon_id: 51511</option>\n+            <option value="cricetulus_griseus_chok1gshd">cricetulus_griseus_chok1gshd  (Chinese hamster CHOK1GS) taxon_id: 10029</option>\n+            <option value="cricetulus_griseus_crigri">cricetulus_griseus_crigri  (Chinese hamster CriGri) taxon_id: 10029</option>\n+            <option value="danio_rerio">danio_rerio  (Zebrafish) taxon_id: 7955</option>\n+            <option value="dasypus_novemcinctus">dasypus_novemcinctus  (Armadillo) taxon_id: 9361</option>\n+            <option value="dipodomys_ordii">dipodomys_ordii  (Kangaroo rat) taxon_id: 10020</option>\n+            <option value="drosophila_melanogaster">drosophila_melanogaster  (Fruitfly) taxon_id: 7227</option>\n+            <option value="echinops_telfairi">echinops_telfairi  (Lesser hedgehog tenrec) taxon_id: 9371</option>\n+            <option value="equus_caballus">equus_caballus  (Horse) taxon_id: 9796</option>\n+            <option value="erinaceus_europaeus">erinaceus_europaeus  (Hedgehog) taxon_id: 9365</option>\n+            <option value="felis_catus">felis_catus  (Cat) taxon_id: 9685</option>\n+            <option value="ficedula_albicollis">ficedula_albicollis  (Flycatcher) taxon_id: 59894</option>\n+            <option value="fukomys_damarensis">fukomys_damarensis  (Damara mole rat) taxon_id: 885580</option>\n+            <option value="gadus_morhua">gadus_morhua  (Cod) taxon_id: 8049</option>\n+            <option value="gallus_gallus">gallus_gallus  (Chicken) taxon_id: 9031</option>\n+            <option value="gasterosteus_aculeatus">gasterosteus_aculeatus  (Stickleback) taxon_id: 69293</option>\n+            <option value="gorilla_gorilla">gorilla_gorilla  (Gorilla) taxon_id: 9595</option>\n+            '..b'n value="oreochromis_niloticus">oreochromis_niloticus  (Tilapia) taxon_id: 8128</option>\n+            <option value="ornithorhynchus_anatinus">ornithorhynchus_anatinus  (Platypus) taxon_id: 9258</option>\n+            <option value="oryctolagus_cuniculus">oryctolagus_cuniculus  (Rabbit) taxon_id: 9986</option>\n+            <option value="oryzias_latipes">oryzias_latipes  (Medaka) taxon_id: 8090</option>\n+            <option value="otolemur_garnettii">otolemur_garnettii  (Bushbaby) taxon_id: 30611</option>\n+            <option value="ovis_aries">ovis_aries  (Sheep) taxon_id: 9940</option>\n+            <option value="pan_troglodytes">pan_troglodytes  (Chimpanzee) taxon_id: 9598</option>\n+            <option value="papio_anubis">papio_anubis  (Olive baboon) taxon_id: 9555</option>\n+            <option value="pelodiscus_sinensis">pelodiscus_sinensis  (Chinese softshell turtle) taxon_id: 13735</option>\n+            <option value="peromyscus_maniculatus_bairdii">peromyscus_maniculatus_bairdii  (Northern American deer mouse) taxon_id: 230844</option>\n+            <option value="petromyzon_marinus">petromyzon_marinus  (Lamprey) taxon_id: 7757</option>\n+            <option value="poecilia_formosa">poecilia_formosa  (Amazon molly) taxon_id: 48698</option>\n+            <option value="pongo_abelii">pongo_abelii  (Orangutan) taxon_id: 9601</option>\n+            <option value="procavia_capensis">procavia_capensis  (Hyrax) taxon_id: 9813</option>\n+            <option value="pteropus_vampyrus">pteropus_vampyrus  (Megabat) taxon_id: 132908</option>\n+            <option value="rattus_norvegicus">rattus_norvegicus  (Rat) taxon_id: 10116</option>\n+            <option value="saccharomyces_cerevisiae">saccharomyces_cerevisiae  (Saccharomyces cerevisiae) taxon_id: 4932</option>\n+            <option value="sarcophilus_harrisii">sarcophilus_harrisii  (Tasmanian devil) taxon_id: 9305</option>\n+            <option value="sorex_araneus">sorex_araneus  (Shrew) taxon_id: 42254</option>\n+            <option value="sus_scrofa">sus_scrofa  (Pig) taxon_id: 9823</option>\n+            <option value="taeniopygia_guttata">taeniopygia_guttata  (Zebra Finch) taxon_id: 59729</option>\n+            <option value="takifugu_rubripes">takifugu_rubripes  (Fugu) taxon_id: 31033</option>\n+            <option value="tetraodon_nigroviridis">tetraodon_nigroviridis  (Tetraodon) taxon_id: 99883</option>\n+            <option value="tupaia_belangeri">tupaia_belangeri  (Tree Shrew) taxon_id: 37347</option>\n+            <option value="tursiops_truncatus">tursiops_truncatus  (Dolphin) taxon_id: 9739</option>\n+            <option value="vicugna_pacos">vicugna_pacos  (Alpaca) taxon_id: 30538</option>\n+            <option value="xenopus_tropicalis">xenopus_tropicalis  (Xenopus) taxon_id: 8364</option>\n+            <option value="xiphophorus_maculatus">xiphophorus_maculatus  (Platyfish) taxon_id: 8083</option>\n+    </xml>\n+    <xml name="biotypes_help">\n+            <help><![CDATA[\n+Example biotypes: \n+protein_coding, non_coding, pseudogene, nonsense_mediated_decay, non_stop_decay, \n+translated_processed_pseudogene, transcribed_processed_pseudogene, transcribed_unitary_pseudogene, transcribed_unprocessed_pseudogene, \n+polymorphic_pseudogene, processed_pseudogene, unprocessed_pseudogene, unitary_pseudogene, processed_transcript, \n+retained_intron, ccds_gene, sense_overlapping, sense_intronic, cdna_update, antisense, \n+LRG_gene, IG_C_gene, IG_D_gene, IG_J_gene, IG_LV_gene IG_V_gene, TR_C_gene, TR_D_gene, TR_J_gene, TR_V_gene, \n+IG_pseudogene, IG_C_pseudogene, IG_D_pseudogene, IG_J_pseudogene, IG_V_pseudogene, TR_J_pseudogene, TR_V_pseudogene, TEC, \n+ribozyme, RNase_P_RNA, guide_RNA, macro_lncRNA, bidirectional_promoter_lncRNA, 3prime_overlapping_ncRNA, antisense_RNA, vaultRNA, Y_RNA, SRP_RNA, RNase_MRP_RNA, IG_C_pseudogene, lncRNA, lincRNA, miRNA, snRNA, sRNA, telomerase_RNA, Mt_tRNA, Mt_rRNA, scaRNA, misc_RNA, rRNA, tRNA, scRNA, snoRNA, other\n+            ]]></help>\n+    </xml>\n+</macros>\n'
b
diff -r 000000000000 -r 038ecf54cbec test-data/GRCh38.1.2bit
b
Binary file test-data/GRCh38.1.2bit has changed
b
diff -r 000000000000 -r 038ecf54cbec test-data/human_transcripts.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/human_transcripts.bed Mon Jan 22 13:59:27 2018 -0500
b
@@ -0,0 +1,19 @@
+chr1 14403 29570 ENST00000488147 1000 - 14402 14402 0,0,0 11 98,34,152,159,198,136,137,147,99,154,37 0,601,1392,2203,2454,2829,3202,3511,3864,10334,15130 WASH7P-201 none none -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1 unprocessed_pseudogene ENSG00000227232 WASH7P unprocessed_pseudogene
+chr1 29553 31097 ENST00000473358 1000 + 29552 29552 0,0,0 3 486,104,122 0,1010,1422 MIR1302-2HG-202 none none -1,-1,-1 lincRNA ENSG00000243485 MIR1302-2HG lincRNA
+chr1 30266 31109 ENST00000469289 1000 + 30265 30265 0,0,0 2 401,134 0,709 MIR1302-2HG-201 none none -1,-1 lincRNA ENSG00000243485 MIR1302-2HG lincRNA
+chr1 30365 30503 ENST00000607096 1000 + 30364 30364 0,0,0 1 138 0 MIR1302-2-201 none none -1 miRNA ENSG00000284332 MIR1302-2 miRNA
+chr1 34553 36081 ENST00000417324 1000 - 34552 34552 0,0,0 3 621,205,361 0,723,1167 FAM138A-201 none none -1,-1,-1 lincRNA ENSG00000237613 FAM138A lincRNA
+chr1 35244 36073 ENST00000461467 1000 - 35243 35243 0,0,0 2 237,353 0,476 FAM138A-202 none none -1,-1 lincRNA ENSG00000237613 FAM138A lincRNA
+chr1 52472 53312 ENST00000606857 1000 + 52471 52471 0,0,0 1 840 0 AL627309.6-201 none none -1 unprocessed_pseudogene ENSG00000268020 AL627309.6 unprocessed_pseudogene
+chr1 57597 64116 ENST00000642116 1000 + 57596 57596 0,0,0 3 56,157,1201 0,1102,5318 OR4G11P-202 none none -1,-1,-1 processed_transcript ENSG00000240361 OR4G11P transcribed_unprocessed_pseudogene
+chr1 62948 63887 ENST00000492842 1000 + 62947 62947 0,0,0 1 939 0 OR4G11P-201 none none -1 transcribed_unprocessed_pseudogene ENSG00000240361 OR4G11P transcribed_unprocessed_pseudogene
+chr1 65418 71585 ENST00000641515 1000 + 69090 70008 0,0,0 3 15,54,2549 0,101,3618 OR4F5-202 cmpl cmpl -1,-1,0 protein_coding ENSG00000186092 OR4F5 protein_coding
+chr1 69054 70108 ENST00000335137 1000 + 69090 70008 0,0,0 1 1054 0 OR4F5-201 cmpl cmpl 0 protein_coding ENSG00000186092 OR4F5 protein_coding
+chr1 131024 134836 ENST00000442987 1000 + 131023 131023 0,0,0 1 3812 0 CICP27-201 none none -1 processed_pseudogene ENSG00000233750 CICP27 processed_pseudogene
+chr1 139789 140339 ENST00000493797 1000 - 139788 139788 0,0,0 2 58,265 0,285 AL627309.2-201 none none -1,-1 antisense_RNA ENSG00000239906 AL627309.2 antisense_RNA
+chr1 157783 157887 ENST00000410691 1000 - 157782 157782 0,0,0 1 104 0 RNU6-1100P-201 none none -1 snRNA ENSG00000222623 RNU6-1100P snRNA
+chr1 187890 187958 ENST00000612080 1000 - 187889 187889 0,0,0 1 68 0 MIR6859-2-201 none none -1 miRNA ENSG00000273874 MIR6859-2 miRNA
+chr1 263014 297502 ENST00000424587 1000 - 263013 263013 0,0,0 4 5190,150,105,158 0,5652,26251,34330 AP006222.1-206 none none -1,-1,-1,-1 processed_transcript ENSG00000228463 AP006222.1 transcribed_processed_pseudogene
+chr1 347981 348366 ENST00000458203 1000 - 347980 347980 0,0,0 1 385 0 RPL23AP24-201 none none -1 processed_pseudogene ENSG00000236679 RPL23AP24 processed_pseudogene
+chr1 439869 440232 ENST00000437905 1000 + 439868 439868 0,0,0 1 363 0 WBP1LP7-201 none none -1 processed_pseudogene ENSG00000269732 WBP1LP7 processed_pseudogene
+chr1 450702 451697 ENST00000426406 1000 - 450739 451678 0,0,0 1 995 0 OR4F29-201 cmpl cmpl 0 protein_coding ENSG00000284733 OR4F29 protein_coding
b
diff -r 000000000000 -r 038ecf54cbec test-data/human_transcripts_seq.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/human_transcripts_seq.bed Mon Jan 22 13:59:27 2018 -0500
b
b'@@ -0,0 +1,12 @@\n+1\t14403\t29570\tENST00000488147\t1000\t-\t14402\t14402\t0,0,0\t11\t98,34,152,159,198,136,137,147,99,154,37\t0,601,1392,2203,2454,2829,3202,3511,3864,10334,15130\tWASH7P-201\tnone\tnone\t-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1\tunprocessed_pseudogene\tENSG00000227232\tWASH7P\tunprocessed_pseudogene\tATGGGAGCCGTGTGCACGTCGGGAGCTCGGAGTGAGCGTGAGTTCCGTGCCCAGGCCCGCGACTCGGCCCGACAGGACAGCGCTCCGGGTCGACGGGGTCCTGGAGCCGCGCTCGGGGAGGGCGCAGTGGAGGGCGAGCGGCGGCGTTAGGACCCGGAGGCGCGGGCGGACTGGGGGCGGCGGGGCTAGGACCCAGCGGCTCCGGCAGAGCGGAAGCGGCGGCGGGAGCTTCCGGGAGGGCGGCTCGCAGGTGAGGAGGCGTCCGGGGCCGCGGGAAGTAGGGTCGTGGGGGCCTGGCGGGGCGAAGTAGGGGACCCGGAGGGGCTGGAGGGAGGCGGGCGGGAGGCCCGGGACCGTTCCTGACCGAGAAGCCTGCGCCAAGCTGGTGTTCCGCGGCCGCTGCCCGGTGCCCGGCTCCACTGCGAACGCCGCCGCTGGGCCCCGACCGCCCGGGAGGCGTCTTGGGCTCGCCCCGGAGCTTCCTCCCTGGAGCCGCGCCCTGCACCCGGCCTTGCCCGGCCCTAGCAGGGAAGCCAAGGCTTGTGGGGCGCAGGGACCCGGGCTCTGCGGGGTCCCGGTTCCGCCTCCCCACTCCTGCGTCTTCCCGCCCCTGCCGGGTTCTGGGAAGCCTCGCGCGGCTCTTCCGCAGCTGCTGCCCGCCCGGAGCTCCTGGTCCCTCGTAGGGGACCCCACTTCTCTGACACCGCGTTGGGTTCCCGGGGCCTACAGCGAGGCTTGTAACTCCGGGAGAGACCCTGGAGCGGGGTGTGGGAGAACGGTCTGGAGGAAGGGCTCCGAGCACTTCGAAAGTATAAACCGCGGTCCCAAAGAGGCGTGCTGTGTCTGCATTTTCCTGGGAGTGCACGGTTTACATTCTCGAAAGCAGTGCTGTCGACTAGAAATATTGAGCGATACACATGTACAAGTTTTGTCACTTAAAAAGAATTTGAAAAAACTTCATAGATGCAAAAAAAAAAAAAAACCACCATTATTAAAGAATACTTAGGTATTTGTGGAATGCATTGAAGAGTTAACAAAATGGATAGGCAGGAAATATCGCAGACCTAGAATGAATTACAGTTACCCACTGTGGAACTGAGGAGCTAGGGTTTCTCATAAAACTCCCTGATAGAAGACGACTTTTGATAAATTTTTTTTTCCGCCAACAAAATCCCCTGTCTTCTCAACTAGTTACTGTCTGTCCACTAAATAAGAGGTGGTCCGTCACTTCTTCAGATGAGCAACTACAGGCTTTTCAAAAGATAATTGCTAATCAACCCCTTTGTGCCTGGGTTTTCTTATTTGTAAAAATAGATACTACTACCTAACTCCAAAGTGTGTGGTGAAGACAAACAATTGGGGTGATGTATACTAAAGTAACGAAAGTGTTGACCACACACTACGGGCTGGTTAGTGTTAGATTCCCTTGTTTTTCCCTCAGTATCAAAAACAGATCTAATTTAGGTTTACATAAAGACAAAGTATGAAGATAAGGTGACTTACAGTTGGTACTACTAACAAAATGTTTGGGCTAAGATTTGCATTATTGCATGAAAACAACAAAACATATCAATAAATAACAAAAAGCTTGGAATTCAGACGACAGATCCAAGTCTGGGCTTGATCTCAAGCTAGTGTTTTGATGTTGAAAAAATGTTATTTGGTCTTTCTAACCCCATTTCCTTATGTAAAATAGGGGATGATGATAAATTCACTGATAATAAGAGTTAAATGAGATTCTTGAGGAGTCAGAATGGTTCTAACATGTGTAGGTATTATTAGCAGTCATACTGTAGCATAAGAAAATACCGTCTGCTGAAAGAGGGACAATAAAGATTATCTACATGGTCATCATTTAAAAGCTACCAGATATAGGAAGAAGGGGCCATAAAATGATAACGTTATGATGATTAATTTTGATGCTTAGGTCAGAGTCCATTCTAGGATATCTGCTGCCCAAAAACAGCAGAGACTCATTTCTTTGGAATCACAGGACGCTGAGTGAGAGGAAAGAAAAAGAAAAGAAATATTTAAGTCACATATGTGATTTCTAAAAGTAAAAAGAAACAGATGAAATTAGTGATATATTTTTAAAATCCAGTATATCCCAAATATGGTTATTTTAGCACGTAATCAATATAAAATAATAAGATATTTTACATTCTTTTTTTCTAGCCTTTGAAATTTGGTGCATATTTTACACTTATGGCACATCTCAATTCAGACTATCCACATTTCAAGTGCTCAGTGGCTGCATGTGCCTGGTGGCTACCATATTGGACAGCACAGGTCTAAGGATTTCATTCCTGCCACAAGTCCAAACTCCTAGCTTTAATTTTGAGTGTTTTTAACAAACTGGCCTCTGTTTATCATTCTTTCTTCTAGTACTTCCCCAAGGATGATTGTACCCTCAGCACTCAAGACCGCTTGCGGTTCCCCTACACACTTTTTGTTCAAGCTGTTTCTTTTACCTGGAATGCTGTCTTTGCACCTTCTTCCTGGACCTGGTTCACCCTCGTTGCCTAGGCTGGAGTGCCATGGCGCGATCTTGGCACACTGCAACCTCCACCTTCCTGGTTCAAGTGATTCTCCTTCCTCAGCCTCCCGAGTAGCGGGGATTACAGGCATGCACCACCACGCCTGGCTAATTTTGTATTTTTAGTAGAGATGGAGTTTCACCATGTTGGTCAGACTGGTCTCGAACTCCCGACCTCAGGTGATCTGCCTGCCTTCGCCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCGCTGCGCCCGGCCGAGAGGCACACATTCTGCTAAGAGCTTTTTCCTGACTCCCCTAACTCCAAGAGGGATTTGTCACTCCTTAGCTTTGTACCCATGACTGGAGTAGAATGAATTTAATTTGAGTTTAGTTGTTTTTGAGACTCTCCCTGGCTAGTGTAGTGTCTTATTCGTCTTTGTTGTGATCATGGCCTGCACCTAACAGATGATCAGTAGATGTTTGCAGACAGAAAGTAAACCACTCATCAGGTGTATTCAGTCCCATTCTTGAACGGGCTTGCTGCCTCCTTTTTGAGGAGATCTGTGTATGTACTCTTCTTTCACGCATATGTGTGAGCAAACACACACACACTAACAAGAAATTCATCTGAAGATGTGCACAGGAAATATCTTGCATCTTTACCCCCTTTGTGATCTTACATATGGGAGAACTGAGGCACAGAAATAAGTTAGGACAGCCAGCAAACTTGCATCAGTATAAATACAAAGAAGGGGAGGGAGGAACATGCTTGAAAGGGGTGTGCTGGTCTCAGAGGGTTGGGTTTCTCAGTTGGCTGGGCATCAGCTGGCCATGCTTTAGTTATTTGATGGGAGGAAAAATAAGTGGGAGGTGAGGAGTAACTCCTGGGCTCTGATGAGTATTCAAGGCAAGTACAGATCTGGAAAGCCTGTATGCAAAGGAGGAACTCACTGAAAAGTGCTGGCCTGAGGAGGGCAGAAGGGAGGGCTGGGGAAGCCAGCAGCGGGAGCAAAGGAGTAGGCTCCAACTGGGTGAAGATGTTGGTGTGGTGCGTTATGTAAAATATACAAATTATTATTGGAAATAACCACGTCTCAGCAGTGCTAGTTCTCAGTTTGGAGAATGGGAAATCGAAAGGATCAGATTCAGAGACGGCAACTTACTCAAGGTCACAGCATTTTAAACCCAAGTGAAATCTCCTA'..b'GCTCATGAAAATTGTGCCCTCCATTCC\n+1\t131024\t134836\tENST00000442987\t1000\t+\t131023\t131023\t0,0,0\t1\t3812\t0\tCICP27-201\tnone\tnone\t-1\tprocessed_pseudogene\tENSG00000233750\tCICP27\tprocessed_pseudogene\tCCCCTTCTCTGGGCCCAAGCCACCTTGGCTGAGGAGGGGGCGAGGAGGTGTGAGCCCCTGCCAGGAACCCCCTGCCCGGACCAAGTGCTCGGCCCCCAGGCCTGCGTTCAGTGAGGCCTCCCGTGGCGTCAGCATGTTCGTGTGGAGGAATGTGGAAGGTCACTCTGCGGCCGTGTTCTCCTGGTACTCCATCCCCTTCCTGACCCCTCCCTGCAGCCACACGAGGCCCAGCAACCTGCCAGTCACTCAGTGGCCTCCAACCAGAGAAAACAACCTGCCAAGTTGGCAGCTGTTGCTCATGAGCGTCCACCAGGTGGGACAGGGAGTGTTGACCCTGGGCGGCCCCCTGGAGCCACCTGCCCTGAAAGCCCAGGGCCCGCAACCCCACACACTTTGGGGTTGGTGGAACCTGGTAAAAGCTCACCTCCCACCATGGAGGAGGAGCCCTGGGCCCCTCAGGGGAGTCCCTGCTGGACAGTGAGACAGAGAATGACCATGATGATGCTTTCCTCTCCATCATGTCTCCTGACACCCAGTTGCCTCTACCACTCAGATGATGTCAGGCCCAGTCCCTCAGTGCCCTGCGCAAGGAACAGGACTCATCTTCTGAGAAGGATGGACGCAGCCCCAACAAATCAGACAAGGACCACATCCGGTGGCCCATGAGTGGCGCTCATGATCTTCAGCAGGCGGCACCAGGCCCTGGCGGGGCGCACCAGGGTCACCCCAACCAGGATAACCGGACCGTCAGCCAGATGCTGAGCGAGCGGTGGTACACCCTGGGGCCCAATGAGATGCAGAAATACAACCTGGCCTTCCAGGTGAAGGTGGCCCACTTGCAACAAGGACCGAAAGAAGTCCAGCTCAGAGGCCAAGCCCACAAGCCAGGGGCTAGCAGGAGTGTAACAAGGGCTCGTGGGAGCGGAGCATATCAGAGACGGGCACTGCCACTGCCCCTGGGGTGTCCTCTGAACTCCTGTCAGTTGCAGCCCAAACACTCCAGAGCTCGGATACCAAGGAGCAGCTTCTGTGGGGCAGAACGGCTGCACACAGTCAGGGAACCTGGCTCAGCCTGGCCCAAGCCTTCTCCCACAGCGGGGTACACAGCCTGGACGGCAGGGAAATAGACCGTCAGGCACTACGGGAACTGACACAGGTGGTGTCTGGCACTGCATCATACTCTGGCCCAAAGCCTTCTACTCAGCATGGAGCTCCAGGCCACTTTGCAGCCCCTGGTGAGGGAGGTGACCCGTGGGCAGCCCTGCTGCCGCCCACGTGAGCTGCTCATTCCCAGCACATGGCCAGCGAGGTCATAGCGAGTGACGAAGAGCACACGGTCATCCATGAGGAGGAGGGGGTGATGATGTCATTGCTGATGATGGCTTTAGCACCACCGACACCGATCTCAAGTTCAAGGAGTGGGTGACCGACTGAGAGTGGGGACAACTCTGGGGAGGAGCCAGAGGGCAACAAGGGCTTTGGTGGGAAGGTATTTGCACCTGTCATTCCTTCCTCCTTTACTCCTGCCGCCCCTTGCTGGATCCTGAGCCCCCAGGGTCCCCCGATCCACCTGCAGCTTTTGGCAGTCTATGGTCACACCCTGTCCTCCTCCTACACATACTCGGATGCTTCCTCCTCAACCTTGGCACCCACCTCCTTCTTACTGGGCCCAGGAGCCTTCAAAGCCCAGGAGTCTGGTCAACGCAGCAGAGCGGGCCCCCTACGGCCCCAACCCCTGGGGATGGGGGCCCAGGGACGCCTTCCAAGGTGGCCTGTTTCCTCCCAATGGATCCTGCCACCTTCTGGTGCAAGAGACCTGAAAGTGTGGGCGACCTGGAGCTACCAGGCTCCTCAGTCATCAGGGTCCCTCCCAACACTAAGGCTTTCCTAGGCAGGAGCTGGGCTGAGCCACCCGGGGGGCAGAGCCTGAAGAGAAACTGACTGGGCTTTCGGGGTCGGGGCAGAGGGAACCCCACGGACATGGACCCCACACTGGAGGACCCCACCGCGCCCAAATGCAAGATGAGAAGATGCTCCAGCTGCAGTCCAAAGCCCAACACCCCCAAGTGTGCCATGTGTGATGGGGACAGCTTCCCCTTTGCCTGTACAGGTGGAGAAGCCGAGGACAGGCTCAGGGAACCGGAGACCAAGAAGGCGCTGTCCTCTTCACTGCATGTACCCTGGACCAGTGCCGGCCCTGATCATGCAGCTCTTCCAGGCCCACTGCTTCTTCCTGTCCACTAGGCCACAGCCGCCCTCCAGGCCCACTATGCACACATCTTCCCCTCCAAGGTTTGTTCTGCCCCTGCCCTGACTCCCAGCCCTGTGGGGGTCCTGACCGCACCTCACCTGGCTCAGACTCTTGACGCTGCCCTGGCTGCCCCACCAGTGCTTCTGCCCGAGAGTCACGTGAGGCTGAGAGTAGGGGCAGGGGCAGCAGTGGTGCCAGTTGGGGGGCGGTCCAGTGGGAGGAGCCTCAGCCTCGCAGGCTGCTCCGTGGGACTGATGACTGCATGATCTTCTGGGCACCTCACGGATCTTCAACTGCAGGTGAAACGGATGCTGGTGGTGGGTGCAGGGCCGCTGGGAGCTGCTGCATGGTTCCCAGAGGCTGGACTGAGGCAGGTGCCAACTGAAGCTGCTGGGGCAGCATGGGCAGGATGTTCTGCACACAAACCTTGGAGAAGAAGATGTGTGCATAGCAGGTCCACTGCTGCTGCCCCTGCCCTGACTCCCAGCCCTGCCTGACCCCACCTCAACCTGCTCAGGCTCTGGCACAACCCTGGCTGCCCTGCCACTGCCTCTGCCCCAGAGTTGGTGCCTTGACAGCCTGGTTGGAAGGGGACACCCCAGCCCTGCCTCAACACCTGGGGGTCTCCATAACTAGCACAGGCAGGTGGGCAACCCCAAAGATCCCAGGACTCACAGTACCCCCTGAGAACATGGACAGTATGTGGGGGTAGCAATGGAGGGCAGGATGGTTATCTTCTCCCAGGTGAAGCCATTTAATCCTTTCAGTTTGGGACGGAGTAAGGCCTTCCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGAGACCGAGTCTTGCTCTGTCGCCCAGGCTGGAGTGCAGTGGTGCGATCTTGGCTCACTGCAACCTCTTCCCGCTGGGTTCACGCCATTCTCCTGCCTCAGCCTTCCGGGTAGCTAGGATTACAGGTGGACGCTACCACGTCCGGCTAATTTTTGTATTTTTAGTACAGACGGGGCTTCATCATCTTGGCCAGGCTGATTTCGATCTCCTGACATCGTGATCTGCCTGCCTCCCCCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACCACGCCTGGCCAAGGCCTGCTCCTCTTATCTATACCCCCTACCCCTGCAGCTGTGCCGGGGGAAAGCTGGGCAGTTTCCCTCCTCCGAGCCCCTGTACATACCATGAATTGTGGGACCTTCAGAGCTTTTCACTTTTCGGAAAATAGCTCCTGCTGGGGCTACAAGATGGAGTGTGAAGAGGGCCTTGGGCCACAGGGAGGCGCCTGTGGACTAGGGGGAGTTCATGCACCCCTTCTTTCCCCAGAGGGGCTGGACTCAGGTGAGTATGGGGGTGGGGGCTCCTGCACTTCGACACAGGCAGCAGGAGGGTTTTCTCCCCATTCCCTCTGCACTCCCAACTTGAGCTATACTTTTTAAGAAAGTGATTCACCCTGCCTTTGCCCCCTTCCCCAGAACAGAACACGTTGATCGTGGGCGATATTTTTCATTGTGCCAAAAAGTTGCCATGACCGTCATTAAACCTGTTTAACAC\n'
b
diff -r 000000000000 -r 038ecf54cbec tool-data/twobit.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/twobit.loc.sample Mon Jan 22 13:59:27 2018 -0500
b
@@ -0,0 +1,26 @@
+#This is a sample file distributed with Galaxy that is used by some
+#tools.  The twobit.loc file has this format (white space characters 
+#are TAB characters):
+#
+#<Build> <FullPathToFile>
+#
+#So, for example, if you had droPer1 twobit files stored in 
+#/depot/data2/galaxy/droPer1/, then the twobit.loc entry 
+#would look like this:
+#
+#droPer1 /depot/data2/galaxy/droPer1/droPer1.2bit
+#
+#and your /depot/data2/galaxy/droPer1/ directory would 
+#contain all of your twobit files (e.g.):
+#
+#-rw-rw-r--   1 nate   galaxy 48972650 2007-05-04 11:27 droPer1.2bit
+#...etc...
+#
+#Your twobit.loc file should include an entry per line for each twobit 
+#file you have stored.  For example:
+#
+#droPer1 /depot/data2/galaxy/droPer1/droPer1.2bit
+#apiMel2 /depot/data2/galaxy/apiMel2/apiMel2.2bit
+#droAna1 /depot/data2/galaxy/droAna1/droAna1.2bit
+#droAna2 /depot/data2/galaxy/droAna2/droAna2.2bit
+#...etc...
b
diff -r 000000000000 -r 038ecf54cbec tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample Mon Jan 22 13:59:27 2018 -0500
b
@@ -0,0 +1,6 @@
+<tables>
+    <table name="twobit" comment_char="#">
+        <columns>value, path</columns>
+        <file path="tool-data/twobit.loc" />
+    </table>
+</tables>
b
diff -r 000000000000 -r 038ecf54cbec translate_bed.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/translate_bed.py Mon Jan 22 13:59:27 2018 -0500
[
b'@@ -0,0 +1,303 @@\n+#!/usr/bin/env python\n+"""\n+#\n+#------------------------------------------------------------------------------\n+#                         University of Minnesota\n+#         Copyright 2017, Regents of the University of Minnesota\n+#------------------------------------------------------------------------------\n+# Author:\n+#\n+#  James E Johnson\n+#\n+#------------------------------------------------------------------------------\n+"""\n+\n+from __future__ import print_function\n+\n+import argparse\n+import re\n+import sys\n+\n+from Bio.Seq import translate\n+\n+from bedutil import bed_from_line\n+\n+import digest\n+\n+from ensembl_rest import get_cdna\n+\n+from twobitreader import TwoBitFile\n+\n+\n+def __main__():\n+    parser = argparse.ArgumentParser(\n+        description=\'Translate from BED\')\n+    parser.add_argument(\n+        \'input_bed\', default=None,\n+        help="BED to translate,  \'-\' for stdin")\n+    pg_seq = parser.add_argument_group(\'Genomic sequence source\')\n+    pg_seq.add_argument(\n+        \'-t\', \'--twobit\', default=None,\n+        help=\'Genome reference sequence in 2bit format\')\n+    pg_seq.add_argument(\n+        \'-c\', \'--column\', type=int, default=None,\n+        help=\'Column offset containing genomic sequence\' +\n+             \'between start and stop (-1) for last column\')\n+    pg_out = parser.add_argument_group(\'Output options\')\n+    pg_out.add_argument(\n+        \'-f\', \'--fasta\', default=None,\n+        help=\'Path to output translations.fasta\')\n+    pg_out.add_argument(\n+        \'-b\', \'--bed\', default=None,\n+        help=\'Path to output translations.bed\')\n+    pg_bed = parser.add_argument_group(\'BED filter options\')\n+    pg_bed.add_argument(\n+        \'-E\', \'--ensembl\', action=\'store_true\', default=False,\n+        help=\'Input BED is in 20 column Ensembl format\')\n+    pg_bed.add_argument(\n+        \'-R\', \'--regions\', action=\'append\', default=[],\n+        help=\'Filter input by regions e.g.:\'\n+             + \' X,2:20000-25000,3:100-500+\')\n+    pg_bed.add_argument(\n+        \'-B\', \'--biotypes\', action=\'append\', default=[],\n+        help=\'For Ensembl BED restrict translations to Ensembl biotypes\')\n+    pg_trans = parser.add_argument_group(\'Translation filter options\')\n+    pg_trans.add_argument(\n+        \'-m\', \'--min_length\', type=int, default=10,\n+        help=\'Minimum length of protein translation to report\')\n+    pg_trans.add_argument(\n+        \'-e\', \'--enzyme\', default=None,\n+        help=\'Digest translation with enzyme\')\n+    pg_trans.add_argument(\n+        \'-M\', \'--start_codon\', action=\'store_true\', default=False,\n+        help=\'Trim translations to methionine start_codon\')\n+    pg_trans.add_argument(\n+        \'-C\', \'--cds\', action=\'store_true\', default=False,\n+        help=\'Only translate CDS\')\n+    pg_trans.add_argument(\n+        \'-A\', \'--all\', action=\'store_true\',\n+        help=\'Include CDS protein translations \')\n+    pg_fmt = parser.add_argument_group(\'ID format options\')\n+    pg_fmt.add_argument(\n+        \'-r\', \'--reference\', default=\'\',\n+        help=\'Genome Reference Name\')\n+    pg_fmt.add_argument(\n+        \'-D\', \'--fa_db\', dest=\'fa_db\', default=None,\n+        help=\'Prefix DB identifier for fasta ID line, e.g. generic\')\n+    pg_fmt.add_argument(\n+        \'-s\', \'--fa_sep\', dest=\'fa_sep\', default=\'|\',\n+        help=\'fasta ID separator defaults to pipe char, \' +\n+             \'e.g. generic|ProtID|description\')\n+    pg_fmt.add_argument(\n+        \'-P\', \'--id_prefix\', default=\'\',\n+        help=\'prefix for the sequence ID\')\n+    parser.add_argument(\'-v\', \'--verbose\', action=\'store_true\', help=\'Verbose\')\n+    parser.add_argument(\'-d\', \'--debug\', action=\'store_true\', help=\'Debug\')\n+    args = parser.parse_args()\n+\n+    input_rdr = open(args.input_bed, \'r\')\\\n+        if args.input_bed != \'-\' else sys.stdin\n+    fa_wtr = open(args.fasta, \'w\')\\\n+        if args.fasta is not None and args.fasta != \'-\' else sys.stdout\n+    bed_wtr = open(args.bed, \'w\') if args.bed is not None else None\n+\n+    enzyme = digest.expasy_rules.get'..b'f args.start_codon:\n+                            m = refprot.find(\'M\')\n+                            if m < 0:\n+                                return 0\n+                            elif m > 0:\n+                                bed.trim_cds(m*3)\n+                                refprot = refprot[m:]\n+                        stop = refprot.find(\'*\')\n+                        if stop >= 0:\n+                            bed.trim_cds((stop - len(refprot)) * 3)\n+                            refprot = refprot[:stop]\n+                        if len(refprot) >= args.min_length:\n+                            write_translation(tbed, bed.name, refprot)\n+                            return 1\n+                    return 0\n+            if args.debug:\n+                print("%s\\n" % (str(bed)), file=sys.stderr)\n+                print("CDS: %s %d %d" %\n+                      (bed.strand, bed.cdna_offset_of_pos(bed.thickStart),\n+                       bed.cdna_offset_of_pos(bed.thickEnd)),\n+                      file=sys.stderr)\n+                print("refprot: %s" % str(refprot), file=sys.stderr)\n+            for offset in range(3):\n+                seqend = cdna_len - (cdna_len - offset) % 3\n+                aaseq = translate(cdna[offset:seqend])\n+                aa_start = 0\n+                while aa_start < len(aaseq):\n+                    aa_end = aaseq.find(\'*\', aa_start)\n+                    if aa_end < 0:\n+                        aa_end = len(aaseq)\n+                    prot = aaseq[aa_start:aa_end]\n+                    if args.start_codon:\n+                        m = prot.find(\'M\')\n+                        aa_start += m if m >= 0 else aa_end\n+                        prot = aaseq[aa_start:aa_end]\n+                    if enzyme and refprot:\n+                        frags = digest._cleave(prot, enzyme)\n+                        for frag in reversed(frags):\n+                            if frag in refprot:\n+                                prot = prot[:prot.rfind(frag)]\n+                            else:\n+                                break\n+                    is_cds = refprot and prot in refprot\n+                    if args.debug:\n+                        print("is_cds: %s %s" % (str(is_cds), str(prot)),\n+                              file=sys.stderr)\n+                    if len(prot) < args.min_length:\n+                        pass\n+                    elif not args.all and is_cds:\n+                        pass\n+                    else:\n+                        tstart = aa_start*3+offset\n+                        tend = aa_end*3+offset\n+                        prot_acc = "%s_%d_%d" % (transcript_id, tstart, tend)\n+                        tbed = bed.trim(tstart, tend)\n+                        if args.all or unique_prot(tbed, prot):\n+                            translate_count += 1\n+                            tbed.name = prot_acc\n+                            write_translation(tbed, bed.name, prot)\n+                    aa_start = aa_end + 1\n+        return translate_count\n+\n+    if input_rdr:\n+        translation_count = 0\n+        transcript_count = 0\n+        for i, bedline in enumerate(input_rdr):\n+            try:\n+                bed = bed_from_line(bedline, ensembl=args.ensembl,\n+                                    seq_column=args.column)\n+                if bed is None:\n+                    continue\n+                transcript_count += 1\n+                if bed.biotype and biotypea and bed.biotype not in biotypea:\n+                    continue\n+                if filter_by_regions(bed):\n+                    translation_count += translate_bed(bed)\n+            except Exception as e:\n+                print("BED format Error: line %d: %s\\n%s"\n+                      % (i, bedline, e), file=sys.stderr)\n+                break\n+        if args.debug or args.verbose:\n+            print("transcripts: %d\\ttranslations: %d"\n+                  % (transcript_count, translation_count), file=sys.stderr)\n+\n+\n+if __name__ == "__main__":\n+    __main__()\n'
b
diff -r 000000000000 -r 038ecf54cbec translate_bed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/translate_bed.xml Mon Jan 22 13:59:27 2018 -0500
[
b'@@ -0,0 +1,304 @@\n+<tool id="translate_bed" name="Translate BED transcripts" version="0.1.0">\n+    <description>cDNA in 3frames or CDS</description>\n+    <macros>\n+        <import>macros.xml</import>\n+    </macros>\n+    <requirements>\n+        <expand macro="ensembl_requirements" />\n+        <expand macro="bedutil_requirements" />\n+        <expand macro="twobit_requirements" />\n+    </requirements>\n+    <stdio>\n+        <exit_code range="1:" />\n+    </stdio>\n+    <command detect_errors="aggressive"><![CDATA[\n+        python \'$__tool_directory__/translate_bed.py\'  \n+            #if $ref.ref_source == \'cached\':\n+                --twobit=\'$ref.ref_loc.fields.path\'\n+            #elif $ref.ref_source == \'history\':\n+                --twobit=\'$ref.ref_file\'\n+            #elif $ref.ref_source == \'last_column\':\n+                --column \'-1\'\n+            #elif $ref.ref_source == \'select_column\':\n+                --column ${int(str($ref.seq_column)) - 1}\n+            #elif $ref.ref_source == \'ensembl_rest\':\n+                --ensembl\n+            #end if\n+            #if $translations.translate == \'CDS\':\n+                --cds\n+            #elif $translations.translate == \'cDNA\':\n+                --all\n+            #end if \n+            $translations.start_codon\n+            #if $bed_filters.biotypes:\n+                --biotypes \'$bed_filters.biotypes\'\n+                --ensembl\n+            #end if\n+            #if $bed_filters.regions:\n+                --regions \'$bed_filters.regions\'\n+            #end if\n+            --min_length $translations.min_length\n+            #if $translations.enzyme:\n+                --enzyme \'$translations.enzyme\'\n+            #end if\n+            #if $fa_id.fa_db:\n+               --fa_db=\'$fa_id.fa_db\'\n+            #end if\n+            #if $fa_id.fa_sep:\n+               --fa_sep=\'$fa_id.fa_sep\'\n+            #end if\n+            #if $fa_id.reference:\n+               --reference $fa_id.reference\n+            #else:\n+               --reference ${input.metadata.dbkey}\n+            #end if\n+            #if $fa_id.id_prefix:\n+               --id_prefix \'$fa_id.id_prefix\'\n+            #end if\n+            --bed \'$translation_bed\'\n+            --fasta \'$translation_fasta\'\n+            -v\n+        $input\n+    ]]></command>\n+    <inputs>\n+        <param name="input" type="data" format="bed" label="A BED file with 12 columns" \n+               help="thickStart and thickEnd define protein coding region, blocks define exon regions"/>\n+        <conditional name="ref">\n+            <param name="ref_source" type="select" label="Source for Genomic Sequence Data">\n+                <option value="cached">Locally cached twobit</option>\n+                <option value="history">History dataset twobit</option>\n+                <option value="last_column">Last Column in the BED file</option>\n+                <option value="select_column">Select Column in the BED file</option>\n+                <option value="ensembl_rest">Retrieve sequences from Ensembl (Slow and only for Ensembl Transcripts)</option>\n+            </param>\n+            <when value="cached">\n+                <param name="ref_loc" type="select" label="Select reference 2bit file">\n+                    <options from_data_table="twobit" />\n+                </param>\n+            </when>\n+            <when value="history">\n+                <param name="ref_file" type="data" format="twobit" label="reference 2bit file" />\n+            </when>\n+            <when value="last_column"/>\n+            <when value="select_column">\n+                <param name="seq_column" type="data_column" data_ref="input" label="BED column conatining the genomic sequence"\n+                    help="unspliced genomic sequence from chromStart to chromEnd (Extract Genomic DNA)"/>\n+            </when>\n+            <when value="ensembl_rest"/>\n+        </conditional>\n+        <section name="bed_filters" expanded="false" title="BED Filtering Options">\n+            <param name="regions" type="text" va'..b' <assert_contents>\n+                    <has_text text="generic|test_ENST00000641515" />\n+                    <has_text text="ELPHTLPQFIFQQLVCYILEYRYKVIMLSKYSFANS" />\n+                </assert_contents>\n+            </output>\n+        </test>\n+        <test>\n+            <param name="input" value="human_transcripts_seq.bed" ftype="bed12"/>\n+            <param name="ref_source" value="last_column"/>\n+            <param name="translate" value="cDNA_minus_CDS"/>\n+            <param name="biotypes" value="protein_coding"/>\n+            <param name="start_codon" value="True"/>\n+            <param name="fa_db" value="generic"/>\n+            <param name="id_prefix" value="test_"/>\n+            <output name="translation_bed">\n+                <assert_contents>\n+                    <has_text text="test_ENST00000641515" />\n+                    <has_text text="MLSKYSFANS" />\n+                    <not_has_text text="ELPHTLPQFIFQQLVCYILEYRYKVIMLSKYSFANS" />\n+                </assert_contents>\n+            </output>\n+            <output name="translation_fasta">\n+                <assert_contents>\n+                    <has_text text="generic|test_ENST00000641515" />\n+                    <has_text text="MLSKYSFANS" />\n+                    <not_has_text text="ELPHTLPQFIFQQLVCYILEYRYKVIMLSKYSFANS" />\n+                </assert_contents>\n+            </output>\n+        </test>\n+        <test>\n+            <param name="input" value="human_transcripts.bed" ftype="bed12"/>\n+            <param name="ref_source" value="history"/>\n+            <param name="ref_file" value="GRCh38.1.2bit" ftype="twobit"/>\n+            <param name="translate" value="cDNA_minus_CDS"/>\n+            <param name="regions" value="1:0-30000"/>\n+            <param name="start_codon" value="True"/>\n+            <param name="fa_db" value="generic"/>\n+            <param name="id_prefix" value="test_"/>\n+            <output name="translation_bed">\n+                <assert_contents>\n+                    <has_text text="test_ENST00000488147" />\n+                    <has_text text="MAPSSRAPRTLACRDAPATGSRASTAPWTSGPCRRS" />\n+                    <not_has_text text="ENST00000335137" />\n+                </assert_contents>\n+            </output>\n+            <output name="translation_fasta">\n+                <assert_contents>\n+                    <has_text text="generic|test_ENST00000488147" />\n+                    <has_text text="MAPSSRAPRTLACRDAPATGSRASTAPWTSGPCRRS" />\n+                    <not_has_text text="ENST00000335137" />\n+                </assert_contents>\n+            </output>\n+        </test>\n+    </tests>\n+    <help><![CDATA[\n+Translate transcripts from the input BED file into protein sequences.  \n+\n+The genomic sequence:\n+\n+  - may be supplied in an extra column in the BED input file\n+  - retrieved from a twobit genomic reference file  \n+  - retrieved from the Ensembl REST API for Ensembl transcripts\n+\n+\n+**INPUTS**\n+\n+  - BED file with at least the standard 12 columns\n+  - Genome reference in twobit format (optional)\n+\n+\n+**OUTPUTS**\n+\n+  - FASTA of transcript translations\n+  - BED with the genomic location of the translated protein.  The added 13th column contains the protein sequence.\n+\n+\n+**OPTIONS**\n+\n+  - Feature translation\n+\n+    - cDNA - three frame translations of the cDNA sequences with an output for each sequence between STOP codons\n+    - CDS - three frame translations of CDS (coding sequence defined by thickStart and thickEnd in the BED file)  \n+\n+  - Translation filtering\n+\n+    - can be trimmed to a Methionine start codon\n+    - can be split into peptides by an enzyme digestion\n+    - must exceed specified minimum length\n+\n+\n+  - BED Filtering\n+\n+    - genomic regions \n+    - ensembl biotype if the BED contains the 20 columns as retrieved from the Ensembl REST API\n+  \n+\n+    ]]></help>\n+    <citations>\n+        <citation type="doi">10.1093/bioinformatics/btu613</citation>\n+        <citation type="doi">10.1093/nar/gku1010</citation>\n+    </citations>\n+</tool>\n'