Repository 'fml_gff3togtf'
hg clone https://toolshed.g2.bx.psu.edu/repos/vipints/fml_gff3togtf

Changeset 9:7d67331368f3 (2015-04-23)
Previous changeset 8:d4f9b7beb52f (2015-04-23) Next changeset 10:c42c69aa81f8 (2015-04-23)
Commit message:
fixing the new version upload manually
added:
test-data/CCDS30770.bed
test-data/CCDS30770.gff
test-data/MB7_3R.bed
test-data/MB7_3R.gff3
test-data/aceview_hs_37.gff3
test-data/aceview_hs_37.gtf
test-data/ens_mm9_chr18.gff3
test-data/ens_mm9_chr18.gtf
test-data/gencode_ens_hav.gtf
test-data/s_cerevisiae_SCU49845.gff
removed:
GFFParser.py
README
bed_to_gff.py
bed_to_gff.xml
gbk_to_gff.py
gbk_to_gff.xml
gff_fmap.py
gff_fmap.xml
gff_to_bed.py
gff_to_bed.xml
gff_to_gbk.py
gff_to_gbk.xml
gff_to_gtf.py
gff_to_gtf.xml
gffparser_bcbio.py
gtf_to_gff.py
gtf_to_gff.xml
helper.py
test-data/s_cerevisiae_SCU49845.gff3
test-data/single_parent_feature_record.gff3
tool_conf.xml.sample
b
diff -r d4f9b7beb52f -r 7d67331368f3 GFFParser.py
--- a/GFFParser.py Thu Apr 23 17:51:14 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,495 +0,0 @@\n-#!/usr/bin/env python\n-"""\n-Extract genome annotation from a GFF (a tab delimited format for storing sequence features and annotations) file.\n-\n-Requirements: \n-    Numpy :- http://numpy.org/ \n-    Scipy :- http://scipy.org/ \n-\n-Copyright (C)\t\n-\n-2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. \n-2012-2014 Memorial Sloan Kettering Cancer Center, New York City, USA.\n-"""\n-\n-import re\n-import os\n-import sys\n-import urllib\n-import numpy as np\n-import scipy.io as sio\n-from collections import defaultdict\n-import helper as utils \n-\n-def attribute_tags(col9):\n-    """ \n-    Split the key-value tags from the attribute column, it takes column number 9 from GTF/GFF file \n-\n-    @args col9: attribute column from GFF file \n-    @type col9: str\n-    """\n-    info = defaultdict(list)\n-    is_gff = False\n-    \n-    if not col9:\n-        return is_gff, info\n-        \n-    # trim the line ending semi-colon  ucsc may have some white-space  \n-    col9 = col9.rstrip(\';| \')\n-    # attributes from 9th column \n-    atbs = col9.split(" ; ")\n-    if len(atbs) == 1:\n-        atbs = col9.split("; ")\n-        if len(atbs) == 1:\n-            atbs = col9.split(";")\n-    # check the GFF3 pattern which has key value pairs like:\n-    gff3_pat = re.compile("\\w+=")\n-    # sometime GTF have: gene_id uc002zkg.1;\n-    gtf_pat = re.compile("\\s?\\w+\\s")\n-\n-    key_vals = []\n-\n-    if gff3_pat.match(atbs[0]): # gff3 pattern \n-        is_gff = True\n-        key_vals = [at.split(\'=\') for at in atbs]\n-    elif gtf_pat.match(atbs[0]): # gtf pattern\n-        for at in atbs:\n-            key_vals.append(at.strip().split(" ",1))\n-    else:\n-        # to handle attribute column has only single value \n-        key_vals.append([\'ID\', atbs[0]])\n-    # get key, val items \n-    for item in key_vals:\n-        key, val = item\n-        # replace the double qoutes from feature identifier \n-        val = re.sub(\'"\', \'\', val)\n-        # replace the web formating place holders to plain text format \n-        info[key].extend([urllib.unquote(v) for v in val.split(\',\') if v])\n-\n-    return is_gff, info\n-                \n-def spec_features_keywd(gff_parts):\n-    """\n-    Specify the feature key word according to the GFF specifications\n-\n-    @args gff_parts: attribute field key \n-    @type gff_parts: str \n-    """\n-    for t_id in ["transcript_id", "transcriptId", "proteinId"]:\n-        try:\n-            gff_parts["info"]["Parent"] = gff_parts["info"][t_id]\n-            break\n-        except KeyError:\n-            pass\n-    for g_id in ["gene_id", "geneid", "geneId", "name", "gene_name", "genename"]:\n-        try:\n-            gff_parts["info"]["GParent"] = gff_parts["info"][g_id]\n-            break\n-        except KeyError:\n-            pass\n-    ## TODO key words\n-    for flat_name in ["Transcript", "CDS"]:\n-        if gff_parts["info"].has_key(flat_name):\n-            # parents\n-            if gff_parts[\'type\'] in [flat_name] or re.search(r\'transcript\', gff_parts[\'type\'], re.IGNORECASE):\n-                if not gff_parts[\'id\']:\n-                    gff_parts[\'id\'] = gff_parts[\'info\'][flat_name][0]\n-                    #gff_parts["info"]["ID"] = [gff_parts["id"]]\n-            # children \n-            elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR",\n-                        "coding_exon", "five_prime_UTR", "CDS", "stop_codon",\n-                        "start_codon"]:\n-                gff_parts["info"]["Parent"] = gff_parts["info"][flat_name]\n-            break\n-    return gff_parts\n-\n-def Parse(ga_file):\n-    """\n-    Parsing GFF/GTF file based on feature relationship, it takes the input file.\n-\n-    @args ga_file: input file name \n-    @type ga_file: str \n-    """\n-    child_map = defaultdict(list)\n-    parent_map = dict()\n-\n-    ga_handle = utils.open_file(ga_file)\n-\n-    for rec in ga_handle:\n-        rec = rec.strip(\'\\n\\r\')\n-        \n-        # skip empty line fasta identifier and commented line\n-        if not rec '..b'lete\'] = []\n-        gene[g_cnt][\'is_complete\'] = []\n-        gene[g_cnt][\'is_correctly_gff3_referenced\'] = \'\'\n-        gene[g_cnt][\'splicegraph\'] = []\n-        g_cnt += 1 \n-\n-    ## deleting empty gene records from the main array\n-    XPFLG=0\n-    for XP, ens in enumerate(gene):\n-        if ens[0]==0:\n-            XPFLG=1\n-            break\n-    \n-    if XPFLG==1:\n-        XQC = range(XP, len(gene)+1)\n-        gene = np.delete(gene, XQC)\n-\n-    return gene \n-\n-def NonetoemptyList(XS):\n-    """\n-    Convert a None type to empty list \n-\n-    @args XS: None type \n-    @type XS: str \n-    """\n-    return [] if XS is None else XS \n-\n-def create_missing_feature_type(p_feat, c_feat):\n-    """\n-    GFF/GTF file defines only child features. This function tries to create \n-    the parent feature from the information provided in the attribute column. \n-\n-    example: \n-    chr21   hg19_knownGene  exon    9690071 9690100 0.000000        +       .       gene_id "uc002zkg.1"; transcript_id "uc002zkg.1"; \n-    chr21   hg19_knownGene  exon    9692178 9692207 0.000000        +       .       gene_id "uc021wgt.1"; transcript_id "uc021wgt.1"; \n-    chr21   hg19_knownGene  exon    9711935 9712038 0.000000        +       .       gene_id "uc011abu.2"; transcript_id "uc011abu.2"; \n-\n-    This function gets the parsed feature annotations. \n-    \n-    @args p_feat: Parent feature map  \n-    @type p_feat: collections defaultdict\n-    @args c_feat: Child feature map  \n-    @type c_feat: collections defaultdict\n-    """\n-\n-    child_n_map = defaultdict(list)\n-    for fid, det in c_feat.items():\n-        # get the details from grand child  \n-        GID = STRD = SCR = None\n-        SPOS, EPOS = [], [] \n-        TYP = dict()\n-        for gchild in det:\n-            GID = gchild.get(\'gene_id\', [\'\'])[0] \n-            SPOS.append(gchild.get(\'location\', [])[0]) \n-            EPOS.append(gchild.get(\'location\', [])[1]) \n-            STRD = gchild.get(\'strand\', \'\')\n-            SCR = gchild.get(\'score\', \'\')\n-            if gchild.get(\'type\', \'\') == "gene": ## gencode GTF file has this problem \n-                continue\n-            TYP[gchild.get(\'type\', \'\')] = 1\n-        SPOS.sort() \n-        EPOS.sort()\n-        \n-        # infer transcript type\n-        transcript_type = \'transcript\'\n-        transcript_type = \'mRNA\' if TYP.get(\'CDS\', \'\') or TYP.get(\'cds\', \'\') else transcript_type\n-        \n-        # gene id and transcript id are same\n-        transcript_id = fid[-1]\n-        if GID == transcript_id:\n-            transcript_id = \'Transcript:\' + str(GID)\n-        \n-        # level -1 feature type \n-        p_feat[(fid[0], fid[1], GID)] = dict( type = \'gene\',\n-                                            location = [], ## infer location based on multiple transcripts  \n-                                            strand = STRD,\n-                                            name = GID )\n-        # level -2 feature type \n-        child_n_map[(fid[0], fid[1], GID)].append(\n-                                            dict( type = transcript_type,\n-                                            location =  [SPOS[0], EPOS[-1]], \n-                                            strand = STRD, \n-                                            score = SCR, \n-                                            ID = transcript_id,\n-                                            gene_id = \'\' ))\n-        # reorganizing the grand child\n-        for gchild in det:\n-            child_n_map[(fid[0], fid[1], transcript_id)].append(\n-                                            dict( type = gchild.get(\'type\', \'\'),\n-                                            location =  gchild.get(\'location\'),\n-                                            strand = gchild.get(\'strand\'), \n-                                            ID = gchild.get(\'ID\'),\n-                                            score = gchild.get(\'score\'),\n-                                            gene_id = \'\' ))\n-    return p_feat, child_n_map \n-\n'
b
diff -r d4f9b7beb52f -r 7d67331368f3 README
--- a/README Thu Apr 23 17:51:14 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,61 +0,0 @@
-A collection of tools for converting genome annotation between GTF (Gene Transfer Format), 
-BED (Browser Extensible Data) and GFF (Generic Feature Format).
-
-INTRODUCTION
-
-Several genome annotation centers provide their data in GTF, BED, GFF3 etc. I have few programs 
-they mainly deals with converting between GTF, BED and GFF3 formats. They are extensively tested 
-with files from different centers like ENSEMBL, UCSC, JGI and NCBI AceView. Please follow the 
-instructions below to clone these tools into your galaxy instance.
-
-CONTENTS
-
-Tool configuration files in *.xml format. 
-
-    gtf_to_gff.xml
-    gff_to_gtf.xml
-    bed_to_gff.xml
-    gff_to_bed.xml
-    gbk_to_gff.xml
-    gff_fmap.xml
-
-Python based scripts. 
-
-    gtf_to_gff.py: convert data from GTF to valid GFF3.
-    gff_to_gtf.py: convert data from GFF3 to GTF.
-    bed_to_gff.py: convert data from a 12 column UCSC wiggle BED format to GFF3.
-    gff_to_bed.py: convert gene transcript annotation from GFF3 to UCSC wiggle 12 column BED format.
-    gbk_to_gff.py: convert data from genbank format to GFF. 
-    gff_fmap.py: find the relation between different features described in a GFF file.  
-    GFFParser.py: Parse GFF/GTF files.  
-    helper.py: Utility functions.
-
-test-data: Test data set. (move to your galaxy_root_folder/test-data/)
-    
-    You may need to move the test files into your test-data directory so galaxy can find them. 
-    If you want to run the functional tests eg as: 
-
-    exmaple: 
-    sh run_functional_tests.sh -id fml_gtf2gff
-
-REQUIREMENTS
-
-    python 
-
-COMMENTS/QUESTIONS 
-
-I can be reached at vipin [at] cbio.mskcc.org 
-
-LICENSE
-
-Copyright (C) 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society
-              2013-2014 Memorial Sloan Kettering Cancer Center
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 3 of the License, or
-(at your option) any later version.
-
-COURTESY
-
-To the Galaxy Team.
b
diff -r d4f9b7beb52f -r 7d67331368f3 bed_to_gff.py
--- a/bed_to_gff.py Thu Apr 23 17:51:14 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,70 +0,0 @@
-#!/usr/bin/env python
-"""
-Convert genome annotation data in a 12 column BED format to GFF3. 
-
-Usage: python bed_to_gff.py in.bed > out.gff
-
-Requirement:
-    helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py
-
-Copyright (C) 
-    2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
-    2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
-"""
-
-import re
-import sys
-import helper 
-
-def __main__():
-    """
-    main function 
-    """
-
-    try:
-        bed_fname = sys.argv[1]
-    except:
-        print __doc__
-        sys.exit(-1)
-
-    bed_fh = helper.open_file(bed_fname)
-
-    for line in bed_fh: 
-        line = line.strip( '\n\r' )
-
-        if not line or line[0] in  ['#']:
-            continue 
-
-        parts = line.split('\t') 
-        assert len(parts) >= 12, line
-
-        rstarts = parts[-1].split(',')
-        rstarts.pop() if rstarts[-1] == '' else rstarts
-
-        exon_lens = parts[-2].split(',')
-        exon_lens.pop() if exon_lens[-1] == '' else exon_lens
-        
-        if len(rstarts) != len(exon_lens):
-            continue # checking the consistency col 11 and col 12 
-
-        if len(rstarts) != int(parts[-3]): 
-            continue # checking the number of exons and block count are same
-        
-        if not parts[5] in ['+', '-']:
-            parts[5] = '.' # replace the unknown strand with '.' 
-
-        # bed2gff result line 
-        print '%s\tbed2gff\tgene\t%d\t%s\t%s\t%s\t.\tID=Gene:%s;Name=Gene:%s' % (parts[0], int(parts[1])+1, parts[2], parts[4], parts[5], parts[3], parts[3])
-        print '%s\tbed2gff\ttranscript\t%d\t%s\t%s\t%s\t.\tID=%s;Name=%s;Parent=Gene:%s' % (parts[0], int(parts[1])+1, parts[2], parts[4], parts[5], parts[3], parts[3], parts[3])
-
-        st = int(parts[1])
-        for ex_cnt in range(int(parts[-3])):
-            start = st + int(rstarts[ex_cnt]) + 1
-            stop = start + int(exon_lens[ex_cnt]) - 1
-            print '%s\tbed2gff\texon\t%d\t%d\t%s\t%s\t.\tParent=%s' % (parts[0], start, stop, parts[4], parts[5], parts[3])
-
-    bed_fh.close()
-
-
-if __name__ == "__main__": 
-    __main__()
b
diff -r d4f9b7beb52f -r 7d67331368f3 bed_to_gff.xml
--- a/bed_to_gff.xml Thu Apr 23 17:51:14 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,89 +0,0 @@
-<tool id="fml_bed2gff" name="BED-to-GFF" version="2.0.0">
- <description>converter</description>
- <command interpreter="python">bed_to_gff.py $inf_bed  &gt; $gff_format 
- </command> 
- <inputs>
-   <param format="bed" name="inf_bed" type="data" label="Convert this query" help="Provide genome annotation in 12 column BED format."/>
-    </inputs>
-   <outputs>
-   <data format="gff3" name="gff_format" label="${tool.name} on ${on_string}: Converted" /> 
-   </outputs>
- <tests>
-        <test>
-                <param name="inf_bed" value="ccds_genes.bed" />
-                <output name="gff_format" file="ccds_genes.gff3" />
-        </test>
-        <test>
-                <param name="inf_bed" value="hs_2009.bed" />
-                <output name="gff_format" file="hs_2009.gff3" />
-        </test>
-        </tests>
-   <help>
-
-**What it does**
-
-This tool converts data from a 12 column UCSC wiggle BED format to GFF3 (scroll down for format description).
-
---------
-
-**Example**
-
-- The following data in UCSC Wiggle BED format::
-
- chr1    11873   14409   uc001aaa.3      0       +       11873   11873   0       3       354,109,1189,   0,739,1347,
-
-- Will be converted to GFF3::
-
- ##gff-version 3
- chr1    bed2gff gene    11874   14409   0       +       .       ID=Gene:uc001aaa.3;Name=Gene:uc001aaa.3
- chr1    bed2gff transcript      11874   14409   0       +       .       ID=uc001aaa.3;Name=uc001aaa.3;Parent=Gene:uc001aaa.3
- chr1    bed2gff exon    11874   12227   0       +       .       Parent=uc001aaa.3
- chr1    bed2gff exon    12613   12721   0       +       .       Parent=uc001aaa.3
- chr1    bed2gff exon    13221   14409   0       +       .       Parent=uc001aaa.3
-
---------
-
-**About formats**
-
-**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and several additional optional ones:
-
-The first three BED fields (required) are::
-
-    1. chrom - The name of the chromosome (e.g. chr1, chrY_random).
-    2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.)
-    3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval).
-
-The additional BED fields (optional) are::
-
-    4. name - The name of the BED line.
-    5. score - A score between 0 and 1000.
-    6. strand - Defines the strand - either '+' or '-'.
-    7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser.
-    8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser.
-    9. reserved - This should always be set to zero.
-   10. blockCount - The number of blocks (exons) in the BED line.
-   11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.
-   12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.
-
-**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields::
-
-    1. seqid - Must be a chromosome or scaffold or contig.
-    2. source - The program that generated this feature.
-    3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". 
-    4. start - The starting position of the feature in the sequence. The first base is numbered 1.
-    5. stop - The ending position of the feature (inclusive).
-    6. score - A score between 0 and 1000. If there is no score value, enter ".".
-    7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
-    8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
-    9. attributes - All lines with the same group are linked together into a single item.
-
---------
-
-**Copyright**
-
-2009-2014 Max Planck Society, University of Tübingen &amp; Memorial Sloan Kettering Cancer Center
-
-Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014)
-
- </help>
-</tool>
b
diff -r d4f9b7beb52f -r 7d67331368f3 gbk_to_gff.py
--- a/gbk_to_gff.py Thu Apr 23 17:51:14 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,213 +0,0 @@
-#!/usr/bin/env python
-"""
-Convert data from Genbank format to GFF. 
-
-Usage: 
-python gbk_to_gff.py in.gbk > out.gff 
-
-Requirements:
-    BioPython:- http://biopython.org/
-    helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py
-
-Copyright (C) 
-    2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
-    2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
-"""
-
-import os
-import re
-import sys
-import collections
-from Bio import SeqIO
-import helper 
-
-def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk):
-    """
-    Write the feature information
-    """
-
-    for gname, ginfo in genes.items():
-        line = [str(chr_id), 
-                'gbk_to_gff',
-                ginfo[3],
-                str(ginfo[0]),
-                str(ginfo[1]),
-                '.',
-                ginfo[2],
-                '.',
-                'ID=%s;Name=%s' % (str(gname), str(gname))]
-        print '\t'.join(line) 
-        ## construct the transcript line is not defined in the original file 
-        t_line = [str(chr_id), 'gbk_to_gff', source, 0, 1, '.', ginfo[2], '.'] 
-
-        if not transcripts:
-            t_line.append('ID=Transcript:%s;Parent=%s' % (str(gname), str(gname)))
-
-            if exons: ## get the entire transcript region  from the defined feature
-                t_line[3] = str(exons[gname][0][0])
-                t_line[4] = str(exons[gname][0][-1])
-            elif cds:
-                t_line[3] = str(cds[gname][0][0])
-                t_line[4] = str(cds[gname][0][-1])
-            print '\t'.join(t_line) 
-
-            if exons:
-                exon_line_print(t_line, exons[gname], 'Transcript:'+str(gname), 'exon')
-
-            if cds:
-                exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'CDS')
-                if not exons:
-                    exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'exon')
-
-        else: ## transcript is defined 
-            for idx in transcripts[gname]: 
-                t_line[2] = idx[3]
-                t_line[3] = str(idx[0])
-                t_line[4] = str(idx[1])
-                t_line.append('ID='+str(idx[2])+';Parent='+str(gname))
-                print '\t'.join(t_line) 
-                
-                ## feature line print call 
-                if exons:
-                    exon_line_print(t_line, exons[gname], str(idx[2]), 'exon')
-                if cds:
-                    exon_line_print(t_line, cds[gname], str(idx[2]), 'CDS')
-                    if not exons:
-                        exon_line_print(t_line, cds[gname], str(idx[2]), 'exon')
-
-    if len(genes) == 0: ## feature entry with fragment information 
-        
-        line = [str(chr_id), 'gbk_to_gff', source, 0, 1, '.', orient, '.'] 
-        fStart = fStop = None 
-
-        for eid, ex in cds.items(): 
-            fStart = ex[0][0] 
-            fStop = ex[0][-1]
-
-        for eid, ex in exons.items(): 
-            fStart = ex[0][0] 
-            fStop = ex[0][-1]
-
-        if fStart or fStart:
-
-            line[2] = 'gene'
-            line[3] = str(fStart)
-            line[4] = str(fStop)
-            line.append('ID=Unknown_Gene_' + str(unk) + ';Name=Unknown_Gene_' + str(unk))
-            print "\t".join(line)
-
-            if not cds:
-                line[2] = 'transcript'
-            else:
-                line[2] = 'mRNA'
-
-            line[8] = 'ID=Unknown_Transcript_' + str(unk) + ';Parent=Unknown_Gene_' + str(unk)
-            print "\t".join(line)
-           
-            if exons:
-                exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon')
-                
-            if cds:
-                exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'CDS')
-                if not exons:
-                    exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon')
-                
-            unk +=1 
-
-    return unk
-
-def exon_line_print(temp_line, trx_exons, parent, ftype):
-    """
-    Print the EXON feature line 
-    """
-
-    for ex in trx_exons:
-        temp_line[2] = ftype
-        temp_line[3] = str(ex[0])
-        temp_line[4] = str(ex[1])
-        temp_line[8] = 'Parent=%s' % parent
-        print '\t'.join(temp_line)
-
-def gbk_parse(fname):
-    """
-    Extract genome annotation recods from genbank format 
-
-    @args fname: gbk file name 
-    @type fname: str
-    """
-
-    fhand = helper.open_file(gbkfname)
-    unk = 1 
-
-    for record in SeqIO.parse(fhand, "genbank"):
-
-        gene_tags = dict()
-        tx_tags = collections.defaultdict(list) 
-        exon = collections.defaultdict(list) 
-        cds = collections.defaultdict(list) 
-        mol_type, chr_id = None, None 
-
-        for rec in record.features:
-
-            if rec.type == 'source':
-                try:
-                    mol_type = rec.qualifiers['mol_type'][0]
-                except:
-                    mol_type = '.'
-                    pass 
-                try:
-                    chr_id = rec.qualifiers['chromosome'][0]
-                except:
-                    chr_id = record.name 
-                continue 
-
-            strand='-'
-            strand='+' if rec.strand>0 else strand
-            
-            fid = None 
-            try:
-                fid = rec.qualifiers['gene'][0]
-            except:
-                pass
-
-            transcript_id = None
-            try:
-                transcript_id = rec.qualifiers['transcript_id'][0]
-            except:
-                pass 
-
-            if re.search(r'gene', rec.type):
-                gene_tags[fid] = (rec.location._start.position+1, 
-                                    rec.location._end.position, 
-                                    strand,
-                                    rec.type
-                                    )
-            elif rec.type == 'exon':
-                exon[fid].append((rec.location._start.position+1, 
-                                    rec.location._end.position))
-            elif rec.type=='CDS':
-                cds[fid].append((rec.location._start.position+1, 
-                                    rec.location._end.position))
-            else: 
-                # get all transcripts 
-                if transcript_id: 
-                    tx_tags[fid].append((rec.location._start.position+1,
-                                    rec.location._end.position, 
-                                    transcript_id,
-                                    rec.type))
-        # record extracted, generate feature table
-        unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk)
-        
-    fhand.close()
-
-
-if __name__=='__main__': 
-
-    try:
-        gbkfname = sys.argv[1]
-    except:
-        print __doc__
-        sys.exit(-1)
-
-    ## extract gbk records  
-    gbk_parse(gbkfname) 
b
diff -r d4f9b7beb52f -r 7d67331368f3 gbk_to_gff.xml
--- a/gbk_to_gff.xml Thu Apr 23 17:51:14 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,91 +0,0 @@
-<tool id="fml_gbk2gff" name="GBK-to-GFF" version="2.0.0">
-  <description>converter</description>
-   <command interpreter="python">gbk_to_gff.py $inf_gbk &gt; $gff_format
-   </command>
-   <inputs>
- <param format="gb,gbk,genbank,txt" name="inf_gbk" type="data" label="Convert this query" help="GenBank flat file format consists of an annotation section and a sequence section."/>
-   </inputs>
- <outputs>
- <data format="gff3" name="gff_format" label="${tool.name} on ${on_string}: Converted"/>
-   </outputs>
- <tests>
-        <test>
-                <param name="inf_gbk" value="s_cerevisiae_SCU49845.gbk" />
-                <output name="gff_format" file="s_cerevisiae_SCU49845.gff3" />
-        </test>
- </tests>
-   <help>
-    
-**What it does**
-
-This tool converts data from a GenBank_ flat file format to GFF (scroll down for format description).
-
-.. _GenBank: http://www.ncbi.nlm.nih.gov/genbank/ 
-
-------
-
-**Example**
-
-- The following data in GenBank format::
-
-    LOCUS       NM_001202705            2406 bp    mRNA    linear   PLN 28-MAY-2011
-    DEFINITION  Arabidopsis thaliana thiamine biosynthesis protein ThiC (THIC)
-                mRNA, complete cds.
-    ACCESSION   NM_001202705
-    VERSION     NM_001202705.1  GI:334184566.........
-    FEATURES             Location/Qualifiers
-         source          1..2406
-                         /organism="Arabidopsis thaliana"
-                         /mol_type="mRNA"
-                         /db_xref="taxon:3702"........
-         gene            1..2406
-                         /gene="THIC"
-                         /locus_tag="AT2G29630"
-                         /gene_synonym="PY; PYRIMIDINE REQUIRING; T27A16.27;........
-    ORIGIN
-        1 aagcctttcg ctttaggctg cattgggccg tgacaatatt cagacgattc aggaggttcg
-        61 ttcctttttt aaaggaccct aatcactctg agtaccactg actcactcag tgtgcgcgat
-        121 tcatttcaaa aacgagccag cctcttcttc cttcgtctac tagatcagat ccaaagcttc
-        181 ctcttccagc tatggctgct tcagtacact gtaccttgat gtccgtcgta tgcaacaaca
-    //
-
-
-- Will be converted to GFF3::
-
-    ##gff-version 3
-    NM_001202705    gbk_to_gff chromosome      1       2406    .       +       1       ID=NM_001202705;Alias=2;Dbxref=taxon:3702;Name=NM_001202705
-    NM_001202705    gbk_to_gff gene    1       2406    .       +       1       ID=AT2G29630;Dbxref=GeneID:817513,TAIR:AT2G29630;Name=THIC
-    NM_001202705    gbk_to_gff mRNA    192     2126    .       +       1       ID=AT2G29630.t01;Parent=AT2G29630
-    NM_001202705    gbk_to_gff CDS     192     2126    .       +       1       ID=AT2G29630.p01;Parent=AT2G29630.t01
-    NM_001202705    gbk_to_gff exon    192     2126    .       +       1       Parent=AT2G29630.t01
-
-------
-
-**About formats** 
-
-**GenBank format** An example of a GenBank record may be viewed here_
-
-.. _here: http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html 
-
-**GFF3** Generic Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields::
-
-    1. seqid - Must be a chromosome or scaffold or contig.
-    2. source - The program that generated this feature.
-    3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon".
-    4. start - The starting position of the feature in the sequence. The first base is numbered 1.
-    5. stop - The ending position of the feature (inclusive).
-    6. score - A score between 0 and 1000. If there is no score value, enter ".".
-    7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
-    8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
-    9. attributes - All lines with the same group are linked together into a single item.
-
---------
-
-**Copyright**
-
-2009-2014 Max Planck Society, University of Tübingen &amp; Memorial Sloan Kettering Cancer Center
-
-Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014)
-
- </help>
-</tool>
b
diff -r d4f9b7beb52f -r 7d67331368f3 gff_fmap.py
--- a/gff_fmap.py Thu Apr 23 17:51:14 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,203 +0,0 @@\n-#!/usr/bin/env python\n-"""\n-GFF feature mapping program, to find the relation between different features described in a given GFF file. \n-\n-Usage: \n-python gff_fmap.py in.gff > out.txt \n-\n-Courtesy: Brad Chapman \n-    Few functions are inherited from bcbio-GFFParser program. \n-"""\n-\n-import re\n-import sys \n-import urllib\n-import collections\n-from helper import open_file\n-\n-def _gff_line_map(line):\n-    """Parses a line of GFF into a dictionary.\n-    Given an input line from a GFF file, this:\n-        - breaks it into component elements\n-        - determines the type of attribute (flat, parent, child or annotation)\n-        - generates a dictionary of GFF info \n-    """\n-    gff3_kw_pat = re.compile("\\w+=")\n-    def _split_keyvals(keyval_str):\n-        """Split key-value pairs in a GFF2, GTF and GFF3 compatible way.\n-        GFF3 has key value pairs like:\n-          count=9;gene=amx-2;sequence=SAGE:aacggagccg\n-        GFF2 and GTF have:           \n-          Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206"\n-          name "fgenesh1_pg.C_chr_1000003"; transcriptId 869\n-        """\n-        quals = collections.defaultdict(list)\n-        if keyval_str is None:\n-            return quals\n-        # ensembl GTF has a stray semi-colon at the end\n-        if keyval_str[-1] == \';\':\n-            keyval_str = keyval_str[:-1]\n-        # GFF2/GTF has a semi-colon with at least one space after it.\n-        # It can have spaces on both sides; wormbase does this.\n-        # GFF3 works with no spaces.\n-        # Split at the first one we can recognize as working\n-        parts = keyval_str.split(" ; ")\n-        if len(parts) == 1:\n-            parts = keyval_str.split("; ")\n-            if len(parts) == 1:\n-                parts = keyval_str.split(";")\n-        # check if we have GFF3 style key-vals (with =)\n-        is_gff2 = True\n-        if gff3_kw_pat.match(parts[0]):\n-            is_gff2 = False\n-            key_vals = [p.split(\'=\') for p in parts]\n-        # otherwise, we are separated by a space with a key as the first item\n-        else:\n-            pieces = []\n-            for p in parts:\n-                # fix misplaced semi-colons in keys in some GFF2 files\n-                if p and p[0] == \';\':\n-                    p = p[1:]\n-                pieces.append(p.strip().split(" "))\n-            key_vals = [(p[0], " ".join(p[1:])) for p in pieces]\n-        for key, val in key_vals:\n-            # remove quotes in GFF2 files\n-            if (len(val) > 0 and val[0] == \'"\' and val[-1] == \'"\'):\n-                val = val[1:-1] \n-            if val:\n-                quals[key].extend(val.split(\',\'))\n-            # if we don\'t have a value, make this a key=True/False style\n-            # attribute\n-            else:\n-                quals[key].append(\'true\')\n-        for key, vals in quals.items():\n-            quals[key] = [urllib.unquote(v) for v in vals]\n-        return quals, is_gff2\n-\n-    def _nest_gff2_features(gff_parts):\n-        """Provide nesting of GFF2 transcript parts with transcript IDs.\n-\n-        exons and coding sequences are mapped to a parent with a transcript_id\n-        in GFF2. This is implemented differently at different genome centers\n-        and this function attempts to resolve that and map things to the GFF3\n-        way of doing them.\n-        """\n-        # map protein or transcript ids to a parent\n-        for transcript_id in ["transcript_id", "transcriptId", "proteinId"]:\n-            try:\n-                gff_parts["quals"]["Parent"] = \\\n-                        gff_parts["quals"][transcript_id]\n-                break\n-            except KeyError:\n-                pass\n-        # case for WormBase GFF -- everything labelled as Transcript or CDS\n-        for flat_name in ["Transcript", "CDS"]:\n-            if gff_parts["quals"].has_key(flat_name):\n-                # parent types\n-                if gff_parts["type"] in [flat_name]:\n-                    if not gff_parts["id"]:\n-   '..b'            break\n-\n-        return gff_parts\n-\n-    line = line.strip()\n-    if line == \'\':return [(\'directive\', line)] # sometimes the blank lines will be there \n-    if line[0] == \'>\':return [(\'directive\', \'\')] # sometimes it will be a FATSA header\n-    if line[0] == "#":\n-        return [(\'directive\', line[2:])]\n-    elif line:\n-        parts = line.split(\'\\t\')\n-        if len(parts) == 1 and re.search(r\'\\w+\', parts[0]):return [(\'directive\', \'\')] ## GFF files with FASTA sequence together \n-        assert len(parts) == 9, line\n-        gff_parts = [(None if p == \'.\' else p) for p in parts]\n-        gff_info = dict()\n-            \n-        # collect all of the base qualifiers for this item\n-        quals, is_gff2 = _split_keyvals(gff_parts[8])\n-\n-        gff_info["is_gff2"] = is_gff2\n-\n-        if gff_parts[1]:quals["source"].append(gff_parts[1])\n-        gff_info[\'quals\'] = dict(quals)\n-\n-        # if we are describing a location, then we are a feature\n-        if gff_parts[3] and gff_parts[4]:\n-            gff_info[\'type\'] = gff_parts[2]\n-            gff_info[\'id\'] = quals.get(\'ID\', [\'\'])[0]\n-            \n-            if is_gff2:gff_info = _nest_gff2_features(gff_info)\n-            # features that have parents need to link so we can pick up\n-            # the relationship\n-            if gff_info[\'quals\'].has_key(\'Parent\'):\n-                final_key = \'child\'\n-            elif gff_info[\'id\']:\n-                final_key = \'parent\'\n-            # Handle flat features\n-            else:\n-                final_key = \'feature\'\n-        # otherwise, associate these annotations with the full record\n-        else:\n-            final_key = \'annotation\'\n-        return [(final_key, gff_info)]\n-    \n-def parent_child_id_map(gff_handle):\n-    """\n-    Provide a mapping of parent to child relationships in the file.\n-    Gives a dictionary of parent child relationships:\n-\n-    keys -- tuple of (source, type) for each parent\n-    values -- tuple of (source, type) as children of that parent\n-    """\n-    # collect all of the parent and child types mapped to IDs\n-    parent_sts = dict()\n-    child_sts = collections.defaultdict(list)\n-    for line in gff_handle:\n-        line_type, line_info = _gff_line_map(line)[0]\n-        if (line_type == \'parent\' or (line_type == \'child\' and line_info[\'id\'])):\n-            parent_sts[line_info[\'id\']] = (line_info[\'quals\'][\'source\'][0], line_info[\'type\'])\n-        if line_type == \'child\':\n-            for parent_id in line_info[\'quals\'][\'Parent\']:\n-                child_sts[parent_id].append((line_info[\'quals\'][\'source\'][0], line_info[\'type\']))\n-    gff_handle.close()\n-    # generate a dictionary of the unique final type relationships\n-    pc_map = collections.defaultdict(list)\n-    for parent_id, parent_type in parent_sts.items():\n-        for child_type in child_sts[parent_id]:\n-            pc_map[parent_type].append(child_type)\n-    pc_final_map = dict()\n-    for ptype, ctypes in pc_map.items():\n-        unique_ctypes = list(set(ctypes))\n-        unique_ctypes.sort()\n-        pc_final_map[ptype] = unique_ctypes\n-    # some cases the GFF file represents a single feature type \n-    if not pc_final_map:\n-        for fid, stypes in parent_sts.items():\n-            pc_final_map[stypes] = dict()\n-    # generate a report on feature id mapping in the file \n-    print \'+---------------------+---------------------------------+\'\n-    print \'| Parent feature type | Associated child feature type(s)|\'\n-    print \'+---------------------+---------------------------------+\'\n-    for key, value in pc_final_map.items():\n-        print key[0], key[1]\n-        for child_to in value:\n-            print \'\\t\\t\\t|-\',child_to[0], child_to[1]\n-        print \'+---------------------+---------------------------------+\'\n-\n-\n-if __name__==\'__main__\':\n-\n-    try:\n-        gff_file = sys.argv[1]\n-    except:\n-        print __doc__\n-        sys.exit(-1)\n-    \n-    gff_handle = open_file(gff_file)\n-    parent_child_id_map(gff_handle)\n'
b
diff -r d4f9b7beb52f -r 7d67331368f3 gff_fmap.xml
--- a/gff_fmap.xml Thu Apr 23 17:51:14 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,83 +0,0 @@
-<tool id="fml_gfffmap" name="GFF-map" version="2.0.0">
-  <description>features</description>
-  <command interpreter="python"> 
- gff_fmap.py $gff_input &gt; $idmapping
-    </command>
-  <inputs>
-    <param format="gff3,gff" name="gff_input" type="data" label="Query file" help="Provide genome annotation file in GFF."/>
-  </inputs>
-  <outputs>
-    <data format="txt" name="idmapping" label="${tool.name} on ${on_string}: parent child id map"/>
-  </outputs>
- <tests>
-     <test>
-             <param name="gff_input" value="Feature_ID_mapping_W.gff3" />
-             <output name="idmapping" file="Feature_ID_mapping_W.txt" />
-            </test>
-     <test>
-             <param name="gff_input" value="Aly_JGI.gff3" />
-             <output name="idmapping" file="Feature_ID_mapping_R.txt" />
-            </test>
-        </tests>
-  <help>
-
-**What it does** 
-
-GFF-map provides the features (gene, mRNA, UTR's, exon, CDS etc) relationship based on their identifier mapping in a given GFF file.
-
---------
-
-**Example**
-
-- The features ID mapping in the following data in GFF3::
-
- ##gff-version 3
- 17      protein_coding  gene    7255208 7258258 .       +       .       ID=ENSG00000213859;Name=KCTD11
- 17      protein_coding  mRNA    7255208 7258258 .       +       .       ID=ENST00000333751;Name=KCTD11-001;Parent=ENSG00000213859
- 17      protein_coding  protein     7256262 7256960 .       +       0       ID=ENSP00000328352;Name=ENSP00000328352
- 17      protein_coding  five_prime_UTR  7255208 7256261 .       +       .       Parent=ENST00000333751
- 17      protein_coding  CDS     7256262 7256960 .       +       0       Name=CDS:KCTD11;Parent=ENST00000333751,ENSP00000328352
- 17      protein_coding  three_prime_UTR 7256961 7258258 .       +       .       Parent=ENST00000333751
- 17      protein_coding  exon    7255208 7258258 .       +       .       Parent=ENST00000333751
-
-- Will be displayed as::
-    
-    +-----------------------+---------------------------------+
-    | Parent feature type   | Associated child feature type(s)|
-    +-----------------------+---------------------------------+
-    | protein_coding gene   | protein_coding mRNA             |
-    +-----------------------+---------------------------------+
-    | protein_coding protein| protein_coding CDS              |
-    +-----------------------+---------------------------------+
-    | protein_coding mRNA   | protein_coding CDS              |
-    |                       | protein_coding exon             |
-    |                       | protein_coding five_prime_UTR   |
-    |                       | protein_coding three_prime_UTR  |
-    +-----------------------+---------------------------------+
-
---------
-
-**About formats**
-
-**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields::
-
-    1. seqid - Must be a chromosome or scaffold.
-    2. source - The program that generated this feature.
-    3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". 
-    4. start - The starting position of the feature in the sequence. The first base is numbered 1.
-    5. stop - The ending position of the feature (inclusive).
-    6. score - A score between 0 and 1000. If there is no score value, enter ".".
-    7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
-    8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
-    9. attributes - All lines with the same group are linked together into a single item.
-
---------
-
-**Copyright**
-
-2009-2014 Max Planck Society, University of Tübingen &amp; Memorial Sloan Kettering Cancer Center
-
-Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014)
-
-</help>
-</tool>
b
diff -r d4f9b7beb52f -r 7d67331368f3 gff_to_bed.py
--- a/gff_to_bed.py Thu Apr 23 17:51:14 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,96 +0,0 @@
-#!/usr/bin/env python
-"""
-Convert genome annotation data in GFF/GTF to a 12 column BED format. 
-BED format typically represents the transcript models. 
-
-Usage: python gff_to_bed.py in.gff > out.bed  
-
-Requirement:
-    GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py    
-
-Copyright (C) 
-    2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
-    2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
-"""
-
-import re
-import sys
-import GFFParser
-
-def writeBED(tinfo):
-    """
-    writing result files in bed format 
-
-    @args tinfo: list of genes 
-    @args tinfo: numpy object  
-    """
-
-    for ent1 in tinfo:
-        child_flag = False  
-
-        for idx, tid in enumerate(ent1['transcripts']):
-            child_flag = True 
-            exon_cnt = len(ent1['exons'][idx])
-            exon_len = ''
-            exon_cod = '' 
-            rel_start = None 
-            rel_stop = None 
-            for idz, ex_cod in enumerate(ent1['exons'][idx]):#check for exons of corresponding transcript  
-                exon_len += '%d,' % (ex_cod[1]-ex_cod[0]+1)
-                if idz == 0: #calculate the relative start position 
-                    exon_cod += '0,'
-                    rel_start = int(ex_cod[0])
-                    rel_stop = ex_cod[1]
-                else:
-                    exon_cod += '%d,' % (ex_cod[0]-rel_start)
-                    rel_stop = int(ex_cod[1])
-            
-            if exon_len:
-                score = '0' 
-                score = ent1['score'][0] if ent1['score'] else score
-                out_print = [ent1['chr'],
-                            str(rel_start),
-                            str(rel_stop),
-                            tid[0],
-                            score, 
-                            ent1['strand'], 
-                            str(rel_start),
-                            str(rel_stop),
-                            '0',
-                            str(exon_cnt),
-                            exon_len,
-                            exon_cod]
-                print '\t'.join(out_print)  
-        
-        if not child_flag: # file just contains only a single parent type i.e, gff3 defines only one feature type 
-            score = '0' 
-            score = ent1['score'][0] if ent1['score'] else score
-
-            out_print = [ent1['chr'], 
-                        '%d' % int(ent1['start']), 
-                        '%d' % int(ent1['stop']),
-                        ent1['name'], 
-                        score, 
-                        ent1['strand'],
-                        '%d' % int(ent1['start']), 
-                        '%d' % int(ent1['stop']),
-                        '0',
-                        '1',
-                        '%d,' % (int(ent1['stop'])-int(ent1['start'])+1), 
-                        '0,']
-
-            print '\t'.join(out_print)  
-
-    
-def __main__():
-    try:
-        query_file = sys.argv[1]
-    except:
-        print __doc__
-        sys.exit(-1)
-
-    Transcriptdb = GFFParser.Parse(query_file)  
-    writeBED(Transcriptdb)
-
-if __name__ == "__main__": 
-    __main__() 
b
diff -r d4f9b7beb52f -r 7d67331368f3 gff_to_bed.xml
--- a/gff_to_bed.xml Thu Apr 23 17:51:14 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,90 +0,0 @@
-<tool id="fml_gff2bed" name="GFF-to-BED" version="2.0.0">
- <description>converter</description> 
- <command interpreter="python">gff_to_bed.py $inf_gff &gt; $bed_format 
- </command> 
- <inputs>
-   <param format="gtf,gff,gff3" name="inf_gff" type="data" label="Convert this query" help="Provide genome annotation file in GFF, GTF, GFF3."/>
-    </inputs>
-   <outputs>
-   <data format="bed" name="bed_format" label="${tool.name} on ${on_string}: Converted" /> 
-   </outputs>
- <tests>
-        <test>
-                <param name="inf_gff" value="Aly_JGI.gff3" />
-                <output name="bed_format" file="Aly_JGI.bed" />
-        </test>
-        <test>
-                <param name="inf_gff" value="MB7_3R.gff3" />
-                <output name="bed_format" file="MB7_3R.bed" />
-        </test>
-        </tests>
-   <help>
-
-**What it does**
-
-This tool converts gene transcript annotation from GTF or GFF or GFF3 to UCSC wiggle 12 column BED format.
-
---------
-
-**Example**
-
-- The following data in GFF3::
-
- ##gff-version 3
- chr1    protein_coding  gene    11874   14409   0       +       .       ID=Gene:uc001aaa.3;Name=Gene:uc001aaa.3
- chr1    protein_coding  transcript      11874   14409   0       +       .       ID=uc001aaa.3;Name=uc001aaa.3;Parent=Gene:uc001aaa.3
- chr1    protein_coding  exon    11874   12227   0       +       .       Parent=uc001aaa.3
- chr1    protein_coding  exon    12613   12721   0       +       .       Parent=uc001aaa.3
- chr1    protein_coding  exon    13221   14409   0       +       .       Parent=uc001aaa.3
-
-- Will be converted to UCSC Wiggle BED format::
-
- chr1    11874   14409   uc001aaa.3      0       +       11874   14409   0       3       354,109,1189,   0,739,1347,
-
---------
-
-**About formats**
-
-**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields::
-
-
-    1. seqid - Must be a chromosome or scaffold or contig.
-    2. source - The program that generated this feature.
-    3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". 
-    4. start - The starting position of the feature in the sequence. The first base is numbered 1.
-    5. stop - The ending position of the feature (inclusive).
-    6. score - A score between 0 and 1000. If there is no score value, enter ".".
-    7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
-    8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
-    9. attributes - All lines with the same group are linked together into a single item.
-
-**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and several additional optional ones:
-
-The first three BED fields (required) are::
-
-    1. chrom - The name of the chromosome (e.g. chr1, chrY_random).
-    2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.)
-    3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval).
-
-The additional BED fields (optional) are::
-
-    4. name - The name of the BED line.
-    5. score - A score between 0 and 1000.
-    6. strand - Defines the strand - either '+' or '-'.
-    7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser.
-    8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser.
-    9. reserved - This should always be set to zero.
-   10. blockCount - The number of blocks (exons) in the BED line.
-   11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.
-   12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.
-
---------
-
-**Copyright**
-
-2009-2014 Max Planck Society, University of Tübingen &amp; Memorial Sloan Kettering Cancer Center
-
-Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014)
-
- </help>
-</tool>
b
diff -r d4f9b7beb52f -r 7d67331368f3 gff_to_gbk.py
--- a/gff_to_gbk.py Thu Apr 23 17:51:14 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,54 +0,0 @@
-#!/usr/bin/env python 
-"""
-Convert data from GFF and associated genome sequence in fasta file into GenBank.
-
-Usage: 
-python gff_to_gbk.py in.gff in.fasta out.gbk 
-
-Requirements:
-    BioPython:- http://biopython.org/
-    helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py
-
-Copyright (C) 
-    2010-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
-    2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
-"""
-
-import sys
-import helper
-import gffparser_bcbio
-
-from Bio import SeqIO
-from Bio.Alphabet import generic_dna
-
-def __main__():
-    """
-    main wrapper
-    """
-
-    try:
-        gff_fname = sys.argv[1]
-        fasta_fname = sys.argv[2]
-        gb_fname = sys.argv[3]
-    except: 
-        print __doc__
-        sys.exit(-1)
-
-    fasta_fh = helper.open_file(fasta_fname) 
-
-    fasta_rec = SeqIO.to_dict(SeqIO.parse(fasta_fh, "fasta", generic_dna))
-    fasta_fh.close()
-
-    gff_rec = gffparser_bcbio.parse(gff_fname, fasta_rec)
-    
-    try:
-        gb_fh = open(gb_fname, "w")
-    except:
-        print 'file not ready for writing %s' % gb_fname
-        sys.exit(-1)
-
-    SeqIO.write(gff_rec, gb_fh, "genbank")
-    gb_fh.close()
-
-if __name__=="__main__":
-    __main__()
b
diff -r d4f9b7beb52f -r 7d67331368f3 gff_to_gbk.xml
--- a/gff_to_gbk.xml Thu Apr 23 17:51:14 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,98 +0,0 @@
-<tool id="fml_gff2gbk" name="GFF-to-GBK" version="2.0.0">
-    <description>converter</description>
-   <command interpreter="python">gff_to_gbk.py $inf_gff $inf_fas $gbk_format
-   </command>
-   <inputs>
- <param format="gff,gff3" name="inf_gff" type="data" label="Convert this query" help="Genome annotation in GFF file format."/>
- <param format="fa,fasta" name="inf_fas" type="data" label="Genome Sequence" help="Genome sequence in FASTA format."/>
-   </inputs>
- <outputs>
- <data format="genbank" name="gbk_format" label="${tool.name} on ${on_string}: Converted"/>
-   </outputs>
-    <tests>
-        <test>
-            <param name="inf_gff" value="s_cerevisiae_SCU49845.gff3" />
-            <param name="inf_fas" value="s_cerevisiae_SCU49845.fasta" />
-            <output name="gbk_format" file="s_cerevisiae_SCU49845.gbk" />
-        </test>
-    </tests>
-   <help>
-
-**What it does**
-
-This tool converts annotations in GFF to GenBank_ format (scroll down for format description).
-
-.. _GenBank: http://www.ncbi.nlm.nih.gov/genbank/ 
-
-------
-
-**Example**
-
-- The following data in GFF::
-
-    ##gff-version 3
-    # sequence-region NM_001202705 1 2406
-    NM_001202705    GenBank chromosome      1       2406    .       +       1       ID=NM_001202705;Alias=2;Dbxref=taxon:3702;Name=NM_001202705;Note=Arabidopsis thaliana thiamine biosynthesis protein ThiC (THIC) mRNA%2C complete cds.,REVIEWED REFSEQ;
-    NM_001202705    GenBank gene    1       2406    .       +       1       ID=AT2G29630;Dbxref=GeneID:817513,TAIR:AT2G29630;Name=THIC;locus_tag=AT2G29630
-    NM_001202705    GenBank mRNA    192     2126    .       +       1       ID=AT2G29630.t01;Parent=AT2G29630
-    NM_001202705    GenBank CDS     192     2126    .       +       1       ID=AT2G29630.p01;Parent=AT2G29630.t01;Dbxref=GI:334184567,GeneID:817513,TAIR:AT2G29630;Name=THIC;Note=thiaminC (THIC)%3B CONTAINS InterPro DOMAIN;rotein_id=NP_001189634.1;
-    NM_001202705    GenBank exon    192     2126    .       +       1       Parent=AT2G29630.t01
-    ##FASTA
-    >NM_001202705
-    AAGCCTTTCGCTTTAGGCTGCATTGGGCCGTGACAATATTCAGACGATTCAGGAGGTTCG
-    TTCCTTTTTTAAAGGACCCTAATCACTCTGAGTACCACTGACTCACTCAGTGTGCGCGAT
-
-- Will be converted to GenBank format::
-
-    LOCUS       NM_001202705            2406 bp    mRNA    linear   PLN 28-MAY-2011
-    DEFINITION  Arabidopsis thaliana thiamine biosynthesis protein ThiC (THIC)
-                mRNA, complete cds.
-    ACCESSION   NM_001202705
-    VERSION     NM_001202705.1  GI:334184566.........
-    FEATURES             Location/Qualifiers
-         source          1..2406
-                         /organism="Arabidopsis thaliana"
-                         /mol_type="mRNA"
-                         /db_xref="taxon:3702"........
-         gene            1..2406
-                         /gene="THIC"
-                         /locus_tag="AT2G29630"
-                         /gene_synonym="PY; PYRIMIDINE REQUIRING; T27A16.27;........
-    ORIGIN
-        1 aagcctttcg ctttaggctg cattgggccg tgacaatatt cagacgattc aggaggttcg
-        61 ttcctttttt aaaggaccct aatcactctg agtaccactg actcactcag tgtgcgcgat
-        121 tcatttcaaa aacgagccag cctcttcttc cttcgtctac tagatcagat ccaaagcttc
-        181 ctcttccagc tatggctgct tcagtacact gtaccttgat gtccgtcgta tgcaacaaca
-    //
-
-------
-
-**About formats** 
-
-**GFF** Generic Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF lines have nine tab-separated fields::
-
-    1. seqid - Must be a chromosome or scaffold or contig.
-    2. source - The program that generated this feature.
-    3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon".
-    4. start - The starting position of the feature in the sequence. The first base is numbered 1.
-    5. stop - The ending position of the feature (inclusive).
-    6. score - A score between 0 and 1000. If there is no score value, enter ".".
-    7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
-    8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
-    9. attributes - All lines with the same group are linked together into a single item.
-
-**GenBank format** Consists of an annotation section and a sequence section. Sample record_  
-
-.. _record: http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html 
-
-
---------
-
-**Copyright**
-
-2010-2014 Max Planck Society, University of Tübingen &amp; Memorial Sloan Kettering Cancer Center
-
-Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014)
-
- </help>
-</tool>
b
diff -r d4f9b7beb52f -r 7d67331368f3 gff_to_gtf.py
--- a/gff_to_gtf.py Thu Apr 23 17:51:14 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,76 +0,0 @@
-#!/usr/bin/env python 
-"""
-Program to convert data from GFF to GTF 
-
-Usage: python gff_to_gtf.py in.gff > out.gtf 
-
-Requirement:
-    GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py    
-
-Copyright (C) 
-    2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
-    2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
-"""
-
-import re
-import sys
-import GFFParser
-
-def printGTF(tinfo):
-    """
-    writing result file in GTF format
-
-    @args tinfo: parsed object from gff file
-    @type tinfo: numpy array 
-    """
-
-    for ent1 in tinfo:
-        for idx, tid in enumerate(ent1['transcripts']):
-            
-            exons = ent1['exons'][idx]
-            cds_exons = ent1['cds_exons'][idx]
-
-            stop_codon = start_codon = ()
-
-            if ent1['strand'] == '+':
-                if cds_exons.any():
-                    start_codon = (cds_exons[0][0], cds_exons[0][0]+2) 
-                    stop_codon = (cds_exons[-1][1]-2, cds_exons[-1][1]) 
-            elif ent1['strand'] == '-':
-                if cds_exons.any():
-                    start_codon = (cds_exons[-1][1]-2, cds_exons[-1][1])
-                    stop_codon = (cds_exons[0][0], cds_exons[0][0]+2)
-            else:
-                print 'STRAND information missing - %s, skip the transcript - %s' % (ent1['strand'], tid[0]) 
-                pass 
-                
-            last_cds_cod = 0 
-            for idz, ex_cod in enumerate(exons):
-
-                print '%s\t%s\texon\t%d\t%d\t.\t%s\t.\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], ex_cod[0], ex_cod[1], ent1['strand'], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name'])
-
-                if cds_exons.any():
-                    try:
-                        print '%s\t%s\tCDS\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], cds_exons[idz][0], cds_exons[idz][1], ent1['strand'], cds_exons[idz][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name'])
-                        last_cds_cod = idz 
-                    except:
-                        pass 
-
-                    if idz == 0:
-                        print '%s\t%s\tstart_codon\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], start_codon[0], start_codon[1], ent1['strand'], cds_exons[idz][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name'])
-
-            if stop_codon:
-                print '%s\t%s\tstop_codon\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], stop_codon[0], stop_codon[1], ent1['strand'], cds_exons[last_cds_cod][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name'])
-
-    
-if __name__ == "__main__": 
-
-    try:
-        gff_fname = sys.argv[1]
-    except:
-        print __doc__
-        sys.exit(-1)
-
-    Transcriptdb = GFFParser.Parse(gff_fname)  
-
-    printGTF(Transcriptdb) 
b
diff -r d4f9b7beb52f -r 7d67331368f3 gff_to_gtf.xml
--- a/gff_to_gtf.xml Thu Apr 23 17:51:14 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,88 +0,0 @@
-<tool id="fml_gff2gtf" name="GFF-to-GTF" version="2.0.0">
- <description>converter</description> 
- <command interpreter="python">gff_to_gtf.py $inf_gff3 &gt; $gtf_format
- </command> 
- <inputs>
-   <param format="gff3,gff" name="inf_gff3" type="data" label="Convert this query" help="Provide genome annotation file in GFF or GFF3."/>
-    </inputs>
-   <outputs>
-   <data format="gtf" name="gtf_format" label="${tool.name} on ${on_string}: Converted" /> 
-   </outputs>
- <tests>
-        <test>
-                <param name="inf_gff3" value="AceView_ncbi_37.gff3" />
-                <output name="gtf_format" file="AceView_gff3_to_gtf.gtf" />
-        </test>
-        <test>
-                <param name="inf_gff3" value="ENSEMBL_mm9.gff3" />
-                <output name="gtf_format" file="ENSEMBL_mm9_gff3_to_gtf.gtf" />
-        </test>
-    </tests>
-   <help>
-
-**What it does**
-
-This tool converts data from GFF3 to GTF file format (scroll down for format description).
-
---------
-
-**Example**
-
-- The following data in GFF3 format::
-
- ##gff-version 3
- 17      protein_coding  gene    7255208 7258258 .       +       .       ID=ENSG00000213859;Name=KCTD11
- 17      protein_coding  mRNA    7255208 7258258 .       +       .       ID=ENST00000333751;Name=KCTD11-001;Parent=ENSG00000213859
- 17      protein_coding  protein 7256262 7256960 .       +       .       ID=ENSP00000328352;Name=KCTD11-001;Parent=ENST00000333751
- 17      protein_coding  five_prime_UTR  7255208 7256261 .       +       .       Parent=ENST00000333751
- 17      protein_coding  CDS     7256262 7256960 .       +       0       Name=CDS:KCTD11;Parent=ENST00000333751,ENSP00000328352
- 17      protein_coding  three_prime_UTR 7256961 7258258 .       +       .       Parent=ENST00000333751
- 17      protein_coding  exon    7255208 7258258 .       +       .       Parent=ENST00000333751
-
-- Will be converted to GTF format::
-
- 17      protein_coding  exon    7255208 7258258 .       +       .        gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
- 17      protein_coding  CDS     7256262 7256957 .       +       0        gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; protein_id "ENSP00000328352";
- 17      protein_coding  start_codon     7256262 7256264 .       +       0        gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
- 17      protein_coding  stop_codon      7256958 7256960 .       +       0        gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
-
---------
-
-**About formats**
-
-
-**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields::
-
-    1. seqid - Must be a chromosome or scaffold.
-    2. source - The program that generated this feature.
-    3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". 
-    4. start - The starting position of the feature in the sequence. The first base is numbered 1.
-    5. stop - The ending position of the feature (inclusive).
-    6. score - A score between 0 and 1000. If there is no score value, enter ".".
-    7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
-    8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
-    9. attributes - All lines with the same group are linked together into a single item.
-
-
-**GTF format** Gene Transfer Format, it borrows from GFF, but has additional structure that warrants a separate definition and format name. GTF lines have nine tab-seaparated fields::
-
-    1. seqname - The name of the sequence.
-    2. source - This indicating where the annotation came from.
-    3. feature - The name of the feature types. The following feature types are required: 'CDS', 'start_codon' and 'stop_codon'
-    4. start - The starting position of the feature in the sequence. The first base is numbered 1.
-    5. end - The ending position of the feature (inclusive).
-    6. score - The score field indicates a degree of confidence in the feature's existence and coordinates.
-    7. strand - Valid entries include '+', '-', or '.'
-    8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base.
-    9. attributes - These attributes are designed for handling multiple transcripts from the same genomic region.
-
---------
-
-**Copyright**
-
-2009-2014 Max Planck Society, University of Tübingen &amp; Memorial Sloan Kettering Cancer Center
-
-Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014)
-
- </help>
-</tool>
b
diff -r d4f9b7beb52f -r 7d67331368f3 gffparser_bcbio.py
--- a/gffparser_bcbio.py Thu Apr 23 17:51:14 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,828 +0,0 @@\n-"""Parse GFF files into features attached to Biopython SeqRecord objects.\n-\n-This deals with GFF3 formatted files, a tab delimited format for storing\n-sequence features and annotations:\n-\n-http://www.sequenceontology.org/gff3.shtml\n-\n-It will also deal with older GFF versions (GTF/GFF2):\n-\n-http://www.sanger.ac.uk/Software/formats/GFF/GFF_Spec.shtml\n-http://mblab.wustl.edu/GTF22.html\n-\n-The implementation utilizes map/reduce parsing of GFF using Disco. Disco\n-(http://discoproject.org) is a Map-Reduce framework for Python utilizing\n-Erlang for parallelization. The code works on a single processor without\n-Disco using the same architecture.\n-"""\n-import os\n-import copy\n-import re\n-import collections\n-import urllib\n-import itertools\n-\n-# Make defaultdict compatible with versions of python older than 2.4\n-try:\n-    collections.defaultdict\n-except AttributeError:\n-    import _utils\n-    collections.defaultdict = _utils.defaultdict\n-\n-from Bio.Seq import Seq, UnknownSeq\n-from Bio.SeqRecord import SeqRecord\n-from Bio.SeqFeature import SeqFeature, FeatureLocation\n-from Bio import SeqIO\n-\n-def _gff_line_map(line, params):\n-    """Map part of Map-Reduce; parses a line of GFF into a dictionary.\n-\n-    Given an input line from a GFF file, this:\n-    - decides if the file passes our filtering limits\n-    - if so:\n-        - breaks it into component elements\n-        - determines the type of attribute (flat, parent, child or annotation)\n-        - generates a dictionary of GFF info which can be serialized as JSON\n-    """\n-    gff3_kw_pat = re.compile("\\w+=")\n-    def _split_keyvals(keyval_str):\n-        """Split key-value pairs in a GFF2, GTF and GFF3 compatible way.\n-\n-        GFF3 has key value pairs like:\n-          count=9;gene=amx-2;sequence=SAGE:aacggagccg\n-        GFF2 and GTF have:           \n-          Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206"\n-          name "fgenesh1_pg.C_chr_1000003"; transcriptId 869\n-        """\n-        quals = collections.defaultdict(list)\n-        if keyval_str is None:\n-            return quals\n-        # ensembl GTF has a stray semi-colon at the end\n-        if keyval_str[-1] == \';\':\n-            keyval_str = keyval_str[:-1]\n-        # GFF2/GTF has a semi-colon with at least one space after it.\n-        # It can have spaces on both sides; wormbase does this.\n-        # GFF3 works with no spaces.\n-        # Split at the first one we can recognize as working\n-        parts = keyval_str.split(" ; ")\n-        if len(parts) == 1:\n-            parts = keyval_str.split("; ")\n-            if len(parts) == 1:\n-                parts = keyval_str.split(";")\n-        # check if we have GFF3 style key-vals (with =)\n-        is_gff2 = True\n-        if gff3_kw_pat.match(parts[0]):\n-            is_gff2 = False\n-            key_vals = [p.split(\'=\') for p in parts]\n-        # otherwise, we are separated by a space with a key as the first item\n-        else:\n-            pieces = []\n-            for p in parts:\n-                # fix misplaced semi-colons in keys in some GFF2 files\n-                if p and p[0] == \';\':\n-                    p = p[1:]\n-                pieces.append(p.strip().split(" "))\n-            key_vals = [(p[0], " ".join(p[1:])) for p in pieces]\n-        for item in key_vals:\n-            # standard in-spec items are key=value\n-            if len(item) == 2:\n-                key, val = item\n-            # out-of-spec files can have just key values. We set an empty value\n-            # which will be changed to true later to standardize.\n-            else:\n-                assert len(item) == 1, item\n-                key = item[0]\n-                val = \'\'\n-            # remove quotes in GFF2 files\n-            if (len(val) > 0 and val[0] == \'"\' and val[-1] == \'"\'):\n-                val = val[1:-1] \n-            if val:\n-                quals[key].extend([v for v in val.split(\',\') if v])\n-            # if we don\'t have a value, make this a key=True/False style\n-      '..b' the\n-    information you need. This class provides high level summary details to\n-    help in learning.\n-    """\n-    def __init__(self):\n-        self._filter_info = dict(gff_id = [0], gff_source_type = [1, 2],\n-                gff_source = [1], gff_type = [2])\n-    \n-    def _get_local_params(self, limit_info=None):\n-        class _LocalParams:\n-            def __init__(self):\n-                self.jsonify = False\n-        params = _LocalParams()\n-        params.limit_info = limit_info\n-        params.filter_info = self._filter_info\n-        return params\n-    \n-    @_file_or_handle\n-    def available_limits(self, gff_handle):\n-        """Return dictionary information on possible limits for this file.\n-\n-        This returns a nested dictionary with the following structure:\n-        \n-        keys -- names of items to filter by\n-        values -- dictionary with:\n-            keys -- filter choice\n-            value -- counts of that filter in this file\n-\n-        Not a parallelized map-reduce implementation.\n-        """\n-        cur_limits = dict()\n-        for filter_key in self._filter_info.keys():\n-            cur_limits[filter_key] = collections.defaultdict(int)\n-        for line in gff_handle:\n-            # when we hit FASTA sequences, we are done with annotations\n-            if line.startswith("##FASTA"):\n-                break\n-            # ignore empty and comment lines\n-            if line.strip() and line.strip()[0] != "#":\n-                parts = [p.strip() for p in line.split(\'\\t\')]\n-                assert len(parts) == 9, line\n-                for filter_key, cur_indexes in self._filter_info.items():\n-                    cur_id = tuple([parts[i] for i in cur_indexes])\n-                    cur_limits[filter_key][cur_id] += 1\n-        # get rid of the default dicts\n-        final_dict = dict()\n-        for key, value_dict in cur_limits.items():\n-            if len(key) == 1:\n-                key = key[0]\n-            final_dict[key] = dict(value_dict)\n-        gff_handle.close()\n-        return final_dict\n-\n-    @_file_or_handle\n-    def parent_child_map(self, gff_handle):\n-        """Provide a mapping of parent to child relationships in the file.\n-\n-        Returns a dictionary of parent child relationships:\n-\n-        keys -- tuple of (source, type) for each parent\n-        values -- tuple of (source, type) as children of that parent\n-        \n-        Not a parallelized map-reduce implementation.\n-        """\n-        # collect all of the parent and child types mapped to IDs\n-        parent_sts = dict()\n-        child_sts = collections.defaultdict(list)\n-        for line in gff_handle:\n-            # when we hit FASTA sequences, we are done with annotations\n-            if line.startswith("##FASTA"):\n-                break\n-            if line.strip():\n-                line_type, line_info = _gff_line_map(line,\n-                        self._get_local_params())[0]\n-                if (line_type == \'parent\' or (line_type == \'child\' and\n-                        line_info[\'id\'])):\n-                    parent_sts[line_info[\'id\']] = (\n-                            line_info[\'quals\'][\'source\'][0], line_info[\'type\'])\n-                if line_type == \'child\':\n-                    for parent_id in line_info[\'quals\'][\'Parent\']:\n-                        child_sts[parent_id].append((\n-                            line_info[\'quals\'][\'source\'][0], line_info[\'type\']))\n-        #print parent_sts, child_sts\n-        # generate a dictionary of the unique final type relationships\n-        pc_map = collections.defaultdict(list)\n-        for parent_id, parent_type in parent_sts.items():\n-            for child_type in child_sts[parent_id]:\n-                pc_map[parent_type].append(child_type)\n-        pc_final_map = dict()\n-        for ptype, ctypes in pc_map.items():\n-            unique_ctypes = list(set(ctypes))\n-            unique_ctypes.sort()\n-            pc_final_map[ptype] = unique_ctypes\n-        return pc_final_map\n'
b
diff -r d4f9b7beb52f -r 7d67331368f3 gtf_to_gff.py
--- a/gtf_to_gff.py Thu Apr 23 17:51:14 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,85 +0,0 @@
-#!/usr/bin/env python
-"""
-Convert Gene Transfer Format [GTF] to Generic Feature Format Version 3 [GFF3].
-
-Usage: python gtf_to_gff.py in.gtf > out.gff3  
-    
-Requirement:
-    GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py    
-    helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py
-    
-Copyright (C) 
-    2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
-    2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
-"""
-
-import re
-import sys
-import GFFParser
-import helper
-
-def GFFWriter(gtf_content):
-    """
-    write the feature information to GFF format
-
-    @args gtf_content: Parsed object from gtf file 
-    @type gtf_content: numpy array
-    """
-
-    print '##gff-version 3'
-
-    for ent1 in gtf_content:
-
-        chr_name = ent1['chr']
-        strand = ent1['strand']
-        start = ent1['start']
-        stop = ent1['stop']
-        source = ent1['source']
-        ID = ent1['name']
-        Name = ent1['gene_info']['Name']
-
-        Name = ID if not Name else Name 
-
-        print '%s\t%s\tgene\t%d\t%d\t.\t%s\t.\tID=%s;Name=%s' % (chr_name, source, start, stop, strand, ID, Name) 
-
-        for idx, tid in enumerate(ent1['transcripts']):
-            print idx 
-            print tid 
-
-            t_start = ent1['exons'][idx][0][0]
-            t_stop = ent1['exons'][idx][-1][-1]
-            t_type = ent1['transcript_type'][idx]
-
-            utr5_exons, utr3_exons = [], [] 
-            if ent1['exons'][idx].any() and ent1['cds_exons'][idx].any():
-                utr5_exons, utr3_exons = helper.buildUTR(ent1['cds_exons'][idx], ent1['exons'][idx], strand)
-
-            print '%s\t%s\t%s\t%d\t%d\t.\t%s\t.\tID=%s;Parent=%s' % (chr_name, source, t_type, t_start, t_stop, strand, tid[0], ID) 
-
-            for ex_cod in utr5_exons:
-                print '%s\t%s\tfive_prime_UTR\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0]) 
-
-            for ex_cod in ent1['cds_exons'][idx]:
-                print '%s\t%s\tCDS\t%d\t%d\t.\t%s\t%d\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, ex_cod[2], tid[0]) 
-
-            for ex_cod in utr3_exons:
-                print '%s\t%s\tthree_prime_UTR\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0]) 
-
-            for ex_cod in ent1['exons'][idx]:
-                print '%s\t%s\texon\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0]) 
-            
-
-def __main__():
-
-    try:
-        gtf_fname = sys.argv[1]
-    except:
-        print __doc__
-        sys.exit(-1)
-
-    gtf_file_content = GFFParser.Parse(gtf_fname)  
-
-    GFFWriter(gtf_file_content)
-
-if __name__ == "__main__": 
-    __main__()
b
diff -r d4f9b7beb52f -r 7d67331368f3 gtf_to_gff.xml
--- a/gtf_to_gff.xml Thu Apr 23 17:51:14 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,94 +0,0 @@
-<tool id="fml_gtf2gff" name="GTF-to-GFF" version="2.0.0">
- <description>converter</description> 
- <command interpreter="python">gtf_to_gff.py $inf_gtf &gt; $gff3_format 
- </command> 
- <inputs>
-   <param format="gtf" name="inf_gtf" type="data" label="Convert this query" help="Provide genome annotation file in GTF."/>
-        </inputs>
-   <outputs>
-   <data format="gff3" name="gff3_format" label="${tool.name} on ${on_string}: Converted" /> 
-   </outputs>
-     <tests>
-         <test>
-                <param name="inf_gtf" value="UCSC_transcripts.gtf" />
-                <output name="gff3_format" file="UCSC_transcripts.gff3" />
-         </test>
-         <test>
-                <param name="inf_gtf" value="JGI_genes.gtf" />
-                <output name="gff3_format" file="JGI_genes.gff3" />
-         </test>
-         <test>
-                <param name="inf_gtf" value="ENSEMBL_mm9.gtf" />
-                <output name="gff3_format" file="ENSEMBL_mm9.gff3" />
-         </test>
-         <test>
-                <param name="inf_gtf" value="AceView_ncbi_37.gtf" />
-                <output name="gff3_format" file="AceView_ncbi_37.gff3" />
-         </test>
-        </tests>
-   <help>
-
-**What it does**
-
-This tool converts data from GTF to a valid GFF3 file (scroll down for format description).
-
---------
-
-**Example**
-
-- The following data in GTF format::
-
- 17      protein_coding  exon    7255208 7258258 .       +       .        gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
- 17      protein_coding  CDS     7256262 7256957 .       +       0        gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; protein_id "ENSP00000328352";
- 17      protein_coding  start_codon     7256262 7256264 .       +       0        gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
- 17      protein_coding  stop_codon      7256958 7256960 .       +       0        gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
-
-- Will be converted to GFF3 format::
-
- ##gff-version 3
- 17      protein_coding  gene    7255208 7258258 .       +       .       ID=ENSG00000213859;Name=KCTD11
- 17      protein_coding  mRNA    7255208 7258258 .       +       .       ID=ENST00000333751;Name=KCTD11-001;Parent=ENSG00000213859
- 17      protein_coding  protein 7256262 7256960 .       +       .       ID=ENSP00000328352;Name=KCTD11-001;Parent=ENST00000333751
- 17      protein_coding  five_prime_UTR  7255208 7256261 .       +       .       Parent=ENST00000333751
- 17      protein_coding  CDS     7256262 7256960 .       +       0       Name=CDS:KCTD11;Parent=ENST00000333751,ENSP00000328352
- 17      protein_coding  three_prime_UTR 7256961 7258258 .       +       .       Parent=ENST00000333751
- 17      protein_coding  exon    7255208 7258258 .       +       .       Parent=ENST00000333751
-
---------
-
-**About formats**
-
-**GTF format** Gene Transfer Format, it borrows from GFF, but has additional structure that warrants a separate definition and format name. GTF lines have nine tab-seaparated fields::
-
-    1. seqname - The name of the sequence.
-    2. source - This indicating where the annotation came from.
-    3. feature - The name of the feature types. The following feature types are required: 'CDS', 'start_codon' and 'stop_codon'
-    4. start - The starting position of the feature in the sequence. The first base is numbered 1.
-    5. end - The ending position of the feature (inclusive).
-    6. score - The score field indicates a degree of confidence in the feature's existence and coordinates.
-    7. strand - Valid entries include '+', '-', or '.'
-    8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base.
-    9. attributes - These attributes are designed for handling multiple transcripts from the same genomic region.
-
-**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields::
-
-    1. seqid - Must be a chromosome or scaffold.
-    2. source - The program that generated this feature.
-    3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". 
-    4. start - The starting position of the feature in the sequence. The first base is numbered 1.
-    5. stop - The ending position of the feature (inclusive).
-    6. score - A score between 0 and 1000. If there is no score value, enter ".".
-    7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
-    8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
-    9. attributes - All lines with the same group are linked together into a single item.
-
---------
-
-**Copyright**
-
-2009-2014 Max Planck Society, University of Tübingen &amp; Memorial Sloan Kettering Cancer Center
-
-Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014)
-
- </help>
-</tool>
b
diff -r d4f9b7beb52f -r 7d67331368f3 helper.py
--- a/helper.py Thu Apr 23 17:51:14 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,332 +0,0 @@\n-#!/usr/bin/env python\n-"""\n-Common utility functions\n-"""\n-\n-import os \n-import re\n-import sys \n-import gzip \n-import bz2\n-import numpy \n-\n-def init_gene():\n-    """\n-    Initializing the gene structure \n-    """\n-\n-    gene_det = [(\'id\', \'f8\'), \n-            (\'anno_id\', numpy.dtype), \n-            (\'confgenes_id\', numpy.dtype),\n-            (\'name\', \'S25\'),\n-            (\'source\', \'S25\'),\n-            (\'gene_info\', numpy.dtype),\n-            (\'alias\', \'S15\'),\n-            (\'name2\', numpy.dtype),\n-            (\'strand\', \'S2\'), \n-            (\'score\', \'S15\'), \n-            (\'chr\', \'S15\'), \n-            (\'chr_num\', numpy.dtype),\n-            (\'paralogs\', numpy.dtype),\n-            (\'start\', \'f8\'),\n-            (\'stop\', \'f8\'), \n-            (\'transcripts\', numpy.dtype),\n-            (\'transcript_type\', numpy.dtype),\n-            (\'transcript_info\', numpy.dtype),\n-            (\'transcript_status\', numpy.dtype),\n-            (\'transcript_valid\', numpy.dtype),\n-            (\'exons\', numpy.dtype),\n-            (\'exons_confirmed\', numpy.dtype),\n-            (\'cds_exons\', numpy.dtype),\n-            (\'utr5_exons\', numpy.dtype),\n-            (\'utr3_exons\', numpy.dtype),\n-            (\'tis\', numpy.dtype),\n-            (\'tis_conf\', numpy.dtype),\n-            (\'tis_info\', numpy.dtype),\n-            (\'cdsStop\', numpy.dtype),\n-            (\'cdsStop_conf\', numpy.dtype),\n-            (\'cdsStop_info\', numpy.dtype),\n-            (\'tss\', numpy.dtype),\n-            (\'tss_info\', numpy.dtype),\n-            (\'tss_conf\', numpy.dtype),\n-            (\'cleave\', numpy.dtype),\n-            (\'cleave_info\', numpy.dtype),\n-            (\'cleave_conf\', numpy.dtype),\n-            (\'polya\', numpy.dtype),\n-            (\'polya_info\', numpy.dtype),\n-            (\'polya_conf\', numpy.dtype),\n-            (\'is_alt\', \'f8\'), \n-            (\'is_alt_spliced\', \'f8\'), \n-            (\'is_valid\',  numpy.dtype),\n-            (\'transcript_complete\', numpy.dtype),\n-            (\'is_complete\', numpy.dtype),\n-            (\'is_correctly_gff3_referenced\', \'S5\'),\n-            (\'splicegraph\', numpy.dtype) ]\n-\n-    return gene_det\n-\n-def open_file(fname):\n-    """\n-    Open the file (supports .gz .bz2) and returns the handler\n-\n-    @args fname: input file name for reading \n-    @type fname: str\n-    """\n-\n-    try:\n-        if os.path.splitext(fname)[1] == ".gz":\n-            FH = gzip.open(fname, \'rb\')\n-        elif os.path.splitext(fname)[1] == ".bz2":\n-            FH = bz2.BZ2File(fname, \'rb\')\n-        else:\n-            FH = open(fname, \'rU\')\n-    except Exception as error:\n-        sys.exit(error)\n-\n-    return FH\n-\n-def add_CDS_phase(strand, cds):\n-    """\n-    Calculate CDS phase and add to the CDS exons\n-\n-    @args strand: feature strand information \n-    @type strand: +/- \n-    @args cds: coding exon coordinates \n-    @type cds: numpy array [[int, int, int]]\n-    """\n-\n-    cds_region, cds_flag = [], 0 \n-    if strand == \'+\':\n-        for cdspos in cds:\n-            if cds_flag == 0:\n-                cdspos = (cdspos[0], cdspos[1], 0)\n-                diff = (cdspos[1]-(cdspos[0]-1))%3\n-            else:\n-                xy = 0\n-                if diff == 0: \n-                    cdspos = (cdspos[0], cdspos[1], 0)\n-                elif diff == 1: \n-                    cdspos = (cdspos[0], cdspos[1], 2)\n-                    xy = 2\n-                elif diff == 2: \n-                    cdspos = (cdspos[0], cdspos[1], 1)\n-                    xy = 1\n-                diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3\n-            cds_region.append(cdspos)\n-            cds_flag = 1 \n-    elif strand == \'-\':\n-        cds.reverse()\n-        for cdspos in cds: \n-            if cds_flag == 0:\n-                cdspos = (cdspos[0], cdspos[1], 0)\n-                diff = (cdspos[1]-(cdspos[0]-1))%3\n-            else:  \n-                xy = 0 \n-                if diff == 0: \n-                    cdspos = (cdspos[0], cdspos[1], 0)\n-                elif diff == 1:\n-                  '..b"              exon_pos.append([cds_5start, utr3_end])\n-            for cds in cds_cod:\n-                exon_pos.append(cds)\n-            for utr3 in three_p_utr:\n-                exon_pos.append(utr3)\n-        else:    \n-            if jun_exon != []:\n-                five_p_utr = five_p_utr[:-1]\n-                cds_cod = cds_cod[1:]\n-            for utr5 in five_p_utr:\n-                exon_pos.append(utr5)\n-            exon_pos.append(jun_exon) if jun_exon != [] else ''\n-            jun_exon = []\n-            utr3_start, utr3_end = 0, 0\n-            if three_p_utr != []:\n-                utr3_start = three_p_utr[0][0]\n-                utr3_end = three_p_utr[0][1]\n-            cds_3start = cds_cod[-1][0]\n-            cds_3end = cds_cod[-1][1]\n-            if utr3_start-cds_3end == 0 or utr3_start-cds_3end == 1:       \n-                jun_exon = [cds_3start, utr3_end]\n-            if jun_exon != []:\n-                cds_cod = cds_cod[:-1]\n-                three_p_utr = three_p_utr[1:]\n-            for cds in cds_cod:\n-                exon_pos.append(cds)\n-            exon_pos.append(jun_exon) if jun_exon != [] else ''\n-            for utr3 in three_p_utr:\n-                exon_pos.append(utr3)\n-    elif strand_p == '-':\n-        utr3_start, utr3_end = 0, 0        \n-        if three_p_utr != []:\n-            utr3_start = three_p_utr[-1][0]\n-            utr3_end = three_p_utr[-1][1]\n-        cds_3start = cds_cod[0][0]\n-        cds_3end = cds_cod[0][1]\n-        jun_exon = []\n-        if cds_3start-utr3_end == 0 or cds_3start-utr3_end == 1:\n-            jun_exon = [utr3_start, cds_3end]  \n-        if len(cds_cod) == 1:    \n-            three_prime_flag = 0\n-            if jun_exon != []:\n-                three_p_utr = three_p_utr[:-1]\n-                three_prime_flag = 1\n-            for utr3 in three_p_utr:\n-                exon_pos.append(utr3)\n-            jun_exon = []\n-            (utr5_start, utr5_end) = (0, 0)\n-            if five_p_utr != []:\n-                utr5_start = five_p_utr[0][0]\n-                utr5_end = five_p_utr[0][1]\n-            if utr5_start-cds_3end == 0 or utr5_start-cds_3end == 1:\n-                jun_exon = [cds_3start, utr5_end]\n-            five_prime_flag = 0\n-            if jun_exon != []:\n-                cds_cod = cds_cod[:-1]\n-                five_p_utr = five_p_utr[1:]\n-                five_prime_flag = 1\n-            if three_prime_flag == 1 and five_prime_flag == 1:\n-                exon_pos.append([utr3_start, utr5_end])\n-            if three_prime_flag == 1 and five_prime_flag == 0:\n-                exon_pos.append([utr3_start, cds_3end])\n-                cds_cod = cds_cod[:-1]\n-            if three_prime_flag == 0 and five_prime_flag == 1:\n-                exon_pos.append([cds_3start, utr5_end])        \n-            for cds in cds_cod:\n-                exon_pos.append(cds)\n-            for utr5 in five_p_utr:\n-                exon_pos.append(utr5)\n-        else:\n-            if jun_exon != []:\n-                three_p_utr = three_p_utr[:-1]\n-                cds_cod = cds_cod[1:]\n-            for utr3 in three_p_utr:\n-                exon_pos.append(utr3)   \n-            if jun_exon != []:\n-                exon_pos.append(jun_exon)\n-            jun_exon = []\n-            (utr5_start, utr5_end) = (0, 0)\n-            if five_p_utr != []:\n-                utr5_start = five_p_utr[0][0]\n-                utr5_end = five_p_utr[0][1]    \n-            cds_5start = cds_cod[-1][0]\n-            cds_5end = cds_cod[-1][1]\n-            if utr5_start-cds_5end == 0 or utr5_start-cds_5end == 1:\n-                jun_exon = [cds_5start, utr5_end]\n-            if jun_exon != []:\n-                cds_cod = cds_cod[:-1]\n-                five_p_utr = five_p_utr[1:]\n-            for cds in cds_cod:\n-                exon_pos.append(cds)\n-            if jun_exon != []:\n-                exon_pos.append(jun_exon)    \n-            for utr5 in five_p_utr:\n-                exon_pos.append(utr5)\n-    return exon_pos\n"
b
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/CCDS30770.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/CCDS30770.bed Thu Apr 23 17:57:49 2015 -0400
b
@@ -0,0 +1,20 @@
+chr1 92149295 92327088 CCDS30770.1 0 - 92149295 92327088 0 16 119,108,42,121,300,159,141,153,338,190,148,169,184,138,185,61, 0,11933,14350,24924,28504,32497,32829,35573,36154,38216,43920,46066,51037,74874,113548,177732,
+chr1 67000041 67208778 CCDS30744.1 0 + 67000041 67208778 0 25 10,64,25,72,57,55,176,12,12,25,52,86,93,75,501,128,127,60,112,156,133,203,65,165,23, 0,91488,98711,101585,105418,108451,109185,126154,133171,136636,137585,138922,142645,145319,147510,154789,155831,161075,184935,194905,199389,204976,206299,206913,208714,
+chr1 8384389 8404073 CCDS30577.1 0 + 8384389 8404073 0 8 397,93,225,728,154,177,206,267, 0,968,1488,5879,11107,13486,15163,19417,
+chr1 16767256 16785385 CCDS44067.1 0 + 16767256 16785385 0 8 14,101,105,82,109,178,76,49, 0,2870,7108,7298,8331,11076,15056,18080,
+chr1 16767256 16785491 CCDS44066.1 0 + 16767256 16785491 0 7 92,101,105,82,109,178,155, 0,2870,7108,7298,8331,11076,18080,
+chr1 16767256 16785385 CCDS173.1 0 + 16767256 16785385 0 8 92,101,105,82,109,178,76,49, 0,2870,7108,7298,8331,11076,15056,18080,
+chr1 25072044 25167428 CCDS256.1 0 + 25072044 25167428 0 6 72,110,126,107,182,165, 0,52188,68540,81456,94306,95219,
+chr1 33547850 33585783 CCDS375.1 0 + 33547850 33585783 0 9 105,174,173,135,166,163,113,215,139, 0,1704,9800,11032,12298,14457,15817,35652,37794,
+chr1 48999844 50489468 CCDS44137.1 0 - 48999844 50489468 0 14 121,27,97,163,153,112,115,90,40,217,95,125,123,34, 0,717,5469,52831,56660,100320,119164,128979,333018,511411,711597,1163140,1317223,1489590,
+chr1 100661810 100715376 CCDS767.1 0 - 100661810 100715376 0 11 168,72,192,78,167,217,122,182,76,124,51, 0,9975,10190,14439,18562,19728,22371,34478,39181,44506,53515,
+chr1 150981108 151006710 CCDS977.1 0 + 150981108 151006710 0 8 39,93,203,185,159,95,159,429, 0,9179,9834,15978,16882,18600,20153,25173,
+chr1 175914288 176176114 CCDS44279.1 0 - 175914288 176176114 0 19 18,45,161,125,118,117,82,109,144,136,115,58,77,69,120,65,98,60,407, 0,2042,41790,43135,44209,82419,98033,98557,101028,135999,140623,171471,189857,203853,217716,218674,230757,239480,261419,
+chr1 175914288 176176114 CCDS30944.1 0 - 175914288 176176114 0 20 18,45,161,125,118,117,82,109,144,136,115,58,77,60,69,120,77,98,60,407, 0,2042,41790,43135,44209,82419,98033,98557,101028,135999,140623,171471,189857,191335,203853,217716,218662,230757,239480,261419,
+chr1 184446643 184588690 CCDS1362.1 0 + 184446643 184588690 0 5 94,95,77,61,39, 0,30078,113229,120891,142008,
+chr1 226420201 226496888 CCDS1553.1 0 - 226420201 226496888 0 15 106,98,180,126,81,102,120,134,158,126,134,105,95,33,79, 0,595,843,6470,18338,33032,33712,35456,45274,53832,55163,63341,65218,68672,76608,
+chr1 1982069 2116448 CCDS37.1 0 + 1982069 2116448 0 18 71,122,90,51,86,132,82,53,189,98,87,136,88,120,80,90,116,88, 0,4810,5853,8910,84631,93579,95396,98241,100159,105364,118887,121424,121670,123266,124123,124593,133952,134291,
+chr1 2075777 2116448 CCDS41229.1 0 + 2075777 2116448 0 13 3,82,53,189,98,87,136,88,120,80,90,116,88, 0,1688,4533,6451,11656,25179,27716,27962,29558,30415,30885,40244,40583,
+chr1 2985823 3350375 CCDS44048.1 0 + 2985823 3350375 0 17 37,350,51,135,103,208,148,154,1417,85,170,78,170,175,237,175,78, 0,116865,174827,315892,327231,333531,335479,336235,342124,345303,348568,349407,356321,356791,361612,362706,364474,
+chr1 2985823 3350375 CCDS41236.1 0 + 2985823 3350375 0 17 37,350,51,135,103,208,148,154,1417,85,170,78,170,175,237,175,135, 0,116865,174827,315892,327231,333531,335479,336235,342124,345303,348568,349407,356321,356791,361612,362706,364417,
+chr1 6285139 6295971 CCDS61.1 0 - 6285139 6295971 0 5 183,218,170,89,195, 0,6822,8394,9806,10637,
b
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/CCDS30770.gff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/CCDS30770.gff Thu Apr 23 17:57:49 2015 -0400
b
b'@@ -0,0 +1,289 @@\n+chr1\tbed2gff\tgene\t92149296\t92327088\t0\t-\t.\tID=Gene:CCDS30770.1;Name=Gene:CCDS30770.1\n+chr1\tbed2gff\ttranscript\t92149296\t92327088\t0\t-\t.\tID=CCDS30770.1;Name=CCDS30770.1;Parent=Gene:CCDS30770.1\n+chr1\tbed2gff\texon\t92149296\t92149414\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92161229\t92161336\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92163646\t92163687\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92174220\t92174340\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92177800\t92178099\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92181793\t92181951\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92182125\t92182265\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92184869\t92185021\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92185450\t92185787\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92187512\t92187701\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92193216\t92193363\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92195362\t92195530\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92200333\t92200516\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92224170\t92224307\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92262844\t92263028\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92327028\t92327088\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\tgene\t67000042\t67208778\t0\t+\t.\tID=Gene:CCDS30744.1;Name=Gene:CCDS30744.1\n+chr1\tbed2gff\ttranscript\t67000042\t67208778\t0\t+\t.\tID=CCDS30744.1;Name=CCDS30744.1;Parent=Gene:CCDS30744.1\n+chr1\tbed2gff\texon\t67000042\t67000051\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67091530\t67091593\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67098753\t67098777\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67101627\t67101698\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67105460\t67105516\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67108493\t67108547\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67109227\t67109402\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67126196\t67126207\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67133213\t67133224\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67136678\t67136702\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67137627\t67137678\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67138964\t67139049\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67142687\t67142779\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67145361\t67145435\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67147552\t67148052\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67154831\t67154958\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67155873\t67155999\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67161117\t67161176\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67184977\t67185088\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67194947\t67195102\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67199431\t67199563\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67205018\t67205220\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67206341\t67206405\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67206955\t67207119\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67208756\t67208778\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\tgene\t8384390\t8404073\t0\t+\t.\tID=Gene:CCDS30577.1;Name=Gene:CCDS30577.1\n+chr1\tbed2gff\ttranscript\t8384390\t8404073\t0\t+\t.\tID=CCDS30577.1;Name=CCDS30577.1;Parent=Gene:CCDS30577.1\n+chr1\tbed2gff\texon\t8384390\t8384786\t0\t+\t.\tParent=CCDS30577.1\n+chr1\tbed2gff\texon\t8385358\t8385450\t0\t+\t.\tParent=CCDS30577.1\n+chr1\tbed2gff\texon\t8385878\t8386102\t0\t+\t.\tParent=CCDS30577.1\n+chr1\tbed2gff\texon\t8390269\t8390996\t0\t+\t.\tParent=CCDS30577.1\n+chr1\tbed2gff\texon\t8395497\t8395650\t0\t+\t.\tParent=CCDS30577.1\n+chr1\tbed2gff\texon\t8397876\t8398052\t0\t+\t.\tParent=CCDS30577.1\n+chr1\tbed2gff\texon\t8399553\t8399758\t0\t+\t.\tParent=CCDS30577.1\n+chr1\tbed2gff\texon\t8403807\t8404073\t0\t+\t.\tParent=CCDS30577.1\n+chr1\tbed2gff\tgene\t16767257\t16785385\t0\t+\t.\tID=Gene:CCDS44067.1;Name=Gene:CCDS44067.1\n+chr1\tbed2gff\ttranscript\t16767257\t16785385\t0\t+\t.\tID=CCDS44067.1;Name=CCDS44067.1;Parent=Gene:CCDS44067.1\n+chr1\tbed2gff\texon\t16767257\t16767270\t0\t+\t.\tParent=CCDS44067.1\n+chr1\tbed2gff\texon\t16770127\t16770227\t0\t+\t.\tParent=CCDS44067.1\n+chr1\tbed2gff\texon\t16774365\t16774469\t0\t+\t.\tParent=CCDS44067.1\n+chr1\tbed2gff\texo'..b'bed2gff\texon\t2106663\t2106752\t0\t+\t.\tParent=CCDS37.1\n+chr1\tbed2gff\texon\t2116022\t2116137\t0\t+\t.\tParent=CCDS37.1\n+chr1\tbed2gff\texon\t2116361\t2116448\t0\t+\t.\tParent=CCDS37.1\n+chr1\tbed2gff\tgene\t2075778\t2116448\t0\t+\t.\tID=Gene:CCDS41229.1;Name=Gene:CCDS41229.1\n+chr1\tbed2gff\ttranscript\t2075778\t2116448\t0\t+\t.\tID=CCDS41229.1;Name=CCDS41229.1;Parent=Gene:CCDS41229.1\n+chr1\tbed2gff\texon\t2075778\t2075780\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2077466\t2077547\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2080311\t2080363\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2082229\t2082417\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2087434\t2087531\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2100957\t2101043\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2103494\t2103629\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2103740\t2103827\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2105336\t2105455\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2106193\t2106272\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2106663\t2106752\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2116022\t2116137\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2116361\t2116448\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\tgene\t2985824\t3350375\t0\t+\t.\tID=Gene:CCDS44048.1;Name=Gene:CCDS44048.1\n+chr1\tbed2gff\ttranscript\t2985824\t3350375\t0\t+\t.\tID=CCDS44048.1;Name=CCDS44048.1;Parent=Gene:CCDS44048.1\n+chr1\tbed2gff\texon\t2985824\t2985860\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3102689\t3103038\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3160651\t3160701\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3301716\t3301850\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3313055\t3313157\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3319355\t3319562\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3321303\t3321450\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3322059\t3322212\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3327948\t3329364\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3331127\t3331211\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3334392\t3334561\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3335231\t3335308\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3342145\t3342314\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3342615\t3342789\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3347436\t3347672\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3348530\t3348704\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3350298\t3350375\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\tgene\t2985824\t3350375\t0\t+\t.\tID=Gene:CCDS41236.1;Name=Gene:CCDS41236.1\n+chr1\tbed2gff\ttranscript\t2985824\t3350375\t0\t+\t.\tID=CCDS41236.1;Name=CCDS41236.1;Parent=Gene:CCDS41236.1\n+chr1\tbed2gff\texon\t2985824\t2985860\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3102689\t3103038\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3160651\t3160701\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3301716\t3301850\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3313055\t3313157\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3319355\t3319562\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3321303\t3321450\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3322059\t3322212\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3327948\t3329364\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3331127\t3331211\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3334392\t3334561\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3335231\t3335308\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3342145\t3342314\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3342615\t3342789\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3347436\t3347672\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3348530\t3348704\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3350241\t3350375\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\tgene\t6285140\t6295971\t0\t-\t.\tID=Gene:CCDS61.1;Name=Gene:CCDS61.1\n+chr1\tbed2gff\ttranscript\t6285140\t6295971\t0\t-\t.\tID=CCDS61.1;Name=CCDS61.1;Parent=Gene:CCDS61.1\n+chr1\tbed2gff\texon\t6285140\t6285322\t0\t-\t.\tParent=CCDS61.1\n+chr1\tbed2gff\texon\t6291962\t6292179\t0\t-\t.\tParent=CCDS61.1\n+chr1\tbed2gff\texon\t6293534\t6293703\t0\t-\t.\tParent=CCDS61.1\n+chr1\tbed2gff\texon\t6294946\t6295034\t0\t-\t.\tParent=CCDS61.1\n+chr1\tbed2gff\texon\t6295777\t6295971\t0\t-\t.\tParent=CCDS61.1\n'
b
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/MB7_3R.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/MB7_3R.bed Thu Apr 23 17:57:49 2015 -0400
b
b'@@ -0,0 +1,241 @@\n+3R\t141309\t144791\tCG9778-RA\t1\t-\t141309\t144791\t0\t5\t1519,236,333,162,488,\t0,2066,2368,2760,2994,\n+3R\t226211\t227739\tCG14647.a\t1\t+\t226211\t227739\t0\t3\t649,132,417,\t0,864,1111,\n+3R\t226211\t227739\tCG14647-RB\t1\t+\t226211\t227739\t0\t3\t649,132,417,\t0,864,1111,\n+3R\t752642\t764363\tCG34306-RA\t2\t+\t752642\t764363\t0\t2\t5670,5912,\t0,5809,\n+3R\t56500\t58054.0\tCG14641.a\t31\t-\t56500\t58054.0\t0\t1\t1554,\t0,\n+3R\t56474\t58077.0\tCG14641-RA\t34\t-\t56474\t58077.0\t0\t1\t1603,\t0,\n+3R\t221814\t223609\tCG9855-RA\t1\t-\t221814\t223609\t0\t4\t710,197,529,180,\t0,767,1019,1615,\n+3R\t1045389\t1047270\tCG1116.a\t3\t+\t1045389\t1047270\t0\t6\t188,218,272,150,577,157,\t0,248,531,868,1081,1724,\n+3R\t1045389\t1047270\tCG1116-RA\t1\t+\t1045389\t1047270\t0\t6\t188,215,272,150,577,157,\t0,251,531,868,1081,1724,\n+3R\t1045389\t1047270\tCG1116-RB\t3\t+\t1045389\t1047270\t0\t5\t466,272,150,577,157,\t0,531,868,1081,1724,\n+3R\t74438\t76518\tCG14643-RA\t1\t-\t74438\t76518\t0\t4\t301,655,608,62,\t0,474,1190,2018,\n+3R\t403660\t404366.0\tCG32945-RA.3d\t31\t-\t403660\t404366.0\t0\t1\t706,\t0,\n+3R\t403660\t404368.0\tCG32945.a\t34\t-\t403660\t404368.0\t0\t1\t708,\t0,\n+3R\t736278\t771295\tCG31536.a\t2\t+\t736278\t771295\t0\t13\t529,735,233,222,244,161,75,107,116,256,1266,487,599,\t0,6840,12035,12390,12677,12975,13197,14989,15337,15519,30118,31745,34418,\n+3R\t736277\t749883\tCG31536-RC\t1\t+\t736277\t749883\t0\t7\t530,735,233,222,244,161,408,\t0,6841,12036,12391,12678,12976,13198,\n+3R\t23012\t30295\tCG12582.a\t3\t+\t23012\t30295\t0\t9\t272,135,552,97,289,480,422,381,361,\t0,446,930,1539,1714,4552,5085,6491,6922,\n+3R\t22996\t30295\tCG12582-RA\t1\t+\t22996\t30295\t0\t9\t288,135,514,97,289,480,422,381,361,\t0,462,984,1555,1730,4568,5101,6507,6938,\n+3R\t22930\t30295\tCG12582.b\t3\t+\t22930\t30295\t0\t9\t93,457,514,97,289,480,422,381,361,\t0,206,1050,1621,1796,4634,5167,6573,7004,\n+3R\t23029\t30295\tCG12582-RB\t1\t+\t23029\t30295\t0\t8\t564,514,97,289,480,422,381,361,\t0,951,1522,1697,4535,5068,6474,6905,\n+3R\t531867\t537915\tCG31534-RC\t3\t+\t531867\t537915\t0\t7\t220,1081,374,911,200,148,1296,\t0,880,2530,3014,4031,4470,4752,\n+3R\t531867\t537915\tCG31534-RB\t1\t+\t531867\t537915\t0\t7\t220,1081,374,911,176,105,1296,\t0,880,2530,3014,4031,4470,4752,\n+3R\t531867\t537915\tCG31534-RA\t3\t+\t531867\t537915\t0\t6\t220,1081,374,911,176,1296,\t0,880,2530,3014,4031,4752,\n+3R\t480549\t483707\tCG12001-RA\t1\t+\t480549\t483707\t0\t4\t252,1138,307,405,\t0,690,2075,2753,\n+3R\t1084192\t1084867.0\tCG14666-RA\t34\t-\t1084192\t1084867.0\t0\t1\t675,\t0,\n+3R\t576128\t598807\tCG31530-RA\t1\t-\t576128\t598807\t0\t5\t2184,169,155,869,153,\t0,2390,2633,7136,22526,\n+3R\t1058267\t1062011\tCG12005-RB\t1\t+\t1058267\t1062011\t0\t6\t181,384,350,1446,423,447,\t0,245,839,1247,2760,3297,\n+3R\t807720\t809954\tCG14662-RA\t1\t-\t807720\t809954\t0\t2\t395,1605,\t0,629,\n+3R\t1061779\t1063038\tCG10233-RA\t1\t-\t1061779\t1063038\t0\t3\t609,301,85,\t0,815,1174,\n+3R\t1062471\t1063038\tCG10233-RB\t2\t-\t1062471\t1063038\t0\t2\t424,85,\t0,482,\n+3R\t606846\t610444\tCG17387-RA.3d\t1\t-\t606846\t610444\t0\t4\t841,1809,148,290,\t0,1162,3102,3308,\n+3R\t701726\t704255\tCG14660-RA\t1\t-\t701726\t704255\t0\t3\t727,1617,57,\t0,804,2472,\n+3R\t909774\t912749.0\tCG2530.a\t31\t-\t909774\t912749.0\t0\t1\t2975,\t0,\n+3R\t909342\t912749.0\tCG2530-RA.5d\t34\t-\t909342\t912749.0\t0\t1\t3407,\t0,\n+3R\t94942\t102759\tCG9766.a\t2\t-\t94942\t102759\t0\t4\t568,205,246,56,\t0,627,888,7761,\n+3R\t94942\t103515\tCG9766-RB\t1\t-\t94942\t103515\t0\t4\t568,205,246,165,\t0,627,888,8408,\n+3R\t976629\t995849\tCG12591-RA\t1\t+\t976629\t995849\t0\t6\t379,575,1094,227,106,680,\t0,2615,14430,15955,16252,18540,\n+3R\t207031\t212741\tCG1084-RA\t1\t+\t207031\t212741\t0\t8\t220,939,1686,167,873,169,134,551,\t0,569,1560,3581,3811,4740,4970,5159,\n+3R\t204643\t206932\tCG11739-RD\t2\t+\t204643\t206932\t0\t7\t88,169,150,118,162,128,508,\t0,528,753,1006,1187,1582,1781,\n+3R\t204400\t206932\tCG11739-RA\t2\t+\t204400\t206932\t0\t7\t331,169,150,118,162,128,508,\t0,771,996,1249,1430,1825,2024,\n+3R\t204385\t206932\tCG11739-RC\t1\t+\t204385\t206932\t0\t7\t66,169,150,118,162,128,508,\t0,786,1011,1264,1445,1840,2039,\n+3R\t205028\t206932\tCG11739-RB\t1\t+\t205028\t206932\t0\t6\t312,150,118,162,128,508,\t0,368,621,802,1197,1396,\n+3R\t612766\t620844\tCG17735-RB\t3\t-\t612766\t620844\t0\t7\t1243,202,189,1836,989,1105,1905,\t0,1329,1636,1888,3831,4913,6173,\n+3R\t656570\t657019.0\tC'..b'G32490-RP\t3\t+\t107973\t127263\t0\t7\t63,161,181,158,133,1098,1752,\t0,422,1974,12877,13316,15236,17538,\n+3R\t107624\t127263\tCG32490-RN\t3\t+\t107624\t127263\t0\t7\t101,161,181,158,133,1098,1752,\t0,771,2323,13226,13665,15585,17887,\n+3R\t107569\t127263\tCG32490.g\t3\t+\t107569\t127263\t0\t7\t68,161,181,167,133,1098,1752,\t0,826,2378,13272,13720,15640,17942,\n+3R\t107551\t127263\tCG32490-RO\t3\t+\t107551\t127263\t0\t7\t101,161,181,158,133,1098,1752,\t0,844,2396,13299,13738,15658,17960,\n+3R\t107551\t127263\tCG32490-RM\t3\t+\t107551\t127263\t0\t7\t174,161,181,158,133,1098,1752,\t0,844,2396,13299,13738,15658,17960,\n+3R\t107426\t127263\tCG32490.h\t3\t+\t107426\t127263\t0\t7\t167,161,181,167,133,1098,1752,\t0,969,2521,13415,13863,15783,18085,\n+3R\t107425\t127263\tCG32490-RC\t1\t+\t107425\t127263\t0\t7\t168,161,181,158,133,1098,1752,\t0,970,2522,13425,13864,15784,18086,\n+3R\t106272\t127263\tCG32490-RH\t3\t+\t106272\t127263\t0\t7\t272,161,181,167,133,1098,1752,\t0,2123,3675,14569,15017,16937,19239,\n+3R\t106110\t127263\tCG32490-RI\t3\t+\t106110\t127263\t0\t7\t81,161,181,167,133,1098,1752,\t0,2285,3837,14731,15179,17099,19401,\n+3R\t105905\t127263\tCG32490-RG\t3\t+\t105905\t127263\t0\t7\t173,161,181,167,133,1098,1752,\t0,2490,4042,14936,15384,17304,19606,\n+3R\t108258\t127263\tCG32490-RA\t3\t+\t108258\t127263\t0\t6\t298,181,167,133,1098,1752,\t0,1689,12583,13031,14951,17253,\n+3R\t120616\t127263\tCG32490-RJ\t1\t+\t120616\t127263\t0\t5\t50,167,133,1098,1752,\t0,225,673,2593,4895,\n+3R\t117669\t127263\tCG32490-RL\t3\t+\t117669\t127263\t0\t5\t125,167,133,1098,1752,\t0,3172,3620,5540,7842,\n+3R\t117459\t127263\tCG32490-RK\t3\t+\t117459\t127263\t0\t5\t532,167,133,1098,1752,\t0,3382,3830,5750,8052,\n+3R\t107426\t128309\tCG32490.i\t2\t+\t107426\t128309\t0\t6\t167,161,181,167,133,62,\t0,969,2521,13415,13863,20821,\n+3R\t110073\t128309\tCG32490-RE\t1\t+\t110073\t128309\t0\t4\t55,167,133,62,\t0,10768,11216,18174,\n+3R\t263101\t267050\tCG14650-RA\t1\t+\t263101\t267050\t0\t6\t2380,154,125,103,293,582,\t0,2437,2647,2841,3013,3367,\n+3R\t212966\t215535\tCG10520-RB\t1\t+\t212966\t215535\t0\t3\t72,912,1355,\t0,236,1214,\n+3R\t678534\t695709\tCG1133-RA\t1\t+\t678534\t695709\t0\t3\t1242,170,1535,\t0,15330,15640,\n+3R\t163481\t165640\tCG1103-RA\t1\t+\t163481\t165640\t0\t3\t164,303,875,\t0,241,1284,\n+3R\t622112\t627445\tCG14656-RA\t3\t-\t622112\t627445\t0\t3\t1307,987,511,\t0,3301,4822,\n+3R\t474944\t480360\tCG1059-RA\t1\t+\t474944\t480360\t0\t5\t545,150,1095,1354,1187,\t0,1362,1576,2760,4229,\n+3R\t358949\t359693\tCG31526-RB\t2\t+\t358949\t359693\t0\t3\t204,320,104,\t0,265,640,\n+3R\t358949\t359666\tCG31526-RA\t1\t+\t358949\t359666\t0\t2\t204,452,\t0,265,\n+3R\t44183\t45852.0\tCG31516.a\t31\t-\t44183\t45852.0\t0\t1\t1669,\t0,\n+3R\t44178\t45852.0\tCG31516-RA\t34\t-\t44178\t45852.0\t0\t1\t1674,\t0,\n+3R\t782716\t787070\tCG2016-RB\t1\t-\t782716\t787070\t0\t7\t229,153,134,122,119,80,37,\t0,295,506,2642,2909,3899,4317,\n+3R\t782716\t787070\tCG2016.a\t3\t-\t782716\t787070\t0\t6\t229,153,134,122,119,37,\t0,295,506,2642,2909,4317,\n+3R\t782716\t787070\tCG2016.b\t2\t-\t782716\t787070\t0\t6\t229,153,134,122,119,455,\t0,295,506,2642,2909,3899,\n+3R\t37504\t53244\tCG1107-RA\t3\t+\t37504\t53244\t0\t13\t30,8,124,68,200,231,140,933,410,374,962,162,772,\t0,125,5661,6464,9335,9973,10275,10475,12816,13283,13715,14747,14968,\n+3R\t46716\t53244\tCG1107-RB\t1\t+\t46716\t53244\t0\t9\t323,231,140,933,410,374,962,162,772,\t0,761,1063,1263,3604,4071,4503,5535,5756,\n+3R\t47365\t53244\tCG1107.a\t3\t+\t47365\t53244\t0\t8\t343,140,933,410,374,962,162,772,\t0,414,614,2955,3422,3854,4886,5107,\n+3R\t92675\t94166\tCG1092.a\t1\t+\t92675\t94166\t0\t2\t252,1184,\t0,307,\n+3R\t92675\t94166\tCG1092-RA\t1\t+\t92675\t94166\t0\t2\t252,1184,\t0,307,\n+3R\t92693\t94005.0\tCG1092-RB\t34\t+\t92693\t94005.0\t0\t1\t1312,\t0,\n+3R\t953811\t955661.0\tCG12007.a\t34\t+\t953811\t955661.0\t0\t1\t1850,\t0,\n+3R\t953809\t955665.0\tCG12007-RA\t34\t+\t953809\t955665.0\t0\t1\t1856,\t0,\n+3R\t224271\t227749\tCG9853-RB\t2\t-\t224271\t227749\t0\t4\t836,235,449,306,\t0,896,1192,3172,\n+3R\t224271\t225734\tCG9853-RA\t1\t-\t224271\t225734\t0\t3\t836,235,271,\t0,896,1192,\n+3R\t261943\t263051.0\tCG9804-RA\t31\t-\t261943\t263051.0\t0\t1\t1108,\t0,\n+3R\t160819\t161237.0\tCG14645-RA\t34\t+\t160819\t161237.0\t0\t1\t418,\t0,\n+3R\t160819\t161223.0\tCG14645.a\t31\t+\t160819\t161223.0\t0\t1\t404,\t0,\n+3R\t185509\t192577\tCG1090.b\t3\t+\t185509\t192577\t0\t6\t500,231,945,976,189,907,\t0,3490,3774,4778,5914,6161,\n'
b
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/MB7_3R.gff3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/MB7_3R.gff3 Thu Apr 23 17:57:49 2015 -0400
b
b'@@ -0,0 +1,3971 @@\n+##gff-version 3\n+3R\tMB7\tgene\t361\t10200\t0\t+\t.\tID=CG12581;Name=CG12581\n+3R\tMB7\tmRNA\t361\t10200\t3\t+\t.\tID=CG12581-RB;Parent=CG12581;Name=CG12581-RB\n+3R\tMB7\texon\t361\t509\t0\t+\t.\tParent=CG12581-RB\n+3R\tMB7\texon\t578\t1913\t0\t+\t.\tParent=CG12581-RB\n+3R\tMB7\texon\t7784\t8649\t0\t+\t.\tParent=CG12581-RB\n+3R\tMB7\texon\t9439\t10200\t0\t+\t.\tParent=CG12581-RB\n+3R\tMB7\tfive_prime_UTR\t361\t509\t0\t+\t.\tParent=CG12581-RB\n+3R\tMB7\tfive_prime_UTR\t578\t1114\t0\t+\t.\tParent=CG12581-RB\n+3R\tMB7\tstart_codon\t1115\t1117\t0\t+\t0\tParent=CG12581-RB\n+3R\tMB7\tCDS\t1115\t1913\t0\t+\t0\tParent=CG12581-RB\n+3R\tMB7\tCDS\t7784\t8649\t0\t+\t2\tParent=CG12581-RB\n+3R\tMB7\tCDS\t9439\t9771\t0\t+\t0\tParent=CG12581-RB\n+3R\tMB7\tstop_codon\t9769\t9771\t0\t+\t0\tParent=CG12581-RB\n+3R\tMB7\tthree_prime_UTR\t9772\t10200\t0\t+\t.\tParent=CG12581-RB\n+3R\tMB7\tmRNA\t380\t10200\t1\t+\t.\tID=CG12581-RA;Parent=CG12581;Name=CG12581-RA\n+3R\tMB7\texon\t380\t1913\t0\t+\t.\tParent=CG12581-RA\n+3R\tMB7\texon\t7784\t8649\t0\t+\t.\tParent=CG12581-RA\n+3R\tMB7\texon\t9439\t10200\t0\t+\t.\tParent=CG12581-RA\n+3R\tMB7\tfive_prime_UTR\t380\t1114\t0\t+\t.\tParent=CG12581-RA\n+3R\tMB7\tstart_codon\t1115\t1117\t0\t+\t0\tParent=CG12581-RA\n+3R\tMB7\tCDS\t1115\t1913\t0\t+\t0\tParent=CG12581-RA\n+3R\tMB7\tCDS\t7784\t8649\t0\t+\t2\tParent=CG12581-RA\n+3R\tMB7\tCDS\t9439\t9771\t0\t+\t0\tParent=CG12581-RA\n+3R\tMB7\tstop_codon\t9769\t9771\t0\t+\t0\tParent=CG12581-RA\n+3R\tMB7\tthree_prime_UTR\t9772\t10200\t0\t+\t.\tParent=CG12581-RA\n+3R\tMB7\tgene\t15388\t16170\t0\t-\t.\tID=CG18090;Name=CG18090\n+3R\tMB7\tmRNA\t15414\t15982\t31\t-\t.\tID=CG18090.a;Parent=CG18090;Name=CG18090.a\n+3R\tMB7\texon\t15414\t15982\t0\t-\t.\tParent=CG18090.a\n+3R\tMB7\tthree_prime_UTR\t15414\t15529\t0\t-\t.\tParent=CG18090.a\n+3R\tMB7\tstop_codon\t15530\t15532\t0\t-\t0\tParent=CG18090.a\n+3R\tMB7\tCDS\t15530\t15955\t0\t-\t0\tParent=CG18090.a\n+3R\tMB7\tstart_codon\t15953\t15955\t0\t-\t0\tParent=CG18090.a\n+3R\tMB7\tfive_prime_UTR\t15956\t15982\t0\t-\t.\tParent=CG18090.a\n+3R\tMB7\tmRNA\t15388\t16170\t34\t-\t.\tID=CG18090-RA;Parent=CG18090;Name=CG18090-RA\n+3R\tMB7\texon\t15388\t16170\t0\t-\t.\tParent=CG18090-RA\n+3R\tMB7\tthree_prime_UTR\t15388\t15529\t0\t-\t.\tParent=CG18090-RA\n+3R\tMB7\tstop_codon\t15530\t15532\t0\t-\t0\tParent=CG18090-RA\n+3R\tMB7\tCDS\t15530\t15955\t0\t-\t0\tParent=CG18090-RA\n+3R\tMB7\tstart_codon\t15953\t15955\t0\t-\t0\tParent=CG18090-RA\n+3R\tMB7\tfive_prime_UTR\t15956\t16170\t0\t-\t.\tParent=CG18090-RA\n+3R\tMB7\tgene\t17136\t21871\t0\t+\t.\tID=DMG5-MB6.chr3R.1.002.a;Name=DMG5-MB6.chr3R.1.002.a\n+3R\tMB7\tmRNA\t17136\t21871\t2\t+\t.\tID=DMG5-MB6.chr3R.1.002.a.a;Parent=DMG5-MB6.chr3R.1.002.a;Name=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\texon\t17136\t17251\t0\t+\t.\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\texon\t19953\t20047\t0\t+\t.\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\texon\t20114\t20599\t0\t+\t.\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\texon\t20671\t21210\t0\t+\t.\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\texon\t21367\t21534\t0\t+\t.\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\texon\t21591\t21871\t0\t+\t.\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\tstart_codon\t17136\t17138\t0\t+\t0\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\tCDS\t17136\t17251\t0\t+\t0\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\tCDS\t19953\t20047\t0\t+\t1\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\tCDS\t20114\t20599\t0\t+\t2\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\tCDS\t20671\t20759\t0\t+\t2\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\tstop_codon\t20757\t20759\t0\t+\t0\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\tthree_prime_UTR\t20760\t21210\t0\t+\t.\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\tthree_prime_UTR\t21367\t21534\t0\t+\t.\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\tthree_prime_UTR\t21591\t21871\t0\t+\t.\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\tgene\t22931\t30295\t0\t+\t.\tID=CG12582;Name=CG12582\n+3R\tMB7\tmRNA\t23013\t30295\t3\t+\t.\tID=CG12582.a;Parent=CG12582;Name=CG12582.a\n+3R\tMB7\texon\t23013\t23284\t0\t+\t.\tParent=CG12582.a\n+3R\tMB7\texon\t23459\t23593\t0\t+\t.\tParent=CG12582.a\n+3R\tMB7\texon\t23943\t24494\t0\t+\t.\tParent=CG12582.a\n+3R\tMB7\texon\t24552\t24648\t0\t+\t.\tParent=CG12582.a\n+3R\tMB7\texon\t24727\t25015\t0\t+\t.\tParent=CG12582.a\n+3R\tMB7\texon\t27565\t28044\t0\t+\t.\tParent=CG12582.a\n+3R\tMB7\texon\t28098\t28519\t0\t+\t.\tParent=CG12582.a\n+3R\tMB7\texon\t29504\t29884\t0\t+\t.\tParent=CG12582.a\n+3R\tMB7\texon\t29935\t30295\t0\t+\t.\tParent=CG12582.a\n+3R\tMB7\tfive_prime_UTR\t23013\t23284\t0\t+\t.\tParent=CG125'..b'517\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\texon\t1121579\t1121685\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\texon\t1121869\t1122357\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\texon\t1123924\t1124211\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\texon\t1125192\t1125295\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\texon\t1129833\t1129904\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\texon\t1138711\t1139219\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\texon\t1139660\t1140027\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\texon\t1148710\t1148847\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\texon\t1149387\t1149566\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\tthree_prime_UTR\t1098665\t1099668\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\tstop_codon\t1099669\t1099671\t0\t-\t0\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1099669\t1099804\t0\t-\t1\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1099871\t1100040\t0\t-\t0\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1100457\t1100616\t0\t-\t1\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1100688\t1100809\t0\t-\t0\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1118362\t1118563\t0\t-\t1\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1118720\t1118882\t0\t-\t2\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1118941\t1119092\t0\t-\t1\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1119784\t1119956\t0\t-\t0\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1120028\t1120577\t0\t-\t1\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1121363\t1121517\t0\t-\t0\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1121579\t1121685\t0\t-\t2\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1121869\t1122357\t0\t-\t2\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1123924\t1124211\t0\t-\t2\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1125192\t1125295\t0\t-\t1\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1129833\t1129904\t0\t-\t1\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1138711\t1139219\t0\t-\t0\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1139660\t1139920\t0\t-\t0\tParent=CG32464-RB\n+3R\tMB7\tstart_codon\t1139918\t1139920\t0\t-\t0\tParent=CG32464-RB\n+3R\tMB7\tfive_prime_UTR\t1139921\t1140027\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\tfive_prime_UTR\t1148710\t1148847\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\tfive_prime_UTR\t1149387\t1149566\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\tmRNA\t1098665\t1149566\t3\t-\t.\tID=CG32464-RU;Parent=CG32464;Name=CG32464-RU\n+3R\tMB7\texon\t1098665\t1099804\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1099871\t1100040\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1100457\t1100616\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1100688\t1100809\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1118362\t1118563\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1118720\t1118882\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1118941\t1119092\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1119784\t1119956\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1120028\t1120577\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1121363\t1121517\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1121579\t1121685\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1121869\t1122357\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1123924\t1124211\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1125192\t1125295\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1138711\t1139219\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1139660\t1140027\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1148710\t1148847\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1149387\t1149566\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\tthree_prime_UTR\t1098665\t1099668\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\tstop_codon\t1099669\t1099671\t0\t-\t0\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1099669\t1099804\t0\t-\t1\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1099871\t1100040\t0\t-\t0\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1100457\t1100616\t0\t-\t1\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1100688\t1100809\t0\t-\t0\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1118362\t1118563\t0\t-\t1\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1118720\t1118882\t0\t-\t2\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1118941\t1119092\t0\t-\t1\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1119784\t1119956\t0\t-\t0\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1120028\t1120577\t0\t-\t1\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1121363\t1121517\t0\t-\t0\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1121579\t1121685\t0\t-\t2\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1121869\t1122357\t0\t-\t2\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1123924\t1124211\t0\t-\t2\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1125192\t1125295\t0\t-\t1\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1138711\t1139219\t0\t-\t0\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1139660\t1139920\t0\t-\t0\tParent=CG32464-RU\n+3R\tMB7\tstart_codon\t1139918\t1139920\t0\t-\t0\tParent=CG32464-RU\n+3R\tMB7\tfive_prime_UTR\t1139921\t1140027\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\tfive_prime_UTR\t1148710\t1148847\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\tfive_prime_UTR\t1149387\t1149566\t0\t-\t.\tParent=CG32464-RU\n'
b
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/aceview_hs_37.gff3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/aceview_hs_37.gff3 Thu Apr 23 17:57:49 2015 -0400
b
b'@@ -0,0 +1,3164 @@\n+##gff-version 3\n+7\tAceView\tgene\t34386126\t34873948\t.\t-\t.\tID=AAA1;Name=AAA1\n+7\tAceView\ttranscript\t34606334\t34797884\t.\t-\t.\tID=AAA1.jAug10;Parent=AAA1\n+7\tAceView\texon\t34606334\t34606424\t.\t-\t.\tParent=AAA1.jAug10\n+7\tAceView\texon\t34606693\t34606763\t.\t-\t.\tParent=AAA1.jAug10\n+7\tAceView\texon\t34609324\t34609473\t.\t-\t.\tParent=AAA1.jAug10\n+7\tAceView\texon\t34743692\t34743811\t.\t-\t.\tParent=AAA1.jAug10\n+7\tAceView\texon\t34768349\t34768428\t.\t-\t.\tParent=AAA1.jAug10\n+7\tAceView\texon\t34797686\t34797884\t.\t-\t.\tParent=AAA1.jAug10\n+7\tAceView\tmRNA\t34682839\t34800803\t.\t-\t.\tID=AAA1.dAug10;Parent=AAA1\n+7\tAceView\tfive_prime_UTR\t34800803\t34800803\t.\t-\t.\tParent=AAA1.dAug10\n+7\tAceView\tCDS\t34682958\t34682963\t.\t-\t0\tParent=AAA1.dAug10\n+7\tAceView\tCDS\t34768349\t34768428\t.\t-\t2\tParent=AAA1.dAug10\n+7\tAceView\tCDS\t34800724\t34800802\t.\t-\t0\tParent=AAA1.dAug10\n+7\tAceView\tthree_prime_UTR\t34682839\t34682957\t.\t-\t.\tParent=AAA1.dAug10\n+7\tAceView\texon\t34682839\t34682963\t.\t-\t.\tParent=AAA1.dAug10\n+7\tAceView\texon\t34768349\t34768428\t.\t-\t.\tParent=AAA1.dAug10\n+7\tAceView\texon\t34800724\t34800803\t.\t-\t.\tParent=AAA1.dAug10\n+7\tAceView\ttranscript\t34758474\t34873943\t.\t-\t.\tID=AAA1.hAug10;Parent=AAA1\n+7\tAceView\texon\t34758474\t34759420\t.\t-\t.\tParent=AAA1.hAug10\n+7\tAceView\texon\t34762896\t34763007\t.\t-\t.\tParent=AAA1.hAug10\n+7\tAceView\texon\t34768349\t34768428\t.\t-\t.\tParent=AAA1.hAug10\n+7\tAceView\texon\t34807954\t34808052\t.\t-\t.\tParent=AAA1.hAug10\n+7\tAceView\texon\t34873773\t34873943\t.\t-\t.\tParent=AAA1.hAug10\n+7\tAceView\ttranscript\t34386126\t34797884\t.\t-\t.\tID=AAA1.eAug10;Parent=AAA1\n+7\tAceView\texon\t34386126\t34390459\t.\t-\t.\tParent=AAA1.eAug10\n+7\tAceView\texon\t34457191\t34457284\t.\t-\t.\tParent=AAA1.eAug10\n+7\tAceView\texon\t34609324\t34609473\t.\t-\t.\tParent=AAA1.eAug10\n+7\tAceView\texon\t34768349\t34768428\t.\t-\t.\tParent=AAA1.eAug10\n+7\tAceView\texon\t34797686\t34797884\t.\t-\t.\tParent=AAA1.eAug10\n+7\tAceView\tmRNA\t34386126\t34797884\t.\t-\t.\tID=AAA1.bAug10;Parent=AAA1\n+7\tAceView\tfive_prime_UTR\t34797711\t34797884\t.\t-\t.\tParent=AAA1.bAug10\n+7\tAceView\tCDS\t34457198\t34457284\t.\t-\t0\tParent=AAA1.bAug10\n+7\tAceView\tCDS\t34768349\t34768428\t.\t-\t2\tParent=AAA1.bAug10\n+7\tAceView\tCDS\t34797686\t34797710\t.\t-\t0\tParent=AAA1.bAug10\n+7\tAceView\tthree_prime_UTR\t34386126\t34390459\t.\t-\t.\tParent=AAA1.bAug10\n+7\tAceView\tthree_prime_UTR\t34457191\t34457197\t.\t-\t.\tParent=AAA1.bAug10\n+7\tAceView\texon\t34386126\t34390459\t.\t-\t.\tParent=AAA1.bAug10\n+7\tAceView\texon\t34457191\t34457284\t.\t-\t.\tParent=AAA1.bAug10\n+7\tAceView\texon\t34768349\t34768428\t.\t-\t.\tParent=AAA1.bAug10\n+7\tAceView\texon\t34797686\t34797884\t.\t-\t.\tParent=AAA1.bAug10\n+7\tAceView\ttranscript\t34390034\t34800803\t.\t-\t.\tID=AAA1.iAug10;Parent=AAA1\n+7\tAceView\texon\t34390034\t34390459\t.\t-\t.\tParent=AAA1.iAug10\n+7\tAceView\texon\t34457191\t34457284\t.\t-\t.\tParent=AAA1.iAug10\n+7\tAceView\texon\t34609324\t34609473\t.\t-\t.\tParent=AAA1.iAug10\n+7\tAceView\texon\t34768349\t34768428\t.\t-\t.\tParent=AAA1.iAug10\n+7\tAceView\texon\t34800724\t34800803\t.\t-\t.\tParent=AAA1.iAug10\n+7\tAceView\tmRNA\t34743462\t34800803\t.\t-\t.\tID=AAA1.cAug10;Parent=AAA1\n+7\tAceView\tfive_prime_UTR\t34800803\t34800803\t.\t-\t.\tParent=AAA1.cAug10\n+7\tAceView\tCDS\t34743797\t34743811\t.\t-\t0\tParent=AAA1.cAug10\n+7\tAceView\tCDS\t34768349\t34768428\t.\t-\t2\tParent=AAA1.cAug10\n+7\tAceView\tCDS\t34800724\t34800802\t.\t-\t0\tParent=AAA1.cAug10\n+7\tAceView\tthree_prime_UTR\t34743462\t34743796\t.\t-\t.\tParent=AAA1.cAug10\n+7\tAceView\texon\t34743462\t34743811\t.\t-\t.\tParent=AAA1.cAug10\n+7\tAceView\texon\t34768349\t34768428\t.\t-\t.\tParent=AAA1.cAug10\n+7\tAceView\texon\t34800724\t34800803\t.\t-\t.\tParent=AAA1.cAug10\n+7\tAceView\ttranscript\t34758474\t34873941\t.\t-\t.\tID=AAA1.fAug10;Parent=AAA1\n+7\tAceView\texon\t34758474\t34759420\t.\t-\t.\tParent=AAA1.fAug10\n+7\tAceView\texon\t34760254\t34760397\t.\t-\t.\tParent=AAA1.fAug10\n+7\tAceView\texon\t34762896\t34763007\t.\t-\t.\tParent=AAA1.fAug10\n+7\tAceView\texon\t34768349\t34768428\t.\t-\t.\tParent=AAA1.fAug10\n+7\tAceView\texon\t34800724\t34800803\t.\t-\t.\tParent=AAA1.fAug10\n+7\tAceView\texon\t34873749\t34873941\t.\t-\t.\tParent=AAA1.fAug10\n+7\tAceView\tmRNA\t34607864\t34797884\t.\t-\t.\tID=AAA1.aAug10;Parent=AAA1\n+7\tAceView\tfive_prime_UTR\t34797711\t34797884\t.\t-\t.\tParent=AAA1.aAug10\n+7\tAceView\t'..b'ceView\tCDS\t219134689\t219134809\t.\t-\t0\tParent=AAMP.gAug10\n+2\tAceView\tthree_prime_UTR\t219128853\t219129331\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\texon\t219128853\t219129331\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\texon\t219129739\t219129897\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\texon\t219130094\t219130184\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\texon\t219130302\t219130405\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\texon\t219130554\t219130669\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\texon\t219130787\t219130870\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\texon\t219131166\t219131310\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\texon\t219131570\t219131709\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\texon\t219132217\t219132336\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\texon\t219134105\t219134257\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\texon\t219134689\t219134857\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\tmRNA\t219128851\t219134882\t.\t-\t.\tID=AAMP.cAug10;Parent=AAMP\n+2\tAceView\tfive_prime_UTR\t219134810\t219134882\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\tCDS\t219129256\t219129331\t.\t-\t1\tParent=AAMP.cAug10\n+2\tAceView\tCDS\t219129743\t219129897\t.\t-\t0\tParent=AAMP.cAug10\n+2\tAceView\tCDS\t219130094\t219130184\t.\t-\t1\tParent=AAMP.cAug10\n+2\tAceView\tCDS\t219130302\t219130405\t.\t-\t0\tParent=AAMP.cAug10\n+2\tAceView\tCDS\t219130554\t219130669\t.\t-\t2\tParent=AAMP.cAug10\n+2\tAceView\tCDS\t219130787\t219130870\t.\t-\t2\tParent=AAMP.cAug10\n+2\tAceView\tCDS\t219131166\t219131310\t.\t-\t0\tParent=AAMP.cAug10\n+2\tAceView\tCDS\t219131570\t219131709\t.\t-\t2\tParent=AAMP.cAug10\n+2\tAceView\tCDS\t219132217\t219132336\t.\t-\t2\tParent=AAMP.cAug10\n+2\tAceView\tCDS\t219134105\t219134260\t.\t-\t2\tParent=AAMP.cAug10\n+2\tAceView\tCDS\t219134689\t219134809\t.\t-\t0\tParent=AAMP.cAug10\n+2\tAceView\tthree_prime_UTR\t219128851\t219129255\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\texon\t219128851\t219129331\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\texon\t219129743\t219129897\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\texon\t219130094\t219130184\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\texon\t219130302\t219130405\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\texon\t219130554\t219130669\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\texon\t219130787\t219130870\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\texon\t219131166\t219131310\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\texon\t219131570\t219131709\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\texon\t219132217\t219132336\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\texon\t219134105\t219134260\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\texon\t219134689\t219134882\t.\t-\t.\tParent=AAMP.cAug10\n+3\tAceView\tgene\t151451704\t151479124\t.\t+\t.\tID=AADACL2;Name=AADACL2\n+3\tAceView\tmRNA\t151451704\t151475667\t.\t+\t.\tID=AADACL2.aAug10;Parent=AADACL2\n+3\tAceView\tfive_prime_UTR\t151451704\t151451823\t.\t+\t.\tParent=AADACL2.aAug10\n+3\tAceView\tCDS\t151451824\t151451961\t.\t+\t0\tParent=AADACL2.aAug10\n+3\tAceView\tCDS\t151458434\t151458656\t.\t+\t0\tParent=AADACL2.aAug10\n+3\tAceView\tCDS\t151461881\t151461950\t.\t+\t2\tParent=AADACL2.aAug10\n+3\tAceView\tCDS\t151463297\t151463468\t.\t+\t1\tParent=AADACL2.aAug10\n+3\tAceView\tCDS\t151474780\t151475382\t.\t+\t0\tParent=AADACL2.aAug10\n+3\tAceView\tthree_prime_UTR\t151475383\t151475667\t.\t+\t.\tParent=AADACL2.aAug10\n+3\tAceView\texon\t151451704\t151451961\t.\t+\t.\tParent=AADACL2.aAug10\n+3\tAceView\texon\t151458434\t151458656\t.\t+\t.\tParent=AADACL2.aAug10\n+3\tAceView\texon\t151461881\t151461950\t.\t+\t.\tParent=AADACL2.aAug10\n+3\tAceView\texon\t151463297\t151463468\t.\t+\t.\tParent=AADACL2.aAug10\n+3\tAceView\texon\t151474780\t151475667\t.\t+\t.\tParent=AADACL2.aAug10\n+3\tAceView\tmRNA\t151451704\t151479124\t.\t+\t.\tID=AADACL2.bAug10;Parent=AADACL2\n+3\tAceView\tfive_prime_UTR\t151451704\t151451948\t.\t+\t.\tParent=AADACL2.bAug10\n+3\tAceView\tCDS\t151451949\t151451961\t.\t+\t0\tParent=AADACL2.bAug10\n+3\tAceView\tCDS\t151461881\t151461950\t.\t+\t2\tParent=AADACL2.bAug10\n+3\tAceView\tCDS\t151463297\t151463468\t.\t+\t1\tParent=AADACL2.bAug10\n+3\tAceView\tCDS\t151474780\t151475382\t.\t+\t0\tParent=AADACL2.bAug10\n+3\tAceView\tthree_prime_UTR\t151475383\t151479124\t.\t+\t.\tParent=AADACL2.bAug10\n+3\tAceView\texon\t151451704\t151451961\t.\t+\t.\tParent=AADACL2.bAug10\n+3\tAceView\texon\t151461881\t151461950\t.\t+\t.\tParent=AADACL2.bAug10\n+3\tAceView\texon\t151463297\t151463468\t.\t+\t.\tParent=AADACL2.bAug10\n+3\tAceView\texon\t151474780\t151479124\t.\t+\t.\tParent=AADACL2.bAug10\n'
b
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/aceview_hs_37.gtf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/aceview_hs_37.gtf Thu Apr 23 17:57:49 2015 -0400
b
b'@@ -0,0 +1,3989 @@\n+11\tAceView\texon\t111933358\t111934981\t.\t-\t0\tgene_id 2-oxoacid_dh; Gene_type cDNA_supported; transcript_id 2-oxoacid_dh.aAug10-unspliced; exon_number 1\n+19\tAceView\tCDS\t58859154\t58859210\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.aAug10; product_id A1BGAS.aAug10; exon_number 1\n+19\tAceView\texon\t58859153\t58859210\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.aAug10; exon_number 1\n+19\tAceView\tintron\t58859211\t58864686\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.aAug10; type gt_ag\n+19\tAceView\tCDS\t58864687\t58864840\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.aAug10; product_id A1BGAS.aAug10; exon_number 2\n+19\tAceView\texon\t58864687\t58864840\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.aAug10; exon_number 2\n+19\tAceView\tintron\t58864841\t58865079\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.aAug10; type gt_ag\n+19\tAceView\tCDS\t58865080\t58865114\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.aAug10; product_id A1BGAS.aAug10; exon_number 3\n+19\tAceView\texon\t58865080\t58865223\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.aAug10; exon_number 3\n+19\tAceView\tstop_codon\t58865115\t58865117\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.aAug10; product_id A1BGAS.aAug10;\n+19\tAceView\tintron\t58865224\t58865734\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.aAug10; type gt_ag\n+19\tAceView\texon\t58865735\t58866090\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.aAug10; exon_number 4\n+19\tAceView\tstart_codon\t58864404\t58864406\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; product_id A1BGAS.bAug10;\n+19\tAceView\tCDS\t58864404\t58864410\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; product_id A1BGAS.bAug10; exon_number 1\n+19\tAceView\texon\t58862110\t58864410\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; exon_number 1\n+19\tAceView\tintron\t58864411\t58864744\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; type gt_ag\n+19\tAceView\tCDS\t58864745\t58864840\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; product_id A1BGAS.bAug10; exon_number 2\n+19\tAceView\texon\t58864745\t58864840\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; exon_number 2\n+19\tAceView\tintron\t58864841\t58865079\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; type gt_ag\n+19\tAceView\tCDS\t58865080\t58865114\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; product_id A1BGAS.bAug10; exon_number 3\n+19\tAceView\texon\t58865080\t58865223\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; exon_number 3\n+19\tAceView\tstop_codon\t58865115\t58865117\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; product_id A1BGAS.bAug10;\n+19\tAceView\tintron\t58865224\t58865734\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; type gt_ag\n+19\tAceView\texon\t58865735\t58866548\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; exon_number 4\n+19\tAceView\texon\t58859122\t58859210\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.cAug10; exon_number 1\n+19\tAceView\tintron\t58859211\t58864686\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.cAug10; type gt_ag\n+19\tAceView\texon\t58864687\t58864840\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.cAug10; exon_number 2\n+19\tAceView\tintron\t58864841\t58865079\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.cAug10; type gt_ag\n+19\tAceView\tstart_codon\t58865831\t58865833\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.cAug10; product_id A1BGAS.cAug10;\n+19\tAceView\tCDS\t58865831\t58866547\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; tran'..b'codon\t219129739\t219129741\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.gAug10; product_id AAMP.gAug10;\n+2\tAceView\tintron\t219129332\t219129738\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.gAug10; type gt_ag\n+2\tAceView\texon\t219128853\t219129331\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.gAug10; exon_number 11\n+2\tAceView\tstart_codon\t219134807\t219134809\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; product_id AAMP.hAug10;\n+2\tAceView\tCDS\t219134689\t219134809\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; product_id AAMP.hAug10; exon_number 1\n+2\tAceView\texon\t219134689\t219134843\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; exon_number 1\n+2\tAceView\tintron\t219134258\t219134688\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; type gt_ag\n+2\tAceView\tCDS\t219134105\t219134257\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; product_id AAMP.hAug10; exon_number 2\n+2\tAceView\texon\t219134105\t219134257\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; exon_number 2\n+2\tAceView\tintron\t219132337\t219134104\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; type gt_ag\n+2\tAceView\tCDS\t219132217\t219132336\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; product_id AAMP.hAug10; exon_number 3\n+2\tAceView\texon\t219132217\t219132336\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; exon_number 3\n+2\tAceView\tintron\t219131710\t219132216\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; type gt_ag\n+2\tAceView\tCDS\t219131570\t219131709\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; product_id AAMP.hAug10; exon_number 4\n+2\tAceView\texon\t219131570\t219131709\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; exon_number 4\n+2\tAceView\tintron\t219131311\t219131569\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; type gt_ag\n+2\tAceView\tCDS\t219131166\t219131310\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; product_id AAMP.hAug10; exon_number 5\n+2\tAceView\texon\t219131166\t219131310\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; exon_number 5\n+2\tAceView\tintron\t219130871\t219131165\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; type gt_ag\n+2\tAceView\tCDS\t219130787\t219130870\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; product_id AAMP.hAug10; exon_number 6\n+2\tAceView\texon\t219130787\t219130870\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; exon_number 6\n+2\tAceView\tintron\t219130670\t219130786\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; type gt_ag\n+2\tAceView\tCDS\t219130392\t219130669\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; product_id AAMP.hAug10; exon_number 7\n+2\tAceView\texon\t219130302\t219130669\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; exon_number 7\n+2\tAceView\tstop_codon\t219130389\t219130391\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; product_id AAMP.hAug10;\n+2\tAceView\tintron\t219130185\t219130301\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; type gt_ag\n+2\tAceView\texon\t219130094\t219130184\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; exon_number 8\n+2\tAceView\tintron\t219129898\t219130093\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; type gt_ag\n+2\tAceView\texon\t219129743\t219129897\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; exon_number 9\n+2\tAceView\tintron\t219129332\t219129742\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; type gt_ag\n+2\tAceView\texon\t219128853\t219129331\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; exon_number 10\n'
b
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/ens_mm9_chr18.gff3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ens_mm9_chr18.gff3 Thu Apr 23 17:57:49 2015 -0400
b
b'@@ -0,0 +1,1165 @@\n+##gff-version 3\n+18\tlincRNA\tgene\t3336414\t3366861\t.\t+\t.\tID=ENSMUSG00000091488;Name=AC124336.2\n+18\tlincRNA\ttranscript\t3336414\t3366861\t.\t+\t.\tID=ENSMUST00000171726;Parent=ENSMUSG00000091488;Name=AC124336.2-201\n+18\tlincRNA\texon\t3336414\t3337176\t.\t+\t.\tParent=ENSMUST00000171726\n+18\tlincRNA\texon\t3365925\t3366861\t.\t+\t.\tParent=ENSMUST00000171726\n+18\tprotein_coding\tgene\t9314042\t9450148\t.\t-\t.\tID=ENSMUSG00000024286;Name=Ccny\n+18\tprotein_coding\tmRNA\t9314042\t9450148\t.\t-\t.\tID=ENSMUST00000053917;Parent=ENSMUSG00000024286;Name=Ccny-201\n+18\tprotein_coding\tfive_prime_UTR\t9449670\t9450148\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\tCDS\t9316554\t9316670\t.\t-\t0\tParent=ENSMUST00000053917\n+18\tprotein_coding\tCDS\t9319407\t9319569\t.\t-\t1\tParent=ENSMUST00000053917\n+18\tprotein_coding\tCDS\t9332782\t9332948\t.\t-\t0\tParent=ENSMUST00000053917\n+18\tprotein_coding\tCDS\t9345192\t9345311\t.\t-\t0\tParent=ENSMUST00000053917\n+18\tprotein_coding\tCDS\t9345412\t9345469\t.\t-\t1\tParent=ENSMUST00000053917\n+18\tprotein_coding\tCDS\t9349386\t9349421\t.\t-\t1\tParent=ENSMUST00000053917\n+18\tprotein_coding\tCDS\t9353405\t9353505\t.\t-\t0\tParent=ENSMUST00000053917\n+18\tprotein_coding\tCDS\t9377792\t9377826\t.\t-\t2\tParent=ENSMUST00000053917\n+18\tprotein_coding\tCDS\t9386733\t9386807\t.\t-\t2\tParent=ENSMUST00000053917\n+18\tprotein_coding\tCDS\t9449516\t9449669\t.\t-\t0\tParent=ENSMUST00000053917\n+18\tprotein_coding\tthree_prime_UTR\t9314042\t9316553\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\texon\t9314042\t9316670\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\texon\t9319407\t9319569\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\texon\t9332782\t9332948\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\texon\t9345192\t9345311\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\texon\t9345412\t9345469\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\texon\t9349386\t9349421\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\texon\t9353405\t9353505\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\texon\t9377792\t9377826\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\texon\t9386733\t9386807\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\texon\t9449516\t9450148\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\tmRNA\t9314042\t9450148\t.\t-\t.\tID=ENSMUST00000115867;Parent=ENSMUSG00000024286;Name=Ccny-202\n+18\tprotein_coding\tfive_prime_UTR\t9449670\t9450148\t.\t-\t.\tParent=ENSMUST00000115867\n+18\tprotein_coding\tCDS\t9316554\t9316670\t.\t-\t0\tParent=ENSMUST00000115867\n+18\tprotein_coding\tCDS\t9319407\t9319569\t.\t-\t1\tParent=ENSMUST00000115867\n+18\tprotein_coding\tCDS\t9332782\t9332948\t.\t-\t0\tParent=ENSMUST00000115867\n+18\tprotein_coding\tCDS\t9345192\t9345311\t.\t-\t0\tParent=ENSMUST00000115867\n+18\tprotein_coding\tCDS\t9345412\t9345469\t.\t-\t1\tParent=ENSMUST00000115867\n+18\tprotein_coding\tCDS\t9349386\t9349421\t.\t-\t1\tParent=ENSMUST00000115867\n+18\tprotein_coding\tCDS\t9353405\t9353505\t.\t-\t0\tParent=ENSMUST00000115867\n+18\tprotein_coding\tCDS\t9377792\t9377826\t.\t-\t2\tParent=ENSMUST00000115867\n+18\tprotein_coding\tCDS\t9449516\t9449669\t.\t-\t0\tParent=ENSMUST00000115867\n+18\tprotein_coding\tthree_prime_UTR\t9314042\t9316553\t.\t-\t.\tParent=ENSMUST00000115867\n+18\tprotein_coding\texon\t9314042\t9316670\t.\t-\t.\tParent=ENSMUST00000115867\n+18\tprotein_coding\texon\t9319407\t9319569\t.\t-\t.\tParent=ENSMUST00000115867\n+18\tprotein_coding\texon\t9332782\t9332948\t.\t-\t.\tParent=ENSMUST00000115867\n+18\tprotein_coding\texon\t9345192\t9345311\t.\t-\t.\tParent=ENSMUST00000115867\n+18\tprotein_coding\texon\t9345412\t9345469\t.\t-\t.\tParent=ENSMUST00000115867\n+18\tprotein_coding\texon\t9349386\t9349421\t.\t-\t.\tParent=ENSMUST00000115867\n+18\tprotein_coding\texon\t9353405\t9353505\t.\t-\t.\tParent=ENSMUST00000115867\n+18\tprotein_coding\texon\t9377792\t9377826\t.\t-\t.\tParent=ENSMUST00000115867\n+18\tprotein_coding\texon\t9449516\t9450148\t.\t-\t.\tParent=ENSMUST00000115867\n+18\tmiRNA\tgene\t10782897\t10782983\t.\t-\t.\tID=ENSMUSG00000065399;Name=Mir133a-1\n+18\tmiRNA\ttranscript\t10782897\t10782983\t.\t-\t.\tID=ENSMUST00000083465;Parent=ENSMUSG00000065399;Name=Mir133a-1-201\n+18\tmiRNA\texon\t10782897\t10782983\t.\t-\t.\tParent=ENSMUST00000083465\n+18\tprotein_coding\tgene\t9726195\t9726668\t.\t-\t.\tID='..b'ein_coding\tCDS\t7441551\t7441636\t.\t-\t2\tParent=ENSMUST00000115869\n+18\tprotein_coding\tCDS\t7442791\t7442872\t.\t-\t0\tParent=ENSMUST00000115869\n+18\tprotein_coding\tCDS\t7443972\t7444103\t.\t-\t0\tParent=ENSMUST00000115869\n+18\tprotein_coding\tCDS\t7458930\t7459010\t.\t-\t0\tParent=ENSMUST00000115869\n+18\tprotein_coding\tCDS\t7461636\t7461713\t.\t-\t0\tParent=ENSMUST00000115869\n+18\tprotein_coding\tCDS\t7512942\t7513060\t.\t-\t2\tParent=ENSMUST00000115869\n+18\tprotein_coding\tCDS\t7561724\t7561760\t.\t-\t0\tParent=ENSMUST00000115869\n+18\tprotein_coding\tthree_prime_UTR\t7347960\t7350962\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7347960\t7351142\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7353152\t7353295\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7355016\t7355124\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7356140\t7356233\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7379987\t7380067\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7403184\t7403354\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7439567\t7439631\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7440081\t7440277\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7440430\t7440504\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7441551\t7441636\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7442791\t7442872\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7443972\t7444103\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7458930\t7459010\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7461636\t7461713\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7512942\t7513060\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7561724\t7561870\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7626731\t7626861\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\tmRNA\t7429282\t7626861\t.\t-\t.\tID=ENSMUST00000025129;Parent=ENSMUSG00000057440;Name=Mpp7-201\n+18\tprotein_coding\tfive_prime_UTR\t7561728\t7561870\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\tfive_prime_UTR\t7626731\t7626861\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7429282\t7429322\t.\t-\t2\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7430259\t7430270\t.\t-\t2\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7430393\t7430413\t.\t-\t2\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7439567\t7439631\t.\t-\t1\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7440081\t7440277\t.\t-\t0\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7440430\t7440504\t.\t-\t0\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7441551\t7441636\t.\t-\t2\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7442791\t7442872\t.\t-\t0\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7443972\t7444103\t.\t-\t0\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7458930\t7459010\t.\t-\t0\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7461636\t7461713\t.\t-\t0\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7512942\t7513060\t.\t-\t2\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7561724\t7561727\t.\t-\t0\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7429282\t7429322\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7430259\t7430270\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7430393\t7430413\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7439567\t7439631\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7440081\t7440277\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7440430\t7440504\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7441551\t7441636\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7442791\t7442872\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7443972\t7444103\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7458930\t7459010\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7461636\t7461713\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7512942\t7513060\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7561724\t7561870\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7626731\t7626861\t.\t-\t.\tParent=ENSMUST00000025129\n'
b
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/ens_mm9_chr18.gtf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ens_mm9_chr18.gtf Thu Apr 23 17:57:49 2015 -0400
b
b'@@ -0,0 +1,1066 @@\n+18\tlincRNA\texon\t11049085\t11050819\t.\t-\t.\tgene_id "ENSMUSG00000087274"; transcript_id "ENSMUST00000138373"; exon_number "1"; gene_name "1010001N08Rik"; \n+18\tlincRNA\texon\t11051256\t11051487\t.\t-\t.\tgene_id "ENSMUSG00000087274"; transcript_id "ENSMUST00000138373"; exon_number "2"; gene_name "1010001N08Rik"; \n+18\tlincRNA\texon\t11049929\t11050254\t.\t-\t.\tgene_id "ENSMUSG00000087274"; transcript_id "ENSMUST00000133759"; exon_number "1"; gene_name "1010001N08Rik"; \n+18\tlincRNA\texon\t11050622\t11050819\t.\t-\t.\tgene_id "ENSMUSG00000087274"; transcript_id "ENSMUST00000133759"; exon_number "2"; gene_name "1010001N08Rik"; \n+18\tlincRNA\texon\t11051256\t11051366\t.\t-\t.\tgene_id "ENSMUSG00000087274"; transcript_id "ENSMUST00000133759"; exon_number "3"; gene_name "1010001N08Rik"; \n+18\tlincRNA\texon\t11052473\t11052565\t.\t-\t.\tgene_id "ENSMUSG00000087274"; transcript_id "ENSMUST00000133759"; exon_number "4"; gene_name "1010001N08Rik"; \n+18\tlincRNA\texon\t5162878\t5164430\t.\t-\t.\tgene_id "ENSMUSG00000085461"; transcript_id "ENSMUST00000150337"; exon_number "1"; gene_name "Gm16954"; \n+18\tlincRNA\texon\t5165286\t5165400\t.\t-\t.\tgene_id "ENSMUSG00000085461"; transcript_id "ENSMUST00000150337"; exon_number "2"; gene_name "Gm16954"; \n+18\tlincRNA\texon\t5165669\t5165729\t.\t-\t.\tgene_id "ENSMUSG00000085461"; transcript_id "ENSMUST00000150337"; exon_number "3"; gene_name "Gm16954"; \n+18\tprotein_coding\texon\t12657194\t12657637\t.\t-\t.\tgene_id "ENSMUSG00000090309"; transcript_id "ENSMUST00000172267"; exon_number "1"; gene_name "AC102131.1"; \n+18\tprotein_coding\tCDS\t12657194\t12657637\t.\t-\t0\tgene_id "ENSMUSG00000090309"; transcript_id "ENSMUST00000172267"; exon_number "1"; gene_name "AC102131.1"; \n+18\tprotein_coding\tstart_codon\t12657635\t12657637\t.\t-\t0\tgene_id "ENSMUSG00000090309"; transcript_id "ENSMUST00000172267"; exon_number "1"; gene_name "AC102131.1"; \n+18\tprotein_coding\tstop_codon\t12657194\t12657196\t.\t-\t0\tgene_id "ENSMUSG00000090309"; transcript_id "ENSMUST00000172267"; exon_number "1"; gene_name "AC102131.1"; \n+18\tlincRNA\texon\t11979185\t11979574\t.\t-\t.\tgene_id "ENSMUSG00000087420"; transcript_id "ENSMUST00000129627"; exon_number "1"; gene_name "Gm6277"; \n+18\tlincRNA\texon\t11979624\t11980616\t.\t-\t.\tgene_id "ENSMUSG00000087420"; transcript_id "ENSMUST00000129627"; exon_number "2"; gene_name "Gm6277"; \n+18\tlincRNA\texon\t11981407\t11981548\t.\t-\t.\tgene_id "ENSMUSG00000087420"; transcript_id "ENSMUST00000129627"; exon_number "3"; gene_name "Gm6277"; \n+18\tlincRNA\texon\t11983673\t11983735\t.\t-\t.\tgene_id "ENSMUSG00000087420"; transcript_id "ENSMUST00000129627"; exon_number "4"; gene_name "Gm6277"; \n+18\tlincRNA\texon\t11997690\t11997846\t.\t-\t.\tgene_id "ENSMUSG00000087420"; transcript_id "ENSMUST00000129627"; exon_number "5"; gene_name "Gm6277"; \n+18\tmisc_RNA\texon\t3860106\t3860428\t.\t+\t.\tgene_id "ENSMUSG00000084719"; transcript_id "ENSMUST00000122770"; exon_number "1"; gene_name "7SK.69"; \n+18\tprotein_coding\texon\t11815936\t11816201\t.\t+\t.\tgene_id "ENSMUSG00000041238"; transcript_id "ENSMUST00000047322"; exon_number "1"; gene_name "Rbbp8"; \n+18\tprotein_coding\tCDS\t11819342\t11819450\t.\t+\t0\tgene_id "ENSMUSG00000041238"; transcript_id "ENSMUST00000047322"; exon_number "1"; gene_name "Rbbp8"; \n+18\tprotein_coding\tstart_codon\t11819342\t11819344\t.\t+\t0\tgene_id "ENSMUSG00000041238"; transcript_id "ENSMUST00000047322"; exon_number "1"; gene_name "Rbbp8"; \n+18\tprotein_coding\texon\t11819244\t11819450\t.\t+\t.\tgene_id "ENSMUSG00000041238"; transcript_id "ENSMUST00000047322"; exon_number "2"; gene_name "Rbbp8"; \n+18\tprotein_coding\tCDS\t11831091\t11831133\t.\t+\t2\tgene_id "ENSMUSG00000041238"; transcript_id "ENSMUST00000047322"; exon_number "2"; gene_name "Rbbp8"; \n+18\tprotein_coding\texon\t11831091\t11831133\t.\t+\t.\tgene_id "ENSMUSG00000041238"; transcript_id "ENSMUST00000047322"; exon_number "3"; gene_name "Rbbp8"; \n+18\tprotein_coding\tCDS\t11836104\t11836199\t.\t+\t1\tgene_id "ENSMUSG00000041238"; transcript_id "ENSMUST00000047322"; exon_number "3"; gene_name "Rbbp8"; \n+18\tprotein_coding\texon\t11836104\t11836199\t.\t+\t.\tgene_id "ENSMUSG000000'..b'ST00000067947"; exon_number "21"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10112342\t10112390\t.\t-\t1\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "21"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10116772\t10116860\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "22"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10116772\t10116860\t.\t-\t0\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "22"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10119883\t10119943\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "23"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10119883\t10119943\t.\t-\t1\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "23"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10122607\t10122766\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "24"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10122607\t10122766\t.\t-\t2\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "24"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10129303\t10129394\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "25"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10129303\t10129394\t.\t-\t1\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "25"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10131528\t10131666\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "26"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10131528\t10131666\t.\t-\t2\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "26"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10132126\t10132270\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "27"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10132126\t10132270\t.\t-\t0\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "27"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10134414\t10134498\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "28"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10134414\t10134498\t.\t-\t1\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "28"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10136094\t10136269\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "29"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10136094\t10136269\t.\t-\t0\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "29"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10140174\t10140311\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "30"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10140174\t10140311\t.\t-\t0\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "30"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10140786\t10140886\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "31"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10140786\t10140886\t.\t-\t2\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "31"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10150233\t10150314\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "32"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10150233\t10150314\t.\t-\t0\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "32"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10181223\t10181790\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "33"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10181223\t10181315\t.\t-\t0\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "33"; gene_name "Rock1"; \n+18\tprotein_coding\tstop_codon\t10066046\t10066048\t.\t-\t0\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "33"; gene_name "Rock1"; \n'
b
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/gencode_ens_hav.gtf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/gencode_ens_hav.gtf Thu Apr 23 17:57:49 2015 -0400
b
b'@@ -0,0 +1,50 @@\n+1\tHAVANA\tgene\t69091\t70008\t.\t+\t.\tgene_id "ENSG00000186092.4"; transcript_id "ENSG00000186092.4"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F5"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F5"; level 2; havana_gene "OTTHUMG00000001094.1";\n+1\tHAVANA\ttranscript\t69091\t70008\t.\t+\t.\tgene_id "ENSG00000186092.4"; transcript_id "ENST00000335137.3"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F5"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F5-001"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; ccdsid "CCDS30547.1"; havana_gene "OTTHUMG00000001094.1"; havana_transcript "OTTHUMT00000003223.1";\n+1\tHAVANA\texon\t69091\t70008\t.\t+\t.\tgene_id "ENSG00000186092.4"; transcript_id "ENST00000335137.3"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F5"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F5-001"; exon_number 1;  exon_id "ENSE00002319515.1";  level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; ccdsid "CCDS30547.1"; havana_gene "OTTHUMG00000001094.1"; havana_transcript "OTTHUMT00000003223.1";\n+1\tHAVANA\tCDS\t69091\t70005\t.\t+\t0\tgene_id "ENSG00000186092.4"; transcript_id "ENST00000335137.3"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F5"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F5-001"; exon_number 1;  exon_id "ENSE00002319515.1";  level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; ccdsid "CCDS30547.1"; havana_gene "OTTHUMG00000001094.1"; havana_transcript "OTTHUMT00000003223.1";\n+1\tHAVANA\tstart_codon\t69091\t69093\t.\t+\t0\tgene_id "ENSG00000186092.4"; transcript_id "ENST00000335137.3"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F5"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F5-001"; exon_number 1;  exon_id "ENSE00002319515.1";  level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; ccdsid "CCDS30547.1"; havana_gene "OTTHUMG00000001094.1"; havana_transcript "OTTHUMT00000003223.1";\n+1\tHAVANA\tstop_codon\t70006\t70008\t.\t+\t0\tgene_id "ENSG00000186092.4"; transcript_id "ENST00000335137.3"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F5"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F5-001"; exon_number 1;  exon_id "ENSE00002319515.1";  level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; ccdsid "CCDS30547.1"; havana_gene "OTTHUMG00000001094.1"; havana_transcript "OTTHUMT00000003223.1";\n+1\tHAVANA\tUTR\t70006\t70008\t.\t+\t.\tgene_id "ENSG00000186092.4"; transcript_id "ENST00000335137.3"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F5"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F5-001"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; ccdsid "CCDS30547.1"; havana_gene "OTTHUMG00000001094.1"; havana_transcript "OTTHUMT00000003223.1";\n+1\tENSEMBL\tgene\t134901\t139379\t.\t-\t.\tgene_id "ENSG00000237683.5"; transcript_id "ENSG00000237683.5"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "AL627309.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "AL627309.1"; level 3;\n+1\tENSEMBL\ttranscript\t134901\t139379\t.\t-\t.\tgene_id "ENSG00000237683.5"; transcript_id "ENST00000423372.3"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "AL627309.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "AL627309.1-201"; level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\texon\t137621\t139379\t.\t-\t.\tgene_id "ENSG00000237683.5"; transcript_id "ENST00000423372.3"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "AL627309.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "AL627309.1-201"; exon_number 1;  exon_id "ENSE00002221580.1";  level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\tCDS\t138533\t139309\t.\t-\t0\tgene_id "ENSG000002'..b'gene_name "AL669831.1"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL669831.1-201"; exon_number 3;  exon_id "ENSE00003138540.1";  level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\tCDS\t738532\t738618\t.\t-\t0\tgene_id "ENSG00000269831.1"; transcript_id "ENST00000599533.1"; gene_type "protein_coding"; gene_status "NOVEL"; gene_name "AL669831.1"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL669831.1-201"; exon_number 3;  exon_id "ENSE00003138540.1";  level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\tgene\t818043\t819983\t.\t+\t.\tgene_id "ENSG00000269308.1"; transcript_id "ENSG00000269308.1"; gene_type "protein_coding"; gene_status "NOVEL"; gene_name "AL645608.2"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL645608.2"; level 3;\n+1\tENSEMBL\ttranscript\t818043\t819983\t.\t+\t.\tgene_id "ENSG00000269308.1"; transcript_id "ENST00000594233.1"; gene_type "protein_coding"; gene_status "NOVEL"; gene_name "AL645608.2"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL645608.2-201"; level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\texon\t818043\t818058\t.\t+\t.\tgene_id "ENSG00000269308.1"; transcript_id "ENST00000594233.1"; gene_type "protein_coding"; gene_status "NOVEL"; gene_name "AL645608.2"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL645608.2-201"; exon_number 1;  exon_id "ENSE00003079649.1";  level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\tCDS\t818043\t818058\t.\t+\t0\tgene_id "ENSG00000269308.1"; transcript_id "ENST00000594233.1"; gene_type "protein_coding"; gene_status "NOVEL"; gene_name "AL645608.2"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL645608.2-201"; exon_number 1;  exon_id "ENSE00003079649.1";  level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\texon\t819496\t819513\t.\t+\t.\tgene_id "ENSG00000269308.1"; transcript_id "ENST00000594233.1"; gene_type "protein_coding"; gene_status "NOVEL"; gene_name "AL645608.2"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL645608.2-201"; exon_number 2;  exon_id "ENSE00003048391.1";  level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\tCDS\t819496\t819513\t.\t+\t2\tgene_id "ENSG00000269308.1"; transcript_id "ENST00000594233.1"; gene_type "protein_coding"; gene_status "NOVEL"; gene_name "AL645608.2"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL645608.2-201"; exon_number 2;  exon_id "ENSE00003048391.1";  level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\texon\t819961\t819983\t.\t+\t.\tgene_id "ENSG00000269308.1"; transcript_id "ENST00000594233.1"; gene_type "protein_coding"; gene_status "NOVEL"; gene_name "AL645608.2"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL645608.2-201"; exon_number 3;  exon_id "ENSE00003055565.1";  level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\tCDS\t819961\t819980\t.\t+\t2\tgene_id "ENSG00000269308.1"; transcript_id "ENST00000594233.1"; gene_type "protein_coding"; gene_status "NOVEL"; gene_name "AL645608.2"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL645608.2-201"; exon_number 3;  exon_id "ENSE00003055565.1";  level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\tstop_codon\t819981\t819983\t.\t+\t0\tgene_id "ENSG00000269308.1"; transcript_id "ENST00000594233.1"; gene_type "protein_coding"; gene_status "NOVEL"; gene_name "AL645608.2"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL645608.2-201"; exon_number 3;  exon_id "ENSE00003055565.1";  level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\tUTR\t819981\t819983\t.\t+\t.\tgene_id "ENSG00000269308.1"; transcript_id "ENST00000594233.1"; gene_type "protein_coding"; gene_status "NOVEL"; gene_name "AL645608.2"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL645608.2-201"; level 3; tag "basic"; tag "appris_principal";\n'
b
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/s_cerevisiae_SCU49845.gff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/s_cerevisiae_SCU49845.gff Thu Apr 23 17:57:49 2015 -0400
b
@@ -0,0 +1,8 @@
+IX gbk2gff gene 687 3158 . + . ID=AXL2;Name=AXL2
+IX gbk2gff mRNA 687 3158 . + . ID=Transcript:AXL2;Parent=AXL2
+IX gbk2gff CDS 687 3158 . + . Parent=Transcript:AXL2
+IX gbk2gff exon 687 3158 . + . Parent=Transcript:AXL2
+IX gbk2gff gene 3300 4037 . - . ID=REV7;Name=REV7
+IX gbk2gff mRNA 3300 4037 . - . ID=Transcript:REV7;Parent=REV7
+IX gbk2gff CDS 3300 4037 . - . Parent=Transcript:REV7
+IX gbk2gff exon 3300 4037 . - . Parent=Transcript:REV7
b
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/s_cerevisiae_SCU49845.gff3
--- a/test-data/s_cerevisiae_SCU49845.gff3 Thu Apr 23 17:51:14 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,8 +0,0 @@
-IX gbk_to_gff gene 687 3158 . + . ID=AXL2;Name=AXL2
-IX gbk_to_gff . 687 3158 . + . ID=Transcript:AXL2;Parent=AXL2
-IX gbk_to_gff CDS 687 3158 . + . Parent=Transcript:AXL2
-IX gbk_to_gff exon 687 3158 . + . Parent=Transcript:AXL2
-IX gbk_to_gff gene 3300 4037 . - . ID=REV7;Name=REV7
-IX gbk_to_gff . 3300 4037 . - . ID=Transcript:REV7;Parent=REV7
-IX gbk_to_gff CDS 3300 4037 . - . Parent=Transcript:REV7
-IX gbk_to_gff exon 3300 4037 . - . Parent=Transcript:REV7
b
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/single_parent_feature_record.gff3
--- a/test-data/single_parent_feature_record.gff3 Thu Apr 23 17:51:14 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,10 +0,0 @@
-chr1 . miRNA_primary_transcript 1380242 1380467 . - . ID=MI0031047;Alias=MI0031047;Name=gma-MIR9754
-chr1 . miRNA 1380249 1380270 . - . ID=MIMAT0036385;Alias=MIMAT0036385;Name=gma-miR9754;Derives_from=MI0031047
-chr1 . miRNA_primary_transcript 2410094 2410318 . + . ID=MI0016507;Alias=MI0016507;Name=gma-MIR4367
-chr1 . miRNA 2410242 2410263 . + . ID=MIMAT0018266;Alias=MIMAT0018266;Name=gma-miR4367;Derives_from=MI0016507
-chr1 . miRNA_primary_transcript 4792375 4792487 . - . ID=MI0021714;Alias=MI0021714;Name=gma-MIR395h
-chr1 . miRNA 4792388 4792408 . - . ID=MIMAT0024920;Alias=MIMAT0024920;Name=gma-miR395h;Derives_from=MI0021714
-chr1 . miRNA_primary_transcript 4797903 4798018 . - . ID=MI0021715;Alias=MI0021715;Name=gma-MIR395i
-chr1 . miRNA 4797916 4797936 . - . ID=MIMAT0024921;Alias=MIMAT0024921;Name=gma-miR395i;Derives_from=MI0021715
-chr1 . miRNA_primary_transcript 4810817 4810942 . - . ID=MI0021716;Alias=MI0021716;Name=gma-MIR395j
-chr1 . miRNA 4810830 4810850 . - . ID=MIMAT0024922;Alias=MIMAT0024922;Name=gma-miR395j;Derives_from=MI0021716
b
diff -r d4f9b7beb52f -r 7d67331368f3 tool_conf.xml.sample
--- a/tool_conf.xml.sample Thu Apr 23 17:51:14 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,9 +0,0 @@
-<section name="GFFtools" id="gfftools.web">
-    <tool file="GFFtools-GX/gff_to_bed.xml"/>
-    <tool file="GFFtools-GX/bed_to_gff.xml"/>
-    <tool file="GFFtools-GX/gbk_to_gff.xml"/>
-    <tool file="GFFtools-GX/gff_to_gbk.xml"/>
-    <tool file="GFFtools-GX/gff_to_gtf.xml"/>
-    <tool file="GFFtools-GX/gtf_to_gff.xml"/>
-    <tool file="GFFtools-GX/gff_fmap.xml"/>
-</section>