Previous changeset 8:d4f9b7beb52f (2015-04-23) Next changeset 10:c42c69aa81f8 (2015-04-23) |
Commit message:
fixing the new version upload manually |
added:
test-data/CCDS30770.bed test-data/CCDS30770.gff test-data/MB7_3R.bed test-data/MB7_3R.gff3 test-data/aceview_hs_37.gff3 test-data/aceview_hs_37.gtf test-data/ens_mm9_chr18.gff3 test-data/ens_mm9_chr18.gtf test-data/gencode_ens_hav.gtf test-data/s_cerevisiae_SCU49845.gff |
removed:
GFFParser.py README bed_to_gff.py bed_to_gff.xml gbk_to_gff.py gbk_to_gff.xml gff_fmap.py gff_fmap.xml gff_to_bed.py gff_to_bed.xml gff_to_gbk.py gff_to_gbk.xml gff_to_gtf.py gff_to_gtf.xml gffparser_bcbio.py gtf_to_gff.py gtf_to_gff.xml helper.py test-data/s_cerevisiae_SCU49845.gff3 test-data/single_parent_feature_record.gff3 tool_conf.xml.sample |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 GFFParser.py --- a/GFFParser.py Thu Apr 23 17:51:14 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,495 +0,0 @@\n-#!/usr/bin/env python\n-"""\n-Extract genome annotation from a GFF (a tab delimited format for storing sequence features and annotations) file.\n-\n-Requirements: \n- Numpy :- http://numpy.org/ \n- Scipy :- http://scipy.org/ \n-\n-Copyright (C)\t\n-\n-2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. \n-2012-2014 Memorial Sloan Kettering Cancer Center, New York City, USA.\n-"""\n-\n-import re\n-import os\n-import sys\n-import urllib\n-import numpy as np\n-import scipy.io as sio\n-from collections import defaultdict\n-import helper as utils \n-\n-def attribute_tags(col9):\n- """ \n- Split the key-value tags from the attribute column, it takes column number 9 from GTF/GFF file \n-\n- @args col9: attribute column from GFF file \n- @type col9: str\n- """\n- info = defaultdict(list)\n- is_gff = False\n- \n- if not col9:\n- return is_gff, info\n- \n- # trim the line ending semi-colon ucsc may have some white-space \n- col9 = col9.rstrip(\';| \')\n- # attributes from 9th column \n- atbs = col9.split(" ; ")\n- if len(atbs) == 1:\n- atbs = col9.split("; ")\n- if len(atbs) == 1:\n- atbs = col9.split(";")\n- # check the GFF3 pattern which has key value pairs like:\n- gff3_pat = re.compile("\\w+=")\n- # sometime GTF have: gene_id uc002zkg.1;\n- gtf_pat = re.compile("\\s?\\w+\\s")\n-\n- key_vals = []\n-\n- if gff3_pat.match(atbs[0]): # gff3 pattern \n- is_gff = True\n- key_vals = [at.split(\'=\') for at in atbs]\n- elif gtf_pat.match(atbs[0]): # gtf pattern\n- for at in atbs:\n- key_vals.append(at.strip().split(" ",1))\n- else:\n- # to handle attribute column has only single value \n- key_vals.append([\'ID\', atbs[0]])\n- # get key, val items \n- for item in key_vals:\n- key, val = item\n- # replace the double qoutes from feature identifier \n- val = re.sub(\'"\', \'\', val)\n- # replace the web formating place holders to plain text format \n- info[key].extend([urllib.unquote(v) for v in val.split(\',\') if v])\n-\n- return is_gff, info\n- \n-def spec_features_keywd(gff_parts):\n- """\n- Specify the feature key word according to the GFF specifications\n-\n- @args gff_parts: attribute field key \n- @type gff_parts: str \n- """\n- for t_id in ["transcript_id", "transcriptId", "proteinId"]:\n- try:\n- gff_parts["info"]["Parent"] = gff_parts["info"][t_id]\n- break\n- except KeyError:\n- pass\n- for g_id in ["gene_id", "geneid", "geneId", "name", "gene_name", "genename"]:\n- try:\n- gff_parts["info"]["GParent"] = gff_parts["info"][g_id]\n- break\n- except KeyError:\n- pass\n- ## TODO key words\n- for flat_name in ["Transcript", "CDS"]:\n- if gff_parts["info"].has_key(flat_name):\n- # parents\n- if gff_parts[\'type\'] in [flat_name] or re.search(r\'transcript\', gff_parts[\'type\'], re.IGNORECASE):\n- if not gff_parts[\'id\']:\n- gff_parts[\'id\'] = gff_parts[\'info\'][flat_name][0]\n- #gff_parts["info"]["ID"] = [gff_parts["id"]]\n- # children \n- elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR",\n- "coding_exon", "five_prime_UTR", "CDS", "stop_codon",\n- "start_codon"]:\n- gff_parts["info"]["Parent"] = gff_parts["info"][flat_name]\n- break\n- return gff_parts\n-\n-def Parse(ga_file):\n- """\n- Parsing GFF/GTF file based on feature relationship, it takes the input file.\n-\n- @args ga_file: input file name \n- @type ga_file: str \n- """\n- child_map = defaultdict(list)\n- parent_map = dict()\n-\n- ga_handle = utils.open_file(ga_file)\n-\n- for rec in ga_handle:\n- rec = rec.strip(\'\\n\\r\')\n- \n- # skip empty line fasta identifier and commented line\n- if not rec '..b'lete\'] = []\n- gene[g_cnt][\'is_complete\'] = []\n- gene[g_cnt][\'is_correctly_gff3_referenced\'] = \'\'\n- gene[g_cnt][\'splicegraph\'] = []\n- g_cnt += 1 \n-\n- ## deleting empty gene records from the main array\n- XPFLG=0\n- for XP, ens in enumerate(gene):\n- if ens[0]==0:\n- XPFLG=1\n- break\n- \n- if XPFLG==1:\n- XQC = range(XP, len(gene)+1)\n- gene = np.delete(gene, XQC)\n-\n- return gene \n-\n-def NonetoemptyList(XS):\n- """\n- Convert a None type to empty list \n-\n- @args XS: None type \n- @type XS: str \n- """\n- return [] if XS is None else XS \n-\n-def create_missing_feature_type(p_feat, c_feat):\n- """\n- GFF/GTF file defines only child features. This function tries to create \n- the parent feature from the information provided in the attribute column. \n-\n- example: \n- chr21 hg19_knownGene exon 9690071 9690100 0.000000 + . gene_id "uc002zkg.1"; transcript_id "uc002zkg.1"; \n- chr21 hg19_knownGene exon 9692178 9692207 0.000000 + . gene_id "uc021wgt.1"; transcript_id "uc021wgt.1"; \n- chr21 hg19_knownGene exon 9711935 9712038 0.000000 + . gene_id "uc011abu.2"; transcript_id "uc011abu.2"; \n-\n- This function gets the parsed feature annotations. \n- \n- @args p_feat: Parent feature map \n- @type p_feat: collections defaultdict\n- @args c_feat: Child feature map \n- @type c_feat: collections defaultdict\n- """\n-\n- child_n_map = defaultdict(list)\n- for fid, det in c_feat.items():\n- # get the details from grand child \n- GID = STRD = SCR = None\n- SPOS, EPOS = [], [] \n- TYP = dict()\n- for gchild in det:\n- GID = gchild.get(\'gene_id\', [\'\'])[0] \n- SPOS.append(gchild.get(\'location\', [])[0]) \n- EPOS.append(gchild.get(\'location\', [])[1]) \n- STRD = gchild.get(\'strand\', \'\')\n- SCR = gchild.get(\'score\', \'\')\n- if gchild.get(\'type\', \'\') == "gene": ## gencode GTF file has this problem \n- continue\n- TYP[gchild.get(\'type\', \'\')] = 1\n- SPOS.sort() \n- EPOS.sort()\n- \n- # infer transcript type\n- transcript_type = \'transcript\'\n- transcript_type = \'mRNA\' if TYP.get(\'CDS\', \'\') or TYP.get(\'cds\', \'\') else transcript_type\n- \n- # gene id and transcript id are same\n- transcript_id = fid[-1]\n- if GID == transcript_id:\n- transcript_id = \'Transcript:\' + str(GID)\n- \n- # level -1 feature type \n- p_feat[(fid[0], fid[1], GID)] = dict( type = \'gene\',\n- location = [], ## infer location based on multiple transcripts \n- strand = STRD,\n- name = GID )\n- # level -2 feature type \n- child_n_map[(fid[0], fid[1], GID)].append(\n- dict( type = transcript_type,\n- location = [SPOS[0], EPOS[-1]], \n- strand = STRD, \n- score = SCR, \n- ID = transcript_id,\n- gene_id = \'\' ))\n- # reorganizing the grand child\n- for gchild in det:\n- child_n_map[(fid[0], fid[1], transcript_id)].append(\n- dict( type = gchild.get(\'type\', \'\'),\n- location = gchild.get(\'location\'),\n- strand = gchild.get(\'strand\'), \n- ID = gchild.get(\'ID\'),\n- score = gchild.get(\'score\'),\n- gene_id = \'\' ))\n- return p_feat, child_n_map \n-\n' |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 README --- a/README Thu Apr 23 17:51:14 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,61 +0,0 @@ -A collection of tools for converting genome annotation between GTF (Gene Transfer Format), -BED (Browser Extensible Data) and GFF (Generic Feature Format). - -INTRODUCTION - -Several genome annotation centers provide their data in GTF, BED, GFF3 etc. I have few programs -they mainly deals with converting between GTF, BED and GFF3 formats. They are extensively tested -with files from different centers like ENSEMBL, UCSC, JGI and NCBI AceView. Please follow the -instructions below to clone these tools into your galaxy instance. - -CONTENTS - -Tool configuration files in *.xml format. - - gtf_to_gff.xml - gff_to_gtf.xml - bed_to_gff.xml - gff_to_bed.xml - gbk_to_gff.xml - gff_fmap.xml - -Python based scripts. - - gtf_to_gff.py: convert data from GTF to valid GFF3. - gff_to_gtf.py: convert data from GFF3 to GTF. - bed_to_gff.py: convert data from a 12 column UCSC wiggle BED format to GFF3. - gff_to_bed.py: convert gene transcript annotation from GFF3 to UCSC wiggle 12 column BED format. - gbk_to_gff.py: convert data from genbank format to GFF. - gff_fmap.py: find the relation between different features described in a GFF file. - GFFParser.py: Parse GFF/GTF files. - helper.py: Utility functions. - -test-data: Test data set. (move to your galaxy_root_folder/test-data/) - - You may need to move the test files into your test-data directory so galaxy can find them. - If you want to run the functional tests eg as: - - exmaple: - sh run_functional_tests.sh -id fml_gtf2gff - -REQUIREMENTS - - python - -COMMENTS/QUESTIONS - -I can be reached at vipin [at] cbio.mskcc.org - -LICENSE - -Copyright (C) 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society - 2013-2014 Memorial Sloan Kettering Cancer Center - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 3 of the License, or -(at your option) any later version. - -COURTESY - -To the Galaxy Team. |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 bed_to_gff.py --- a/bed_to_gff.py Thu Apr 23 17:51:14 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,70 +0,0 @@ -#!/usr/bin/env python -""" -Convert genome annotation data in a 12 column BED format to GFF3. - -Usage: python bed_to_gff.py in.bed > out.gff - -Requirement: - helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py - -Copyright (C) - 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. - 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA. -""" - -import re -import sys -import helper - -def __main__(): - """ - main function - """ - - try: - bed_fname = sys.argv[1] - except: - print __doc__ - sys.exit(-1) - - bed_fh = helper.open_file(bed_fname) - - for line in bed_fh: - line = line.strip( '\n\r' ) - - if not line or line[0] in ['#']: - continue - - parts = line.split('\t') - assert len(parts) >= 12, line - - rstarts = parts[-1].split(',') - rstarts.pop() if rstarts[-1] == '' else rstarts - - exon_lens = parts[-2].split(',') - exon_lens.pop() if exon_lens[-1] == '' else exon_lens - - if len(rstarts) != len(exon_lens): - continue # checking the consistency col 11 and col 12 - - if len(rstarts) != int(parts[-3]): - continue # checking the number of exons and block count are same - - if not parts[5] in ['+', '-']: - parts[5] = '.' # replace the unknown strand with '.' - - # bed2gff result line - print '%s\tbed2gff\tgene\t%d\t%s\t%s\t%s\t.\tID=Gene:%s;Name=Gene:%s' % (parts[0], int(parts[1])+1, parts[2], parts[4], parts[5], parts[3], parts[3]) - print '%s\tbed2gff\ttranscript\t%d\t%s\t%s\t%s\t.\tID=%s;Name=%s;Parent=Gene:%s' % (parts[0], int(parts[1])+1, parts[2], parts[4], parts[5], parts[3], parts[3], parts[3]) - - st = int(parts[1]) - for ex_cnt in range(int(parts[-3])): - start = st + int(rstarts[ex_cnt]) + 1 - stop = start + int(exon_lens[ex_cnt]) - 1 - print '%s\tbed2gff\texon\t%d\t%d\t%s\t%s\t.\tParent=%s' % (parts[0], start, stop, parts[4], parts[5], parts[3]) - - bed_fh.close() - - -if __name__ == "__main__": - __main__() |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 bed_to_gff.xml --- a/bed_to_gff.xml Thu Apr 23 17:51:14 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,89 +0,0 @@ -<tool id="fml_bed2gff" name="BED-to-GFF" version="2.0.0"> - <description>converter</description> - <command interpreter="python">bed_to_gff.py $inf_bed > $gff_format - </command> - <inputs> - <param format="bed" name="inf_bed" type="data" label="Convert this query" help="Provide genome annotation in 12 column BED format."/> - </inputs> - <outputs> - <data format="gff3" name="gff_format" label="${tool.name} on ${on_string}: Converted" /> - </outputs> - <tests> - <test> - <param name="inf_bed" value="ccds_genes.bed" /> - <output name="gff_format" file="ccds_genes.gff3" /> - </test> - <test> - <param name="inf_bed" value="hs_2009.bed" /> - <output name="gff_format" file="hs_2009.gff3" /> - </test> - </tests> - <help> - -**What it does** - -This tool converts data from a 12 column UCSC wiggle BED format to GFF3 (scroll down for format description). - --------- - -**Example** - -- The following data in UCSC Wiggle BED format:: - - chr1 11873 14409 uc001aaa.3 0 + 11873 11873 0 3 354,109,1189, 0,739,1347, - -- Will be converted to GFF3:: - - ##gff-version 3 - chr1 bed2gff gene 11874 14409 0 + . ID=Gene:uc001aaa.3;Name=Gene:uc001aaa.3 - chr1 bed2gff transcript 11874 14409 0 + . ID=uc001aaa.3;Name=uc001aaa.3;Parent=Gene:uc001aaa.3 - chr1 bed2gff exon 11874 12227 0 + . Parent=uc001aaa.3 - chr1 bed2gff exon 12613 12721 0 + . Parent=uc001aaa.3 - chr1 bed2gff exon 13221 14409 0 + . Parent=uc001aaa.3 - --------- - -**About formats** - -**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and several additional optional ones: - -The first three BED fields (required) are:: - - 1. chrom - The name of the chromosome (e.g. chr1, chrY_random). - 2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.) - 3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval). - -The additional BED fields (optional) are:: - - 4. name - The name of the BED line. - 5. score - A score between 0 and 1000. - 6. strand - Defines the strand - either '+' or '-'. - 7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser. - 8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser. - 9. reserved - This should always be set to zero. - 10. blockCount - The number of blocks (exons) in the BED line. - 11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount. - 12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. - -**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields:: - - 1. seqid - Must be a chromosome or scaffold or contig. - 2. source - The program that generated this feature. - 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". - 4. start - The starting position of the feature in the sequence. The first base is numbered 1. - 5. stop - The ending position of the feature (inclusive). - 6. score - A score between 0 and 1000. If there is no score value, enter ".". - 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). - 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. - 9. attributes - All lines with the same group are linked together into a single item. - --------- - -**Copyright** - -2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center - -Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014) - - </help> -</tool> |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 gbk_to_gff.py --- a/gbk_to_gff.py Thu Apr 23 17:51:14 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,213 +0,0 @@ -#!/usr/bin/env python -""" -Convert data from Genbank format to GFF. - -Usage: -python gbk_to_gff.py in.gbk > out.gff - -Requirements: - BioPython:- http://biopython.org/ - helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py - -Copyright (C) - 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. - 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA. -""" - -import os -import re -import sys -import collections -from Bio import SeqIO -import helper - -def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk): - """ - Write the feature information - """ - - for gname, ginfo in genes.items(): - line = [str(chr_id), - 'gbk_to_gff', - ginfo[3], - str(ginfo[0]), - str(ginfo[1]), - '.', - ginfo[2], - '.', - 'ID=%s;Name=%s' % (str(gname), str(gname))] - print '\t'.join(line) - ## construct the transcript line is not defined in the original file - t_line = [str(chr_id), 'gbk_to_gff', source, 0, 1, '.', ginfo[2], '.'] - - if not transcripts: - t_line.append('ID=Transcript:%s;Parent=%s' % (str(gname), str(gname))) - - if exons: ## get the entire transcript region from the defined feature - t_line[3] = str(exons[gname][0][0]) - t_line[4] = str(exons[gname][0][-1]) - elif cds: - t_line[3] = str(cds[gname][0][0]) - t_line[4] = str(cds[gname][0][-1]) - print '\t'.join(t_line) - - if exons: - exon_line_print(t_line, exons[gname], 'Transcript:'+str(gname), 'exon') - - if cds: - exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'CDS') - if not exons: - exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'exon') - - else: ## transcript is defined - for idx in transcripts[gname]: - t_line[2] = idx[3] - t_line[3] = str(idx[0]) - t_line[4] = str(idx[1]) - t_line.append('ID='+str(idx[2])+';Parent='+str(gname)) - print '\t'.join(t_line) - - ## feature line print call - if exons: - exon_line_print(t_line, exons[gname], str(idx[2]), 'exon') - if cds: - exon_line_print(t_line, cds[gname], str(idx[2]), 'CDS') - if not exons: - exon_line_print(t_line, cds[gname], str(idx[2]), 'exon') - - if len(genes) == 0: ## feature entry with fragment information - - line = [str(chr_id), 'gbk_to_gff', source, 0, 1, '.', orient, '.'] - fStart = fStop = None - - for eid, ex in cds.items(): - fStart = ex[0][0] - fStop = ex[0][-1] - - for eid, ex in exons.items(): - fStart = ex[0][0] - fStop = ex[0][-1] - - if fStart or fStart: - - line[2] = 'gene' - line[3] = str(fStart) - line[4] = str(fStop) - line.append('ID=Unknown_Gene_' + str(unk) + ';Name=Unknown_Gene_' + str(unk)) - print "\t".join(line) - - if not cds: - line[2] = 'transcript' - else: - line[2] = 'mRNA' - - line[8] = 'ID=Unknown_Transcript_' + str(unk) + ';Parent=Unknown_Gene_' + str(unk) - print "\t".join(line) - - if exons: - exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon') - - if cds: - exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'CDS') - if not exons: - exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon') - - unk +=1 - - return unk - -def exon_line_print(temp_line, trx_exons, parent, ftype): - """ - Print the EXON feature line - """ - - for ex in trx_exons: - temp_line[2] = ftype - temp_line[3] = str(ex[0]) - temp_line[4] = str(ex[1]) - temp_line[8] = 'Parent=%s' % parent - print '\t'.join(temp_line) - -def gbk_parse(fname): - """ - Extract genome annotation recods from genbank format - - @args fname: gbk file name - @type fname: str - """ - - fhand = helper.open_file(gbkfname) - unk = 1 - - for record in SeqIO.parse(fhand, "genbank"): - - gene_tags = dict() - tx_tags = collections.defaultdict(list) - exon = collections.defaultdict(list) - cds = collections.defaultdict(list) - mol_type, chr_id = None, None - - for rec in record.features: - - if rec.type == 'source': - try: - mol_type = rec.qualifiers['mol_type'][0] - except: - mol_type = '.' - pass - try: - chr_id = rec.qualifiers['chromosome'][0] - except: - chr_id = record.name - continue - - strand='-' - strand='+' if rec.strand>0 else strand - - fid = None - try: - fid = rec.qualifiers['gene'][0] - except: - pass - - transcript_id = None - try: - transcript_id = rec.qualifiers['transcript_id'][0] - except: - pass - - if re.search(r'gene', rec.type): - gene_tags[fid] = (rec.location._start.position+1, - rec.location._end.position, - strand, - rec.type - ) - elif rec.type == 'exon': - exon[fid].append((rec.location._start.position+1, - rec.location._end.position)) - elif rec.type=='CDS': - cds[fid].append((rec.location._start.position+1, - rec.location._end.position)) - else: - # get all transcripts - if transcript_id: - tx_tags[fid].append((rec.location._start.position+1, - rec.location._end.position, - transcript_id, - rec.type)) - # record extracted, generate feature table - unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk) - - fhand.close() - - -if __name__=='__main__': - - try: - gbkfname = sys.argv[1] - except: - print __doc__ - sys.exit(-1) - - ## extract gbk records - gbk_parse(gbkfname) |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 gbk_to_gff.xml --- a/gbk_to_gff.xml Thu Apr 23 17:51:14 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,91 +0,0 @@ -<tool id="fml_gbk2gff" name="GBK-to-GFF" version="2.0.0"> - <description>converter</description> - <command interpreter="python">gbk_to_gff.py $inf_gbk > $gff_format - </command> - <inputs> - <param format="gb,gbk,genbank,txt" name="inf_gbk" type="data" label="Convert this query" help="GenBank flat file format consists of an annotation section and a sequence section."/> - </inputs> - <outputs> - <data format="gff3" name="gff_format" label="${tool.name} on ${on_string}: Converted"/> - </outputs> - <tests> - <test> - <param name="inf_gbk" value="s_cerevisiae_SCU49845.gbk" /> - <output name="gff_format" file="s_cerevisiae_SCU49845.gff3" /> - </test> - </tests> - <help> - -**What it does** - -This tool converts data from a GenBank_ flat file format to GFF (scroll down for format description). - -.. _GenBank: http://www.ncbi.nlm.nih.gov/genbank/ - ------- - -**Example** - -- The following data in GenBank format:: - - LOCUS NM_001202705 2406 bp mRNA linear PLN 28-MAY-2011 - DEFINITION Arabidopsis thaliana thiamine biosynthesis protein ThiC (THIC) - mRNA, complete cds. - ACCESSION NM_001202705 - VERSION NM_001202705.1 GI:334184566......... - FEATURES Location/Qualifiers - source 1..2406 - /organism="Arabidopsis thaliana" - /mol_type="mRNA" - /db_xref="taxon:3702"........ - gene 1..2406 - /gene="THIC" - /locus_tag="AT2G29630" - /gene_synonym="PY; PYRIMIDINE REQUIRING; T27A16.27;........ - ORIGIN - 1 aagcctttcg ctttaggctg cattgggccg tgacaatatt cagacgattc aggaggttcg - 61 ttcctttttt aaaggaccct aatcactctg agtaccactg actcactcag tgtgcgcgat - 121 tcatttcaaa aacgagccag cctcttcttc cttcgtctac tagatcagat ccaaagcttc - 181 ctcttccagc tatggctgct tcagtacact gtaccttgat gtccgtcgta tgcaacaaca - // - - -- Will be converted to GFF3:: - - ##gff-version 3 - NM_001202705 gbk_to_gff chromosome 1 2406 . + 1 ID=NM_001202705;Alias=2;Dbxref=taxon:3702;Name=NM_001202705 - NM_001202705 gbk_to_gff gene 1 2406 . + 1 ID=AT2G29630;Dbxref=GeneID:817513,TAIR:AT2G29630;Name=THIC - NM_001202705 gbk_to_gff mRNA 192 2126 . + 1 ID=AT2G29630.t01;Parent=AT2G29630 - NM_001202705 gbk_to_gff CDS 192 2126 . + 1 ID=AT2G29630.p01;Parent=AT2G29630.t01 - NM_001202705 gbk_to_gff exon 192 2126 . + 1 Parent=AT2G29630.t01 - ------- - -**About formats** - -**GenBank format** An example of a GenBank record may be viewed here_ - -.. _here: http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html - -**GFF3** Generic Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields:: - - 1. seqid - Must be a chromosome or scaffold or contig. - 2. source - The program that generated this feature. - 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". - 4. start - The starting position of the feature in the sequence. The first base is numbered 1. - 5. stop - The ending position of the feature (inclusive). - 6. score - A score between 0 and 1000. If there is no score value, enter ".". - 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). - 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. - 9. attributes - All lines with the same group are linked together into a single item. - --------- - -**Copyright** - -2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center - -Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014) - - </help> -</tool> |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 gff_fmap.py --- a/gff_fmap.py Thu Apr 23 17:51:14 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,203 +0,0 @@\n-#!/usr/bin/env python\n-"""\n-GFF feature mapping program, to find the relation between different features described in a given GFF file. \n-\n-Usage: \n-python gff_fmap.py in.gff > out.txt \n-\n-Courtesy: Brad Chapman \n- Few functions are inherited from bcbio-GFFParser program. \n-"""\n-\n-import re\n-import sys \n-import urllib\n-import collections\n-from helper import open_file\n-\n-def _gff_line_map(line):\n- """Parses a line of GFF into a dictionary.\n- Given an input line from a GFF file, this:\n- - breaks it into component elements\n- - determines the type of attribute (flat, parent, child or annotation)\n- - generates a dictionary of GFF info \n- """\n- gff3_kw_pat = re.compile("\\w+=")\n- def _split_keyvals(keyval_str):\n- """Split key-value pairs in a GFF2, GTF and GFF3 compatible way.\n- GFF3 has key value pairs like:\n- count=9;gene=amx-2;sequence=SAGE:aacggagccg\n- GFF2 and GTF have: \n- Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206"\n- name "fgenesh1_pg.C_chr_1000003"; transcriptId 869\n- """\n- quals = collections.defaultdict(list)\n- if keyval_str is None:\n- return quals\n- # ensembl GTF has a stray semi-colon at the end\n- if keyval_str[-1] == \';\':\n- keyval_str = keyval_str[:-1]\n- # GFF2/GTF has a semi-colon with at least one space after it.\n- # It can have spaces on both sides; wormbase does this.\n- # GFF3 works with no spaces.\n- # Split at the first one we can recognize as working\n- parts = keyval_str.split(" ; ")\n- if len(parts) == 1:\n- parts = keyval_str.split("; ")\n- if len(parts) == 1:\n- parts = keyval_str.split(";")\n- # check if we have GFF3 style key-vals (with =)\n- is_gff2 = True\n- if gff3_kw_pat.match(parts[0]):\n- is_gff2 = False\n- key_vals = [p.split(\'=\') for p in parts]\n- # otherwise, we are separated by a space with a key as the first item\n- else:\n- pieces = []\n- for p in parts:\n- # fix misplaced semi-colons in keys in some GFF2 files\n- if p and p[0] == \';\':\n- p = p[1:]\n- pieces.append(p.strip().split(" "))\n- key_vals = [(p[0], " ".join(p[1:])) for p in pieces]\n- for key, val in key_vals:\n- # remove quotes in GFF2 files\n- if (len(val) > 0 and val[0] == \'"\' and val[-1] == \'"\'):\n- val = val[1:-1] \n- if val:\n- quals[key].extend(val.split(\',\'))\n- # if we don\'t have a value, make this a key=True/False style\n- # attribute\n- else:\n- quals[key].append(\'true\')\n- for key, vals in quals.items():\n- quals[key] = [urllib.unquote(v) for v in vals]\n- return quals, is_gff2\n-\n- def _nest_gff2_features(gff_parts):\n- """Provide nesting of GFF2 transcript parts with transcript IDs.\n-\n- exons and coding sequences are mapped to a parent with a transcript_id\n- in GFF2. This is implemented differently at different genome centers\n- and this function attempts to resolve that and map things to the GFF3\n- way of doing them.\n- """\n- # map protein or transcript ids to a parent\n- for transcript_id in ["transcript_id", "transcriptId", "proteinId"]:\n- try:\n- gff_parts["quals"]["Parent"] = \\\n- gff_parts["quals"][transcript_id]\n- break\n- except KeyError:\n- pass\n- # case for WormBase GFF -- everything labelled as Transcript or CDS\n- for flat_name in ["Transcript", "CDS"]:\n- if gff_parts["quals"].has_key(flat_name):\n- # parent types\n- if gff_parts["type"] in [flat_name]:\n- if not gff_parts["id"]:\n- '..b' break\n-\n- return gff_parts\n-\n- line = line.strip()\n- if line == \'\':return [(\'directive\', line)] # sometimes the blank lines will be there \n- if line[0] == \'>\':return [(\'directive\', \'\')] # sometimes it will be a FATSA header\n- if line[0] == "#":\n- return [(\'directive\', line[2:])]\n- elif line:\n- parts = line.split(\'\\t\')\n- if len(parts) == 1 and re.search(r\'\\w+\', parts[0]):return [(\'directive\', \'\')] ## GFF files with FASTA sequence together \n- assert len(parts) == 9, line\n- gff_parts = [(None if p == \'.\' else p) for p in parts]\n- gff_info = dict()\n- \n- # collect all of the base qualifiers for this item\n- quals, is_gff2 = _split_keyvals(gff_parts[8])\n-\n- gff_info["is_gff2"] = is_gff2\n-\n- if gff_parts[1]:quals["source"].append(gff_parts[1])\n- gff_info[\'quals\'] = dict(quals)\n-\n- # if we are describing a location, then we are a feature\n- if gff_parts[3] and gff_parts[4]:\n- gff_info[\'type\'] = gff_parts[2]\n- gff_info[\'id\'] = quals.get(\'ID\', [\'\'])[0]\n- \n- if is_gff2:gff_info = _nest_gff2_features(gff_info)\n- # features that have parents need to link so we can pick up\n- # the relationship\n- if gff_info[\'quals\'].has_key(\'Parent\'):\n- final_key = \'child\'\n- elif gff_info[\'id\']:\n- final_key = \'parent\'\n- # Handle flat features\n- else:\n- final_key = \'feature\'\n- # otherwise, associate these annotations with the full record\n- else:\n- final_key = \'annotation\'\n- return [(final_key, gff_info)]\n- \n-def parent_child_id_map(gff_handle):\n- """\n- Provide a mapping of parent to child relationships in the file.\n- Gives a dictionary of parent child relationships:\n-\n- keys -- tuple of (source, type) for each parent\n- values -- tuple of (source, type) as children of that parent\n- """\n- # collect all of the parent and child types mapped to IDs\n- parent_sts = dict()\n- child_sts = collections.defaultdict(list)\n- for line in gff_handle:\n- line_type, line_info = _gff_line_map(line)[0]\n- if (line_type == \'parent\' or (line_type == \'child\' and line_info[\'id\'])):\n- parent_sts[line_info[\'id\']] = (line_info[\'quals\'][\'source\'][0], line_info[\'type\'])\n- if line_type == \'child\':\n- for parent_id in line_info[\'quals\'][\'Parent\']:\n- child_sts[parent_id].append((line_info[\'quals\'][\'source\'][0], line_info[\'type\']))\n- gff_handle.close()\n- # generate a dictionary of the unique final type relationships\n- pc_map = collections.defaultdict(list)\n- for parent_id, parent_type in parent_sts.items():\n- for child_type in child_sts[parent_id]:\n- pc_map[parent_type].append(child_type)\n- pc_final_map = dict()\n- for ptype, ctypes in pc_map.items():\n- unique_ctypes = list(set(ctypes))\n- unique_ctypes.sort()\n- pc_final_map[ptype] = unique_ctypes\n- # some cases the GFF file represents a single feature type \n- if not pc_final_map:\n- for fid, stypes in parent_sts.items():\n- pc_final_map[stypes] = dict()\n- # generate a report on feature id mapping in the file \n- print \'+---------------------+---------------------------------+\'\n- print \'| Parent feature type | Associated child feature type(s)|\'\n- print \'+---------------------+---------------------------------+\'\n- for key, value in pc_final_map.items():\n- print key[0], key[1]\n- for child_to in value:\n- print \'\\t\\t\\t|-\',child_to[0], child_to[1]\n- print \'+---------------------+---------------------------------+\'\n-\n-\n-if __name__==\'__main__\':\n-\n- try:\n- gff_file = sys.argv[1]\n- except:\n- print __doc__\n- sys.exit(-1)\n- \n- gff_handle = open_file(gff_file)\n- parent_child_id_map(gff_handle)\n' |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 gff_fmap.xml --- a/gff_fmap.xml Thu Apr 23 17:51:14 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,83 +0,0 @@ -<tool id="fml_gfffmap" name="GFF-map" version="2.0.0"> - <description>features</description> - <command interpreter="python"> - gff_fmap.py $gff_input > $idmapping - </command> - <inputs> - <param format="gff3,gff" name="gff_input" type="data" label="Query file" help="Provide genome annotation file in GFF."/> - </inputs> - <outputs> - <data format="txt" name="idmapping" label="${tool.name} on ${on_string}: parent child id map"/> - </outputs> - <tests> - <test> - <param name="gff_input" value="Feature_ID_mapping_W.gff3" /> - <output name="idmapping" file="Feature_ID_mapping_W.txt" /> - </test> - <test> - <param name="gff_input" value="Aly_JGI.gff3" /> - <output name="idmapping" file="Feature_ID_mapping_R.txt" /> - </test> - </tests> - <help> - -**What it does** - -GFF-map provides the features (gene, mRNA, UTR's, exon, CDS etc) relationship based on their identifier mapping in a given GFF file. - --------- - -**Example** - -- The features ID mapping in the following data in GFF3:: - - ##gff-version 3 - 17 protein_coding gene 7255208 7258258 . + . ID=ENSG00000213859;Name=KCTD11 - 17 protein_coding mRNA 7255208 7258258 . + . ID=ENST00000333751;Name=KCTD11-001;Parent=ENSG00000213859 - 17 protein_coding protein 7256262 7256960 . + 0 ID=ENSP00000328352;Name=ENSP00000328352 - 17 protein_coding five_prime_UTR 7255208 7256261 . + . Parent=ENST00000333751 - 17 protein_coding CDS 7256262 7256960 . + 0 Name=CDS:KCTD11;Parent=ENST00000333751,ENSP00000328352 - 17 protein_coding three_prime_UTR 7256961 7258258 . + . Parent=ENST00000333751 - 17 protein_coding exon 7255208 7258258 . + . Parent=ENST00000333751 - -- Will be displayed as:: - - +-----------------------+---------------------------------+ - | Parent feature type | Associated child feature type(s)| - +-----------------------+---------------------------------+ - | protein_coding gene | protein_coding mRNA | - +-----------------------+---------------------------------+ - | protein_coding protein| protein_coding CDS | - +-----------------------+---------------------------------+ - | protein_coding mRNA | protein_coding CDS | - | | protein_coding exon | - | | protein_coding five_prime_UTR | - | | protein_coding three_prime_UTR | - +-----------------------+---------------------------------+ - --------- - -**About formats** - -**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields:: - - 1. seqid - Must be a chromosome or scaffold. - 2. source - The program that generated this feature. - 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". - 4. start - The starting position of the feature in the sequence. The first base is numbered 1. - 5. stop - The ending position of the feature (inclusive). - 6. score - A score between 0 and 1000. If there is no score value, enter ".". - 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). - 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. - 9. attributes - All lines with the same group are linked together into a single item. - --------- - -**Copyright** - -2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center - -Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014) - -</help> -</tool> |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 gff_to_bed.py --- a/gff_to_bed.py Thu Apr 23 17:51:14 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,96 +0,0 @@ -#!/usr/bin/env python -""" -Convert genome annotation data in GFF/GTF to a 12 column BED format. -BED format typically represents the transcript models. - -Usage: python gff_to_bed.py in.gff > out.bed - -Requirement: - GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py - -Copyright (C) - 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. - 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA. -""" - -import re -import sys -import GFFParser - -def writeBED(tinfo): - """ - writing result files in bed format - - @args tinfo: list of genes - @args tinfo: numpy object - """ - - for ent1 in tinfo: - child_flag = False - - for idx, tid in enumerate(ent1['transcripts']): - child_flag = True - exon_cnt = len(ent1['exons'][idx]) - exon_len = '' - exon_cod = '' - rel_start = None - rel_stop = None - for idz, ex_cod in enumerate(ent1['exons'][idx]):#check for exons of corresponding transcript - exon_len += '%d,' % (ex_cod[1]-ex_cod[0]+1) - if idz == 0: #calculate the relative start position - exon_cod += '0,' - rel_start = int(ex_cod[0]) - rel_stop = ex_cod[1] - else: - exon_cod += '%d,' % (ex_cod[0]-rel_start) - rel_stop = int(ex_cod[1]) - - if exon_len: - score = '0' - score = ent1['score'][0] if ent1['score'] else score - out_print = [ent1['chr'], - str(rel_start), - str(rel_stop), - tid[0], - score, - ent1['strand'], - str(rel_start), - str(rel_stop), - '0', - str(exon_cnt), - exon_len, - exon_cod] - print '\t'.join(out_print) - - if not child_flag: # file just contains only a single parent type i.e, gff3 defines only one feature type - score = '0' - score = ent1['score'][0] if ent1['score'] else score - - out_print = [ent1['chr'], - '%d' % int(ent1['start']), - '%d' % int(ent1['stop']), - ent1['name'], - score, - ent1['strand'], - '%d' % int(ent1['start']), - '%d' % int(ent1['stop']), - '0', - '1', - '%d,' % (int(ent1['stop'])-int(ent1['start'])+1), - '0,'] - - print '\t'.join(out_print) - - -def __main__(): - try: - query_file = sys.argv[1] - except: - print __doc__ - sys.exit(-1) - - Transcriptdb = GFFParser.Parse(query_file) - writeBED(Transcriptdb) - -if __name__ == "__main__": - __main__() |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 gff_to_bed.xml --- a/gff_to_bed.xml Thu Apr 23 17:51:14 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,90 +0,0 @@ -<tool id="fml_gff2bed" name="GFF-to-BED" version="2.0.0"> - <description>converter</description> - <command interpreter="python">gff_to_bed.py $inf_gff > $bed_format - </command> - <inputs> - <param format="gtf,gff,gff3" name="inf_gff" type="data" label="Convert this query" help="Provide genome annotation file in GFF, GTF, GFF3."/> - </inputs> - <outputs> - <data format="bed" name="bed_format" label="${tool.name} on ${on_string}: Converted" /> - </outputs> - <tests> - <test> - <param name="inf_gff" value="Aly_JGI.gff3" /> - <output name="bed_format" file="Aly_JGI.bed" /> - </test> - <test> - <param name="inf_gff" value="MB7_3R.gff3" /> - <output name="bed_format" file="MB7_3R.bed" /> - </test> - </tests> - <help> - -**What it does** - -This tool converts gene transcript annotation from GTF or GFF or GFF3 to UCSC wiggle 12 column BED format. - --------- - -**Example** - -- The following data in GFF3:: - - ##gff-version 3 - chr1 protein_coding gene 11874 14409 0 + . ID=Gene:uc001aaa.3;Name=Gene:uc001aaa.3 - chr1 protein_coding transcript 11874 14409 0 + . ID=uc001aaa.3;Name=uc001aaa.3;Parent=Gene:uc001aaa.3 - chr1 protein_coding exon 11874 12227 0 + . Parent=uc001aaa.3 - chr1 protein_coding exon 12613 12721 0 + . Parent=uc001aaa.3 - chr1 protein_coding exon 13221 14409 0 + . Parent=uc001aaa.3 - -- Will be converted to UCSC Wiggle BED format:: - - chr1 11874 14409 uc001aaa.3 0 + 11874 14409 0 3 354,109,1189, 0,739,1347, - --------- - -**About formats** - -**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields:: - - - 1. seqid - Must be a chromosome or scaffold or contig. - 2. source - The program that generated this feature. - 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". - 4. start - The starting position of the feature in the sequence. The first base is numbered 1. - 5. stop - The ending position of the feature (inclusive). - 6. score - A score between 0 and 1000. If there is no score value, enter ".". - 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). - 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. - 9. attributes - All lines with the same group are linked together into a single item. - -**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and several additional optional ones: - -The first three BED fields (required) are:: - - 1. chrom - The name of the chromosome (e.g. chr1, chrY_random). - 2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.) - 3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval). - -The additional BED fields (optional) are:: - - 4. name - The name of the BED line. - 5. score - A score between 0 and 1000. - 6. strand - Defines the strand - either '+' or '-'. - 7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser. - 8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser. - 9. reserved - This should always be set to zero. - 10. blockCount - The number of blocks (exons) in the BED line. - 11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount. - 12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. - --------- - -**Copyright** - -2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center - -Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014) - - </help> -</tool> |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 gff_to_gbk.py --- a/gff_to_gbk.py Thu Apr 23 17:51:14 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,54 +0,0 @@ -#!/usr/bin/env python -""" -Convert data from GFF and associated genome sequence in fasta file into GenBank. - -Usage: -python gff_to_gbk.py in.gff in.fasta out.gbk - -Requirements: - BioPython:- http://biopython.org/ - helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py - -Copyright (C) - 2010-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. - 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA. -""" - -import sys -import helper -import gffparser_bcbio - -from Bio import SeqIO -from Bio.Alphabet import generic_dna - -def __main__(): - """ - main wrapper - """ - - try: - gff_fname = sys.argv[1] - fasta_fname = sys.argv[2] - gb_fname = sys.argv[3] - except: - print __doc__ - sys.exit(-1) - - fasta_fh = helper.open_file(fasta_fname) - - fasta_rec = SeqIO.to_dict(SeqIO.parse(fasta_fh, "fasta", generic_dna)) - fasta_fh.close() - - gff_rec = gffparser_bcbio.parse(gff_fname, fasta_rec) - - try: - gb_fh = open(gb_fname, "w") - except: - print 'file not ready for writing %s' % gb_fname - sys.exit(-1) - - SeqIO.write(gff_rec, gb_fh, "genbank") - gb_fh.close() - -if __name__=="__main__": - __main__() |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 gff_to_gbk.xml --- a/gff_to_gbk.xml Thu Apr 23 17:51:14 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,98 +0,0 @@ -<tool id="fml_gff2gbk" name="GFF-to-GBK" version="2.0.0"> - <description>converter</description> - <command interpreter="python">gff_to_gbk.py $inf_gff $inf_fas $gbk_format - </command> - <inputs> - <param format="gff,gff3" name="inf_gff" type="data" label="Convert this query" help="Genome annotation in GFF file format."/> - <param format="fa,fasta" name="inf_fas" type="data" label="Genome Sequence" help="Genome sequence in FASTA format."/> - </inputs> - <outputs> - <data format="genbank" name="gbk_format" label="${tool.name} on ${on_string}: Converted"/> - </outputs> - <tests> - <test> - <param name="inf_gff" value="s_cerevisiae_SCU49845.gff3" /> - <param name="inf_fas" value="s_cerevisiae_SCU49845.fasta" /> - <output name="gbk_format" file="s_cerevisiae_SCU49845.gbk" /> - </test> - </tests> - <help> - -**What it does** - -This tool converts annotations in GFF to GenBank_ format (scroll down for format description). - -.. _GenBank: http://www.ncbi.nlm.nih.gov/genbank/ - ------- - -**Example** - -- The following data in GFF:: - - ##gff-version 3 - # sequence-region NM_001202705 1 2406 - NM_001202705 GenBank chromosome 1 2406 . + 1 ID=NM_001202705;Alias=2;Dbxref=taxon:3702;Name=NM_001202705;Note=Arabidopsis thaliana thiamine biosynthesis protein ThiC (THIC) mRNA%2C complete cds.,REVIEWED REFSEQ; - NM_001202705 GenBank gene 1 2406 . + 1 ID=AT2G29630;Dbxref=GeneID:817513,TAIR:AT2G29630;Name=THIC;locus_tag=AT2G29630 - NM_001202705 GenBank mRNA 192 2126 . + 1 ID=AT2G29630.t01;Parent=AT2G29630 - NM_001202705 GenBank CDS 192 2126 . + 1 ID=AT2G29630.p01;Parent=AT2G29630.t01;Dbxref=GI:334184567,GeneID:817513,TAIR:AT2G29630;Name=THIC;Note=thiaminC (THIC)%3B CONTAINS InterPro DOMAIN;rotein_id=NP_001189634.1; - NM_001202705 GenBank exon 192 2126 . + 1 Parent=AT2G29630.t01 - ##FASTA - >NM_001202705 - AAGCCTTTCGCTTTAGGCTGCATTGGGCCGTGACAATATTCAGACGATTCAGGAGGTTCG - TTCCTTTTTTAAAGGACCCTAATCACTCTGAGTACCACTGACTCACTCAGTGTGCGCGAT - -- Will be converted to GenBank format:: - - LOCUS NM_001202705 2406 bp mRNA linear PLN 28-MAY-2011 - DEFINITION Arabidopsis thaliana thiamine biosynthesis protein ThiC (THIC) - mRNA, complete cds. - ACCESSION NM_001202705 - VERSION NM_001202705.1 GI:334184566......... - FEATURES Location/Qualifiers - source 1..2406 - /organism="Arabidopsis thaliana" - /mol_type="mRNA" - /db_xref="taxon:3702"........ - gene 1..2406 - /gene="THIC" - /locus_tag="AT2G29630" - /gene_synonym="PY; PYRIMIDINE REQUIRING; T27A16.27;........ - ORIGIN - 1 aagcctttcg ctttaggctg cattgggccg tgacaatatt cagacgattc aggaggttcg - 61 ttcctttttt aaaggaccct aatcactctg agtaccactg actcactcag tgtgcgcgat - 121 tcatttcaaa aacgagccag cctcttcttc cttcgtctac tagatcagat ccaaagcttc - 181 ctcttccagc tatggctgct tcagtacact gtaccttgat gtccgtcgta tgcaacaaca - // - ------- - -**About formats** - -**GFF** Generic Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF lines have nine tab-separated fields:: - - 1. seqid - Must be a chromosome or scaffold or contig. - 2. source - The program that generated this feature. - 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". - 4. start - The starting position of the feature in the sequence. The first base is numbered 1. - 5. stop - The ending position of the feature (inclusive). - 6. score - A score between 0 and 1000. If there is no score value, enter ".". - 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). - 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. - 9. attributes - All lines with the same group are linked together into a single item. - -**GenBank format** Consists of an annotation section and a sequence section. Sample record_ - -.. _record: http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html - - --------- - -**Copyright** - -2010-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center - -Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014) - - </help> -</tool> |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 gff_to_gtf.py --- a/gff_to_gtf.py Thu Apr 23 17:51:14 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,76 +0,0 @@ -#!/usr/bin/env python -""" -Program to convert data from GFF to GTF - -Usage: python gff_to_gtf.py in.gff > out.gtf - -Requirement: - GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py - -Copyright (C) - 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. - 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA. -""" - -import re -import sys -import GFFParser - -def printGTF(tinfo): - """ - writing result file in GTF format - - @args tinfo: parsed object from gff file - @type tinfo: numpy array - """ - - for ent1 in tinfo: - for idx, tid in enumerate(ent1['transcripts']): - - exons = ent1['exons'][idx] - cds_exons = ent1['cds_exons'][idx] - - stop_codon = start_codon = () - - if ent1['strand'] == '+': - if cds_exons.any(): - start_codon = (cds_exons[0][0], cds_exons[0][0]+2) - stop_codon = (cds_exons[-1][1]-2, cds_exons[-1][1]) - elif ent1['strand'] == '-': - if cds_exons.any(): - start_codon = (cds_exons[-1][1]-2, cds_exons[-1][1]) - stop_codon = (cds_exons[0][0], cds_exons[0][0]+2) - else: - print 'STRAND information missing - %s, skip the transcript - %s' % (ent1['strand'], tid[0]) - pass - - last_cds_cod = 0 - for idz, ex_cod in enumerate(exons): - - print '%s\t%s\texon\t%d\t%d\t.\t%s\t.\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], ex_cod[0], ex_cod[1], ent1['strand'], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name']) - - if cds_exons.any(): - try: - print '%s\t%s\tCDS\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], cds_exons[idz][0], cds_exons[idz][1], ent1['strand'], cds_exons[idz][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name']) - last_cds_cod = idz - except: - pass - - if idz == 0: - print '%s\t%s\tstart_codon\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], start_codon[0], start_codon[1], ent1['strand'], cds_exons[idz][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name']) - - if stop_codon: - print '%s\t%s\tstop_codon\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], stop_codon[0], stop_codon[1], ent1['strand'], cds_exons[last_cds_cod][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name']) - - -if __name__ == "__main__": - - try: - gff_fname = sys.argv[1] - except: - print __doc__ - sys.exit(-1) - - Transcriptdb = GFFParser.Parse(gff_fname) - - printGTF(Transcriptdb) |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 gff_to_gtf.xml --- a/gff_to_gtf.xml Thu Apr 23 17:51:14 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,88 +0,0 @@ -<tool id="fml_gff2gtf" name="GFF-to-GTF" version="2.0.0"> - <description>converter</description> - <command interpreter="python">gff_to_gtf.py $inf_gff3 > $gtf_format - </command> - <inputs> - <param format="gff3,gff" name="inf_gff3" type="data" label="Convert this query" help="Provide genome annotation file in GFF or GFF3."/> - </inputs> - <outputs> - <data format="gtf" name="gtf_format" label="${tool.name} on ${on_string}: Converted" /> - </outputs> - <tests> - <test> - <param name="inf_gff3" value="AceView_ncbi_37.gff3" /> - <output name="gtf_format" file="AceView_gff3_to_gtf.gtf" /> - </test> - <test> - <param name="inf_gff3" value="ENSEMBL_mm9.gff3" /> - <output name="gtf_format" file="ENSEMBL_mm9_gff3_to_gtf.gtf" /> - </test> - </tests> - <help> - -**What it does** - -This tool converts data from GFF3 to GTF file format (scroll down for format description). - --------- - -**Example** - -- The following data in GFF3 format:: - - ##gff-version 3 - 17 protein_coding gene 7255208 7258258 . + . ID=ENSG00000213859;Name=KCTD11 - 17 protein_coding mRNA 7255208 7258258 . + . ID=ENST00000333751;Name=KCTD11-001;Parent=ENSG00000213859 - 17 protein_coding protein 7256262 7256960 . + . ID=ENSP00000328352;Name=KCTD11-001;Parent=ENST00000333751 - 17 protein_coding five_prime_UTR 7255208 7256261 . + . Parent=ENST00000333751 - 17 protein_coding CDS 7256262 7256960 . + 0 Name=CDS:KCTD11;Parent=ENST00000333751,ENSP00000328352 - 17 protein_coding three_prime_UTR 7256961 7258258 . + . Parent=ENST00000333751 - 17 protein_coding exon 7255208 7258258 . + . Parent=ENST00000333751 - -- Will be converted to GTF format:: - - 17 protein_coding exon 7255208 7258258 . + . gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; - 17 protein_coding CDS 7256262 7256957 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; protein_id "ENSP00000328352"; - 17 protein_coding start_codon 7256262 7256264 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; - 17 protein_coding stop_codon 7256958 7256960 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; - --------- - -**About formats** - - -**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields:: - - 1. seqid - Must be a chromosome or scaffold. - 2. source - The program that generated this feature. - 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". - 4. start - The starting position of the feature in the sequence. The first base is numbered 1. - 5. stop - The ending position of the feature (inclusive). - 6. score - A score between 0 and 1000. If there is no score value, enter ".". - 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). - 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. - 9. attributes - All lines with the same group are linked together into a single item. - - -**GTF format** Gene Transfer Format, it borrows from GFF, but has additional structure that warrants a separate definition and format name. GTF lines have nine tab-seaparated fields:: - - 1. seqname - The name of the sequence. - 2. source - This indicating where the annotation came from. - 3. feature - The name of the feature types. The following feature types are required: 'CDS', 'start_codon' and 'stop_codon' - 4. start - The starting position of the feature in the sequence. The first base is numbered 1. - 5. end - The ending position of the feature (inclusive). - 6. score - The score field indicates a degree of confidence in the feature's existence and coordinates. - 7. strand - Valid entries include '+', '-', or '.' - 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. - 9. attributes - These attributes are designed for handling multiple transcripts from the same genomic region. - --------- - -**Copyright** - -2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center - -Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014) - - </help> -</tool> |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 gffparser_bcbio.py --- a/gffparser_bcbio.py Thu Apr 23 17:51:14 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,828 +0,0 @@\n-"""Parse GFF files into features attached to Biopython SeqRecord objects.\n-\n-This deals with GFF3 formatted files, a tab delimited format for storing\n-sequence features and annotations:\n-\n-http://www.sequenceontology.org/gff3.shtml\n-\n-It will also deal with older GFF versions (GTF/GFF2):\n-\n-http://www.sanger.ac.uk/Software/formats/GFF/GFF_Spec.shtml\n-http://mblab.wustl.edu/GTF22.html\n-\n-The implementation utilizes map/reduce parsing of GFF using Disco. Disco\n-(http://discoproject.org) is a Map-Reduce framework for Python utilizing\n-Erlang for parallelization. The code works on a single processor without\n-Disco using the same architecture.\n-"""\n-import os\n-import copy\n-import re\n-import collections\n-import urllib\n-import itertools\n-\n-# Make defaultdict compatible with versions of python older than 2.4\n-try:\n- collections.defaultdict\n-except AttributeError:\n- import _utils\n- collections.defaultdict = _utils.defaultdict\n-\n-from Bio.Seq import Seq, UnknownSeq\n-from Bio.SeqRecord import SeqRecord\n-from Bio.SeqFeature import SeqFeature, FeatureLocation\n-from Bio import SeqIO\n-\n-def _gff_line_map(line, params):\n- """Map part of Map-Reduce; parses a line of GFF into a dictionary.\n-\n- Given an input line from a GFF file, this:\n- - decides if the file passes our filtering limits\n- - if so:\n- - breaks it into component elements\n- - determines the type of attribute (flat, parent, child or annotation)\n- - generates a dictionary of GFF info which can be serialized as JSON\n- """\n- gff3_kw_pat = re.compile("\\w+=")\n- def _split_keyvals(keyval_str):\n- """Split key-value pairs in a GFF2, GTF and GFF3 compatible way.\n-\n- GFF3 has key value pairs like:\n- count=9;gene=amx-2;sequence=SAGE:aacggagccg\n- GFF2 and GTF have: \n- Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206"\n- name "fgenesh1_pg.C_chr_1000003"; transcriptId 869\n- """\n- quals = collections.defaultdict(list)\n- if keyval_str is None:\n- return quals\n- # ensembl GTF has a stray semi-colon at the end\n- if keyval_str[-1] == \';\':\n- keyval_str = keyval_str[:-1]\n- # GFF2/GTF has a semi-colon with at least one space after it.\n- # It can have spaces on both sides; wormbase does this.\n- # GFF3 works with no spaces.\n- # Split at the first one we can recognize as working\n- parts = keyval_str.split(" ; ")\n- if len(parts) == 1:\n- parts = keyval_str.split("; ")\n- if len(parts) == 1:\n- parts = keyval_str.split(";")\n- # check if we have GFF3 style key-vals (with =)\n- is_gff2 = True\n- if gff3_kw_pat.match(parts[0]):\n- is_gff2 = False\n- key_vals = [p.split(\'=\') for p in parts]\n- # otherwise, we are separated by a space with a key as the first item\n- else:\n- pieces = []\n- for p in parts:\n- # fix misplaced semi-colons in keys in some GFF2 files\n- if p and p[0] == \';\':\n- p = p[1:]\n- pieces.append(p.strip().split(" "))\n- key_vals = [(p[0], " ".join(p[1:])) for p in pieces]\n- for item in key_vals:\n- # standard in-spec items are key=value\n- if len(item) == 2:\n- key, val = item\n- # out-of-spec files can have just key values. We set an empty value\n- # which will be changed to true later to standardize.\n- else:\n- assert len(item) == 1, item\n- key = item[0]\n- val = \'\'\n- # remove quotes in GFF2 files\n- if (len(val) > 0 and val[0] == \'"\' and val[-1] == \'"\'):\n- val = val[1:-1] \n- if val:\n- quals[key].extend([v for v in val.split(\',\') if v])\n- # if we don\'t have a value, make this a key=True/False style\n- '..b' the\n- information you need. This class provides high level summary details to\n- help in learning.\n- """\n- def __init__(self):\n- self._filter_info = dict(gff_id = [0], gff_source_type = [1, 2],\n- gff_source = [1], gff_type = [2])\n- \n- def _get_local_params(self, limit_info=None):\n- class _LocalParams:\n- def __init__(self):\n- self.jsonify = False\n- params = _LocalParams()\n- params.limit_info = limit_info\n- params.filter_info = self._filter_info\n- return params\n- \n- @_file_or_handle\n- def available_limits(self, gff_handle):\n- """Return dictionary information on possible limits for this file.\n-\n- This returns a nested dictionary with the following structure:\n- \n- keys -- names of items to filter by\n- values -- dictionary with:\n- keys -- filter choice\n- value -- counts of that filter in this file\n-\n- Not a parallelized map-reduce implementation.\n- """\n- cur_limits = dict()\n- for filter_key in self._filter_info.keys():\n- cur_limits[filter_key] = collections.defaultdict(int)\n- for line in gff_handle:\n- # when we hit FASTA sequences, we are done with annotations\n- if line.startswith("##FASTA"):\n- break\n- # ignore empty and comment lines\n- if line.strip() and line.strip()[0] != "#":\n- parts = [p.strip() for p in line.split(\'\\t\')]\n- assert len(parts) == 9, line\n- for filter_key, cur_indexes in self._filter_info.items():\n- cur_id = tuple([parts[i] for i in cur_indexes])\n- cur_limits[filter_key][cur_id] += 1\n- # get rid of the default dicts\n- final_dict = dict()\n- for key, value_dict in cur_limits.items():\n- if len(key) == 1:\n- key = key[0]\n- final_dict[key] = dict(value_dict)\n- gff_handle.close()\n- return final_dict\n-\n- @_file_or_handle\n- def parent_child_map(self, gff_handle):\n- """Provide a mapping of parent to child relationships in the file.\n-\n- Returns a dictionary of parent child relationships:\n-\n- keys -- tuple of (source, type) for each parent\n- values -- tuple of (source, type) as children of that parent\n- \n- Not a parallelized map-reduce implementation.\n- """\n- # collect all of the parent and child types mapped to IDs\n- parent_sts = dict()\n- child_sts = collections.defaultdict(list)\n- for line in gff_handle:\n- # when we hit FASTA sequences, we are done with annotations\n- if line.startswith("##FASTA"):\n- break\n- if line.strip():\n- line_type, line_info = _gff_line_map(line,\n- self._get_local_params())[0]\n- if (line_type == \'parent\' or (line_type == \'child\' and\n- line_info[\'id\'])):\n- parent_sts[line_info[\'id\']] = (\n- line_info[\'quals\'][\'source\'][0], line_info[\'type\'])\n- if line_type == \'child\':\n- for parent_id in line_info[\'quals\'][\'Parent\']:\n- child_sts[parent_id].append((\n- line_info[\'quals\'][\'source\'][0], line_info[\'type\']))\n- #print parent_sts, child_sts\n- # generate a dictionary of the unique final type relationships\n- pc_map = collections.defaultdict(list)\n- for parent_id, parent_type in parent_sts.items():\n- for child_type in child_sts[parent_id]:\n- pc_map[parent_type].append(child_type)\n- pc_final_map = dict()\n- for ptype, ctypes in pc_map.items():\n- unique_ctypes = list(set(ctypes))\n- unique_ctypes.sort()\n- pc_final_map[ptype] = unique_ctypes\n- return pc_final_map\n' |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 gtf_to_gff.py --- a/gtf_to_gff.py Thu Apr 23 17:51:14 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,85 +0,0 @@ -#!/usr/bin/env python -""" -Convert Gene Transfer Format [GTF] to Generic Feature Format Version 3 [GFF3]. - -Usage: python gtf_to_gff.py in.gtf > out.gff3 - -Requirement: - GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py - helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py - -Copyright (C) - 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. - 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA. -""" - -import re -import sys -import GFFParser -import helper - -def GFFWriter(gtf_content): - """ - write the feature information to GFF format - - @args gtf_content: Parsed object from gtf file - @type gtf_content: numpy array - """ - - print '##gff-version 3' - - for ent1 in gtf_content: - - chr_name = ent1['chr'] - strand = ent1['strand'] - start = ent1['start'] - stop = ent1['stop'] - source = ent1['source'] - ID = ent1['name'] - Name = ent1['gene_info']['Name'] - - Name = ID if not Name else Name - - print '%s\t%s\tgene\t%d\t%d\t.\t%s\t.\tID=%s;Name=%s' % (chr_name, source, start, stop, strand, ID, Name) - - for idx, tid in enumerate(ent1['transcripts']): - print idx - print tid - - t_start = ent1['exons'][idx][0][0] - t_stop = ent1['exons'][idx][-1][-1] - t_type = ent1['transcript_type'][idx] - - utr5_exons, utr3_exons = [], [] - if ent1['exons'][idx].any() and ent1['cds_exons'][idx].any(): - utr5_exons, utr3_exons = helper.buildUTR(ent1['cds_exons'][idx], ent1['exons'][idx], strand) - - print '%s\t%s\t%s\t%d\t%d\t.\t%s\t.\tID=%s;Parent=%s' % (chr_name, source, t_type, t_start, t_stop, strand, tid[0], ID) - - for ex_cod in utr5_exons: - print '%s\t%s\tfive_prime_UTR\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0]) - - for ex_cod in ent1['cds_exons'][idx]: - print '%s\t%s\tCDS\t%d\t%d\t.\t%s\t%d\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, ex_cod[2], tid[0]) - - for ex_cod in utr3_exons: - print '%s\t%s\tthree_prime_UTR\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0]) - - for ex_cod in ent1['exons'][idx]: - print '%s\t%s\texon\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0]) - - -def __main__(): - - try: - gtf_fname = sys.argv[1] - except: - print __doc__ - sys.exit(-1) - - gtf_file_content = GFFParser.Parse(gtf_fname) - - GFFWriter(gtf_file_content) - -if __name__ == "__main__": - __main__() |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 gtf_to_gff.xml --- a/gtf_to_gff.xml Thu Apr 23 17:51:14 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,94 +0,0 @@ -<tool id="fml_gtf2gff" name="GTF-to-GFF" version="2.0.0"> - <description>converter</description> - <command interpreter="python">gtf_to_gff.py $inf_gtf > $gff3_format - </command> - <inputs> - <param format="gtf" name="inf_gtf" type="data" label="Convert this query" help="Provide genome annotation file in GTF."/> - </inputs> - <outputs> - <data format="gff3" name="gff3_format" label="${tool.name} on ${on_string}: Converted" /> - </outputs> - <tests> - <test> - <param name="inf_gtf" value="UCSC_transcripts.gtf" /> - <output name="gff3_format" file="UCSC_transcripts.gff3" /> - </test> - <test> - <param name="inf_gtf" value="JGI_genes.gtf" /> - <output name="gff3_format" file="JGI_genes.gff3" /> - </test> - <test> - <param name="inf_gtf" value="ENSEMBL_mm9.gtf" /> - <output name="gff3_format" file="ENSEMBL_mm9.gff3" /> - </test> - <test> - <param name="inf_gtf" value="AceView_ncbi_37.gtf" /> - <output name="gff3_format" file="AceView_ncbi_37.gff3" /> - </test> - </tests> - <help> - -**What it does** - -This tool converts data from GTF to a valid GFF3 file (scroll down for format description). - --------- - -**Example** - -- The following data in GTF format:: - - 17 protein_coding exon 7255208 7258258 . + . gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; - 17 protein_coding CDS 7256262 7256957 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; protein_id "ENSP00000328352"; - 17 protein_coding start_codon 7256262 7256264 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; - 17 protein_coding stop_codon 7256958 7256960 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; - -- Will be converted to GFF3 format:: - - ##gff-version 3 - 17 protein_coding gene 7255208 7258258 . + . ID=ENSG00000213859;Name=KCTD11 - 17 protein_coding mRNA 7255208 7258258 . + . ID=ENST00000333751;Name=KCTD11-001;Parent=ENSG00000213859 - 17 protein_coding protein 7256262 7256960 . + . ID=ENSP00000328352;Name=KCTD11-001;Parent=ENST00000333751 - 17 protein_coding five_prime_UTR 7255208 7256261 . + . Parent=ENST00000333751 - 17 protein_coding CDS 7256262 7256960 . + 0 Name=CDS:KCTD11;Parent=ENST00000333751,ENSP00000328352 - 17 protein_coding three_prime_UTR 7256961 7258258 . + . Parent=ENST00000333751 - 17 protein_coding exon 7255208 7258258 . + . Parent=ENST00000333751 - --------- - -**About formats** - -**GTF format** Gene Transfer Format, it borrows from GFF, but has additional structure that warrants a separate definition and format name. GTF lines have nine tab-seaparated fields:: - - 1. seqname - The name of the sequence. - 2. source - This indicating where the annotation came from. - 3. feature - The name of the feature types. The following feature types are required: 'CDS', 'start_codon' and 'stop_codon' - 4. start - The starting position of the feature in the sequence. The first base is numbered 1. - 5. end - The ending position of the feature (inclusive). - 6. score - The score field indicates a degree of confidence in the feature's existence and coordinates. - 7. strand - Valid entries include '+', '-', or '.' - 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. - 9. attributes - These attributes are designed for handling multiple transcripts from the same genomic region. - -**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields:: - - 1. seqid - Must be a chromosome or scaffold. - 2. source - The program that generated this feature. - 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". - 4. start - The starting position of the feature in the sequence. The first base is numbered 1. - 5. stop - The ending position of the feature (inclusive). - 6. score - A score between 0 and 1000. If there is no score value, enter ".". - 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). - 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. - 9. attributes - All lines with the same group are linked together into a single item. - --------- - -**Copyright** - -2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center - -Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014) - - </help> -</tool> |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 helper.py --- a/helper.py Thu Apr 23 17:51:14 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,332 +0,0 @@\n-#!/usr/bin/env python\n-"""\n-Common utility functions\n-"""\n-\n-import os \n-import re\n-import sys \n-import gzip \n-import bz2\n-import numpy \n-\n-def init_gene():\n- """\n- Initializing the gene structure \n- """\n-\n- gene_det = [(\'id\', \'f8\'), \n- (\'anno_id\', numpy.dtype), \n- (\'confgenes_id\', numpy.dtype),\n- (\'name\', \'S25\'),\n- (\'source\', \'S25\'),\n- (\'gene_info\', numpy.dtype),\n- (\'alias\', \'S15\'),\n- (\'name2\', numpy.dtype),\n- (\'strand\', \'S2\'), \n- (\'score\', \'S15\'), \n- (\'chr\', \'S15\'), \n- (\'chr_num\', numpy.dtype),\n- (\'paralogs\', numpy.dtype),\n- (\'start\', \'f8\'),\n- (\'stop\', \'f8\'), \n- (\'transcripts\', numpy.dtype),\n- (\'transcript_type\', numpy.dtype),\n- (\'transcript_info\', numpy.dtype),\n- (\'transcript_status\', numpy.dtype),\n- (\'transcript_valid\', numpy.dtype),\n- (\'exons\', numpy.dtype),\n- (\'exons_confirmed\', numpy.dtype),\n- (\'cds_exons\', numpy.dtype),\n- (\'utr5_exons\', numpy.dtype),\n- (\'utr3_exons\', numpy.dtype),\n- (\'tis\', numpy.dtype),\n- (\'tis_conf\', numpy.dtype),\n- (\'tis_info\', numpy.dtype),\n- (\'cdsStop\', numpy.dtype),\n- (\'cdsStop_conf\', numpy.dtype),\n- (\'cdsStop_info\', numpy.dtype),\n- (\'tss\', numpy.dtype),\n- (\'tss_info\', numpy.dtype),\n- (\'tss_conf\', numpy.dtype),\n- (\'cleave\', numpy.dtype),\n- (\'cleave_info\', numpy.dtype),\n- (\'cleave_conf\', numpy.dtype),\n- (\'polya\', numpy.dtype),\n- (\'polya_info\', numpy.dtype),\n- (\'polya_conf\', numpy.dtype),\n- (\'is_alt\', \'f8\'), \n- (\'is_alt_spliced\', \'f8\'), \n- (\'is_valid\', numpy.dtype),\n- (\'transcript_complete\', numpy.dtype),\n- (\'is_complete\', numpy.dtype),\n- (\'is_correctly_gff3_referenced\', \'S5\'),\n- (\'splicegraph\', numpy.dtype) ]\n-\n- return gene_det\n-\n-def open_file(fname):\n- """\n- Open the file (supports .gz .bz2) and returns the handler\n-\n- @args fname: input file name for reading \n- @type fname: str\n- """\n-\n- try:\n- if os.path.splitext(fname)[1] == ".gz":\n- FH = gzip.open(fname, \'rb\')\n- elif os.path.splitext(fname)[1] == ".bz2":\n- FH = bz2.BZ2File(fname, \'rb\')\n- else:\n- FH = open(fname, \'rU\')\n- except Exception as error:\n- sys.exit(error)\n-\n- return FH\n-\n-def add_CDS_phase(strand, cds):\n- """\n- Calculate CDS phase and add to the CDS exons\n-\n- @args strand: feature strand information \n- @type strand: +/- \n- @args cds: coding exon coordinates \n- @type cds: numpy array [[int, int, int]]\n- """\n-\n- cds_region, cds_flag = [], 0 \n- if strand == \'+\':\n- for cdspos in cds:\n- if cds_flag == 0:\n- cdspos = (cdspos[0], cdspos[1], 0)\n- diff = (cdspos[1]-(cdspos[0]-1))%3\n- else:\n- xy = 0\n- if diff == 0: \n- cdspos = (cdspos[0], cdspos[1], 0)\n- elif diff == 1: \n- cdspos = (cdspos[0], cdspos[1], 2)\n- xy = 2\n- elif diff == 2: \n- cdspos = (cdspos[0], cdspos[1], 1)\n- xy = 1\n- diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3\n- cds_region.append(cdspos)\n- cds_flag = 1 \n- elif strand == \'-\':\n- cds.reverse()\n- for cdspos in cds: \n- if cds_flag == 0:\n- cdspos = (cdspos[0], cdspos[1], 0)\n- diff = (cdspos[1]-(cdspos[0]-1))%3\n- else: \n- xy = 0 \n- if diff == 0: \n- cdspos = (cdspos[0], cdspos[1], 0)\n- elif diff == 1:\n- '..b" exon_pos.append([cds_5start, utr3_end])\n- for cds in cds_cod:\n- exon_pos.append(cds)\n- for utr3 in three_p_utr:\n- exon_pos.append(utr3)\n- else: \n- if jun_exon != []:\n- five_p_utr = five_p_utr[:-1]\n- cds_cod = cds_cod[1:]\n- for utr5 in five_p_utr:\n- exon_pos.append(utr5)\n- exon_pos.append(jun_exon) if jun_exon != [] else ''\n- jun_exon = []\n- utr3_start, utr3_end = 0, 0\n- if three_p_utr != []:\n- utr3_start = three_p_utr[0][0]\n- utr3_end = three_p_utr[0][1]\n- cds_3start = cds_cod[-1][0]\n- cds_3end = cds_cod[-1][1]\n- if utr3_start-cds_3end == 0 or utr3_start-cds_3end == 1: \n- jun_exon = [cds_3start, utr3_end]\n- if jun_exon != []:\n- cds_cod = cds_cod[:-1]\n- three_p_utr = three_p_utr[1:]\n- for cds in cds_cod:\n- exon_pos.append(cds)\n- exon_pos.append(jun_exon) if jun_exon != [] else ''\n- for utr3 in three_p_utr:\n- exon_pos.append(utr3)\n- elif strand_p == '-':\n- utr3_start, utr3_end = 0, 0 \n- if three_p_utr != []:\n- utr3_start = three_p_utr[-1][0]\n- utr3_end = three_p_utr[-1][1]\n- cds_3start = cds_cod[0][0]\n- cds_3end = cds_cod[0][1]\n- jun_exon = []\n- if cds_3start-utr3_end == 0 or cds_3start-utr3_end == 1:\n- jun_exon = [utr3_start, cds_3end] \n- if len(cds_cod) == 1: \n- three_prime_flag = 0\n- if jun_exon != []:\n- three_p_utr = three_p_utr[:-1]\n- three_prime_flag = 1\n- for utr3 in three_p_utr:\n- exon_pos.append(utr3)\n- jun_exon = []\n- (utr5_start, utr5_end) = (0, 0)\n- if five_p_utr != []:\n- utr5_start = five_p_utr[0][0]\n- utr5_end = five_p_utr[0][1]\n- if utr5_start-cds_3end == 0 or utr5_start-cds_3end == 1:\n- jun_exon = [cds_3start, utr5_end]\n- five_prime_flag = 0\n- if jun_exon != []:\n- cds_cod = cds_cod[:-1]\n- five_p_utr = five_p_utr[1:]\n- five_prime_flag = 1\n- if three_prime_flag == 1 and five_prime_flag == 1:\n- exon_pos.append([utr3_start, utr5_end])\n- if three_prime_flag == 1 and five_prime_flag == 0:\n- exon_pos.append([utr3_start, cds_3end])\n- cds_cod = cds_cod[:-1]\n- if three_prime_flag == 0 and five_prime_flag == 1:\n- exon_pos.append([cds_3start, utr5_end]) \n- for cds in cds_cod:\n- exon_pos.append(cds)\n- for utr5 in five_p_utr:\n- exon_pos.append(utr5)\n- else:\n- if jun_exon != []:\n- three_p_utr = three_p_utr[:-1]\n- cds_cod = cds_cod[1:]\n- for utr3 in three_p_utr:\n- exon_pos.append(utr3) \n- if jun_exon != []:\n- exon_pos.append(jun_exon)\n- jun_exon = []\n- (utr5_start, utr5_end) = (0, 0)\n- if five_p_utr != []:\n- utr5_start = five_p_utr[0][0]\n- utr5_end = five_p_utr[0][1] \n- cds_5start = cds_cod[-1][0]\n- cds_5end = cds_cod[-1][1]\n- if utr5_start-cds_5end == 0 or utr5_start-cds_5end == 1:\n- jun_exon = [cds_5start, utr5_end]\n- if jun_exon != []:\n- cds_cod = cds_cod[:-1]\n- five_p_utr = five_p_utr[1:]\n- for cds in cds_cod:\n- exon_pos.append(cds)\n- if jun_exon != []:\n- exon_pos.append(jun_exon) \n- for utr5 in five_p_utr:\n- exon_pos.append(utr5)\n- return exon_pos\n" |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/CCDS30770.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/CCDS30770.bed Thu Apr 23 17:57:49 2015 -0400 |
b |
@@ -0,0 +1,20 @@ +chr1 92149295 92327088 CCDS30770.1 0 - 92149295 92327088 0 16 119,108,42,121,300,159,141,153,338,190,148,169,184,138,185,61, 0,11933,14350,24924,28504,32497,32829,35573,36154,38216,43920,46066,51037,74874,113548,177732, +chr1 67000041 67208778 CCDS30744.1 0 + 67000041 67208778 0 25 10,64,25,72,57,55,176,12,12,25,52,86,93,75,501,128,127,60,112,156,133,203,65,165,23, 0,91488,98711,101585,105418,108451,109185,126154,133171,136636,137585,138922,142645,145319,147510,154789,155831,161075,184935,194905,199389,204976,206299,206913,208714, +chr1 8384389 8404073 CCDS30577.1 0 + 8384389 8404073 0 8 397,93,225,728,154,177,206,267, 0,968,1488,5879,11107,13486,15163,19417, +chr1 16767256 16785385 CCDS44067.1 0 + 16767256 16785385 0 8 14,101,105,82,109,178,76,49, 0,2870,7108,7298,8331,11076,15056,18080, +chr1 16767256 16785491 CCDS44066.1 0 + 16767256 16785491 0 7 92,101,105,82,109,178,155, 0,2870,7108,7298,8331,11076,18080, +chr1 16767256 16785385 CCDS173.1 0 + 16767256 16785385 0 8 92,101,105,82,109,178,76,49, 0,2870,7108,7298,8331,11076,15056,18080, +chr1 25072044 25167428 CCDS256.1 0 + 25072044 25167428 0 6 72,110,126,107,182,165, 0,52188,68540,81456,94306,95219, +chr1 33547850 33585783 CCDS375.1 0 + 33547850 33585783 0 9 105,174,173,135,166,163,113,215,139, 0,1704,9800,11032,12298,14457,15817,35652,37794, +chr1 48999844 50489468 CCDS44137.1 0 - 48999844 50489468 0 14 121,27,97,163,153,112,115,90,40,217,95,125,123,34, 0,717,5469,52831,56660,100320,119164,128979,333018,511411,711597,1163140,1317223,1489590, +chr1 100661810 100715376 CCDS767.1 0 - 100661810 100715376 0 11 168,72,192,78,167,217,122,182,76,124,51, 0,9975,10190,14439,18562,19728,22371,34478,39181,44506,53515, +chr1 150981108 151006710 CCDS977.1 0 + 150981108 151006710 0 8 39,93,203,185,159,95,159,429, 0,9179,9834,15978,16882,18600,20153,25173, +chr1 175914288 176176114 CCDS44279.1 0 - 175914288 176176114 0 19 18,45,161,125,118,117,82,109,144,136,115,58,77,69,120,65,98,60,407, 0,2042,41790,43135,44209,82419,98033,98557,101028,135999,140623,171471,189857,203853,217716,218674,230757,239480,261419, +chr1 175914288 176176114 CCDS30944.1 0 - 175914288 176176114 0 20 18,45,161,125,118,117,82,109,144,136,115,58,77,60,69,120,77,98,60,407, 0,2042,41790,43135,44209,82419,98033,98557,101028,135999,140623,171471,189857,191335,203853,217716,218662,230757,239480,261419, +chr1 184446643 184588690 CCDS1362.1 0 + 184446643 184588690 0 5 94,95,77,61,39, 0,30078,113229,120891,142008, +chr1 226420201 226496888 CCDS1553.1 0 - 226420201 226496888 0 15 106,98,180,126,81,102,120,134,158,126,134,105,95,33,79, 0,595,843,6470,18338,33032,33712,35456,45274,53832,55163,63341,65218,68672,76608, +chr1 1982069 2116448 CCDS37.1 0 + 1982069 2116448 0 18 71,122,90,51,86,132,82,53,189,98,87,136,88,120,80,90,116,88, 0,4810,5853,8910,84631,93579,95396,98241,100159,105364,118887,121424,121670,123266,124123,124593,133952,134291, +chr1 2075777 2116448 CCDS41229.1 0 + 2075777 2116448 0 13 3,82,53,189,98,87,136,88,120,80,90,116,88, 0,1688,4533,6451,11656,25179,27716,27962,29558,30415,30885,40244,40583, +chr1 2985823 3350375 CCDS44048.1 0 + 2985823 3350375 0 17 37,350,51,135,103,208,148,154,1417,85,170,78,170,175,237,175,78, 0,116865,174827,315892,327231,333531,335479,336235,342124,345303,348568,349407,356321,356791,361612,362706,364474, +chr1 2985823 3350375 CCDS41236.1 0 + 2985823 3350375 0 17 37,350,51,135,103,208,148,154,1417,85,170,78,170,175,237,175,135, 0,116865,174827,315892,327231,333531,335479,336235,342124,345303,348568,349407,356321,356791,361612,362706,364417, +chr1 6285139 6295971 CCDS61.1 0 - 6285139 6295971 0 5 183,218,170,89,195, 0,6822,8394,9806,10637, |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/CCDS30770.gff --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/CCDS30770.gff Thu Apr 23 17:57:49 2015 -0400 |
b |
b'@@ -0,0 +1,289 @@\n+chr1\tbed2gff\tgene\t92149296\t92327088\t0\t-\t.\tID=Gene:CCDS30770.1;Name=Gene:CCDS30770.1\n+chr1\tbed2gff\ttranscript\t92149296\t92327088\t0\t-\t.\tID=CCDS30770.1;Name=CCDS30770.1;Parent=Gene:CCDS30770.1\n+chr1\tbed2gff\texon\t92149296\t92149414\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92161229\t92161336\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92163646\t92163687\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92174220\t92174340\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92177800\t92178099\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92181793\t92181951\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92182125\t92182265\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92184869\t92185021\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92185450\t92185787\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92187512\t92187701\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92193216\t92193363\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92195362\t92195530\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92200333\t92200516\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92224170\t92224307\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92262844\t92263028\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\texon\t92327028\t92327088\t0\t-\t.\tParent=CCDS30770.1\n+chr1\tbed2gff\tgene\t67000042\t67208778\t0\t+\t.\tID=Gene:CCDS30744.1;Name=Gene:CCDS30744.1\n+chr1\tbed2gff\ttranscript\t67000042\t67208778\t0\t+\t.\tID=CCDS30744.1;Name=CCDS30744.1;Parent=Gene:CCDS30744.1\n+chr1\tbed2gff\texon\t67000042\t67000051\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67091530\t67091593\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67098753\t67098777\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67101627\t67101698\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67105460\t67105516\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67108493\t67108547\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67109227\t67109402\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67126196\t67126207\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67133213\t67133224\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67136678\t67136702\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67137627\t67137678\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67138964\t67139049\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67142687\t67142779\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67145361\t67145435\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67147552\t67148052\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67154831\t67154958\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67155873\t67155999\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67161117\t67161176\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67184977\t67185088\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67194947\t67195102\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67199431\t67199563\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67205018\t67205220\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67206341\t67206405\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67206955\t67207119\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\texon\t67208756\t67208778\t0\t+\t.\tParent=CCDS30744.1\n+chr1\tbed2gff\tgene\t8384390\t8404073\t0\t+\t.\tID=Gene:CCDS30577.1;Name=Gene:CCDS30577.1\n+chr1\tbed2gff\ttranscript\t8384390\t8404073\t0\t+\t.\tID=CCDS30577.1;Name=CCDS30577.1;Parent=Gene:CCDS30577.1\n+chr1\tbed2gff\texon\t8384390\t8384786\t0\t+\t.\tParent=CCDS30577.1\n+chr1\tbed2gff\texon\t8385358\t8385450\t0\t+\t.\tParent=CCDS30577.1\n+chr1\tbed2gff\texon\t8385878\t8386102\t0\t+\t.\tParent=CCDS30577.1\n+chr1\tbed2gff\texon\t8390269\t8390996\t0\t+\t.\tParent=CCDS30577.1\n+chr1\tbed2gff\texon\t8395497\t8395650\t0\t+\t.\tParent=CCDS30577.1\n+chr1\tbed2gff\texon\t8397876\t8398052\t0\t+\t.\tParent=CCDS30577.1\n+chr1\tbed2gff\texon\t8399553\t8399758\t0\t+\t.\tParent=CCDS30577.1\n+chr1\tbed2gff\texon\t8403807\t8404073\t0\t+\t.\tParent=CCDS30577.1\n+chr1\tbed2gff\tgene\t16767257\t16785385\t0\t+\t.\tID=Gene:CCDS44067.1;Name=Gene:CCDS44067.1\n+chr1\tbed2gff\ttranscript\t16767257\t16785385\t0\t+\t.\tID=CCDS44067.1;Name=CCDS44067.1;Parent=Gene:CCDS44067.1\n+chr1\tbed2gff\texon\t16767257\t16767270\t0\t+\t.\tParent=CCDS44067.1\n+chr1\tbed2gff\texon\t16770127\t16770227\t0\t+\t.\tParent=CCDS44067.1\n+chr1\tbed2gff\texon\t16774365\t16774469\t0\t+\t.\tParent=CCDS44067.1\n+chr1\tbed2gff\texo'..b'bed2gff\texon\t2106663\t2106752\t0\t+\t.\tParent=CCDS37.1\n+chr1\tbed2gff\texon\t2116022\t2116137\t0\t+\t.\tParent=CCDS37.1\n+chr1\tbed2gff\texon\t2116361\t2116448\t0\t+\t.\tParent=CCDS37.1\n+chr1\tbed2gff\tgene\t2075778\t2116448\t0\t+\t.\tID=Gene:CCDS41229.1;Name=Gene:CCDS41229.1\n+chr1\tbed2gff\ttranscript\t2075778\t2116448\t0\t+\t.\tID=CCDS41229.1;Name=CCDS41229.1;Parent=Gene:CCDS41229.1\n+chr1\tbed2gff\texon\t2075778\t2075780\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2077466\t2077547\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2080311\t2080363\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2082229\t2082417\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2087434\t2087531\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2100957\t2101043\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2103494\t2103629\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2103740\t2103827\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2105336\t2105455\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2106193\t2106272\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2106663\t2106752\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2116022\t2116137\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\texon\t2116361\t2116448\t0\t+\t.\tParent=CCDS41229.1\n+chr1\tbed2gff\tgene\t2985824\t3350375\t0\t+\t.\tID=Gene:CCDS44048.1;Name=Gene:CCDS44048.1\n+chr1\tbed2gff\ttranscript\t2985824\t3350375\t0\t+\t.\tID=CCDS44048.1;Name=CCDS44048.1;Parent=Gene:CCDS44048.1\n+chr1\tbed2gff\texon\t2985824\t2985860\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3102689\t3103038\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3160651\t3160701\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3301716\t3301850\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3313055\t3313157\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3319355\t3319562\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3321303\t3321450\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3322059\t3322212\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3327948\t3329364\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3331127\t3331211\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3334392\t3334561\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3335231\t3335308\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3342145\t3342314\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3342615\t3342789\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3347436\t3347672\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3348530\t3348704\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\texon\t3350298\t3350375\t0\t+\t.\tParent=CCDS44048.1\n+chr1\tbed2gff\tgene\t2985824\t3350375\t0\t+\t.\tID=Gene:CCDS41236.1;Name=Gene:CCDS41236.1\n+chr1\tbed2gff\ttranscript\t2985824\t3350375\t0\t+\t.\tID=CCDS41236.1;Name=CCDS41236.1;Parent=Gene:CCDS41236.1\n+chr1\tbed2gff\texon\t2985824\t2985860\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3102689\t3103038\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3160651\t3160701\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3301716\t3301850\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3313055\t3313157\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3319355\t3319562\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3321303\t3321450\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3322059\t3322212\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3327948\t3329364\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3331127\t3331211\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3334392\t3334561\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3335231\t3335308\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3342145\t3342314\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3342615\t3342789\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3347436\t3347672\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3348530\t3348704\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\texon\t3350241\t3350375\t0\t+\t.\tParent=CCDS41236.1\n+chr1\tbed2gff\tgene\t6285140\t6295971\t0\t-\t.\tID=Gene:CCDS61.1;Name=Gene:CCDS61.1\n+chr1\tbed2gff\ttranscript\t6285140\t6295971\t0\t-\t.\tID=CCDS61.1;Name=CCDS61.1;Parent=Gene:CCDS61.1\n+chr1\tbed2gff\texon\t6285140\t6285322\t0\t-\t.\tParent=CCDS61.1\n+chr1\tbed2gff\texon\t6291962\t6292179\t0\t-\t.\tParent=CCDS61.1\n+chr1\tbed2gff\texon\t6293534\t6293703\t0\t-\t.\tParent=CCDS61.1\n+chr1\tbed2gff\texon\t6294946\t6295034\t0\t-\t.\tParent=CCDS61.1\n+chr1\tbed2gff\texon\t6295777\t6295971\t0\t-\t.\tParent=CCDS61.1\n' |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/MB7_3R.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/MB7_3R.bed Thu Apr 23 17:57:49 2015 -0400 |
b |
b'@@ -0,0 +1,241 @@\n+3R\t141309\t144791\tCG9778-RA\t1\t-\t141309\t144791\t0\t5\t1519,236,333,162,488,\t0,2066,2368,2760,2994,\n+3R\t226211\t227739\tCG14647.a\t1\t+\t226211\t227739\t0\t3\t649,132,417,\t0,864,1111,\n+3R\t226211\t227739\tCG14647-RB\t1\t+\t226211\t227739\t0\t3\t649,132,417,\t0,864,1111,\n+3R\t752642\t764363\tCG34306-RA\t2\t+\t752642\t764363\t0\t2\t5670,5912,\t0,5809,\n+3R\t56500\t58054.0\tCG14641.a\t31\t-\t56500\t58054.0\t0\t1\t1554,\t0,\n+3R\t56474\t58077.0\tCG14641-RA\t34\t-\t56474\t58077.0\t0\t1\t1603,\t0,\n+3R\t221814\t223609\tCG9855-RA\t1\t-\t221814\t223609\t0\t4\t710,197,529,180,\t0,767,1019,1615,\n+3R\t1045389\t1047270\tCG1116.a\t3\t+\t1045389\t1047270\t0\t6\t188,218,272,150,577,157,\t0,248,531,868,1081,1724,\n+3R\t1045389\t1047270\tCG1116-RA\t1\t+\t1045389\t1047270\t0\t6\t188,215,272,150,577,157,\t0,251,531,868,1081,1724,\n+3R\t1045389\t1047270\tCG1116-RB\t3\t+\t1045389\t1047270\t0\t5\t466,272,150,577,157,\t0,531,868,1081,1724,\n+3R\t74438\t76518\tCG14643-RA\t1\t-\t74438\t76518\t0\t4\t301,655,608,62,\t0,474,1190,2018,\n+3R\t403660\t404366.0\tCG32945-RA.3d\t31\t-\t403660\t404366.0\t0\t1\t706,\t0,\n+3R\t403660\t404368.0\tCG32945.a\t34\t-\t403660\t404368.0\t0\t1\t708,\t0,\n+3R\t736278\t771295\tCG31536.a\t2\t+\t736278\t771295\t0\t13\t529,735,233,222,244,161,75,107,116,256,1266,487,599,\t0,6840,12035,12390,12677,12975,13197,14989,15337,15519,30118,31745,34418,\n+3R\t736277\t749883\tCG31536-RC\t1\t+\t736277\t749883\t0\t7\t530,735,233,222,244,161,408,\t0,6841,12036,12391,12678,12976,13198,\n+3R\t23012\t30295\tCG12582.a\t3\t+\t23012\t30295\t0\t9\t272,135,552,97,289,480,422,381,361,\t0,446,930,1539,1714,4552,5085,6491,6922,\n+3R\t22996\t30295\tCG12582-RA\t1\t+\t22996\t30295\t0\t9\t288,135,514,97,289,480,422,381,361,\t0,462,984,1555,1730,4568,5101,6507,6938,\n+3R\t22930\t30295\tCG12582.b\t3\t+\t22930\t30295\t0\t9\t93,457,514,97,289,480,422,381,361,\t0,206,1050,1621,1796,4634,5167,6573,7004,\n+3R\t23029\t30295\tCG12582-RB\t1\t+\t23029\t30295\t0\t8\t564,514,97,289,480,422,381,361,\t0,951,1522,1697,4535,5068,6474,6905,\n+3R\t531867\t537915\tCG31534-RC\t3\t+\t531867\t537915\t0\t7\t220,1081,374,911,200,148,1296,\t0,880,2530,3014,4031,4470,4752,\n+3R\t531867\t537915\tCG31534-RB\t1\t+\t531867\t537915\t0\t7\t220,1081,374,911,176,105,1296,\t0,880,2530,3014,4031,4470,4752,\n+3R\t531867\t537915\tCG31534-RA\t3\t+\t531867\t537915\t0\t6\t220,1081,374,911,176,1296,\t0,880,2530,3014,4031,4752,\n+3R\t480549\t483707\tCG12001-RA\t1\t+\t480549\t483707\t0\t4\t252,1138,307,405,\t0,690,2075,2753,\n+3R\t1084192\t1084867.0\tCG14666-RA\t34\t-\t1084192\t1084867.0\t0\t1\t675,\t0,\n+3R\t576128\t598807\tCG31530-RA\t1\t-\t576128\t598807\t0\t5\t2184,169,155,869,153,\t0,2390,2633,7136,22526,\n+3R\t1058267\t1062011\tCG12005-RB\t1\t+\t1058267\t1062011\t0\t6\t181,384,350,1446,423,447,\t0,245,839,1247,2760,3297,\n+3R\t807720\t809954\tCG14662-RA\t1\t-\t807720\t809954\t0\t2\t395,1605,\t0,629,\n+3R\t1061779\t1063038\tCG10233-RA\t1\t-\t1061779\t1063038\t0\t3\t609,301,85,\t0,815,1174,\n+3R\t1062471\t1063038\tCG10233-RB\t2\t-\t1062471\t1063038\t0\t2\t424,85,\t0,482,\n+3R\t606846\t610444\tCG17387-RA.3d\t1\t-\t606846\t610444\t0\t4\t841,1809,148,290,\t0,1162,3102,3308,\n+3R\t701726\t704255\tCG14660-RA\t1\t-\t701726\t704255\t0\t3\t727,1617,57,\t0,804,2472,\n+3R\t909774\t912749.0\tCG2530.a\t31\t-\t909774\t912749.0\t0\t1\t2975,\t0,\n+3R\t909342\t912749.0\tCG2530-RA.5d\t34\t-\t909342\t912749.0\t0\t1\t3407,\t0,\n+3R\t94942\t102759\tCG9766.a\t2\t-\t94942\t102759\t0\t4\t568,205,246,56,\t0,627,888,7761,\n+3R\t94942\t103515\tCG9766-RB\t1\t-\t94942\t103515\t0\t4\t568,205,246,165,\t0,627,888,8408,\n+3R\t976629\t995849\tCG12591-RA\t1\t+\t976629\t995849\t0\t6\t379,575,1094,227,106,680,\t0,2615,14430,15955,16252,18540,\n+3R\t207031\t212741\tCG1084-RA\t1\t+\t207031\t212741\t0\t8\t220,939,1686,167,873,169,134,551,\t0,569,1560,3581,3811,4740,4970,5159,\n+3R\t204643\t206932\tCG11739-RD\t2\t+\t204643\t206932\t0\t7\t88,169,150,118,162,128,508,\t0,528,753,1006,1187,1582,1781,\n+3R\t204400\t206932\tCG11739-RA\t2\t+\t204400\t206932\t0\t7\t331,169,150,118,162,128,508,\t0,771,996,1249,1430,1825,2024,\n+3R\t204385\t206932\tCG11739-RC\t1\t+\t204385\t206932\t0\t7\t66,169,150,118,162,128,508,\t0,786,1011,1264,1445,1840,2039,\n+3R\t205028\t206932\tCG11739-RB\t1\t+\t205028\t206932\t0\t6\t312,150,118,162,128,508,\t0,368,621,802,1197,1396,\n+3R\t612766\t620844\tCG17735-RB\t3\t-\t612766\t620844\t0\t7\t1243,202,189,1836,989,1105,1905,\t0,1329,1636,1888,3831,4913,6173,\n+3R\t656570\t657019.0\tC'..b'G32490-RP\t3\t+\t107973\t127263\t0\t7\t63,161,181,158,133,1098,1752,\t0,422,1974,12877,13316,15236,17538,\n+3R\t107624\t127263\tCG32490-RN\t3\t+\t107624\t127263\t0\t7\t101,161,181,158,133,1098,1752,\t0,771,2323,13226,13665,15585,17887,\n+3R\t107569\t127263\tCG32490.g\t3\t+\t107569\t127263\t0\t7\t68,161,181,167,133,1098,1752,\t0,826,2378,13272,13720,15640,17942,\n+3R\t107551\t127263\tCG32490-RO\t3\t+\t107551\t127263\t0\t7\t101,161,181,158,133,1098,1752,\t0,844,2396,13299,13738,15658,17960,\n+3R\t107551\t127263\tCG32490-RM\t3\t+\t107551\t127263\t0\t7\t174,161,181,158,133,1098,1752,\t0,844,2396,13299,13738,15658,17960,\n+3R\t107426\t127263\tCG32490.h\t3\t+\t107426\t127263\t0\t7\t167,161,181,167,133,1098,1752,\t0,969,2521,13415,13863,15783,18085,\n+3R\t107425\t127263\tCG32490-RC\t1\t+\t107425\t127263\t0\t7\t168,161,181,158,133,1098,1752,\t0,970,2522,13425,13864,15784,18086,\n+3R\t106272\t127263\tCG32490-RH\t3\t+\t106272\t127263\t0\t7\t272,161,181,167,133,1098,1752,\t0,2123,3675,14569,15017,16937,19239,\n+3R\t106110\t127263\tCG32490-RI\t3\t+\t106110\t127263\t0\t7\t81,161,181,167,133,1098,1752,\t0,2285,3837,14731,15179,17099,19401,\n+3R\t105905\t127263\tCG32490-RG\t3\t+\t105905\t127263\t0\t7\t173,161,181,167,133,1098,1752,\t0,2490,4042,14936,15384,17304,19606,\n+3R\t108258\t127263\tCG32490-RA\t3\t+\t108258\t127263\t0\t6\t298,181,167,133,1098,1752,\t0,1689,12583,13031,14951,17253,\n+3R\t120616\t127263\tCG32490-RJ\t1\t+\t120616\t127263\t0\t5\t50,167,133,1098,1752,\t0,225,673,2593,4895,\n+3R\t117669\t127263\tCG32490-RL\t3\t+\t117669\t127263\t0\t5\t125,167,133,1098,1752,\t0,3172,3620,5540,7842,\n+3R\t117459\t127263\tCG32490-RK\t3\t+\t117459\t127263\t0\t5\t532,167,133,1098,1752,\t0,3382,3830,5750,8052,\n+3R\t107426\t128309\tCG32490.i\t2\t+\t107426\t128309\t0\t6\t167,161,181,167,133,62,\t0,969,2521,13415,13863,20821,\n+3R\t110073\t128309\tCG32490-RE\t1\t+\t110073\t128309\t0\t4\t55,167,133,62,\t0,10768,11216,18174,\n+3R\t263101\t267050\tCG14650-RA\t1\t+\t263101\t267050\t0\t6\t2380,154,125,103,293,582,\t0,2437,2647,2841,3013,3367,\n+3R\t212966\t215535\tCG10520-RB\t1\t+\t212966\t215535\t0\t3\t72,912,1355,\t0,236,1214,\n+3R\t678534\t695709\tCG1133-RA\t1\t+\t678534\t695709\t0\t3\t1242,170,1535,\t0,15330,15640,\n+3R\t163481\t165640\tCG1103-RA\t1\t+\t163481\t165640\t0\t3\t164,303,875,\t0,241,1284,\n+3R\t622112\t627445\tCG14656-RA\t3\t-\t622112\t627445\t0\t3\t1307,987,511,\t0,3301,4822,\n+3R\t474944\t480360\tCG1059-RA\t1\t+\t474944\t480360\t0\t5\t545,150,1095,1354,1187,\t0,1362,1576,2760,4229,\n+3R\t358949\t359693\tCG31526-RB\t2\t+\t358949\t359693\t0\t3\t204,320,104,\t0,265,640,\n+3R\t358949\t359666\tCG31526-RA\t1\t+\t358949\t359666\t0\t2\t204,452,\t0,265,\n+3R\t44183\t45852.0\tCG31516.a\t31\t-\t44183\t45852.0\t0\t1\t1669,\t0,\n+3R\t44178\t45852.0\tCG31516-RA\t34\t-\t44178\t45852.0\t0\t1\t1674,\t0,\n+3R\t782716\t787070\tCG2016-RB\t1\t-\t782716\t787070\t0\t7\t229,153,134,122,119,80,37,\t0,295,506,2642,2909,3899,4317,\n+3R\t782716\t787070\tCG2016.a\t3\t-\t782716\t787070\t0\t6\t229,153,134,122,119,37,\t0,295,506,2642,2909,4317,\n+3R\t782716\t787070\tCG2016.b\t2\t-\t782716\t787070\t0\t6\t229,153,134,122,119,455,\t0,295,506,2642,2909,3899,\n+3R\t37504\t53244\tCG1107-RA\t3\t+\t37504\t53244\t0\t13\t30,8,124,68,200,231,140,933,410,374,962,162,772,\t0,125,5661,6464,9335,9973,10275,10475,12816,13283,13715,14747,14968,\n+3R\t46716\t53244\tCG1107-RB\t1\t+\t46716\t53244\t0\t9\t323,231,140,933,410,374,962,162,772,\t0,761,1063,1263,3604,4071,4503,5535,5756,\n+3R\t47365\t53244\tCG1107.a\t3\t+\t47365\t53244\t0\t8\t343,140,933,410,374,962,162,772,\t0,414,614,2955,3422,3854,4886,5107,\n+3R\t92675\t94166\tCG1092.a\t1\t+\t92675\t94166\t0\t2\t252,1184,\t0,307,\n+3R\t92675\t94166\tCG1092-RA\t1\t+\t92675\t94166\t0\t2\t252,1184,\t0,307,\n+3R\t92693\t94005.0\tCG1092-RB\t34\t+\t92693\t94005.0\t0\t1\t1312,\t0,\n+3R\t953811\t955661.0\tCG12007.a\t34\t+\t953811\t955661.0\t0\t1\t1850,\t0,\n+3R\t953809\t955665.0\tCG12007-RA\t34\t+\t953809\t955665.0\t0\t1\t1856,\t0,\n+3R\t224271\t227749\tCG9853-RB\t2\t-\t224271\t227749\t0\t4\t836,235,449,306,\t0,896,1192,3172,\n+3R\t224271\t225734\tCG9853-RA\t1\t-\t224271\t225734\t0\t3\t836,235,271,\t0,896,1192,\n+3R\t261943\t263051.0\tCG9804-RA\t31\t-\t261943\t263051.0\t0\t1\t1108,\t0,\n+3R\t160819\t161237.0\tCG14645-RA\t34\t+\t160819\t161237.0\t0\t1\t418,\t0,\n+3R\t160819\t161223.0\tCG14645.a\t31\t+\t160819\t161223.0\t0\t1\t404,\t0,\n+3R\t185509\t192577\tCG1090.b\t3\t+\t185509\t192577\t0\t6\t500,231,945,976,189,907,\t0,3490,3774,4778,5914,6161,\n' |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/MB7_3R.gff3 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/MB7_3R.gff3 Thu Apr 23 17:57:49 2015 -0400 |
b |
b'@@ -0,0 +1,3971 @@\n+##gff-version 3\n+3R\tMB7\tgene\t361\t10200\t0\t+\t.\tID=CG12581;Name=CG12581\n+3R\tMB7\tmRNA\t361\t10200\t3\t+\t.\tID=CG12581-RB;Parent=CG12581;Name=CG12581-RB\n+3R\tMB7\texon\t361\t509\t0\t+\t.\tParent=CG12581-RB\n+3R\tMB7\texon\t578\t1913\t0\t+\t.\tParent=CG12581-RB\n+3R\tMB7\texon\t7784\t8649\t0\t+\t.\tParent=CG12581-RB\n+3R\tMB7\texon\t9439\t10200\t0\t+\t.\tParent=CG12581-RB\n+3R\tMB7\tfive_prime_UTR\t361\t509\t0\t+\t.\tParent=CG12581-RB\n+3R\tMB7\tfive_prime_UTR\t578\t1114\t0\t+\t.\tParent=CG12581-RB\n+3R\tMB7\tstart_codon\t1115\t1117\t0\t+\t0\tParent=CG12581-RB\n+3R\tMB7\tCDS\t1115\t1913\t0\t+\t0\tParent=CG12581-RB\n+3R\tMB7\tCDS\t7784\t8649\t0\t+\t2\tParent=CG12581-RB\n+3R\tMB7\tCDS\t9439\t9771\t0\t+\t0\tParent=CG12581-RB\n+3R\tMB7\tstop_codon\t9769\t9771\t0\t+\t0\tParent=CG12581-RB\n+3R\tMB7\tthree_prime_UTR\t9772\t10200\t0\t+\t.\tParent=CG12581-RB\n+3R\tMB7\tmRNA\t380\t10200\t1\t+\t.\tID=CG12581-RA;Parent=CG12581;Name=CG12581-RA\n+3R\tMB7\texon\t380\t1913\t0\t+\t.\tParent=CG12581-RA\n+3R\tMB7\texon\t7784\t8649\t0\t+\t.\tParent=CG12581-RA\n+3R\tMB7\texon\t9439\t10200\t0\t+\t.\tParent=CG12581-RA\n+3R\tMB7\tfive_prime_UTR\t380\t1114\t0\t+\t.\tParent=CG12581-RA\n+3R\tMB7\tstart_codon\t1115\t1117\t0\t+\t0\tParent=CG12581-RA\n+3R\tMB7\tCDS\t1115\t1913\t0\t+\t0\tParent=CG12581-RA\n+3R\tMB7\tCDS\t7784\t8649\t0\t+\t2\tParent=CG12581-RA\n+3R\tMB7\tCDS\t9439\t9771\t0\t+\t0\tParent=CG12581-RA\n+3R\tMB7\tstop_codon\t9769\t9771\t0\t+\t0\tParent=CG12581-RA\n+3R\tMB7\tthree_prime_UTR\t9772\t10200\t0\t+\t.\tParent=CG12581-RA\n+3R\tMB7\tgene\t15388\t16170\t0\t-\t.\tID=CG18090;Name=CG18090\n+3R\tMB7\tmRNA\t15414\t15982\t31\t-\t.\tID=CG18090.a;Parent=CG18090;Name=CG18090.a\n+3R\tMB7\texon\t15414\t15982\t0\t-\t.\tParent=CG18090.a\n+3R\tMB7\tthree_prime_UTR\t15414\t15529\t0\t-\t.\tParent=CG18090.a\n+3R\tMB7\tstop_codon\t15530\t15532\t0\t-\t0\tParent=CG18090.a\n+3R\tMB7\tCDS\t15530\t15955\t0\t-\t0\tParent=CG18090.a\n+3R\tMB7\tstart_codon\t15953\t15955\t0\t-\t0\tParent=CG18090.a\n+3R\tMB7\tfive_prime_UTR\t15956\t15982\t0\t-\t.\tParent=CG18090.a\n+3R\tMB7\tmRNA\t15388\t16170\t34\t-\t.\tID=CG18090-RA;Parent=CG18090;Name=CG18090-RA\n+3R\tMB7\texon\t15388\t16170\t0\t-\t.\tParent=CG18090-RA\n+3R\tMB7\tthree_prime_UTR\t15388\t15529\t0\t-\t.\tParent=CG18090-RA\n+3R\tMB7\tstop_codon\t15530\t15532\t0\t-\t0\tParent=CG18090-RA\n+3R\tMB7\tCDS\t15530\t15955\t0\t-\t0\tParent=CG18090-RA\n+3R\tMB7\tstart_codon\t15953\t15955\t0\t-\t0\tParent=CG18090-RA\n+3R\tMB7\tfive_prime_UTR\t15956\t16170\t0\t-\t.\tParent=CG18090-RA\n+3R\tMB7\tgene\t17136\t21871\t0\t+\t.\tID=DMG5-MB6.chr3R.1.002.a;Name=DMG5-MB6.chr3R.1.002.a\n+3R\tMB7\tmRNA\t17136\t21871\t2\t+\t.\tID=DMG5-MB6.chr3R.1.002.a.a;Parent=DMG5-MB6.chr3R.1.002.a;Name=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\texon\t17136\t17251\t0\t+\t.\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\texon\t19953\t20047\t0\t+\t.\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\texon\t20114\t20599\t0\t+\t.\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\texon\t20671\t21210\t0\t+\t.\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\texon\t21367\t21534\t0\t+\t.\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\texon\t21591\t21871\t0\t+\t.\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\tstart_codon\t17136\t17138\t0\t+\t0\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\tCDS\t17136\t17251\t0\t+\t0\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\tCDS\t19953\t20047\t0\t+\t1\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\tCDS\t20114\t20599\t0\t+\t2\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\tCDS\t20671\t20759\t0\t+\t2\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\tstop_codon\t20757\t20759\t0\t+\t0\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\tthree_prime_UTR\t20760\t21210\t0\t+\t.\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\tthree_prime_UTR\t21367\t21534\t0\t+\t.\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\tthree_prime_UTR\t21591\t21871\t0\t+\t.\tParent=DMG5-MB6.chr3R.1.002.a.a\n+3R\tMB7\tgene\t22931\t30295\t0\t+\t.\tID=CG12582;Name=CG12582\n+3R\tMB7\tmRNA\t23013\t30295\t3\t+\t.\tID=CG12582.a;Parent=CG12582;Name=CG12582.a\n+3R\tMB7\texon\t23013\t23284\t0\t+\t.\tParent=CG12582.a\n+3R\tMB7\texon\t23459\t23593\t0\t+\t.\tParent=CG12582.a\n+3R\tMB7\texon\t23943\t24494\t0\t+\t.\tParent=CG12582.a\n+3R\tMB7\texon\t24552\t24648\t0\t+\t.\tParent=CG12582.a\n+3R\tMB7\texon\t24727\t25015\t0\t+\t.\tParent=CG12582.a\n+3R\tMB7\texon\t27565\t28044\t0\t+\t.\tParent=CG12582.a\n+3R\tMB7\texon\t28098\t28519\t0\t+\t.\tParent=CG12582.a\n+3R\tMB7\texon\t29504\t29884\t0\t+\t.\tParent=CG12582.a\n+3R\tMB7\texon\t29935\t30295\t0\t+\t.\tParent=CG12582.a\n+3R\tMB7\tfive_prime_UTR\t23013\t23284\t0\t+\t.\tParent=CG125'..b'517\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\texon\t1121579\t1121685\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\texon\t1121869\t1122357\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\texon\t1123924\t1124211\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\texon\t1125192\t1125295\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\texon\t1129833\t1129904\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\texon\t1138711\t1139219\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\texon\t1139660\t1140027\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\texon\t1148710\t1148847\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\texon\t1149387\t1149566\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\tthree_prime_UTR\t1098665\t1099668\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\tstop_codon\t1099669\t1099671\t0\t-\t0\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1099669\t1099804\t0\t-\t1\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1099871\t1100040\t0\t-\t0\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1100457\t1100616\t0\t-\t1\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1100688\t1100809\t0\t-\t0\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1118362\t1118563\t0\t-\t1\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1118720\t1118882\t0\t-\t2\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1118941\t1119092\t0\t-\t1\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1119784\t1119956\t0\t-\t0\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1120028\t1120577\t0\t-\t1\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1121363\t1121517\t0\t-\t0\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1121579\t1121685\t0\t-\t2\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1121869\t1122357\t0\t-\t2\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1123924\t1124211\t0\t-\t2\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1125192\t1125295\t0\t-\t1\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1129833\t1129904\t0\t-\t1\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1138711\t1139219\t0\t-\t0\tParent=CG32464-RB\n+3R\tMB7\tCDS\t1139660\t1139920\t0\t-\t0\tParent=CG32464-RB\n+3R\tMB7\tstart_codon\t1139918\t1139920\t0\t-\t0\tParent=CG32464-RB\n+3R\tMB7\tfive_prime_UTR\t1139921\t1140027\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\tfive_prime_UTR\t1148710\t1148847\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\tfive_prime_UTR\t1149387\t1149566\t0\t-\t.\tParent=CG32464-RB\n+3R\tMB7\tmRNA\t1098665\t1149566\t3\t-\t.\tID=CG32464-RU;Parent=CG32464;Name=CG32464-RU\n+3R\tMB7\texon\t1098665\t1099804\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1099871\t1100040\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1100457\t1100616\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1100688\t1100809\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1118362\t1118563\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1118720\t1118882\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1118941\t1119092\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1119784\t1119956\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1120028\t1120577\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1121363\t1121517\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1121579\t1121685\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1121869\t1122357\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1123924\t1124211\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1125192\t1125295\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1138711\t1139219\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1139660\t1140027\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1148710\t1148847\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\texon\t1149387\t1149566\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\tthree_prime_UTR\t1098665\t1099668\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\tstop_codon\t1099669\t1099671\t0\t-\t0\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1099669\t1099804\t0\t-\t1\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1099871\t1100040\t0\t-\t0\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1100457\t1100616\t0\t-\t1\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1100688\t1100809\t0\t-\t0\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1118362\t1118563\t0\t-\t1\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1118720\t1118882\t0\t-\t2\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1118941\t1119092\t0\t-\t1\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1119784\t1119956\t0\t-\t0\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1120028\t1120577\t0\t-\t1\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1121363\t1121517\t0\t-\t0\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1121579\t1121685\t0\t-\t2\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1121869\t1122357\t0\t-\t2\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1123924\t1124211\t0\t-\t2\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1125192\t1125295\t0\t-\t1\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1138711\t1139219\t0\t-\t0\tParent=CG32464-RU\n+3R\tMB7\tCDS\t1139660\t1139920\t0\t-\t0\tParent=CG32464-RU\n+3R\tMB7\tstart_codon\t1139918\t1139920\t0\t-\t0\tParent=CG32464-RU\n+3R\tMB7\tfive_prime_UTR\t1139921\t1140027\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\tfive_prime_UTR\t1148710\t1148847\t0\t-\t.\tParent=CG32464-RU\n+3R\tMB7\tfive_prime_UTR\t1149387\t1149566\t0\t-\t.\tParent=CG32464-RU\n' |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/aceview_hs_37.gff3 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/aceview_hs_37.gff3 Thu Apr 23 17:57:49 2015 -0400 |
b |
b'@@ -0,0 +1,3164 @@\n+##gff-version 3\n+7\tAceView\tgene\t34386126\t34873948\t.\t-\t.\tID=AAA1;Name=AAA1\n+7\tAceView\ttranscript\t34606334\t34797884\t.\t-\t.\tID=AAA1.jAug10;Parent=AAA1\n+7\tAceView\texon\t34606334\t34606424\t.\t-\t.\tParent=AAA1.jAug10\n+7\tAceView\texon\t34606693\t34606763\t.\t-\t.\tParent=AAA1.jAug10\n+7\tAceView\texon\t34609324\t34609473\t.\t-\t.\tParent=AAA1.jAug10\n+7\tAceView\texon\t34743692\t34743811\t.\t-\t.\tParent=AAA1.jAug10\n+7\tAceView\texon\t34768349\t34768428\t.\t-\t.\tParent=AAA1.jAug10\n+7\tAceView\texon\t34797686\t34797884\t.\t-\t.\tParent=AAA1.jAug10\n+7\tAceView\tmRNA\t34682839\t34800803\t.\t-\t.\tID=AAA1.dAug10;Parent=AAA1\n+7\tAceView\tfive_prime_UTR\t34800803\t34800803\t.\t-\t.\tParent=AAA1.dAug10\n+7\tAceView\tCDS\t34682958\t34682963\t.\t-\t0\tParent=AAA1.dAug10\n+7\tAceView\tCDS\t34768349\t34768428\t.\t-\t2\tParent=AAA1.dAug10\n+7\tAceView\tCDS\t34800724\t34800802\t.\t-\t0\tParent=AAA1.dAug10\n+7\tAceView\tthree_prime_UTR\t34682839\t34682957\t.\t-\t.\tParent=AAA1.dAug10\n+7\tAceView\texon\t34682839\t34682963\t.\t-\t.\tParent=AAA1.dAug10\n+7\tAceView\texon\t34768349\t34768428\t.\t-\t.\tParent=AAA1.dAug10\n+7\tAceView\texon\t34800724\t34800803\t.\t-\t.\tParent=AAA1.dAug10\n+7\tAceView\ttranscript\t34758474\t34873943\t.\t-\t.\tID=AAA1.hAug10;Parent=AAA1\n+7\tAceView\texon\t34758474\t34759420\t.\t-\t.\tParent=AAA1.hAug10\n+7\tAceView\texon\t34762896\t34763007\t.\t-\t.\tParent=AAA1.hAug10\n+7\tAceView\texon\t34768349\t34768428\t.\t-\t.\tParent=AAA1.hAug10\n+7\tAceView\texon\t34807954\t34808052\t.\t-\t.\tParent=AAA1.hAug10\n+7\tAceView\texon\t34873773\t34873943\t.\t-\t.\tParent=AAA1.hAug10\n+7\tAceView\ttranscript\t34386126\t34797884\t.\t-\t.\tID=AAA1.eAug10;Parent=AAA1\n+7\tAceView\texon\t34386126\t34390459\t.\t-\t.\tParent=AAA1.eAug10\n+7\tAceView\texon\t34457191\t34457284\t.\t-\t.\tParent=AAA1.eAug10\n+7\tAceView\texon\t34609324\t34609473\t.\t-\t.\tParent=AAA1.eAug10\n+7\tAceView\texon\t34768349\t34768428\t.\t-\t.\tParent=AAA1.eAug10\n+7\tAceView\texon\t34797686\t34797884\t.\t-\t.\tParent=AAA1.eAug10\n+7\tAceView\tmRNA\t34386126\t34797884\t.\t-\t.\tID=AAA1.bAug10;Parent=AAA1\n+7\tAceView\tfive_prime_UTR\t34797711\t34797884\t.\t-\t.\tParent=AAA1.bAug10\n+7\tAceView\tCDS\t34457198\t34457284\t.\t-\t0\tParent=AAA1.bAug10\n+7\tAceView\tCDS\t34768349\t34768428\t.\t-\t2\tParent=AAA1.bAug10\n+7\tAceView\tCDS\t34797686\t34797710\t.\t-\t0\tParent=AAA1.bAug10\n+7\tAceView\tthree_prime_UTR\t34386126\t34390459\t.\t-\t.\tParent=AAA1.bAug10\n+7\tAceView\tthree_prime_UTR\t34457191\t34457197\t.\t-\t.\tParent=AAA1.bAug10\n+7\tAceView\texon\t34386126\t34390459\t.\t-\t.\tParent=AAA1.bAug10\n+7\tAceView\texon\t34457191\t34457284\t.\t-\t.\tParent=AAA1.bAug10\n+7\tAceView\texon\t34768349\t34768428\t.\t-\t.\tParent=AAA1.bAug10\n+7\tAceView\texon\t34797686\t34797884\t.\t-\t.\tParent=AAA1.bAug10\n+7\tAceView\ttranscript\t34390034\t34800803\t.\t-\t.\tID=AAA1.iAug10;Parent=AAA1\n+7\tAceView\texon\t34390034\t34390459\t.\t-\t.\tParent=AAA1.iAug10\n+7\tAceView\texon\t34457191\t34457284\t.\t-\t.\tParent=AAA1.iAug10\n+7\tAceView\texon\t34609324\t34609473\t.\t-\t.\tParent=AAA1.iAug10\n+7\tAceView\texon\t34768349\t34768428\t.\t-\t.\tParent=AAA1.iAug10\n+7\tAceView\texon\t34800724\t34800803\t.\t-\t.\tParent=AAA1.iAug10\n+7\tAceView\tmRNA\t34743462\t34800803\t.\t-\t.\tID=AAA1.cAug10;Parent=AAA1\n+7\tAceView\tfive_prime_UTR\t34800803\t34800803\t.\t-\t.\tParent=AAA1.cAug10\n+7\tAceView\tCDS\t34743797\t34743811\t.\t-\t0\tParent=AAA1.cAug10\n+7\tAceView\tCDS\t34768349\t34768428\t.\t-\t2\tParent=AAA1.cAug10\n+7\tAceView\tCDS\t34800724\t34800802\t.\t-\t0\tParent=AAA1.cAug10\n+7\tAceView\tthree_prime_UTR\t34743462\t34743796\t.\t-\t.\tParent=AAA1.cAug10\n+7\tAceView\texon\t34743462\t34743811\t.\t-\t.\tParent=AAA1.cAug10\n+7\tAceView\texon\t34768349\t34768428\t.\t-\t.\tParent=AAA1.cAug10\n+7\tAceView\texon\t34800724\t34800803\t.\t-\t.\tParent=AAA1.cAug10\n+7\tAceView\ttranscript\t34758474\t34873941\t.\t-\t.\tID=AAA1.fAug10;Parent=AAA1\n+7\tAceView\texon\t34758474\t34759420\t.\t-\t.\tParent=AAA1.fAug10\n+7\tAceView\texon\t34760254\t34760397\t.\t-\t.\tParent=AAA1.fAug10\n+7\tAceView\texon\t34762896\t34763007\t.\t-\t.\tParent=AAA1.fAug10\n+7\tAceView\texon\t34768349\t34768428\t.\t-\t.\tParent=AAA1.fAug10\n+7\tAceView\texon\t34800724\t34800803\t.\t-\t.\tParent=AAA1.fAug10\n+7\tAceView\texon\t34873749\t34873941\t.\t-\t.\tParent=AAA1.fAug10\n+7\tAceView\tmRNA\t34607864\t34797884\t.\t-\t.\tID=AAA1.aAug10;Parent=AAA1\n+7\tAceView\tfive_prime_UTR\t34797711\t34797884\t.\t-\t.\tParent=AAA1.aAug10\n+7\tAceView\t'..b'ceView\tCDS\t219134689\t219134809\t.\t-\t0\tParent=AAMP.gAug10\n+2\tAceView\tthree_prime_UTR\t219128853\t219129331\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\texon\t219128853\t219129331\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\texon\t219129739\t219129897\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\texon\t219130094\t219130184\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\texon\t219130302\t219130405\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\texon\t219130554\t219130669\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\texon\t219130787\t219130870\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\texon\t219131166\t219131310\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\texon\t219131570\t219131709\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\texon\t219132217\t219132336\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\texon\t219134105\t219134257\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\texon\t219134689\t219134857\t.\t-\t.\tParent=AAMP.gAug10\n+2\tAceView\tmRNA\t219128851\t219134882\t.\t-\t.\tID=AAMP.cAug10;Parent=AAMP\n+2\tAceView\tfive_prime_UTR\t219134810\t219134882\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\tCDS\t219129256\t219129331\t.\t-\t1\tParent=AAMP.cAug10\n+2\tAceView\tCDS\t219129743\t219129897\t.\t-\t0\tParent=AAMP.cAug10\n+2\tAceView\tCDS\t219130094\t219130184\t.\t-\t1\tParent=AAMP.cAug10\n+2\tAceView\tCDS\t219130302\t219130405\t.\t-\t0\tParent=AAMP.cAug10\n+2\tAceView\tCDS\t219130554\t219130669\t.\t-\t2\tParent=AAMP.cAug10\n+2\tAceView\tCDS\t219130787\t219130870\t.\t-\t2\tParent=AAMP.cAug10\n+2\tAceView\tCDS\t219131166\t219131310\t.\t-\t0\tParent=AAMP.cAug10\n+2\tAceView\tCDS\t219131570\t219131709\t.\t-\t2\tParent=AAMP.cAug10\n+2\tAceView\tCDS\t219132217\t219132336\t.\t-\t2\tParent=AAMP.cAug10\n+2\tAceView\tCDS\t219134105\t219134260\t.\t-\t2\tParent=AAMP.cAug10\n+2\tAceView\tCDS\t219134689\t219134809\t.\t-\t0\tParent=AAMP.cAug10\n+2\tAceView\tthree_prime_UTR\t219128851\t219129255\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\texon\t219128851\t219129331\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\texon\t219129743\t219129897\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\texon\t219130094\t219130184\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\texon\t219130302\t219130405\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\texon\t219130554\t219130669\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\texon\t219130787\t219130870\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\texon\t219131166\t219131310\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\texon\t219131570\t219131709\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\texon\t219132217\t219132336\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\texon\t219134105\t219134260\t.\t-\t.\tParent=AAMP.cAug10\n+2\tAceView\texon\t219134689\t219134882\t.\t-\t.\tParent=AAMP.cAug10\n+3\tAceView\tgene\t151451704\t151479124\t.\t+\t.\tID=AADACL2;Name=AADACL2\n+3\tAceView\tmRNA\t151451704\t151475667\t.\t+\t.\tID=AADACL2.aAug10;Parent=AADACL2\n+3\tAceView\tfive_prime_UTR\t151451704\t151451823\t.\t+\t.\tParent=AADACL2.aAug10\n+3\tAceView\tCDS\t151451824\t151451961\t.\t+\t0\tParent=AADACL2.aAug10\n+3\tAceView\tCDS\t151458434\t151458656\t.\t+\t0\tParent=AADACL2.aAug10\n+3\tAceView\tCDS\t151461881\t151461950\t.\t+\t2\tParent=AADACL2.aAug10\n+3\tAceView\tCDS\t151463297\t151463468\t.\t+\t1\tParent=AADACL2.aAug10\n+3\tAceView\tCDS\t151474780\t151475382\t.\t+\t0\tParent=AADACL2.aAug10\n+3\tAceView\tthree_prime_UTR\t151475383\t151475667\t.\t+\t.\tParent=AADACL2.aAug10\n+3\tAceView\texon\t151451704\t151451961\t.\t+\t.\tParent=AADACL2.aAug10\n+3\tAceView\texon\t151458434\t151458656\t.\t+\t.\tParent=AADACL2.aAug10\n+3\tAceView\texon\t151461881\t151461950\t.\t+\t.\tParent=AADACL2.aAug10\n+3\tAceView\texon\t151463297\t151463468\t.\t+\t.\tParent=AADACL2.aAug10\n+3\tAceView\texon\t151474780\t151475667\t.\t+\t.\tParent=AADACL2.aAug10\n+3\tAceView\tmRNA\t151451704\t151479124\t.\t+\t.\tID=AADACL2.bAug10;Parent=AADACL2\n+3\tAceView\tfive_prime_UTR\t151451704\t151451948\t.\t+\t.\tParent=AADACL2.bAug10\n+3\tAceView\tCDS\t151451949\t151451961\t.\t+\t0\tParent=AADACL2.bAug10\n+3\tAceView\tCDS\t151461881\t151461950\t.\t+\t2\tParent=AADACL2.bAug10\n+3\tAceView\tCDS\t151463297\t151463468\t.\t+\t1\tParent=AADACL2.bAug10\n+3\tAceView\tCDS\t151474780\t151475382\t.\t+\t0\tParent=AADACL2.bAug10\n+3\tAceView\tthree_prime_UTR\t151475383\t151479124\t.\t+\t.\tParent=AADACL2.bAug10\n+3\tAceView\texon\t151451704\t151451961\t.\t+\t.\tParent=AADACL2.bAug10\n+3\tAceView\texon\t151461881\t151461950\t.\t+\t.\tParent=AADACL2.bAug10\n+3\tAceView\texon\t151463297\t151463468\t.\t+\t.\tParent=AADACL2.bAug10\n+3\tAceView\texon\t151474780\t151479124\t.\t+\t.\tParent=AADACL2.bAug10\n' |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/aceview_hs_37.gtf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/aceview_hs_37.gtf Thu Apr 23 17:57:49 2015 -0400 |
b |
b'@@ -0,0 +1,3989 @@\n+11\tAceView\texon\t111933358\t111934981\t.\t-\t0\tgene_id 2-oxoacid_dh; Gene_type cDNA_supported; transcript_id 2-oxoacid_dh.aAug10-unspliced; exon_number 1\n+19\tAceView\tCDS\t58859154\t58859210\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.aAug10; product_id A1BGAS.aAug10; exon_number 1\n+19\tAceView\texon\t58859153\t58859210\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.aAug10; exon_number 1\n+19\tAceView\tintron\t58859211\t58864686\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.aAug10; type gt_ag\n+19\tAceView\tCDS\t58864687\t58864840\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.aAug10; product_id A1BGAS.aAug10; exon_number 2\n+19\tAceView\texon\t58864687\t58864840\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.aAug10; exon_number 2\n+19\tAceView\tintron\t58864841\t58865079\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.aAug10; type gt_ag\n+19\tAceView\tCDS\t58865080\t58865114\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.aAug10; product_id A1BGAS.aAug10; exon_number 3\n+19\tAceView\texon\t58865080\t58865223\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.aAug10; exon_number 3\n+19\tAceView\tstop_codon\t58865115\t58865117\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.aAug10; product_id A1BGAS.aAug10;\n+19\tAceView\tintron\t58865224\t58865734\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.aAug10; type gt_ag\n+19\tAceView\texon\t58865735\t58866090\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.aAug10; exon_number 4\n+19\tAceView\tstart_codon\t58864404\t58864406\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; product_id A1BGAS.bAug10;\n+19\tAceView\tCDS\t58864404\t58864410\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; product_id A1BGAS.bAug10; exon_number 1\n+19\tAceView\texon\t58862110\t58864410\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; exon_number 1\n+19\tAceView\tintron\t58864411\t58864744\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; type gt_ag\n+19\tAceView\tCDS\t58864745\t58864840\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; product_id A1BGAS.bAug10; exon_number 2\n+19\tAceView\texon\t58864745\t58864840\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; exon_number 2\n+19\tAceView\tintron\t58864841\t58865079\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; type gt_ag\n+19\tAceView\tCDS\t58865080\t58865114\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; product_id A1BGAS.bAug10; exon_number 3\n+19\tAceView\texon\t58865080\t58865223\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; exon_number 3\n+19\tAceView\tstop_codon\t58865115\t58865117\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; product_id A1BGAS.bAug10;\n+19\tAceView\tintron\t58865224\t58865734\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; type gt_ag\n+19\tAceView\texon\t58865735\t58866548\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.bAug10; exon_number 4\n+19\tAceView\texon\t58859122\t58859210\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.cAug10; exon_number 1\n+19\tAceView\tintron\t58859211\t58864686\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.cAug10; type gt_ag\n+19\tAceView\texon\t58864687\t58864840\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.cAug10; exon_number 2\n+19\tAceView\tintron\t58864841\t58865079\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.cAug10; type gt_ag\n+19\tAceView\tstart_codon\t58865831\t58865833\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; transcript_id A1BGAS.cAug10; product_id A1BGAS.cAug10;\n+19\tAceView\tCDS\t58865831\t58866547\t.\t+\t0\tgene_id A1BGAS; Gene_type cDNA_supported; tran'..b'codon\t219129739\t219129741\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.gAug10; product_id AAMP.gAug10;\n+2\tAceView\tintron\t219129332\t219129738\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.gAug10; type gt_ag\n+2\tAceView\texon\t219128853\t219129331\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.gAug10; exon_number 11\n+2\tAceView\tstart_codon\t219134807\t219134809\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; product_id AAMP.hAug10;\n+2\tAceView\tCDS\t219134689\t219134809\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; product_id AAMP.hAug10; exon_number 1\n+2\tAceView\texon\t219134689\t219134843\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; exon_number 1\n+2\tAceView\tintron\t219134258\t219134688\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; type gt_ag\n+2\tAceView\tCDS\t219134105\t219134257\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; product_id AAMP.hAug10; exon_number 2\n+2\tAceView\texon\t219134105\t219134257\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; exon_number 2\n+2\tAceView\tintron\t219132337\t219134104\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; type gt_ag\n+2\tAceView\tCDS\t219132217\t219132336\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; product_id AAMP.hAug10; exon_number 3\n+2\tAceView\texon\t219132217\t219132336\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; exon_number 3\n+2\tAceView\tintron\t219131710\t219132216\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; type gt_ag\n+2\tAceView\tCDS\t219131570\t219131709\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; product_id AAMP.hAug10; exon_number 4\n+2\tAceView\texon\t219131570\t219131709\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; exon_number 4\n+2\tAceView\tintron\t219131311\t219131569\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; type gt_ag\n+2\tAceView\tCDS\t219131166\t219131310\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; product_id AAMP.hAug10; exon_number 5\n+2\tAceView\texon\t219131166\t219131310\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; exon_number 5\n+2\tAceView\tintron\t219130871\t219131165\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; type gt_ag\n+2\tAceView\tCDS\t219130787\t219130870\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; product_id AAMP.hAug10; exon_number 6\n+2\tAceView\texon\t219130787\t219130870\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; exon_number 6\n+2\tAceView\tintron\t219130670\t219130786\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; type gt_ag\n+2\tAceView\tCDS\t219130392\t219130669\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; product_id AAMP.hAug10; exon_number 7\n+2\tAceView\texon\t219130302\t219130669\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; exon_number 7\n+2\tAceView\tstop_codon\t219130389\t219130391\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; product_id AAMP.hAug10;\n+2\tAceView\tintron\t219130185\t219130301\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; type gt_ag\n+2\tAceView\texon\t219130094\t219130184\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; exon_number 8\n+2\tAceView\tintron\t219129898\t219130093\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; type gt_ag\n+2\tAceView\texon\t219129743\t219129897\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; exon_number 9\n+2\tAceView\tintron\t219129332\t219129742\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; type gt_ag\n+2\tAceView\texon\t219128853\t219129331\t.\t-\t0\tgene_id AAMP; Gene_type cDNA_supported; transcript_id AAMP.hAug10; exon_number 10\n' |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/ens_mm9_chr18.gff3 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ens_mm9_chr18.gff3 Thu Apr 23 17:57:49 2015 -0400 |
b |
b'@@ -0,0 +1,1165 @@\n+##gff-version 3\n+18\tlincRNA\tgene\t3336414\t3366861\t.\t+\t.\tID=ENSMUSG00000091488;Name=AC124336.2\n+18\tlincRNA\ttranscript\t3336414\t3366861\t.\t+\t.\tID=ENSMUST00000171726;Parent=ENSMUSG00000091488;Name=AC124336.2-201\n+18\tlincRNA\texon\t3336414\t3337176\t.\t+\t.\tParent=ENSMUST00000171726\n+18\tlincRNA\texon\t3365925\t3366861\t.\t+\t.\tParent=ENSMUST00000171726\n+18\tprotein_coding\tgene\t9314042\t9450148\t.\t-\t.\tID=ENSMUSG00000024286;Name=Ccny\n+18\tprotein_coding\tmRNA\t9314042\t9450148\t.\t-\t.\tID=ENSMUST00000053917;Parent=ENSMUSG00000024286;Name=Ccny-201\n+18\tprotein_coding\tfive_prime_UTR\t9449670\t9450148\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\tCDS\t9316554\t9316670\t.\t-\t0\tParent=ENSMUST00000053917\n+18\tprotein_coding\tCDS\t9319407\t9319569\t.\t-\t1\tParent=ENSMUST00000053917\n+18\tprotein_coding\tCDS\t9332782\t9332948\t.\t-\t0\tParent=ENSMUST00000053917\n+18\tprotein_coding\tCDS\t9345192\t9345311\t.\t-\t0\tParent=ENSMUST00000053917\n+18\tprotein_coding\tCDS\t9345412\t9345469\t.\t-\t1\tParent=ENSMUST00000053917\n+18\tprotein_coding\tCDS\t9349386\t9349421\t.\t-\t1\tParent=ENSMUST00000053917\n+18\tprotein_coding\tCDS\t9353405\t9353505\t.\t-\t0\tParent=ENSMUST00000053917\n+18\tprotein_coding\tCDS\t9377792\t9377826\t.\t-\t2\tParent=ENSMUST00000053917\n+18\tprotein_coding\tCDS\t9386733\t9386807\t.\t-\t2\tParent=ENSMUST00000053917\n+18\tprotein_coding\tCDS\t9449516\t9449669\t.\t-\t0\tParent=ENSMUST00000053917\n+18\tprotein_coding\tthree_prime_UTR\t9314042\t9316553\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\texon\t9314042\t9316670\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\texon\t9319407\t9319569\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\texon\t9332782\t9332948\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\texon\t9345192\t9345311\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\texon\t9345412\t9345469\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\texon\t9349386\t9349421\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\texon\t9353405\t9353505\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\texon\t9377792\t9377826\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\texon\t9386733\t9386807\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\texon\t9449516\t9450148\t.\t-\t.\tParent=ENSMUST00000053917\n+18\tprotein_coding\tmRNA\t9314042\t9450148\t.\t-\t.\tID=ENSMUST00000115867;Parent=ENSMUSG00000024286;Name=Ccny-202\n+18\tprotein_coding\tfive_prime_UTR\t9449670\t9450148\t.\t-\t.\tParent=ENSMUST00000115867\n+18\tprotein_coding\tCDS\t9316554\t9316670\t.\t-\t0\tParent=ENSMUST00000115867\n+18\tprotein_coding\tCDS\t9319407\t9319569\t.\t-\t1\tParent=ENSMUST00000115867\n+18\tprotein_coding\tCDS\t9332782\t9332948\t.\t-\t0\tParent=ENSMUST00000115867\n+18\tprotein_coding\tCDS\t9345192\t9345311\t.\t-\t0\tParent=ENSMUST00000115867\n+18\tprotein_coding\tCDS\t9345412\t9345469\t.\t-\t1\tParent=ENSMUST00000115867\n+18\tprotein_coding\tCDS\t9349386\t9349421\t.\t-\t1\tParent=ENSMUST00000115867\n+18\tprotein_coding\tCDS\t9353405\t9353505\t.\t-\t0\tParent=ENSMUST00000115867\n+18\tprotein_coding\tCDS\t9377792\t9377826\t.\t-\t2\tParent=ENSMUST00000115867\n+18\tprotein_coding\tCDS\t9449516\t9449669\t.\t-\t0\tParent=ENSMUST00000115867\n+18\tprotein_coding\tthree_prime_UTR\t9314042\t9316553\t.\t-\t.\tParent=ENSMUST00000115867\n+18\tprotein_coding\texon\t9314042\t9316670\t.\t-\t.\tParent=ENSMUST00000115867\n+18\tprotein_coding\texon\t9319407\t9319569\t.\t-\t.\tParent=ENSMUST00000115867\n+18\tprotein_coding\texon\t9332782\t9332948\t.\t-\t.\tParent=ENSMUST00000115867\n+18\tprotein_coding\texon\t9345192\t9345311\t.\t-\t.\tParent=ENSMUST00000115867\n+18\tprotein_coding\texon\t9345412\t9345469\t.\t-\t.\tParent=ENSMUST00000115867\n+18\tprotein_coding\texon\t9349386\t9349421\t.\t-\t.\tParent=ENSMUST00000115867\n+18\tprotein_coding\texon\t9353405\t9353505\t.\t-\t.\tParent=ENSMUST00000115867\n+18\tprotein_coding\texon\t9377792\t9377826\t.\t-\t.\tParent=ENSMUST00000115867\n+18\tprotein_coding\texon\t9449516\t9450148\t.\t-\t.\tParent=ENSMUST00000115867\n+18\tmiRNA\tgene\t10782897\t10782983\t.\t-\t.\tID=ENSMUSG00000065399;Name=Mir133a-1\n+18\tmiRNA\ttranscript\t10782897\t10782983\t.\t-\t.\tID=ENSMUST00000083465;Parent=ENSMUSG00000065399;Name=Mir133a-1-201\n+18\tmiRNA\texon\t10782897\t10782983\t.\t-\t.\tParent=ENSMUST00000083465\n+18\tprotein_coding\tgene\t9726195\t9726668\t.\t-\t.\tID='..b'ein_coding\tCDS\t7441551\t7441636\t.\t-\t2\tParent=ENSMUST00000115869\n+18\tprotein_coding\tCDS\t7442791\t7442872\t.\t-\t0\tParent=ENSMUST00000115869\n+18\tprotein_coding\tCDS\t7443972\t7444103\t.\t-\t0\tParent=ENSMUST00000115869\n+18\tprotein_coding\tCDS\t7458930\t7459010\t.\t-\t0\tParent=ENSMUST00000115869\n+18\tprotein_coding\tCDS\t7461636\t7461713\t.\t-\t0\tParent=ENSMUST00000115869\n+18\tprotein_coding\tCDS\t7512942\t7513060\t.\t-\t2\tParent=ENSMUST00000115869\n+18\tprotein_coding\tCDS\t7561724\t7561760\t.\t-\t0\tParent=ENSMUST00000115869\n+18\tprotein_coding\tthree_prime_UTR\t7347960\t7350962\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7347960\t7351142\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7353152\t7353295\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7355016\t7355124\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7356140\t7356233\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7379987\t7380067\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7403184\t7403354\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7439567\t7439631\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7440081\t7440277\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7440430\t7440504\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7441551\t7441636\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7442791\t7442872\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7443972\t7444103\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7458930\t7459010\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7461636\t7461713\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7512942\t7513060\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7561724\t7561870\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\texon\t7626731\t7626861\t.\t-\t.\tParent=ENSMUST00000115869\n+18\tprotein_coding\tmRNA\t7429282\t7626861\t.\t-\t.\tID=ENSMUST00000025129;Parent=ENSMUSG00000057440;Name=Mpp7-201\n+18\tprotein_coding\tfive_prime_UTR\t7561728\t7561870\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\tfive_prime_UTR\t7626731\t7626861\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7429282\t7429322\t.\t-\t2\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7430259\t7430270\t.\t-\t2\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7430393\t7430413\t.\t-\t2\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7439567\t7439631\t.\t-\t1\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7440081\t7440277\t.\t-\t0\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7440430\t7440504\t.\t-\t0\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7441551\t7441636\t.\t-\t2\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7442791\t7442872\t.\t-\t0\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7443972\t7444103\t.\t-\t0\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7458930\t7459010\t.\t-\t0\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7461636\t7461713\t.\t-\t0\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7512942\t7513060\t.\t-\t2\tParent=ENSMUST00000025129\n+18\tprotein_coding\tCDS\t7561724\t7561727\t.\t-\t0\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7429282\t7429322\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7430259\t7430270\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7430393\t7430413\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7439567\t7439631\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7440081\t7440277\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7440430\t7440504\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7441551\t7441636\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7442791\t7442872\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7443972\t7444103\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7458930\t7459010\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7461636\t7461713\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7512942\t7513060\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7561724\t7561870\t.\t-\t.\tParent=ENSMUST00000025129\n+18\tprotein_coding\texon\t7626731\t7626861\t.\t-\t.\tParent=ENSMUST00000025129\n' |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/ens_mm9_chr18.gtf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ens_mm9_chr18.gtf Thu Apr 23 17:57:49 2015 -0400 |
b |
b'@@ -0,0 +1,1066 @@\n+18\tlincRNA\texon\t11049085\t11050819\t.\t-\t.\tgene_id "ENSMUSG00000087274"; transcript_id "ENSMUST00000138373"; exon_number "1"; gene_name "1010001N08Rik"; \n+18\tlincRNA\texon\t11051256\t11051487\t.\t-\t.\tgene_id "ENSMUSG00000087274"; transcript_id "ENSMUST00000138373"; exon_number "2"; gene_name "1010001N08Rik"; \n+18\tlincRNA\texon\t11049929\t11050254\t.\t-\t.\tgene_id "ENSMUSG00000087274"; transcript_id "ENSMUST00000133759"; exon_number "1"; gene_name "1010001N08Rik"; \n+18\tlincRNA\texon\t11050622\t11050819\t.\t-\t.\tgene_id "ENSMUSG00000087274"; transcript_id "ENSMUST00000133759"; exon_number "2"; gene_name "1010001N08Rik"; \n+18\tlincRNA\texon\t11051256\t11051366\t.\t-\t.\tgene_id "ENSMUSG00000087274"; transcript_id "ENSMUST00000133759"; exon_number "3"; gene_name "1010001N08Rik"; \n+18\tlincRNA\texon\t11052473\t11052565\t.\t-\t.\tgene_id "ENSMUSG00000087274"; transcript_id "ENSMUST00000133759"; exon_number "4"; gene_name "1010001N08Rik"; \n+18\tlincRNA\texon\t5162878\t5164430\t.\t-\t.\tgene_id "ENSMUSG00000085461"; transcript_id "ENSMUST00000150337"; exon_number "1"; gene_name "Gm16954"; \n+18\tlincRNA\texon\t5165286\t5165400\t.\t-\t.\tgene_id "ENSMUSG00000085461"; transcript_id "ENSMUST00000150337"; exon_number "2"; gene_name "Gm16954"; \n+18\tlincRNA\texon\t5165669\t5165729\t.\t-\t.\tgene_id "ENSMUSG00000085461"; transcript_id "ENSMUST00000150337"; exon_number "3"; gene_name "Gm16954"; \n+18\tprotein_coding\texon\t12657194\t12657637\t.\t-\t.\tgene_id "ENSMUSG00000090309"; transcript_id "ENSMUST00000172267"; exon_number "1"; gene_name "AC102131.1"; \n+18\tprotein_coding\tCDS\t12657194\t12657637\t.\t-\t0\tgene_id "ENSMUSG00000090309"; transcript_id "ENSMUST00000172267"; exon_number "1"; gene_name "AC102131.1"; \n+18\tprotein_coding\tstart_codon\t12657635\t12657637\t.\t-\t0\tgene_id "ENSMUSG00000090309"; transcript_id "ENSMUST00000172267"; exon_number "1"; gene_name "AC102131.1"; \n+18\tprotein_coding\tstop_codon\t12657194\t12657196\t.\t-\t0\tgene_id "ENSMUSG00000090309"; transcript_id "ENSMUST00000172267"; exon_number "1"; gene_name "AC102131.1"; \n+18\tlincRNA\texon\t11979185\t11979574\t.\t-\t.\tgene_id "ENSMUSG00000087420"; transcript_id "ENSMUST00000129627"; exon_number "1"; gene_name "Gm6277"; \n+18\tlincRNA\texon\t11979624\t11980616\t.\t-\t.\tgene_id "ENSMUSG00000087420"; transcript_id "ENSMUST00000129627"; exon_number "2"; gene_name "Gm6277"; \n+18\tlincRNA\texon\t11981407\t11981548\t.\t-\t.\tgene_id "ENSMUSG00000087420"; transcript_id "ENSMUST00000129627"; exon_number "3"; gene_name "Gm6277"; \n+18\tlincRNA\texon\t11983673\t11983735\t.\t-\t.\tgene_id "ENSMUSG00000087420"; transcript_id "ENSMUST00000129627"; exon_number "4"; gene_name "Gm6277"; \n+18\tlincRNA\texon\t11997690\t11997846\t.\t-\t.\tgene_id "ENSMUSG00000087420"; transcript_id "ENSMUST00000129627"; exon_number "5"; gene_name "Gm6277"; \n+18\tmisc_RNA\texon\t3860106\t3860428\t.\t+\t.\tgene_id "ENSMUSG00000084719"; transcript_id "ENSMUST00000122770"; exon_number "1"; gene_name "7SK.69"; \n+18\tprotein_coding\texon\t11815936\t11816201\t.\t+\t.\tgene_id "ENSMUSG00000041238"; transcript_id "ENSMUST00000047322"; exon_number "1"; gene_name "Rbbp8"; \n+18\tprotein_coding\tCDS\t11819342\t11819450\t.\t+\t0\tgene_id "ENSMUSG00000041238"; transcript_id "ENSMUST00000047322"; exon_number "1"; gene_name "Rbbp8"; \n+18\tprotein_coding\tstart_codon\t11819342\t11819344\t.\t+\t0\tgene_id "ENSMUSG00000041238"; transcript_id "ENSMUST00000047322"; exon_number "1"; gene_name "Rbbp8"; \n+18\tprotein_coding\texon\t11819244\t11819450\t.\t+\t.\tgene_id "ENSMUSG00000041238"; transcript_id "ENSMUST00000047322"; exon_number "2"; gene_name "Rbbp8"; \n+18\tprotein_coding\tCDS\t11831091\t11831133\t.\t+\t2\tgene_id "ENSMUSG00000041238"; transcript_id "ENSMUST00000047322"; exon_number "2"; gene_name "Rbbp8"; \n+18\tprotein_coding\texon\t11831091\t11831133\t.\t+\t.\tgene_id "ENSMUSG00000041238"; transcript_id "ENSMUST00000047322"; exon_number "3"; gene_name "Rbbp8"; \n+18\tprotein_coding\tCDS\t11836104\t11836199\t.\t+\t1\tgene_id "ENSMUSG00000041238"; transcript_id "ENSMUST00000047322"; exon_number "3"; gene_name "Rbbp8"; \n+18\tprotein_coding\texon\t11836104\t11836199\t.\t+\t.\tgene_id "ENSMUSG000000'..b'ST00000067947"; exon_number "21"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10112342\t10112390\t.\t-\t1\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "21"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10116772\t10116860\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "22"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10116772\t10116860\t.\t-\t0\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "22"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10119883\t10119943\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "23"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10119883\t10119943\t.\t-\t1\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "23"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10122607\t10122766\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "24"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10122607\t10122766\t.\t-\t2\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "24"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10129303\t10129394\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "25"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10129303\t10129394\t.\t-\t1\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "25"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10131528\t10131666\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "26"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10131528\t10131666\t.\t-\t2\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "26"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10132126\t10132270\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "27"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10132126\t10132270\t.\t-\t0\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "27"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10134414\t10134498\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "28"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10134414\t10134498\t.\t-\t1\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "28"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10136094\t10136269\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "29"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10136094\t10136269\t.\t-\t0\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "29"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10140174\t10140311\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "30"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10140174\t10140311\t.\t-\t0\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "30"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10140786\t10140886\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "31"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10140786\t10140886\t.\t-\t2\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "31"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10150233\t10150314\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "32"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10150233\t10150314\t.\t-\t0\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "32"; gene_name "Rock1"; \n+18\tprotein_coding\texon\t10181223\t10181790\t.\t-\t.\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "33"; gene_name "Rock1"; \n+18\tprotein_coding\tCDS\t10181223\t10181315\t.\t-\t0\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "33"; gene_name "Rock1"; \n+18\tprotein_coding\tstop_codon\t10066046\t10066048\t.\t-\t0\tgene_id "ENSMUSG00000024290"; transcript_id "ENSMUST00000067947"; exon_number "33"; gene_name "Rock1"; \n' |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/gencode_ens_hav.gtf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/gencode_ens_hav.gtf Thu Apr 23 17:57:49 2015 -0400 |
b |
b'@@ -0,0 +1,50 @@\n+1\tHAVANA\tgene\t69091\t70008\t.\t+\t.\tgene_id "ENSG00000186092.4"; transcript_id "ENSG00000186092.4"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F5"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F5"; level 2; havana_gene "OTTHUMG00000001094.1";\n+1\tHAVANA\ttranscript\t69091\t70008\t.\t+\t.\tgene_id "ENSG00000186092.4"; transcript_id "ENST00000335137.3"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F5"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F5-001"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; ccdsid "CCDS30547.1"; havana_gene "OTTHUMG00000001094.1"; havana_transcript "OTTHUMT00000003223.1";\n+1\tHAVANA\texon\t69091\t70008\t.\t+\t.\tgene_id "ENSG00000186092.4"; transcript_id "ENST00000335137.3"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F5"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F5-001"; exon_number 1; exon_id "ENSE00002319515.1"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; ccdsid "CCDS30547.1"; havana_gene "OTTHUMG00000001094.1"; havana_transcript "OTTHUMT00000003223.1";\n+1\tHAVANA\tCDS\t69091\t70005\t.\t+\t0\tgene_id "ENSG00000186092.4"; transcript_id "ENST00000335137.3"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F5"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F5-001"; exon_number 1; exon_id "ENSE00002319515.1"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; ccdsid "CCDS30547.1"; havana_gene "OTTHUMG00000001094.1"; havana_transcript "OTTHUMT00000003223.1";\n+1\tHAVANA\tstart_codon\t69091\t69093\t.\t+\t0\tgene_id "ENSG00000186092.4"; transcript_id "ENST00000335137.3"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F5"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F5-001"; exon_number 1; exon_id "ENSE00002319515.1"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; ccdsid "CCDS30547.1"; havana_gene "OTTHUMG00000001094.1"; havana_transcript "OTTHUMT00000003223.1";\n+1\tHAVANA\tstop_codon\t70006\t70008\t.\t+\t0\tgene_id "ENSG00000186092.4"; transcript_id "ENST00000335137.3"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F5"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F5-001"; exon_number 1; exon_id "ENSE00002319515.1"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; ccdsid "CCDS30547.1"; havana_gene "OTTHUMG00000001094.1"; havana_transcript "OTTHUMT00000003223.1";\n+1\tHAVANA\tUTR\t70006\t70008\t.\t+\t.\tgene_id "ENSG00000186092.4"; transcript_id "ENST00000335137.3"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F5"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F5-001"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; ccdsid "CCDS30547.1"; havana_gene "OTTHUMG00000001094.1"; havana_transcript "OTTHUMT00000003223.1";\n+1\tENSEMBL\tgene\t134901\t139379\t.\t-\t.\tgene_id "ENSG00000237683.5"; transcript_id "ENSG00000237683.5"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "AL627309.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "AL627309.1"; level 3;\n+1\tENSEMBL\ttranscript\t134901\t139379\t.\t-\t.\tgene_id "ENSG00000237683.5"; transcript_id "ENST00000423372.3"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "AL627309.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "AL627309.1-201"; level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\texon\t137621\t139379\t.\t-\t.\tgene_id "ENSG00000237683.5"; transcript_id "ENST00000423372.3"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "AL627309.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "AL627309.1-201"; exon_number 1; exon_id "ENSE00002221580.1"; level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\tCDS\t138533\t139309\t.\t-\t0\tgene_id "ENSG000002'..b'gene_name "AL669831.1"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL669831.1-201"; exon_number 3; exon_id "ENSE00003138540.1"; level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\tCDS\t738532\t738618\t.\t-\t0\tgene_id "ENSG00000269831.1"; transcript_id "ENST00000599533.1"; gene_type "protein_coding"; gene_status "NOVEL"; gene_name "AL669831.1"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL669831.1-201"; exon_number 3; exon_id "ENSE00003138540.1"; level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\tgene\t818043\t819983\t.\t+\t.\tgene_id "ENSG00000269308.1"; transcript_id "ENSG00000269308.1"; gene_type "protein_coding"; gene_status "NOVEL"; gene_name "AL645608.2"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL645608.2"; level 3;\n+1\tENSEMBL\ttranscript\t818043\t819983\t.\t+\t.\tgene_id "ENSG00000269308.1"; transcript_id "ENST00000594233.1"; gene_type "protein_coding"; gene_status "NOVEL"; gene_name "AL645608.2"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL645608.2-201"; level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\texon\t818043\t818058\t.\t+\t.\tgene_id "ENSG00000269308.1"; transcript_id "ENST00000594233.1"; gene_type "protein_coding"; gene_status "NOVEL"; gene_name "AL645608.2"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL645608.2-201"; exon_number 1; exon_id "ENSE00003079649.1"; level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\tCDS\t818043\t818058\t.\t+\t0\tgene_id "ENSG00000269308.1"; transcript_id "ENST00000594233.1"; gene_type "protein_coding"; gene_status "NOVEL"; gene_name "AL645608.2"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL645608.2-201"; exon_number 1; exon_id "ENSE00003079649.1"; level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\texon\t819496\t819513\t.\t+\t.\tgene_id "ENSG00000269308.1"; transcript_id "ENST00000594233.1"; gene_type "protein_coding"; gene_status "NOVEL"; gene_name "AL645608.2"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL645608.2-201"; exon_number 2; exon_id "ENSE00003048391.1"; level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\tCDS\t819496\t819513\t.\t+\t2\tgene_id "ENSG00000269308.1"; transcript_id "ENST00000594233.1"; gene_type "protein_coding"; gene_status "NOVEL"; gene_name "AL645608.2"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL645608.2-201"; exon_number 2; exon_id "ENSE00003048391.1"; level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\texon\t819961\t819983\t.\t+\t.\tgene_id "ENSG00000269308.1"; transcript_id "ENST00000594233.1"; gene_type "protein_coding"; gene_status "NOVEL"; gene_name "AL645608.2"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL645608.2-201"; exon_number 3; exon_id "ENSE00003055565.1"; level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\tCDS\t819961\t819980\t.\t+\t2\tgene_id "ENSG00000269308.1"; transcript_id "ENST00000594233.1"; gene_type "protein_coding"; gene_status "NOVEL"; gene_name "AL645608.2"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL645608.2-201"; exon_number 3; exon_id "ENSE00003055565.1"; level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\tstop_codon\t819981\t819983\t.\t+\t0\tgene_id "ENSG00000269308.1"; transcript_id "ENST00000594233.1"; gene_type "protein_coding"; gene_status "NOVEL"; gene_name "AL645608.2"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL645608.2-201"; exon_number 3; exon_id "ENSE00003055565.1"; level 3; tag "basic"; tag "appris_principal";\n+1\tENSEMBL\tUTR\t819981\t819983\t.\t+\t.\tgene_id "ENSG00000269308.1"; transcript_id "ENST00000594233.1"; gene_type "protein_coding"; gene_status "NOVEL"; gene_name "AL645608.2"; transcript_type "protein_coding"; transcript_status "NOVEL"; transcript_name "AL645608.2-201"; level 3; tag "basic"; tag "appris_principal";\n' |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/s_cerevisiae_SCU49845.gff --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/s_cerevisiae_SCU49845.gff Thu Apr 23 17:57:49 2015 -0400 |
b |
@@ -0,0 +1,8 @@ +IX gbk2gff gene 687 3158 . + . ID=AXL2;Name=AXL2 +IX gbk2gff mRNA 687 3158 . + . ID=Transcript:AXL2;Parent=AXL2 +IX gbk2gff CDS 687 3158 . + . Parent=Transcript:AXL2 +IX gbk2gff exon 687 3158 . + . Parent=Transcript:AXL2 +IX gbk2gff gene 3300 4037 . - . ID=REV7;Name=REV7 +IX gbk2gff mRNA 3300 4037 . - . ID=Transcript:REV7;Parent=REV7 +IX gbk2gff CDS 3300 4037 . - . Parent=Transcript:REV7 +IX gbk2gff exon 3300 4037 . - . Parent=Transcript:REV7 |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/s_cerevisiae_SCU49845.gff3 --- a/test-data/s_cerevisiae_SCU49845.gff3 Thu Apr 23 17:51:14 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,8 +0,0 @@ -IX gbk_to_gff gene 687 3158 . + . ID=AXL2;Name=AXL2 -IX gbk_to_gff . 687 3158 . + . ID=Transcript:AXL2;Parent=AXL2 -IX gbk_to_gff CDS 687 3158 . + . Parent=Transcript:AXL2 -IX gbk_to_gff exon 687 3158 . + . Parent=Transcript:AXL2 -IX gbk_to_gff gene 3300 4037 . - . ID=REV7;Name=REV7 -IX gbk_to_gff . 3300 4037 . - . ID=Transcript:REV7;Parent=REV7 -IX gbk_to_gff CDS 3300 4037 . - . Parent=Transcript:REV7 -IX gbk_to_gff exon 3300 4037 . - . Parent=Transcript:REV7 |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 test-data/single_parent_feature_record.gff3 --- a/test-data/single_parent_feature_record.gff3 Thu Apr 23 17:51:14 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,10 +0,0 @@ -chr1 . miRNA_primary_transcript 1380242 1380467 . - . ID=MI0031047;Alias=MI0031047;Name=gma-MIR9754 -chr1 . miRNA 1380249 1380270 . - . ID=MIMAT0036385;Alias=MIMAT0036385;Name=gma-miR9754;Derives_from=MI0031047 -chr1 . miRNA_primary_transcript 2410094 2410318 . + . ID=MI0016507;Alias=MI0016507;Name=gma-MIR4367 -chr1 . miRNA 2410242 2410263 . + . ID=MIMAT0018266;Alias=MIMAT0018266;Name=gma-miR4367;Derives_from=MI0016507 -chr1 . miRNA_primary_transcript 4792375 4792487 . - . ID=MI0021714;Alias=MI0021714;Name=gma-MIR395h -chr1 . miRNA 4792388 4792408 . - . ID=MIMAT0024920;Alias=MIMAT0024920;Name=gma-miR395h;Derives_from=MI0021714 -chr1 . miRNA_primary_transcript 4797903 4798018 . - . ID=MI0021715;Alias=MI0021715;Name=gma-MIR395i -chr1 . miRNA 4797916 4797936 . - . ID=MIMAT0024921;Alias=MIMAT0024921;Name=gma-miR395i;Derives_from=MI0021715 -chr1 . miRNA_primary_transcript 4810817 4810942 . - . ID=MI0021716;Alias=MI0021716;Name=gma-MIR395j -chr1 . miRNA 4810830 4810850 . - . ID=MIMAT0024922;Alias=MIMAT0024922;Name=gma-miR395j;Derives_from=MI0021716 |
b |
diff -r d4f9b7beb52f -r 7d67331368f3 tool_conf.xml.sample --- a/tool_conf.xml.sample Thu Apr 23 17:51:14 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,9 +0,0 @@ -<section name="GFFtools" id="gfftools.web"> - <tool file="GFFtools-GX/gff_to_bed.xml"/> - <tool file="GFFtools-GX/bed_to_gff.xml"/> - <tool file="GFFtools-GX/gbk_to_gff.xml"/> - <tool file="GFFtools-GX/gff_to_gbk.xml"/> - <tool file="GFFtools-GX/gff_to_gtf.xml"/> - <tool file="GFFtools-GX/gtf_to_gff.xml"/> - <tool file="GFFtools-GX/gff_fmap.xml"/> -</section> |