Previous changeset 4:619e0fcd9126 (2014-06-11) Next changeset 6:154887a3d92f (2015-04-23) |
Commit message:
Uploaded |
added:
GFFParser.py README bed_to_gff.py bed_to_gff.xml gbk_to_gff.py gbk_to_gff.xml gff_fmap.py gff_fmap.xml gff_to_bed.py gff_to_bed.xml gff_to_gbk.py gff_to_gbk.xml gff_to_gtf.py gff_to_gtf.xml gffparser_bcbio.py gtf_to_gff.py gtf_to_gff.xml helper.py test-data/s_cerevisiae_SCU49845.gbk test-data/s_cerevisiae_SCU49845.gff3 test-data/single_parent_feature_record.gff3 tool_conf.xml.sample |
b |
diff -r 619e0fcd9126 -r 6e589f267c14 GFFParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GFFParser.py Tue Nov 04 12:15:19 2014 -0500 |
[ |
b'@@ -0,0 +1,495 @@\n+#!/usr/bin/env python\n+"""\n+Extract genome annotation from a GFF (a tab delimited format for storing sequence features and annotations) file.\n+\n+Requirements: \n+ Numpy :- http://numpy.org/ \n+ Scipy :- http://scipy.org/ \n+\n+Copyright (C)\t\n+\n+2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. \n+2012-2014 Memorial Sloan Kettering Cancer Center, New York City, USA.\n+"""\n+\n+import re\n+import os\n+import sys\n+import urllib\n+import numpy as np\n+import scipy.io as sio\n+from collections import defaultdict\n+import helper as utils \n+\n+def attribute_tags(col9):\n+ """ \n+ Split the key-value tags from the attribute column, it takes column number 9 from GTF/GFF file \n+\n+ @args col9: attribute column from GFF file \n+ @type col9: str\n+ """\n+ info = defaultdict(list)\n+ is_gff = False\n+ \n+ if not col9:\n+ return is_gff, info\n+ \n+ # trim the line ending semi-colon ucsc may have some white-space \n+ col9 = col9.rstrip(\';| \')\n+ # attributes from 9th column \n+ atbs = col9.split(" ; ")\n+ if len(atbs) == 1:\n+ atbs = col9.split("; ")\n+ if len(atbs) == 1:\n+ atbs = col9.split(";")\n+ # check the GFF3 pattern which has key value pairs like:\n+ gff3_pat = re.compile("\\w+=")\n+ # sometime GTF have: gene_id uc002zkg.1;\n+ gtf_pat = re.compile("\\s?\\w+\\s")\n+\n+ key_vals = []\n+\n+ if gff3_pat.match(atbs[0]): # gff3 pattern \n+ is_gff = True\n+ key_vals = [at.split(\'=\') for at in atbs]\n+ elif gtf_pat.match(atbs[0]): # gtf pattern\n+ for at in atbs:\n+ key_vals.append(at.strip().split(" ",1))\n+ else:\n+ # to handle attribute column has only single value \n+ key_vals.append([\'ID\', atbs[0]])\n+ # get key, val items \n+ for item in key_vals:\n+ key, val = item\n+ # replace the double qoutes from feature identifier \n+ val = re.sub(\'"\', \'\', val)\n+ # replace the web formating place holders to plain text format \n+ info[key].extend([urllib.unquote(v) for v in val.split(\',\') if v])\n+\n+ return is_gff, info\n+ \n+def spec_features_keywd(gff_parts):\n+ """\n+ Specify the feature key word according to the GFF specifications\n+\n+ @args gff_parts: attribute field key \n+ @type gff_parts: str \n+ """\n+ for t_id in ["transcript_id", "transcriptId", "proteinId"]:\n+ try:\n+ gff_parts["info"]["Parent"] = gff_parts["info"][t_id]\n+ break\n+ except KeyError:\n+ pass\n+ for g_id in ["gene_id", "geneid", "geneId", "name", "gene_name", "genename"]:\n+ try:\n+ gff_parts["info"]["GParent"] = gff_parts["info"][g_id]\n+ break\n+ except KeyError:\n+ pass\n+ ## TODO key words\n+ for flat_name in ["Transcript", "CDS"]:\n+ if gff_parts["info"].has_key(flat_name):\n+ # parents\n+ if gff_parts[\'type\'] in [flat_name] or re.search(r\'transcript\', gff_parts[\'type\'], re.IGNORECASE):\n+ if not gff_parts[\'id\']:\n+ gff_parts[\'id\'] = gff_parts[\'info\'][flat_name][0]\n+ #gff_parts["info"]["ID"] = [gff_parts["id"]]\n+ # children \n+ elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR",\n+ "coding_exon", "five_prime_UTR", "CDS", "stop_codon",\n+ "start_codon"]:\n+ gff_parts["info"]["Parent"] = gff_parts["info"][flat_name]\n+ break\n+ return gff_parts\n+\n+def Parse(ga_file):\n+ """\n+ Parsing GFF/GTF file based on feature relationship, it takes the input file.\n+\n+ @args ga_file: input file name \n+ @type ga_file: str \n+ """\n+ child_map = defaultdict(list)\n+ parent_map = dict()\n+\n+ ga_handle = utils.open_file(ga_file)\n+\n+ for rec in ga_handle:\n+ rec = rec.strip(\'\\n\\r\')\n+ \n+ # skip empty line fasta identifier and commented line\n+ if not rec '..b'lete\'] = []\n+ gene[g_cnt][\'is_complete\'] = []\n+ gene[g_cnt][\'is_correctly_gff3_referenced\'] = \'\'\n+ gene[g_cnt][\'splicegraph\'] = []\n+ g_cnt += 1 \n+\n+ ## deleting empty gene records from the main array\n+ XPFLG=0\n+ for XP, ens in enumerate(gene):\n+ if ens[0]==0:\n+ XPFLG=1\n+ break\n+ \n+ if XPFLG==1:\n+ XQC = range(XP, len(gene)+1)\n+ gene = np.delete(gene, XQC)\n+\n+ return gene \n+\n+def NonetoemptyList(XS):\n+ """\n+ Convert a None type to empty list \n+\n+ @args XS: None type \n+ @type XS: str \n+ """\n+ return [] if XS is None else XS \n+\n+def create_missing_feature_type(p_feat, c_feat):\n+ """\n+ GFF/GTF file defines only child features. This function tries to create \n+ the parent feature from the information provided in the attribute column. \n+\n+ example: \n+ chr21 hg19_knownGene exon 9690071 9690100 0.000000 + . gene_id "uc002zkg.1"; transcript_id "uc002zkg.1"; \n+ chr21 hg19_knownGene exon 9692178 9692207 0.000000 + . gene_id "uc021wgt.1"; transcript_id "uc021wgt.1"; \n+ chr21 hg19_knownGene exon 9711935 9712038 0.000000 + . gene_id "uc011abu.2"; transcript_id "uc011abu.2"; \n+\n+ This function gets the parsed feature annotations. \n+ \n+ @args p_feat: Parent feature map \n+ @type p_feat: collections defaultdict\n+ @args c_feat: Child feature map \n+ @type c_feat: collections defaultdict\n+ """\n+\n+ child_n_map = defaultdict(list)\n+ for fid, det in c_feat.items():\n+ # get the details from grand child \n+ GID = STRD = SCR = None\n+ SPOS, EPOS = [], [] \n+ TYP = dict()\n+ for gchild in det:\n+ GID = gchild.get(\'gene_id\', [\'\'])[0] \n+ SPOS.append(gchild.get(\'location\', [])[0]) \n+ EPOS.append(gchild.get(\'location\', [])[1]) \n+ STRD = gchild.get(\'strand\', \'\')\n+ SCR = gchild.get(\'score\', \'\')\n+ if gchild.get(\'type\', \'\') == "gene": ## gencode GTF file has this problem \n+ continue\n+ TYP[gchild.get(\'type\', \'\')] = 1\n+ SPOS.sort() \n+ EPOS.sort()\n+ \n+ # infer transcript type\n+ transcript_type = \'transcript\'\n+ transcript_type = \'mRNA\' if TYP.get(\'CDS\', \'\') or TYP.get(\'cds\', \'\') else transcript_type\n+ \n+ # gene id and transcript id are same\n+ transcript_id = fid[-1]\n+ if GID == transcript_id:\n+ transcript_id = \'Transcript:\' + str(GID)\n+ \n+ # level -1 feature type \n+ p_feat[(fid[0], fid[1], GID)] = dict( type = \'gene\',\n+ location = [], ## infer location based on multiple transcripts \n+ strand = STRD,\n+ name = GID )\n+ # level -2 feature type \n+ child_n_map[(fid[0], fid[1], GID)].append(\n+ dict( type = transcript_type,\n+ location = [SPOS[0], EPOS[-1]], \n+ strand = STRD, \n+ score = SCR, \n+ ID = transcript_id,\n+ gene_id = \'\' ))\n+ # reorganizing the grand child\n+ for gchild in det:\n+ child_n_map[(fid[0], fid[1], transcript_id)].append(\n+ dict( type = gchild.get(\'type\', \'\'),\n+ location = gchild.get(\'location\'),\n+ strand = gchild.get(\'strand\'), \n+ ID = gchild.get(\'ID\'),\n+ score = gchild.get(\'score\'),\n+ gene_id = \'\' ))\n+ return p_feat, child_n_map \n+\n' |
b |
diff -r 619e0fcd9126 -r 6e589f267c14 README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Tue Nov 04 12:15:19 2014 -0500 |
[ |
@@ -0,0 +1,61 @@ +A collection of tools for converting genome annotation between GTF (Gene Transfer Format), +BED (Browser Extensible Data) and GFF (Generic Feature Format). + +INTRODUCTION + +Several genome annotation centers provide their data in GTF, BED, GFF3 etc. I have few programs +they mainly deals with converting between GTF, BED and GFF3 formats. They are extensively tested +with files from different centers like ENSEMBL, UCSC, JGI and NCBI AceView. Please follow the +instructions below to clone these tools into your galaxy instance. + +CONTENTS + +Tool configuration files in *.xml format. + + gtf_to_gff.xml + gff_to_gtf.xml + bed_to_gff.xml + gff_to_bed.xml + gbk_to_gff.xml + gff_fmap.xml + +Python based scripts. + + gtf_to_gff.py: convert data from GTF to valid GFF3. + gff_to_gtf.py: convert data from GFF3 to GTF. + bed_to_gff.py: convert data from a 12 column UCSC wiggle BED format to GFF3. + gff_to_bed.py: convert gene transcript annotation from GFF3 to UCSC wiggle 12 column BED format. + gbk_to_gff.py: convert data from genbank format to GFF. + gff_fmap.py: find the relation between different features described in a GFF file. + GFFParser.py: Parse GFF/GTF files. + helper.py: Utility functions. + +test-data: Test data set. (move to your galaxy_root_folder/test-data/) + + You may need to move the test files into your test-data directory so galaxy can find them. + If you want to run the functional tests eg as: + + exmaple: + sh run_functional_tests.sh -id fml_gtf2gff + +REQUIREMENTS + + python + +COMMENTS/QUESTIONS + +I can be reached at vipin [at] cbio.mskcc.org + +LICENSE + +Copyright (C) 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society + 2013-2014 Memorial Sloan Kettering Cancer Center + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +(at your option) any later version. + +COURTESY + +To the Galaxy Team. |
b |
diff -r 619e0fcd9126 -r 6e589f267c14 bed_to_gff.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bed_to_gff.py Tue Nov 04 12:15:19 2014 -0500 |
[ |
@@ -0,0 +1,70 @@ +#!/usr/bin/env python +""" +Convert genome annotation data in a 12 column BED format to GFF3. + +Usage: python bed_to_gff.py in.bed > out.gff + +Requirement: + helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py + +Copyright (C) + 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. + 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA. +""" + +import re +import sys +import helper + +def __main__(): + """ + main function + """ + + try: + bed_fname = sys.argv[1] + except: + print __doc__ + sys.exit(-1) + + bed_fh = helper.open_file(bed_fname) + + for line in bed_fh: + line = line.strip( '\n\r' ) + + if not line or line[0] in ['#']: + continue + + parts = line.split('\t') + assert len(parts) >= 12, line + + rstarts = parts[-1].split(',') + rstarts.pop() if rstarts[-1] == '' else rstarts + + exon_lens = parts[-2].split(',') + exon_lens.pop() if exon_lens[-1] == '' else exon_lens + + if len(rstarts) != len(exon_lens): + continue # checking the consistency col 11 and col 12 + + if len(rstarts) != int(parts[-3]): + continue # checking the number of exons and block count are same + + if not parts[5] in ['+', '-']: + parts[5] = '.' # replace the unknown strand with '.' + + # bed2gff result line + print '%s\tbed2gff\tgene\t%d\t%s\t%s\t%s\t.\tID=Gene:%s;Name=Gene:%s' % (parts[0], int(parts[1])+1, parts[2], parts[4], parts[5], parts[3], parts[3]) + print '%s\tbed2gff\ttranscript\t%d\t%s\t%s\t%s\t.\tID=%s;Name=%s;Parent=Gene:%s' % (parts[0], int(parts[1])+1, parts[2], parts[4], parts[5], parts[3], parts[3], parts[3]) + + st = int(parts[1]) + for ex_cnt in range(int(parts[-3])): + start = st + int(rstarts[ex_cnt]) + 1 + stop = start + int(exon_lens[ex_cnt]) - 1 + print '%s\tbed2gff\texon\t%d\t%d\t%s\t%s\t.\tParent=%s' % (parts[0], start, stop, parts[4], parts[5], parts[3]) + + bed_fh.close() + + +if __name__ == "__main__": + __main__() |
b |
diff -r 619e0fcd9126 -r 6e589f267c14 bed_to_gff.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bed_to_gff.xml Tue Nov 04 12:15:19 2014 -0500 |
b |
@@ -0,0 +1,89 @@ +<tool id="fml_bed2gff" name="BED-to-GFF" version="2.0.0"> + <description>converter</description> + <command interpreter="python">bed_to_gff.py $inf_bed > $gff_format + </command> + <inputs> + <param format="bed" name="inf_bed" type="data" label="Convert this query" help="Provide genome annotation in 12 column BED format."/> + </inputs> + <outputs> + <data format="gff3" name="gff_format" label="${tool.name} on ${on_string}: Converted" /> + </outputs> + <tests> + <test> + <param name="inf_bed" value="ccds_genes.bed" /> + <output name="gff_format" file="ccds_genes.gff3" /> + </test> + <test> + <param name="inf_bed" value="hs_2009.bed" /> + <output name="gff_format" file="hs_2009.gff3" /> + </test> + </tests> + <help> + +**What it does** + +This tool converts data from a 12 column UCSC wiggle BED format to GFF3 (scroll down for format description). + +-------- + +**Example** + +- The following data in UCSC Wiggle BED format:: + + chr1 11873 14409 uc001aaa.3 0 + 11873 11873 0 3 354,109,1189, 0,739,1347, + +- Will be converted to GFF3:: + + ##gff-version 3 + chr1 bed2gff gene 11874 14409 0 + . ID=Gene:uc001aaa.3;Name=Gene:uc001aaa.3 + chr1 bed2gff transcript 11874 14409 0 + . ID=uc001aaa.3;Name=uc001aaa.3;Parent=Gene:uc001aaa.3 + chr1 bed2gff exon 11874 12227 0 + . Parent=uc001aaa.3 + chr1 bed2gff exon 12613 12721 0 + . Parent=uc001aaa.3 + chr1 bed2gff exon 13221 14409 0 + . Parent=uc001aaa.3 + +-------- + +**About formats** + +**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and several additional optional ones: + +The first three BED fields (required) are:: + + 1. chrom - The name of the chromosome (e.g. chr1, chrY_random). + 2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.) + 3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval). + +The additional BED fields (optional) are:: + + 4. name - The name of the BED line. + 5. score - A score between 0 and 1000. + 6. strand - Defines the strand - either '+' or '-'. + 7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser. + 8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser. + 9. reserved - This should always be set to zero. + 10. blockCount - The number of blocks (exons) in the BED line. + 11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount. + 12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. + +**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields:: + + 1. seqid - Must be a chromosome or scaffold or contig. + 2. source - The program that generated this feature. + 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. stop - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If there is no score value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). + 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. + 9. attributes - All lines with the same group are linked together into a single item. + +-------- + +**Copyright** + +2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center + +Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014) + + </help> +</tool> |
b |
diff -r 619e0fcd9126 -r 6e589f267c14 gbk_to_gff.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gbk_to_gff.py Tue Nov 04 12:15:19 2014 -0500 |
[ |
@@ -0,0 +1,213 @@ +#!/usr/bin/env python +""" +Convert data from Genbank format to GFF. + +Usage: +python gbk_to_gff.py in.gbk > out.gff + +Requirements: + BioPython:- http://biopython.org/ + helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py + +Copyright (C) + 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. + 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA. +""" + +import os +import re +import sys +import collections +from Bio import SeqIO +import helper + +def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk): + """ + Write the feature information + """ + + for gname, ginfo in genes.items(): + line = [str(chr_id), + 'gbk_to_gff', + ginfo[3], + str(ginfo[0]), + str(ginfo[1]), + '.', + ginfo[2], + '.', + 'ID=%s;Name=%s' % (str(gname), str(gname))] + print '\t'.join(line) + ## construct the transcript line is not defined in the original file + t_line = [str(chr_id), 'gbk_to_gff', source, 0, 1, '.', ginfo[2], '.'] + + if not transcripts: + t_line.append('ID=Transcript:%s;Parent=%s' % (str(gname), str(gname))) + + if exons: ## get the entire transcript region from the defined feature + t_line[3] = str(exons[gname][0][0]) + t_line[4] = str(exons[gname][0][-1]) + elif cds: + t_line[3] = str(cds[gname][0][0]) + t_line[4] = str(cds[gname][0][-1]) + print '\t'.join(t_line) + + if exons: + exon_line_print(t_line, exons[gname], 'Transcript:'+str(gname), 'exon') + + if cds: + exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'CDS') + if not exons: + exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'exon') + + else: ## transcript is defined + for idx in transcripts[gname]: + t_line[2] = idx[3] + t_line[3] = str(idx[0]) + t_line[4] = str(idx[1]) + t_line.append('ID='+str(idx[2])+';Parent='+str(gname)) + print '\t'.join(t_line) + + ## feature line print call + if exons: + exon_line_print(t_line, exons[gname], str(idx[2]), 'exon') + if cds: + exon_line_print(t_line, cds[gname], str(idx[2]), 'CDS') + if not exons: + exon_line_print(t_line, cds[gname], str(idx[2]), 'exon') + + if len(genes) == 0: ## feature entry with fragment information + + line = [str(chr_id), 'gbk_to_gff', source, 0, 1, '.', orient, '.'] + fStart = fStop = None + + for eid, ex in cds.items(): + fStart = ex[0][0] + fStop = ex[0][-1] + + for eid, ex in exons.items(): + fStart = ex[0][0] + fStop = ex[0][-1] + + if fStart or fStart: + + line[2] = 'gene' + line[3] = str(fStart) + line[4] = str(fStop) + line.append('ID=Unknown_Gene_' + str(unk) + ';Name=Unknown_Gene_' + str(unk)) + print "\t".join(line) + + if not cds: + line[2] = 'transcript' + else: + line[2] = 'mRNA' + + line[8] = 'ID=Unknown_Transcript_' + str(unk) + ';Parent=Unknown_Gene_' + str(unk) + print "\t".join(line) + + if exons: + exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon') + + if cds: + exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'CDS') + if not exons: + exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon') + + unk +=1 + + return unk + +def exon_line_print(temp_line, trx_exons, parent, ftype): + """ + Print the EXON feature line + """ + + for ex in trx_exons: + temp_line[2] = ftype + temp_line[3] = str(ex[0]) + temp_line[4] = str(ex[1]) + temp_line[8] = 'Parent=%s' % parent + print '\t'.join(temp_line) + +def gbk_parse(fname): + """ + Extract genome annotation recods from genbank format + + @args fname: gbk file name + @type fname: str + """ + + fhand = helper.open_file(gbkfname) + unk = 1 + + for record in SeqIO.parse(fhand, "genbank"): + + gene_tags = dict() + tx_tags = collections.defaultdict(list) + exon = collections.defaultdict(list) + cds = collections.defaultdict(list) + mol_type, chr_id = None, None + + for rec in record.features: + + if rec.type == 'source': + try: + mol_type = rec.qualifiers['mol_type'][0] + except: + mol_type = '.' + pass + try: + chr_id = rec.qualifiers['chromosome'][0] + except: + chr_id = record.name + continue + + strand='-' + strand='+' if rec.strand>0 else strand + + fid = None + try: + fid = rec.qualifiers['gene'][0] + except: + pass + + transcript_id = None + try: + transcript_id = rec.qualifiers['transcript_id'][0] + except: + pass + + if re.search(r'gene', rec.type): + gene_tags[fid] = (rec.location._start.position+1, + rec.location._end.position, + strand, + rec.type + ) + elif rec.type == 'exon': + exon[fid].append((rec.location._start.position+1, + rec.location._end.position)) + elif rec.type=='CDS': + cds[fid].append((rec.location._start.position+1, + rec.location._end.position)) + else: + # get all transcripts + if transcript_id: + tx_tags[fid].append((rec.location._start.position+1, + rec.location._end.position, + transcript_id, + rec.type)) + # record extracted, generate feature table + unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk) + + fhand.close() + + +if __name__=='__main__': + + try: + gbkfname = sys.argv[1] + except: + print __doc__ + sys.exit(-1) + + ## extract gbk records + gbk_parse(gbkfname) |
b |
diff -r 619e0fcd9126 -r 6e589f267c14 gbk_to_gff.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gbk_to_gff.xml Tue Nov 04 12:15:19 2014 -0500 |
b |
@@ -0,0 +1,91 @@ +<tool id="fml_gbk2gff" name="GBK-to-GFF" version="2.0.0"> + <description>converter</description> + <command interpreter="python">gbk_to_gff.py $inf_gbk > $gff_format + </command> + <inputs> + <param format="gb,gbk,genbank,txt" name="inf_gbk" type="data" label="Convert this query" help="GenBank flat file format consists of an annotation section and a sequence section."/> + </inputs> + <outputs> + <data format="gff3" name="gff_format" label="${tool.name} on ${on_string}: Converted"/> + </outputs> + <tests> + <test> + <param name="inf_gbk" value="s_cerevisiae_SCU49845.gbk" /> + <output name="gff_format" file="s_cerevisiae_SCU49845.gff3" /> + </test> + </tests> + <help> + +**What it does** + +This tool converts data from a GenBank_ flat file format to GFF (scroll down for format description). + +.. _GenBank: http://www.ncbi.nlm.nih.gov/genbank/ + +------ + +**Example** + +- The following data in GenBank format:: + + LOCUS NM_001202705 2406 bp mRNA linear PLN 28-MAY-2011 + DEFINITION Arabidopsis thaliana thiamine biosynthesis protein ThiC (THIC) + mRNA, complete cds. + ACCESSION NM_001202705 + VERSION NM_001202705.1 GI:334184566......... + FEATURES Location/Qualifiers + source 1..2406 + /organism="Arabidopsis thaliana" + /mol_type="mRNA" + /db_xref="taxon:3702"........ + gene 1..2406 + /gene="THIC" + /locus_tag="AT2G29630" + /gene_synonym="PY; PYRIMIDINE REQUIRING; T27A16.27;........ + ORIGIN + 1 aagcctttcg ctttaggctg cattgggccg tgacaatatt cagacgattc aggaggttcg + 61 ttcctttttt aaaggaccct aatcactctg agtaccactg actcactcag tgtgcgcgat + 121 tcatttcaaa aacgagccag cctcttcttc cttcgtctac tagatcagat ccaaagcttc + 181 ctcttccagc tatggctgct tcagtacact gtaccttgat gtccgtcgta tgcaacaaca + // + + +- Will be converted to GFF3:: + + ##gff-version 3 + NM_001202705 gbk_to_gff chromosome 1 2406 . + 1 ID=NM_001202705;Alias=2;Dbxref=taxon:3702;Name=NM_001202705 + NM_001202705 gbk_to_gff gene 1 2406 . + 1 ID=AT2G29630;Dbxref=GeneID:817513,TAIR:AT2G29630;Name=THIC + NM_001202705 gbk_to_gff mRNA 192 2126 . + 1 ID=AT2G29630.t01;Parent=AT2G29630 + NM_001202705 gbk_to_gff CDS 192 2126 . + 1 ID=AT2G29630.p01;Parent=AT2G29630.t01 + NM_001202705 gbk_to_gff exon 192 2126 . + 1 Parent=AT2G29630.t01 + +------ + +**About formats** + +**GenBank format** An example of a GenBank record may be viewed here_ + +.. _here: http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html + +**GFF3** Generic Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields:: + + 1. seqid - Must be a chromosome or scaffold or contig. + 2. source - The program that generated this feature. + 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. stop - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If there is no score value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). + 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. + 9. attributes - All lines with the same group are linked together into a single item. + +-------- + +**Copyright** + +2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center + +Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014) + + </help> +</tool> |
b |
diff -r 619e0fcd9126 -r 6e589f267c14 gff_fmap.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff_fmap.py Tue Nov 04 12:15:19 2014 -0500 |
[ |
b'@@ -0,0 +1,203 @@\n+#!/usr/bin/env python\n+"""\n+GFF feature mapping program, to find the relation between different features described in a given GFF file. \n+\n+Usage: \n+python gff_fmap.py in.gff > out.txt \n+\n+Courtesy: Brad Chapman \n+ Few functions are inherited from bcbio-GFFParser program. \n+"""\n+\n+import re\n+import sys \n+import urllib\n+import collections\n+from helper import open_file\n+\n+def _gff_line_map(line):\n+ """Parses a line of GFF into a dictionary.\n+ Given an input line from a GFF file, this:\n+ - breaks it into component elements\n+ - determines the type of attribute (flat, parent, child or annotation)\n+ - generates a dictionary of GFF info \n+ """\n+ gff3_kw_pat = re.compile("\\w+=")\n+ def _split_keyvals(keyval_str):\n+ """Split key-value pairs in a GFF2, GTF and GFF3 compatible way.\n+ GFF3 has key value pairs like:\n+ count=9;gene=amx-2;sequence=SAGE:aacggagccg\n+ GFF2 and GTF have: \n+ Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206"\n+ name "fgenesh1_pg.C_chr_1000003"; transcriptId 869\n+ """\n+ quals = collections.defaultdict(list)\n+ if keyval_str is None:\n+ return quals\n+ # ensembl GTF has a stray semi-colon at the end\n+ if keyval_str[-1] == \';\':\n+ keyval_str = keyval_str[:-1]\n+ # GFF2/GTF has a semi-colon with at least one space after it.\n+ # It can have spaces on both sides; wormbase does this.\n+ # GFF3 works with no spaces.\n+ # Split at the first one we can recognize as working\n+ parts = keyval_str.split(" ; ")\n+ if len(parts) == 1:\n+ parts = keyval_str.split("; ")\n+ if len(parts) == 1:\n+ parts = keyval_str.split(";")\n+ # check if we have GFF3 style key-vals (with =)\n+ is_gff2 = True\n+ if gff3_kw_pat.match(parts[0]):\n+ is_gff2 = False\n+ key_vals = [p.split(\'=\') for p in parts]\n+ # otherwise, we are separated by a space with a key as the first item\n+ else:\n+ pieces = []\n+ for p in parts:\n+ # fix misplaced semi-colons in keys in some GFF2 files\n+ if p and p[0] == \';\':\n+ p = p[1:]\n+ pieces.append(p.strip().split(" "))\n+ key_vals = [(p[0], " ".join(p[1:])) for p in pieces]\n+ for key, val in key_vals:\n+ # remove quotes in GFF2 files\n+ if (len(val) > 0 and val[0] == \'"\' and val[-1] == \'"\'):\n+ val = val[1:-1] \n+ if val:\n+ quals[key].extend(val.split(\',\'))\n+ # if we don\'t have a value, make this a key=True/False style\n+ # attribute\n+ else:\n+ quals[key].append(\'true\')\n+ for key, vals in quals.items():\n+ quals[key] = [urllib.unquote(v) for v in vals]\n+ return quals, is_gff2\n+\n+ def _nest_gff2_features(gff_parts):\n+ """Provide nesting of GFF2 transcript parts with transcript IDs.\n+\n+ exons and coding sequences are mapped to a parent with a transcript_id\n+ in GFF2. This is implemented differently at different genome centers\n+ and this function attempts to resolve that and map things to the GFF3\n+ way of doing them.\n+ """\n+ # map protein or transcript ids to a parent\n+ for transcript_id in ["transcript_id", "transcriptId", "proteinId"]:\n+ try:\n+ gff_parts["quals"]["Parent"] = \\\n+ gff_parts["quals"][transcript_id]\n+ break\n+ except KeyError:\n+ pass\n+ # case for WormBase GFF -- everything labelled as Transcript or CDS\n+ for flat_name in ["Transcript", "CDS"]:\n+ if gff_parts["quals"].has_key(flat_name):\n+ # parent types\n+ if gff_parts["type"] in [flat_name]:\n+ if not gff_parts["id"]:\n+ '..b' break\n+\n+ return gff_parts\n+\n+ line = line.strip()\n+ if line == \'\':return [(\'directive\', line)] # sometimes the blank lines will be there \n+ if line[0] == \'>\':return [(\'directive\', \'\')] # sometimes it will be a FATSA header\n+ if line[0] == "#":\n+ return [(\'directive\', line[2:])]\n+ elif line:\n+ parts = line.split(\'\\t\')\n+ if len(parts) == 1 and re.search(r\'\\w+\', parts[0]):return [(\'directive\', \'\')] ## GFF files with FASTA sequence together \n+ assert len(parts) == 9, line\n+ gff_parts = [(None if p == \'.\' else p) for p in parts]\n+ gff_info = dict()\n+ \n+ # collect all of the base qualifiers for this item\n+ quals, is_gff2 = _split_keyvals(gff_parts[8])\n+\n+ gff_info["is_gff2"] = is_gff2\n+\n+ if gff_parts[1]:quals["source"].append(gff_parts[1])\n+ gff_info[\'quals\'] = dict(quals)\n+\n+ # if we are describing a location, then we are a feature\n+ if gff_parts[3] and gff_parts[4]:\n+ gff_info[\'type\'] = gff_parts[2]\n+ gff_info[\'id\'] = quals.get(\'ID\', [\'\'])[0]\n+ \n+ if is_gff2:gff_info = _nest_gff2_features(gff_info)\n+ # features that have parents need to link so we can pick up\n+ # the relationship\n+ if gff_info[\'quals\'].has_key(\'Parent\'):\n+ final_key = \'child\'\n+ elif gff_info[\'id\']:\n+ final_key = \'parent\'\n+ # Handle flat features\n+ else:\n+ final_key = \'feature\'\n+ # otherwise, associate these annotations with the full record\n+ else:\n+ final_key = \'annotation\'\n+ return [(final_key, gff_info)]\n+ \n+def parent_child_id_map(gff_handle):\n+ """\n+ Provide a mapping of parent to child relationships in the file.\n+ Gives a dictionary of parent child relationships:\n+\n+ keys -- tuple of (source, type) for each parent\n+ values -- tuple of (source, type) as children of that parent\n+ """\n+ # collect all of the parent and child types mapped to IDs\n+ parent_sts = dict()\n+ child_sts = collections.defaultdict(list)\n+ for line in gff_handle:\n+ line_type, line_info = _gff_line_map(line)[0]\n+ if (line_type == \'parent\' or (line_type == \'child\' and line_info[\'id\'])):\n+ parent_sts[line_info[\'id\']] = (line_info[\'quals\'][\'source\'][0], line_info[\'type\'])\n+ if line_type == \'child\':\n+ for parent_id in line_info[\'quals\'][\'Parent\']:\n+ child_sts[parent_id].append((line_info[\'quals\'][\'source\'][0], line_info[\'type\']))\n+ gff_handle.close()\n+ # generate a dictionary of the unique final type relationships\n+ pc_map = collections.defaultdict(list)\n+ for parent_id, parent_type in parent_sts.items():\n+ for child_type in child_sts[parent_id]:\n+ pc_map[parent_type].append(child_type)\n+ pc_final_map = dict()\n+ for ptype, ctypes in pc_map.items():\n+ unique_ctypes = list(set(ctypes))\n+ unique_ctypes.sort()\n+ pc_final_map[ptype] = unique_ctypes\n+ # some cases the GFF file represents a single feature type \n+ if not pc_final_map:\n+ for fid, stypes in parent_sts.items():\n+ pc_final_map[stypes] = dict()\n+ # generate a report on feature id mapping in the file \n+ print \'+---------------------+---------------------------------+\'\n+ print \'| Parent feature type | Associated child feature type(s)|\'\n+ print \'+---------------------+---------------------------------+\'\n+ for key, value in pc_final_map.items():\n+ print key[0], key[1]\n+ for child_to in value:\n+ print \'\\t\\t\\t|-\',child_to[0], child_to[1]\n+ print \'+---------------------+---------------------------------+\'\n+\n+\n+if __name__==\'__main__\':\n+\n+ try:\n+ gff_file = sys.argv[1]\n+ except:\n+ print __doc__\n+ sys.exit(-1)\n+ \n+ gff_handle = open_file(gff_file)\n+ parent_child_id_map(gff_handle)\n' |
b |
diff -r 619e0fcd9126 -r 6e589f267c14 gff_fmap.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff_fmap.xml Tue Nov 04 12:15:19 2014 -0500 |
b |
@@ -0,0 +1,83 @@ +<tool id="fml_gfffmap" name="GFF-map" version="2.0.0"> + <description>features</description> + <command interpreter="python"> + gff_fmap.py $gff_input > $idmapping + </command> + <inputs> + <param format="gff3,gff" name="gff_input" type="data" label="Query file" help="Provide genome annotation file in GFF."/> + </inputs> + <outputs> + <data format="txt" name="idmapping" label="${tool.name} on ${on_string}: parent child id map"/> + </outputs> + <tests> + <test> + <param name="gff_input" value="Feature_ID_mapping_W.gff3" /> + <output name="idmapping" file="Feature_ID_mapping_W.txt" /> + </test> + <test> + <param name="gff_input" value="Aly_JGI.gff3" /> + <output name="idmapping" file="Feature_ID_mapping_R.txt" /> + </test> + </tests> + <help> + +**What it does** + +GFF-map provides the features (gene, mRNA, UTR's, exon, CDS etc) relationship based on their identifier mapping in a given GFF file. + +-------- + +**Example** + +- The features ID mapping in the following data in GFF3:: + + ##gff-version 3 + 17 protein_coding gene 7255208 7258258 . + . ID=ENSG00000213859;Name=KCTD11 + 17 protein_coding mRNA 7255208 7258258 . + . ID=ENST00000333751;Name=KCTD11-001;Parent=ENSG00000213859 + 17 protein_coding protein 7256262 7256960 . + 0 ID=ENSP00000328352;Name=ENSP00000328352 + 17 protein_coding five_prime_UTR 7255208 7256261 . + . Parent=ENST00000333751 + 17 protein_coding CDS 7256262 7256960 . + 0 Name=CDS:KCTD11;Parent=ENST00000333751,ENSP00000328352 + 17 protein_coding three_prime_UTR 7256961 7258258 . + . Parent=ENST00000333751 + 17 protein_coding exon 7255208 7258258 . + . Parent=ENST00000333751 + +- Will be displayed as:: + + +-----------------------+---------------------------------+ + | Parent feature type | Associated child feature type(s)| + +-----------------------+---------------------------------+ + | protein_coding gene | protein_coding mRNA | + +-----------------------+---------------------------------+ + | protein_coding protein| protein_coding CDS | + +-----------------------+---------------------------------+ + | protein_coding mRNA | protein_coding CDS | + | | protein_coding exon | + | | protein_coding five_prime_UTR | + | | protein_coding three_prime_UTR | + +-----------------------+---------------------------------+ + +-------- + +**About formats** + +**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields:: + + 1. seqid - Must be a chromosome or scaffold. + 2. source - The program that generated this feature. + 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. stop - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If there is no score value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). + 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. + 9. attributes - All lines with the same group are linked together into a single item. + +-------- + +**Copyright** + +2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center + +Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014) + +</help> +</tool> |
b |
diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_bed.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff_to_bed.py Tue Nov 04 12:15:19 2014 -0500 |
[ |
@@ -0,0 +1,96 @@ +#!/usr/bin/env python +""" +Convert genome annotation data in GFF/GTF to a 12 column BED format. +BED format typically represents the transcript models. + +Usage: python gff_to_bed.py in.gff > out.bed + +Requirement: + GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py + +Copyright (C) + 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. + 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA. +""" + +import re +import sys +import GFFParser + +def writeBED(tinfo): + """ + writing result files in bed format + + @args tinfo: list of genes + @args tinfo: numpy object + """ + + for ent1 in tinfo: + child_flag = False + + for idx, tid in enumerate(ent1['transcripts']): + child_flag = True + exon_cnt = len(ent1['exons'][idx]) + exon_len = '' + exon_cod = '' + rel_start = None + rel_stop = None + for idz, ex_cod in enumerate(ent1['exons'][idx]):#check for exons of corresponding transcript + exon_len += '%d,' % (ex_cod[1]-ex_cod[0]+1) + if idz == 0: #calculate the relative start position + exon_cod += '0,' + rel_start = int(ex_cod[0]) + rel_stop = ex_cod[1] + else: + exon_cod += '%d,' % (ex_cod[0]-rel_start) + rel_stop = int(ex_cod[1]) + + if exon_len: + score = '0' + score = ent1['score'][0] if ent1['score'] else score + out_print = [ent1['chr'], + str(rel_start), + str(rel_stop), + tid[0], + score, + ent1['strand'], + str(rel_start), + str(rel_stop), + '0', + str(exon_cnt), + exon_len, + exon_cod] + print '\t'.join(out_print) + + if not child_flag: # file just contains only a single parent type i.e, gff3 defines only one feature type + score = '0' + score = ent1['score'][0] if ent1['score'] else score + + out_print = [ent1['chr'], + '%d' % int(ent1['start']), + '%d' % int(ent1['stop']), + ent1['name'], + score, + ent1['strand'], + '%d' % int(ent1['start']), + '%d' % int(ent1['stop']), + '0', + '1', + '%d,' % (int(ent1['stop'])-int(ent1['start'])+1), + '0,'] + + print '\t'.join(out_print) + + +def __main__(): + try: + query_file = sys.argv[1] + except: + print __doc__ + sys.exit(-1) + + Transcriptdb = GFFParser.Parse(query_file) + writeBED(Transcriptdb) + +if __name__ == "__main__": + __main__() |
b |
diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_bed.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff_to_bed.xml Tue Nov 04 12:15:19 2014 -0500 |
b |
@@ -0,0 +1,90 @@ +<tool id="fml_gff2bed" name="GFF-to-BED" version="2.0.0"> + <description>converter</description> + <command interpreter="python">gff_to_bed.py $inf_gff > $bed_format + </command> + <inputs> + <param format="gtf,gff,gff3" name="inf_gff" type="data" label="Convert this query" help="Provide genome annotation file in GFF, GTF, GFF3."/> + </inputs> + <outputs> + <data format="bed" name="bed_format" label="${tool.name} on ${on_string}: Converted" /> + </outputs> + <tests> + <test> + <param name="inf_gff" value="Aly_JGI.gff3" /> + <output name="bed_format" file="Aly_JGI.bed" /> + </test> + <test> + <param name="inf_gff" value="MB7_3R.gff3" /> + <output name="bed_format" file="MB7_3R.bed" /> + </test> + </tests> + <help> + +**What it does** + +This tool converts gene transcript annotation from GTF or GFF or GFF3 to UCSC wiggle 12 column BED format. + +-------- + +**Example** + +- The following data in GFF3:: + + ##gff-version 3 + chr1 protein_coding gene 11874 14409 0 + . ID=Gene:uc001aaa.3;Name=Gene:uc001aaa.3 + chr1 protein_coding transcript 11874 14409 0 + . ID=uc001aaa.3;Name=uc001aaa.3;Parent=Gene:uc001aaa.3 + chr1 protein_coding exon 11874 12227 0 + . Parent=uc001aaa.3 + chr1 protein_coding exon 12613 12721 0 + . Parent=uc001aaa.3 + chr1 protein_coding exon 13221 14409 0 + . Parent=uc001aaa.3 + +- Will be converted to UCSC Wiggle BED format:: + + chr1 11874 14409 uc001aaa.3 0 + 11874 14409 0 3 354,109,1189, 0,739,1347, + +-------- + +**About formats** + +**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields:: + + + 1. seqid - Must be a chromosome or scaffold or contig. + 2. source - The program that generated this feature. + 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. stop - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If there is no score value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). + 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. + 9. attributes - All lines with the same group are linked together into a single item. + +**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and several additional optional ones: + +The first three BED fields (required) are:: + + 1. chrom - The name of the chromosome (e.g. chr1, chrY_random). + 2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.) + 3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval). + +The additional BED fields (optional) are:: + + 4. name - The name of the BED line. + 5. score - A score between 0 and 1000. + 6. strand - Defines the strand - either '+' or '-'. + 7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser. + 8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser. + 9. reserved - This should always be set to zero. + 10. blockCount - The number of blocks (exons) in the BED line. + 11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount. + 12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. + +-------- + +**Copyright** + +2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center + +Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014) + + </help> +</tool> |
b |
diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_gbk.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff_to_gbk.py Tue Nov 04 12:15:19 2014 -0500 |
[ |
@@ -0,0 +1,54 @@ +#!/usr/bin/env python +""" +Convert data from GFF and associated genome sequence in fasta file into GenBank. + +Usage: +python gff_to_gbk.py in.gff in.fasta out.gbk + +Requirements: + BioPython:- http://biopython.org/ + helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py + +Copyright (C) + 2010-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. + 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA. +""" + +import sys +import helper +import gffparser_bcbio + +from Bio import SeqIO +from Bio.Alphabet import generic_dna + +def __main__(): + """ + main wrapper + """ + + try: + gff_fname = sys.argv[1] + fasta_fname = sys.argv[2] + gb_fname = sys.argv[3] + except: + print __doc__ + sys.exit(-1) + + fasta_fh = helper.open_file(fasta_fname) + + fasta_rec = SeqIO.to_dict(SeqIO.parse(fasta_fh, "fasta", generic_dna)) + fasta_fh.close() + + gff_rec = gffparser_bcbio.parse(gff_fname, fasta_rec) + + try: + gb_fh = open(gb_fname, "w") + except: + print 'file not ready for writing %s' % gb_fname + sys.exit(-1) + + SeqIO.write(gff_rec, gb_fh, "genbank") + gb_fh.close() + +if __name__=="__main__": + __main__() |
b |
diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_gbk.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff_to_gbk.xml Tue Nov 04 12:15:19 2014 -0500 |
b |
@@ -0,0 +1,98 @@ +<tool id="fml_gff2gbk" name="GFF-to-GBK" version="2.0.0"> + <description>converter</description> + <command interpreter="python">gff_to_gbk.py $inf_gff $inf_fas $gbk_format + </command> + <inputs> + <param format="gff,gff3" name="inf_gff" type="data" label="Convert this query" help="Genome annotation in GFF file format."/> + <param format="fa,fasta" name="inf_fas" type="data" label="Genome Sequence" help="Genome sequence in FASTA format."/> + </inputs> + <outputs> + <data format="genbank" name="gbk_format" label="${tool.name} on ${on_string}: Converted"/> + </outputs> + <tests> + <test> + <param name="inf_gff" value="s_cerevisiae_SCU49845.gff3" /> + <param name="inf_fas" value="s_cerevisiae_SCU49845.fasta" /> + <output name="gbk_format" file="s_cerevisiae_SCU49845.gbk" /> + </test> + </tests> + <help> + +**What it does** + +This tool converts annotations in GFF to GenBank_ format (scroll down for format description). + +.. _GenBank: http://www.ncbi.nlm.nih.gov/genbank/ + +------ + +**Example** + +- The following data in GFF:: + + ##gff-version 3 + # sequence-region NM_001202705 1 2406 + NM_001202705 GenBank chromosome 1 2406 . + 1 ID=NM_001202705;Alias=2;Dbxref=taxon:3702;Name=NM_001202705;Note=Arabidopsis thaliana thiamine biosynthesis protein ThiC (THIC) mRNA%2C complete cds.,REVIEWED REFSEQ; + NM_001202705 GenBank gene 1 2406 . + 1 ID=AT2G29630;Dbxref=GeneID:817513,TAIR:AT2G29630;Name=THIC;locus_tag=AT2G29630 + NM_001202705 GenBank mRNA 192 2126 . + 1 ID=AT2G29630.t01;Parent=AT2G29630 + NM_001202705 GenBank CDS 192 2126 . + 1 ID=AT2G29630.p01;Parent=AT2G29630.t01;Dbxref=GI:334184567,GeneID:817513,TAIR:AT2G29630;Name=THIC;Note=thiaminC (THIC)%3B CONTAINS InterPro DOMAIN;rotein_id=NP_001189634.1; + NM_001202705 GenBank exon 192 2126 . + 1 Parent=AT2G29630.t01 + ##FASTA + >NM_001202705 + AAGCCTTTCGCTTTAGGCTGCATTGGGCCGTGACAATATTCAGACGATTCAGGAGGTTCG + TTCCTTTTTTAAAGGACCCTAATCACTCTGAGTACCACTGACTCACTCAGTGTGCGCGAT + +- Will be converted to GenBank format:: + + LOCUS NM_001202705 2406 bp mRNA linear PLN 28-MAY-2011 + DEFINITION Arabidopsis thaliana thiamine biosynthesis protein ThiC (THIC) + mRNA, complete cds. + ACCESSION NM_001202705 + VERSION NM_001202705.1 GI:334184566......... + FEATURES Location/Qualifiers + source 1..2406 + /organism="Arabidopsis thaliana" + /mol_type="mRNA" + /db_xref="taxon:3702"........ + gene 1..2406 + /gene="THIC" + /locus_tag="AT2G29630" + /gene_synonym="PY; PYRIMIDINE REQUIRING; T27A16.27;........ + ORIGIN + 1 aagcctttcg ctttaggctg cattgggccg tgacaatatt cagacgattc aggaggttcg + 61 ttcctttttt aaaggaccct aatcactctg agtaccactg actcactcag tgtgcgcgat + 121 tcatttcaaa aacgagccag cctcttcttc cttcgtctac tagatcagat ccaaagcttc + 181 ctcttccagc tatggctgct tcagtacact gtaccttgat gtccgtcgta tgcaacaaca + // + +------ + +**About formats** + +**GFF** Generic Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF lines have nine tab-separated fields:: + + 1. seqid - Must be a chromosome or scaffold or contig. + 2. source - The program that generated this feature. + 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. stop - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If there is no score value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). + 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. + 9. attributes - All lines with the same group are linked together into a single item. + +**GenBank format** Consists of an annotation section and a sequence section. Sample record_ + +.. _record: http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html + + +-------- + +**Copyright** + +2010-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center + +Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014) + + </help> +</tool> |
b |
diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_gtf.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff_to_gtf.py Tue Nov 04 12:15:19 2014 -0500 |
[ |
@@ -0,0 +1,76 @@ +#!/usr/bin/env python +""" +Program to convert data from GFF to GTF + +Usage: python gff_to_gtf.py in.gff > out.gtf + +Requirement: + GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py + +Copyright (C) + 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. + 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA. +""" + +import re +import sys +import GFFParser + +def printGTF(tinfo): + """ + writing result file in GTF format + + @args tinfo: parsed object from gff file + @type tinfo: numpy array + """ + + for ent1 in tinfo: + for idx, tid in enumerate(ent1['transcripts']): + + exons = ent1['exons'][idx] + cds_exons = ent1['cds_exons'][idx] + + stop_codon = start_codon = () + + if ent1['strand'] == '+': + if cds_exons.any(): + start_codon = (cds_exons[0][0], cds_exons[0][0]+2) + stop_codon = (cds_exons[-1][1]-2, cds_exons[-1][1]) + elif ent1['strand'] == '-': + if cds_exons.any(): + start_codon = (cds_exons[-1][1]-2, cds_exons[-1][1]) + stop_codon = (cds_exons[0][0], cds_exons[0][0]+2) + else: + print 'STRAND information missing - %s, skip the transcript - %s' % (ent1['strand'], tid[0]) + pass + + last_cds_cod = 0 + for idz, ex_cod in enumerate(exons): + + print '%s\t%s\texon\t%d\t%d\t.\t%s\t.\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], ex_cod[0], ex_cod[1], ent1['strand'], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name']) + + if cds_exons.any(): + try: + print '%s\t%s\tCDS\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], cds_exons[idz][0], cds_exons[idz][1], ent1['strand'], cds_exons[idz][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name']) + last_cds_cod = idz + except: + pass + + if idz == 0: + print '%s\t%s\tstart_codon\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], start_codon[0], start_codon[1], ent1['strand'], cds_exons[idz][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name']) + + if stop_codon: + print '%s\t%s\tstop_codon\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], stop_codon[0], stop_codon[1], ent1['strand'], cds_exons[last_cds_cod][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name']) + + +if __name__ == "__main__": + + try: + gff_fname = sys.argv[1] + except: + print __doc__ + sys.exit(-1) + + Transcriptdb = GFFParser.Parse(gff_fname) + + printGTF(Transcriptdb) |
b |
diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_gtf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff_to_gtf.xml Tue Nov 04 12:15:19 2014 -0500 |
b |
@@ -0,0 +1,88 @@ +<tool id="fml_gff2gtf" name="GFF-to-GTF" version="2.0.0"> + <description>converter</description> + <command interpreter="python">gff_to_gtf.py $inf_gff3 > $gtf_format + </command> + <inputs> + <param format="gff3,gff" name="inf_gff3" type="data" label="Convert this query" help="Provide genome annotation file in GFF or GFF3."/> + </inputs> + <outputs> + <data format="gtf" name="gtf_format" label="${tool.name} on ${on_string}: Converted" /> + </outputs> + <tests> + <test> + <param name="inf_gff3" value="AceView_ncbi_37.gff3" /> + <output name="gtf_format" file="AceView_gff3_to_gtf.gtf" /> + </test> + <test> + <param name="inf_gff3" value="ENSEMBL_mm9.gff3" /> + <output name="gtf_format" file="ENSEMBL_mm9_gff3_to_gtf.gtf" /> + </test> + </tests> + <help> + +**What it does** + +This tool converts data from GFF3 to GTF file format (scroll down for format description). + +-------- + +**Example** + +- The following data in GFF3 format:: + + ##gff-version 3 + 17 protein_coding gene 7255208 7258258 . + . ID=ENSG00000213859;Name=KCTD11 + 17 protein_coding mRNA 7255208 7258258 . + . ID=ENST00000333751;Name=KCTD11-001;Parent=ENSG00000213859 + 17 protein_coding protein 7256262 7256960 . + . ID=ENSP00000328352;Name=KCTD11-001;Parent=ENST00000333751 + 17 protein_coding five_prime_UTR 7255208 7256261 . + . Parent=ENST00000333751 + 17 protein_coding CDS 7256262 7256960 . + 0 Name=CDS:KCTD11;Parent=ENST00000333751,ENSP00000328352 + 17 protein_coding three_prime_UTR 7256961 7258258 . + . Parent=ENST00000333751 + 17 protein_coding exon 7255208 7258258 . + . Parent=ENST00000333751 + +- Will be converted to GTF format:: + + 17 protein_coding exon 7255208 7258258 . + . gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; + 17 protein_coding CDS 7256262 7256957 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; protein_id "ENSP00000328352"; + 17 protein_coding start_codon 7256262 7256264 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; + 17 protein_coding stop_codon 7256958 7256960 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; + +-------- + +**About formats** + + +**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields:: + + 1. seqid - Must be a chromosome or scaffold. + 2. source - The program that generated this feature. + 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. stop - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If there is no score value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). + 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. + 9. attributes - All lines with the same group are linked together into a single item. + + +**GTF format** Gene Transfer Format, it borrows from GFF, but has additional structure that warrants a separate definition and format name. GTF lines have nine tab-seaparated fields:: + + 1. seqname - The name of the sequence. + 2. source - This indicating where the annotation came from. + 3. feature - The name of the feature types. The following feature types are required: 'CDS', 'start_codon' and 'stop_codon' + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. end - The ending position of the feature (inclusive). + 6. score - The score field indicates a degree of confidence in the feature's existence and coordinates. + 7. strand - Valid entries include '+', '-', or '.' + 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. + 9. attributes - These attributes are designed for handling multiple transcripts from the same genomic region. + +-------- + +**Copyright** + +2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center + +Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014) + + </help> +</tool> |
b |
diff -r 619e0fcd9126 -r 6e589f267c14 gffparser_bcbio.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gffparser_bcbio.py Tue Nov 04 12:15:19 2014 -0500 |
[ |
b'@@ -0,0 +1,828 @@\n+"""Parse GFF files into features attached to Biopython SeqRecord objects.\n+\n+This deals with GFF3 formatted files, a tab delimited format for storing\n+sequence features and annotations:\n+\n+http://www.sequenceontology.org/gff3.shtml\n+\n+It will also deal with older GFF versions (GTF/GFF2):\n+\n+http://www.sanger.ac.uk/Software/formats/GFF/GFF_Spec.shtml\n+http://mblab.wustl.edu/GTF22.html\n+\n+The implementation utilizes map/reduce parsing of GFF using Disco. Disco\n+(http://discoproject.org) is a Map-Reduce framework for Python utilizing\n+Erlang for parallelization. The code works on a single processor without\n+Disco using the same architecture.\n+"""\n+import os\n+import copy\n+import re\n+import collections\n+import urllib\n+import itertools\n+\n+# Make defaultdict compatible with versions of python older than 2.4\n+try:\n+ collections.defaultdict\n+except AttributeError:\n+ import _utils\n+ collections.defaultdict = _utils.defaultdict\n+\n+from Bio.Seq import Seq, UnknownSeq\n+from Bio.SeqRecord import SeqRecord\n+from Bio.SeqFeature import SeqFeature, FeatureLocation\n+from Bio import SeqIO\n+\n+def _gff_line_map(line, params):\n+ """Map part of Map-Reduce; parses a line of GFF into a dictionary.\n+\n+ Given an input line from a GFF file, this:\n+ - decides if the file passes our filtering limits\n+ - if so:\n+ - breaks it into component elements\n+ - determines the type of attribute (flat, parent, child or annotation)\n+ - generates a dictionary of GFF info which can be serialized as JSON\n+ """\n+ gff3_kw_pat = re.compile("\\w+=")\n+ def _split_keyvals(keyval_str):\n+ """Split key-value pairs in a GFF2, GTF and GFF3 compatible way.\n+\n+ GFF3 has key value pairs like:\n+ count=9;gene=amx-2;sequence=SAGE:aacggagccg\n+ GFF2 and GTF have: \n+ Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206"\n+ name "fgenesh1_pg.C_chr_1000003"; transcriptId 869\n+ """\n+ quals = collections.defaultdict(list)\n+ if keyval_str is None:\n+ return quals\n+ # ensembl GTF has a stray semi-colon at the end\n+ if keyval_str[-1] == \';\':\n+ keyval_str = keyval_str[:-1]\n+ # GFF2/GTF has a semi-colon with at least one space after it.\n+ # It can have spaces on both sides; wormbase does this.\n+ # GFF3 works with no spaces.\n+ # Split at the first one we can recognize as working\n+ parts = keyval_str.split(" ; ")\n+ if len(parts) == 1:\n+ parts = keyval_str.split("; ")\n+ if len(parts) == 1:\n+ parts = keyval_str.split(";")\n+ # check if we have GFF3 style key-vals (with =)\n+ is_gff2 = True\n+ if gff3_kw_pat.match(parts[0]):\n+ is_gff2 = False\n+ key_vals = [p.split(\'=\') for p in parts]\n+ # otherwise, we are separated by a space with a key as the first item\n+ else:\n+ pieces = []\n+ for p in parts:\n+ # fix misplaced semi-colons in keys in some GFF2 files\n+ if p and p[0] == \';\':\n+ p = p[1:]\n+ pieces.append(p.strip().split(" "))\n+ key_vals = [(p[0], " ".join(p[1:])) for p in pieces]\n+ for item in key_vals:\n+ # standard in-spec items are key=value\n+ if len(item) == 2:\n+ key, val = item\n+ # out-of-spec files can have just key values. We set an empty value\n+ # which will be changed to true later to standardize.\n+ else:\n+ assert len(item) == 1, item\n+ key = item[0]\n+ val = \'\'\n+ # remove quotes in GFF2 files\n+ if (len(val) > 0 and val[0] == \'"\' and val[-1] == \'"\'):\n+ val = val[1:-1] \n+ if val:\n+ quals[key].extend([v for v in val.split(\',\') if v])\n+ # if we don\'t have a value, make this a key=True/False style\n+ '..b' the\n+ information you need. This class provides high level summary details to\n+ help in learning.\n+ """\n+ def __init__(self):\n+ self._filter_info = dict(gff_id = [0], gff_source_type = [1, 2],\n+ gff_source = [1], gff_type = [2])\n+ \n+ def _get_local_params(self, limit_info=None):\n+ class _LocalParams:\n+ def __init__(self):\n+ self.jsonify = False\n+ params = _LocalParams()\n+ params.limit_info = limit_info\n+ params.filter_info = self._filter_info\n+ return params\n+ \n+ @_file_or_handle\n+ def available_limits(self, gff_handle):\n+ """Return dictionary information on possible limits for this file.\n+\n+ This returns a nested dictionary with the following structure:\n+ \n+ keys -- names of items to filter by\n+ values -- dictionary with:\n+ keys -- filter choice\n+ value -- counts of that filter in this file\n+\n+ Not a parallelized map-reduce implementation.\n+ """\n+ cur_limits = dict()\n+ for filter_key in self._filter_info.keys():\n+ cur_limits[filter_key] = collections.defaultdict(int)\n+ for line in gff_handle:\n+ # when we hit FASTA sequences, we are done with annotations\n+ if line.startswith("##FASTA"):\n+ break\n+ # ignore empty and comment lines\n+ if line.strip() and line.strip()[0] != "#":\n+ parts = [p.strip() for p in line.split(\'\\t\')]\n+ assert len(parts) == 9, line\n+ for filter_key, cur_indexes in self._filter_info.items():\n+ cur_id = tuple([parts[i] for i in cur_indexes])\n+ cur_limits[filter_key][cur_id] += 1\n+ # get rid of the default dicts\n+ final_dict = dict()\n+ for key, value_dict in cur_limits.items():\n+ if len(key) == 1:\n+ key = key[0]\n+ final_dict[key] = dict(value_dict)\n+ gff_handle.close()\n+ return final_dict\n+\n+ @_file_or_handle\n+ def parent_child_map(self, gff_handle):\n+ """Provide a mapping of parent to child relationships in the file.\n+\n+ Returns a dictionary of parent child relationships:\n+\n+ keys -- tuple of (source, type) for each parent\n+ values -- tuple of (source, type) as children of that parent\n+ \n+ Not a parallelized map-reduce implementation.\n+ """\n+ # collect all of the parent and child types mapped to IDs\n+ parent_sts = dict()\n+ child_sts = collections.defaultdict(list)\n+ for line in gff_handle:\n+ # when we hit FASTA sequences, we are done with annotations\n+ if line.startswith("##FASTA"):\n+ break\n+ if line.strip():\n+ line_type, line_info = _gff_line_map(line,\n+ self._get_local_params())[0]\n+ if (line_type == \'parent\' or (line_type == \'child\' and\n+ line_info[\'id\'])):\n+ parent_sts[line_info[\'id\']] = (\n+ line_info[\'quals\'][\'source\'][0], line_info[\'type\'])\n+ if line_type == \'child\':\n+ for parent_id in line_info[\'quals\'][\'Parent\']:\n+ child_sts[parent_id].append((\n+ line_info[\'quals\'][\'source\'][0], line_info[\'type\']))\n+ #print parent_sts, child_sts\n+ # generate a dictionary of the unique final type relationships\n+ pc_map = collections.defaultdict(list)\n+ for parent_id, parent_type in parent_sts.items():\n+ for child_type in child_sts[parent_id]:\n+ pc_map[parent_type].append(child_type)\n+ pc_final_map = dict()\n+ for ptype, ctypes in pc_map.items():\n+ unique_ctypes = list(set(ctypes))\n+ unique_ctypes.sort()\n+ pc_final_map[ptype] = unique_ctypes\n+ return pc_final_map\n' |
b |
diff -r 619e0fcd9126 -r 6e589f267c14 gtf_to_gff.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gtf_to_gff.py Tue Nov 04 12:15:19 2014 -0500 |
[ |
@@ -0,0 +1,85 @@ +#!/usr/bin/env python +""" +Convert Gene Transfer Format [GTF] to Generic Feature Format Version 3 [GFF3]. + +Usage: python gtf_to_gff.py in.gtf > out.gff3 + +Requirement: + GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py + helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py + +Copyright (C) + 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. + 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA. +""" + +import re +import sys +import GFFParser +import helper + +def GFFWriter(gtf_content): + """ + write the feature information to GFF format + + @args gtf_content: Parsed object from gtf file + @type gtf_content: numpy array + """ + + print '##gff-version 3' + + for ent1 in gtf_content: + + chr_name = ent1['chr'] + strand = ent1['strand'] + start = ent1['start'] + stop = ent1['stop'] + source = ent1['source'] + ID = ent1['name'] + Name = ent1['gene_info']['Name'] + + Name = ID if not Name else Name + + print '%s\t%s\tgene\t%d\t%d\t.\t%s\t.\tID=%s;Name=%s' % (chr_name, source, start, stop, strand, ID, Name) + + for idx, tid in enumerate(ent1['transcripts']): + print idx + print tid + + t_start = ent1['exons'][idx][0][0] + t_stop = ent1['exons'][idx][-1][-1] + t_type = ent1['transcript_type'][idx] + + utr5_exons, utr3_exons = [], [] + if ent1['exons'][idx].any() and ent1['cds_exons'][idx].any(): + utr5_exons, utr3_exons = helper.buildUTR(ent1['cds_exons'][idx], ent1['exons'][idx], strand) + + print '%s\t%s\t%s\t%d\t%d\t.\t%s\t.\tID=%s;Parent=%s' % (chr_name, source, t_type, t_start, t_stop, strand, tid[0], ID) + + for ex_cod in utr5_exons: + print '%s\t%s\tfive_prime_UTR\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0]) + + for ex_cod in ent1['cds_exons'][idx]: + print '%s\t%s\tCDS\t%d\t%d\t.\t%s\t%d\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, ex_cod[2], tid[0]) + + for ex_cod in utr3_exons: + print '%s\t%s\tthree_prime_UTR\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0]) + + for ex_cod in ent1['exons'][idx]: + print '%s\t%s\texon\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0]) + + +def __main__(): + + try: + gtf_fname = sys.argv[1] + except: + print __doc__ + sys.exit(-1) + + gtf_file_content = GFFParser.Parse(gtf_fname) + + GFFWriter(gtf_file_content) + +if __name__ == "__main__": + __main__() |
b |
diff -r 619e0fcd9126 -r 6e589f267c14 gtf_to_gff.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gtf_to_gff.xml Tue Nov 04 12:15:19 2014 -0500 |
b |
@@ -0,0 +1,94 @@ +<tool id="fml_gtf2gff" name="GTF-to-GFF" version="2.0.0"> + <description>converter</description> + <command interpreter="python">gtf_to_gff.py $inf_gtf > $gff3_format + </command> + <inputs> + <param format="gtf" name="inf_gtf" type="data" label="Convert this query" help="Provide genome annotation file in GTF."/> + </inputs> + <outputs> + <data format="gff3" name="gff3_format" label="${tool.name} on ${on_string}: Converted" /> + </outputs> + <tests> + <test> + <param name="inf_gtf" value="UCSC_transcripts.gtf" /> + <output name="gff3_format" file="UCSC_transcripts.gff3" /> + </test> + <test> + <param name="inf_gtf" value="JGI_genes.gtf" /> + <output name="gff3_format" file="JGI_genes.gff3" /> + </test> + <test> + <param name="inf_gtf" value="ENSEMBL_mm9.gtf" /> + <output name="gff3_format" file="ENSEMBL_mm9.gff3" /> + </test> + <test> + <param name="inf_gtf" value="AceView_ncbi_37.gtf" /> + <output name="gff3_format" file="AceView_ncbi_37.gff3" /> + </test> + </tests> + <help> + +**What it does** + +This tool converts data from GTF to a valid GFF3 file (scroll down for format description). + +-------- + +**Example** + +- The following data in GTF format:: + + 17 protein_coding exon 7255208 7258258 . + . gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; + 17 protein_coding CDS 7256262 7256957 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; protein_id "ENSP00000328352"; + 17 protein_coding start_codon 7256262 7256264 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; + 17 protein_coding stop_codon 7256958 7256960 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; + +- Will be converted to GFF3 format:: + + ##gff-version 3 + 17 protein_coding gene 7255208 7258258 . + . ID=ENSG00000213859;Name=KCTD11 + 17 protein_coding mRNA 7255208 7258258 . + . ID=ENST00000333751;Name=KCTD11-001;Parent=ENSG00000213859 + 17 protein_coding protein 7256262 7256960 . + . ID=ENSP00000328352;Name=KCTD11-001;Parent=ENST00000333751 + 17 protein_coding five_prime_UTR 7255208 7256261 . + . Parent=ENST00000333751 + 17 protein_coding CDS 7256262 7256960 . + 0 Name=CDS:KCTD11;Parent=ENST00000333751,ENSP00000328352 + 17 protein_coding three_prime_UTR 7256961 7258258 . + . Parent=ENST00000333751 + 17 protein_coding exon 7255208 7258258 . + . Parent=ENST00000333751 + +-------- + +**About formats** + +**GTF format** Gene Transfer Format, it borrows from GFF, but has additional structure that warrants a separate definition and format name. GTF lines have nine tab-seaparated fields:: + + 1. seqname - The name of the sequence. + 2. source - This indicating where the annotation came from. + 3. feature - The name of the feature types. The following feature types are required: 'CDS', 'start_codon' and 'stop_codon' + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. end - The ending position of the feature (inclusive). + 6. score - The score field indicates a degree of confidence in the feature's existence and coordinates. + 7. strand - Valid entries include '+', '-', or '.' + 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. + 9. attributes - These attributes are designed for handling multiple transcripts from the same genomic region. + +**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields:: + + 1. seqid - Must be a chromosome or scaffold. + 2. source - The program that generated this feature. + 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. stop - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If there is no score value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). + 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. + 9. attributes - All lines with the same group are linked together into a single item. + +-------- + +**Copyright** + +2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center + +Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014) + + </help> +</tool> |
b |
diff -r 619e0fcd9126 -r 6e589f267c14 helper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/helper.py Tue Nov 04 12:15:19 2014 -0500 |
[ |
b'@@ -0,0 +1,332 @@\n+#!/usr/bin/env python\n+"""\n+Common utility functions\n+"""\n+\n+import os \n+import re\n+import sys \n+import gzip \n+import bz2\n+import numpy \n+\n+def init_gene():\n+ """\n+ Initializing the gene structure \n+ """\n+\n+ gene_det = [(\'id\', \'f8\'), \n+ (\'anno_id\', numpy.dtype), \n+ (\'confgenes_id\', numpy.dtype),\n+ (\'name\', \'S25\'),\n+ (\'source\', \'S25\'),\n+ (\'gene_info\', numpy.dtype),\n+ (\'alias\', \'S15\'),\n+ (\'name2\', numpy.dtype),\n+ (\'strand\', \'S2\'), \n+ (\'score\', \'S15\'), \n+ (\'chr\', \'S15\'), \n+ (\'chr_num\', numpy.dtype),\n+ (\'paralogs\', numpy.dtype),\n+ (\'start\', \'f8\'),\n+ (\'stop\', \'f8\'), \n+ (\'transcripts\', numpy.dtype),\n+ (\'transcript_type\', numpy.dtype),\n+ (\'transcript_info\', numpy.dtype),\n+ (\'transcript_status\', numpy.dtype),\n+ (\'transcript_valid\', numpy.dtype),\n+ (\'exons\', numpy.dtype),\n+ (\'exons_confirmed\', numpy.dtype),\n+ (\'cds_exons\', numpy.dtype),\n+ (\'utr5_exons\', numpy.dtype),\n+ (\'utr3_exons\', numpy.dtype),\n+ (\'tis\', numpy.dtype),\n+ (\'tis_conf\', numpy.dtype),\n+ (\'tis_info\', numpy.dtype),\n+ (\'cdsStop\', numpy.dtype),\n+ (\'cdsStop_conf\', numpy.dtype),\n+ (\'cdsStop_info\', numpy.dtype),\n+ (\'tss\', numpy.dtype),\n+ (\'tss_info\', numpy.dtype),\n+ (\'tss_conf\', numpy.dtype),\n+ (\'cleave\', numpy.dtype),\n+ (\'cleave_info\', numpy.dtype),\n+ (\'cleave_conf\', numpy.dtype),\n+ (\'polya\', numpy.dtype),\n+ (\'polya_info\', numpy.dtype),\n+ (\'polya_conf\', numpy.dtype),\n+ (\'is_alt\', \'f8\'), \n+ (\'is_alt_spliced\', \'f8\'), \n+ (\'is_valid\', numpy.dtype),\n+ (\'transcript_complete\', numpy.dtype),\n+ (\'is_complete\', numpy.dtype),\n+ (\'is_correctly_gff3_referenced\', \'S5\'),\n+ (\'splicegraph\', numpy.dtype) ]\n+\n+ return gene_det\n+\n+def open_file(fname):\n+ """\n+ Open the file (supports .gz .bz2) and returns the handler\n+\n+ @args fname: input file name for reading \n+ @type fname: str\n+ """\n+\n+ try:\n+ if os.path.splitext(fname)[1] == ".gz":\n+ FH = gzip.open(fname, \'rb\')\n+ elif os.path.splitext(fname)[1] == ".bz2":\n+ FH = bz2.BZ2File(fname, \'rb\')\n+ else:\n+ FH = open(fname, \'rU\')\n+ except Exception as error:\n+ sys.exit(error)\n+\n+ return FH\n+\n+def add_CDS_phase(strand, cds):\n+ """\n+ Calculate CDS phase and add to the CDS exons\n+\n+ @args strand: feature strand information \n+ @type strand: +/- \n+ @args cds: coding exon coordinates \n+ @type cds: numpy array [[int, int, int]]\n+ """\n+\n+ cds_region, cds_flag = [], 0 \n+ if strand == \'+\':\n+ for cdspos in cds:\n+ if cds_flag == 0:\n+ cdspos = (cdspos[0], cdspos[1], 0)\n+ diff = (cdspos[1]-(cdspos[0]-1))%3\n+ else:\n+ xy = 0\n+ if diff == 0: \n+ cdspos = (cdspos[0], cdspos[1], 0)\n+ elif diff == 1: \n+ cdspos = (cdspos[0], cdspos[1], 2)\n+ xy = 2\n+ elif diff == 2: \n+ cdspos = (cdspos[0], cdspos[1], 1)\n+ xy = 1\n+ diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3\n+ cds_region.append(cdspos)\n+ cds_flag = 1 \n+ elif strand == \'-\':\n+ cds.reverse()\n+ for cdspos in cds: \n+ if cds_flag == 0:\n+ cdspos = (cdspos[0], cdspos[1], 0)\n+ diff = (cdspos[1]-(cdspos[0]-1))%3\n+ else: \n+ xy = 0 \n+ if diff == 0: \n+ cdspos = (cdspos[0], cdspos[1], 0)\n+ elif diff == 1:\n+ '..b" exon_pos.append([cds_5start, utr3_end])\n+ for cds in cds_cod:\n+ exon_pos.append(cds)\n+ for utr3 in three_p_utr:\n+ exon_pos.append(utr3)\n+ else: \n+ if jun_exon != []:\n+ five_p_utr = five_p_utr[:-1]\n+ cds_cod = cds_cod[1:]\n+ for utr5 in five_p_utr:\n+ exon_pos.append(utr5)\n+ exon_pos.append(jun_exon) if jun_exon != [] else ''\n+ jun_exon = []\n+ utr3_start, utr3_end = 0, 0\n+ if three_p_utr != []:\n+ utr3_start = three_p_utr[0][0]\n+ utr3_end = three_p_utr[0][1]\n+ cds_3start = cds_cod[-1][0]\n+ cds_3end = cds_cod[-1][1]\n+ if utr3_start-cds_3end == 0 or utr3_start-cds_3end == 1: \n+ jun_exon = [cds_3start, utr3_end]\n+ if jun_exon != []:\n+ cds_cod = cds_cod[:-1]\n+ three_p_utr = three_p_utr[1:]\n+ for cds in cds_cod:\n+ exon_pos.append(cds)\n+ exon_pos.append(jun_exon) if jun_exon != [] else ''\n+ for utr3 in three_p_utr:\n+ exon_pos.append(utr3)\n+ elif strand_p == '-':\n+ utr3_start, utr3_end = 0, 0 \n+ if three_p_utr != []:\n+ utr3_start = three_p_utr[-1][0]\n+ utr3_end = three_p_utr[-1][1]\n+ cds_3start = cds_cod[0][0]\n+ cds_3end = cds_cod[0][1]\n+ jun_exon = []\n+ if cds_3start-utr3_end == 0 or cds_3start-utr3_end == 1:\n+ jun_exon = [utr3_start, cds_3end] \n+ if len(cds_cod) == 1: \n+ three_prime_flag = 0\n+ if jun_exon != []:\n+ three_p_utr = three_p_utr[:-1]\n+ three_prime_flag = 1\n+ for utr3 in three_p_utr:\n+ exon_pos.append(utr3)\n+ jun_exon = []\n+ (utr5_start, utr5_end) = (0, 0)\n+ if five_p_utr != []:\n+ utr5_start = five_p_utr[0][0]\n+ utr5_end = five_p_utr[0][1]\n+ if utr5_start-cds_3end == 0 or utr5_start-cds_3end == 1:\n+ jun_exon = [cds_3start, utr5_end]\n+ five_prime_flag = 0\n+ if jun_exon != []:\n+ cds_cod = cds_cod[:-1]\n+ five_p_utr = five_p_utr[1:]\n+ five_prime_flag = 1\n+ if three_prime_flag == 1 and five_prime_flag == 1:\n+ exon_pos.append([utr3_start, utr5_end])\n+ if three_prime_flag == 1 and five_prime_flag == 0:\n+ exon_pos.append([utr3_start, cds_3end])\n+ cds_cod = cds_cod[:-1]\n+ if three_prime_flag == 0 and five_prime_flag == 1:\n+ exon_pos.append([cds_3start, utr5_end]) \n+ for cds in cds_cod:\n+ exon_pos.append(cds)\n+ for utr5 in five_p_utr:\n+ exon_pos.append(utr5)\n+ else:\n+ if jun_exon != []:\n+ three_p_utr = three_p_utr[:-1]\n+ cds_cod = cds_cod[1:]\n+ for utr3 in three_p_utr:\n+ exon_pos.append(utr3) \n+ if jun_exon != []:\n+ exon_pos.append(jun_exon)\n+ jun_exon = []\n+ (utr5_start, utr5_end) = (0, 0)\n+ if five_p_utr != []:\n+ utr5_start = five_p_utr[0][0]\n+ utr5_end = five_p_utr[0][1] \n+ cds_5start = cds_cod[-1][0]\n+ cds_5end = cds_cod[-1][1]\n+ if utr5_start-cds_5end == 0 or utr5_start-cds_5end == 1:\n+ jun_exon = [cds_5start, utr5_end]\n+ if jun_exon != []:\n+ cds_cod = cds_cod[:-1]\n+ five_p_utr = five_p_utr[1:]\n+ for cds in cds_cod:\n+ exon_pos.append(cds)\n+ if jun_exon != []:\n+ exon_pos.append(jun_exon) \n+ for utr5 in five_p_utr:\n+ exon_pos.append(utr5)\n+ return exon_pos\n" |
b |
diff -r 619e0fcd9126 -r 6e589f267c14 test-data/s_cerevisiae_SCU49845.gbk --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/s_cerevisiae_SCU49845.gbk Tue Nov 04 12:15:19 2014 -0500 |
b |
b'@@ -0,0 +1,165 @@\n+LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999\n+DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p\n+ (AXL2) and Rev7p (REV7) genes, complete cds.\n+ACCESSION U49845\n+VERSION U49845.1 GI:1293613\n+KEYWORDS .\n+SOURCE Saccharomyces cerevisiae (baker\'s yeast)\n+ ORGANISM Saccharomyces cerevisiae\n+ Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes;\n+ Saccharomycetales; Saccharomycetaceae; Saccharomyces.\n+REFERENCE 1 (bases 1 to 5028)\n+ AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W.\n+ TITLE Cloning and sequence of REV7, a gene whose function is required for\n+ DNA damage-induced mutagenesis in Saccharomyces cerevisiae\n+ JOURNAL Yeast 10 (11), 1503-1509 (1994)\n+ PUBMED 7871890\n+REFERENCE 2 (bases 1 to 5028)\n+ AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M.\n+ TITLE Selection of axial growth sites in yeast requires Axl2p, a novel\n+ plasma membrane glycoprotein\n+ JOURNAL Genes Dev. 10 (7), 777-793 (1996)\n+ PUBMED 8846915\n+REFERENCE 3 (bases 1 to 5028)\n+ AUTHORS Roemer,T.\n+ TITLE Direct Submission\n+ JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New\n+ Haven, CT, USA\n+FEATURES Location/Qualifiers\n+ source 1..5028\n+ /organism="Saccharomyces cerevisiae"\n+ /db_xref="taxon:4932"\n+ /chromosome="IX"\n+ /map="9"\n+ CDS <1..206\n+ /codon_start=3\n+ /product="TCP1-beta"\n+ /protein_id="AAA98665.1"\n+ /db_xref="GI:1293614"\n+ /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA\n+ AEVLLRVDNIIRARPRTANRQHM"\n+ gene 687..3158\n+ /gene="AXL2"\n+ CDS 687..3158\n+ /gene="AXL2"\n+ /note="plasma membrane glycoprotein"\n+ /codon_start=1\n+ /function="required for axial budding pattern of S.\n+ cerevisiae"\n+ /product="Axl2p"\n+ /protein_id="AAA98666.1"\n+ /db_xref="GI:1293615"\n+ /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF\n+ TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN\n+ VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE\n+ VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE\n+ TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV\n+ YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG\n+ DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ\n+ DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA\n+ NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA\n+ CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN\n+ NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ\n+ SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS\n+ YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK\n+ HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL\n+ VDFSNKSNVNVGQVKDIHGRIPEML"\n+ gene complement(3300..4037)\n+ /gene="REV7"\n+ CDS complement(3300..4037)\n+ /gene="REV7"\n+ /codon_start=1\n+ /product="Rev7p"\n+ /protein_id="AAA98667.1"\n+ /db_xref="GI:1293616"\n+ /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ\n+ FVPINRHPALIDYI'..b'cca\n+ 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc\n+ 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg\n+ 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt\n+ 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc\n+ 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg\n+ 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca\n+ 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata\n+ 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg\n+ 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga\n+ 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt\n+ 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat\n+ 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt\n+ 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc\n+ 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag\n+ 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta\n+ 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa\n+ 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact\n+ 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt\n+ 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa\n+ 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag\n+ 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct\n+ 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt\n+ 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact\n+ 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa\n+ 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg\n+ 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt\n+ 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc\n+ 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca\n+ 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc\n+ 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc\n+ 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat\n+ 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa\n+ 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga\n+ 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat\n+ 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc\n+ 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc\n+ 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa\n+ 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg\n+ 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc\n+ 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt\n+ 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg\n+ 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg\n+ 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt\n+ 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt\n+ 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat\n+ 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc\n+ 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct\n+ 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta\n+ 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac\n+ 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct\n+ 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct\n+ 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc\n+//\n' |
b |
diff -r 619e0fcd9126 -r 6e589f267c14 test-data/s_cerevisiae_SCU49845.gff3 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/s_cerevisiae_SCU49845.gff3 Tue Nov 04 12:15:19 2014 -0500 |
b |
@@ -0,0 +1,8 @@ +IX gbk_to_gff gene 687 3158 . + . ID=AXL2;Name=AXL2 +IX gbk_to_gff . 687 3158 . + . ID=Transcript:AXL2;Parent=AXL2 +IX gbk_to_gff CDS 687 3158 . + . Parent=Transcript:AXL2 +IX gbk_to_gff exon 687 3158 . + . Parent=Transcript:AXL2 +IX gbk_to_gff gene 3300 4037 . - . ID=REV7;Name=REV7 +IX gbk_to_gff . 3300 4037 . - . ID=Transcript:REV7;Parent=REV7 +IX gbk_to_gff CDS 3300 4037 . - . Parent=Transcript:REV7 +IX gbk_to_gff exon 3300 4037 . - . Parent=Transcript:REV7 |
b |
diff -r 619e0fcd9126 -r 6e589f267c14 test-data/single_parent_feature_record.gff3 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/single_parent_feature_record.gff3 Tue Nov 04 12:15:19 2014 -0500 |
b |
@@ -0,0 +1,10 @@ +chr1 . miRNA_primary_transcript 1380242 1380467 . - . ID=MI0031047;Alias=MI0031047;Name=gma-MIR9754 +chr1 . miRNA 1380249 1380270 . - . ID=MIMAT0036385;Alias=MIMAT0036385;Name=gma-miR9754;Derives_from=MI0031047 +chr1 . miRNA_primary_transcript 2410094 2410318 . + . ID=MI0016507;Alias=MI0016507;Name=gma-MIR4367 +chr1 . miRNA 2410242 2410263 . + . ID=MIMAT0018266;Alias=MIMAT0018266;Name=gma-miR4367;Derives_from=MI0016507 +chr1 . miRNA_primary_transcript 4792375 4792487 . - . ID=MI0021714;Alias=MI0021714;Name=gma-MIR395h +chr1 . miRNA 4792388 4792408 . - . ID=MIMAT0024920;Alias=MIMAT0024920;Name=gma-miR395h;Derives_from=MI0021714 +chr1 . miRNA_primary_transcript 4797903 4798018 . - . ID=MI0021715;Alias=MI0021715;Name=gma-MIR395i +chr1 . miRNA 4797916 4797936 . - . ID=MIMAT0024921;Alias=MIMAT0024921;Name=gma-miR395i;Derives_from=MI0021715 +chr1 . miRNA_primary_transcript 4810817 4810942 . - . ID=MI0021716;Alias=MI0021716;Name=gma-MIR395j +chr1 . miRNA 4810830 4810850 . - . ID=MIMAT0024922;Alias=MIMAT0024922;Name=gma-miR395j;Derives_from=MI0021716 |
b |
diff -r 619e0fcd9126 -r 6e589f267c14 tool_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_conf.xml.sample Tue Nov 04 12:15:19 2014 -0500 |
b |
@@ -0,0 +1,9 @@ +<section name="GFFtools" id="gfftools.web"> + <tool file="GFFtools-GX/gff_to_bed.xml"/> + <tool file="GFFtools-GX/bed_to_gff.xml"/> + <tool file="GFFtools-GX/gbk_to_gff.xml"/> + <tool file="GFFtools-GX/gff_to_gbk.xml"/> + <tool file="GFFtools-GX/gff_to_gtf.xml"/> + <tool file="GFFtools-GX/gtf_to_gff.xml"/> + <tool file="GFFtools-GX/gff_fmap.xml"/> +</section> |