fml_gff3togtf: gbk_to_gff.py comparison

comparison gbk_to_gff.py @ 10:c42c69aa81f8

fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo

author	vipints <vipin@cbio.mskcc.org>
date	Thu, 23 Apr 2015 18:01:45 -0400
parents
children

comparison

equal deleted inserted replaced

-:7d67331368f3
+:c42c69aa81f8
+#!/usr/bin/env python
+"""
+Convert data from Genbank format to GFF.
+Usage:
+python gbk_to_gff.py in.gbk > out.gff
+Requirements:
+BioPython:- http://biopython.org/
+helper.py:- https://github.com/vipints/GFFtools-GX/blob/master/helper.py
+Copyright (C)
+2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
+2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA.
+"""
+import os
+import re
+import sys
+import helper
+import collections
+from Bio import SeqIO
+def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk):
+"""
+Write the feature information
+"""
+for gname, ginfo in genes.items():
+line = [str(chr_id),
+'gbk2gff',
+ginfo[3],
+str(ginfo[0]),
+str(ginfo[1]),
+'.',
+ginfo[2],
+'.',
+'ID=%s;Name=%s' % (str(gname), str(gname))]
+sys.stdout.write('\t'.join(line)+"\n")
+## construct the transcript line is not defined in the original file
+t_line = [str(chr_id), 'gbk2gff', source, 0, 1, '.', ginfo[2], '.']
+if not transcripts:
+t_line.append('ID=Transcript:%s;Parent=%s' % (str(gname), str(gname)))
+if exons: ## get the entire transcript region  from the defined feature
+t_line[3] = str(exons[gname][0][0])
+t_line[4] = str(exons[gname][0][-1])
+elif cds:
+t_line[3] = str(cds[gname][0][0])
+t_line[4] = str(cds[gname][0][-1])
+if not cds:
+t_line[2] = 'transcript'
+else:
+t_line[2] = 'mRNA'
+sys.stdout.write('\t'.join(t_line)+"\n")
+if exons:
+exon_line_print(t_line, exons[gname], 'Transcript:'+str(gname), 'exon')
+if cds:
+exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'CDS')
+if not exons:
+exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'exon')
+else: ## transcript is defined
+for idx in transcripts[gname]:
+t_line[2] = idx[3]
+t_line[3] = str(idx[0])
+t_line[4] = str(idx[1])
+t_line.append('ID='+str(idx[2])+';Parent='+str(gname))
+sys.stdout.write('\t'.join(t_line)+"\n")
+## feature line print call
+if exons:
+exon_line_print(t_line, exons[gname], str(idx[2]), 'exon')
+if cds:
+exon_line_print(t_line, cds[gname], str(idx[2]), 'CDS')
+if not exons:
+exon_line_print(t_line, cds[gname], str(idx[2]), 'exon')
+if len(genes) == 0: ## feature entry with fragment information
+line = [str(chr_id), 'gbk2gff', source, 0, 1, '.', orient, '.']
+fStart = fStop = None
+for eid, ex in cds.items():
+fStart = ex[0][0]
+fStop = ex[0][-1]
+for eid, ex in exons.items():
+fStart = ex[0][0]
+fStop = ex[0][-1]
+if fStart or fStart:
+line[2] = 'gene'
+line[3] = str(fStart)
+line[4] = str(fStop)
+line.append('ID=Unknown_Gene_' + str(unk) + ';Name=Unknown_Gene_' + str(unk))
+sys.stdout.write('\t'.join(line)+"\n")
+if not cds:
+line[2] = 'transcript'
+else:
+line[2] = 'mRNA'
+line[8] = 'ID=Unknown_Transcript_' + str(unk) + ';Parent=Unknown_Gene_' + str(unk)
+sys.stdout.write('\t'.join(line)+"\n")
+if exons:
+exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon')
+if cds:
+exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'CDS')
+if not exons:
+exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon')
+unk +=1
+return unk
+def exon_line_print(temp_line, trx_exons, parent, ftype):
+"""
+Print the EXON feature line
+"""
+for ex in trx_exons:
+temp_line[2] = ftype
+temp_line[3] = str(ex[0])
+temp_line[4] = str(ex[1])
+temp_line[8] = 'Parent=%s' % parent
+sys.stdout.write('\t'.join(temp_line)+"\n")
+def gbk_parse(fname):
+"""
+Extract genome annotation recods from genbank format
+@args fname: gbk file name
+@type fname: str
+"""
+fhand = helper.open_file(gbkfname)
+unk = 1
+for record in SeqIO.parse(fhand, "genbank"):
+gene_tags = dict()
+tx_tags = collections.defaultdict(list)
+exon = collections.defaultdict(list)
+cds = collections.defaultdict(list)
+mol_type, chr_id = None, None
+for rec in record.features:
+if rec.type == 'source':
+try:
+mol_type = rec.qualifiers['mol_type'][0]
+except:
+mol_type = '.'
+pass
+try:
+chr_id = rec.qualifiers['chromosome'][0]
+except:
+chr_id = record.name
+continue
+strand='-'
+strand='+' if rec.strand>0 else strand
+fid = None
+try:
+fid = rec.qualifiers['gene'][0]
+except:
+pass
+transcript_id = None
+try:
+transcript_id = rec.qualifiers['transcript_id'][0]
+except:
+pass
+if re.search(r'gene', rec.type):
+gene_tags[fid] = (rec.location._start.position+1,
+rec.location._end.position,
+strand,
+rec.type
+)
+elif rec.type == 'exon':
+exon[fid].append((rec.location._start.position+1,
+rec.location._end.position))
+elif rec.type=='CDS':
+cds[fid].append((rec.location._start.position+1,
+rec.location._end.position))
+else:
+# get all transcripts
+if transcript_id:
+tx_tags[fid].append((rec.location._start.position+1,
+rec.location._end.position,
+transcript_id,
+rec.type))
+# record extracted, generate feature table
+unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk)
+fhand.close()
+if __name__=='__main__':
+try:
+gbkfname = sys.argv[1]
+except:
+print __doc__
+sys.exit(-1)
+## extract gbk records
+gbk_parse(gbkfname)

Mercurial > repos > vipints > fml_gff3togtf

comparison gbk_to_gff.py @ 10:c42c69aa81f8