# HG changeset patch # User brigidar # Date 1444332972 14400 # Node ID 6497d42015aaf8cf51b2be59181e489b68f5cf6e Uploaded diff -r 000000000000 -r 6497d42015aa select_product.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/select_product.py Thu Oct 08 15:36:12 2015 -0400 @@ -0,0 +1,161 @@ +#Adapted from gbk_to_gff +# Brigida Rusconi, UTSA +# October 8 2015 +# Finds all CDS that match the key words provided in the comma separated argument -k in the product description +# adds a given number of bases on each side of the transcript for the exclusion set + + + +#!/usr/bin/env python +""" +Convert data from Genbank format to GFF. + +Usage: +python gbk_to_gff.py in.gbk > out.gff + +Requirements: + BioPython:- http://biopython.org/ + helper.py:- https://github.com/vipints/GFFtools-GX/blob/master/helper.py + +Copyright (C) + 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. + 2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA. +""" + +import os +import re +import sys +import helper +import collections +import pdb +import argparse +from Bio import SeqIO + +def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk, product, keys, n): + """ + Write the feature information + """ + for gname, ginfo in genes.items(): + line = [str(chr_id), + ginfo[3], + str(ginfo[0]-int(n)), + str(ginfo[1]+int(n)), + ginfo[2], + str(product)] + +# to key on specfic products only + for k in keys.split(","): + if k in str(product): + + sys.stdout.write('\t'.join(line)+"\n") + break + + + unk +=1 + return unk + + + + +def gbk_parse(fname,keys,n): + """ + Extract genome annotation recods from genbank format + + @args fname: gbk file name + @type fname: str + """ + fhand = helper.open_file(gbkfname) + unk = 1 + + for record in SeqIO.parse(fhand, "genbank"): + gene_tags = dict() + tx_tags = collections.defaultdict(list) + exon = collections.defaultdict(list) + cds = collections.defaultdict(list) + mol_type, chr_id = None, None + + for rec in record.features: + + + product = "NA" + + if rec.qualifiers.has_key('product'): + + product = rec.qualifiers['product'][0] + + + if rec.type == 'source': + try: + mol_type = rec.qualifiers['mol_type'][0] + except: + mol_type = '.' + pass + try: + chr_id = rec.qualifiers['chromosome'][0] + except: + chr_id = record.name + continue + + strand='-' + strand='+' if rec.strand>0 else strand + + fid = None + try: + fid = rec.qualifiers['gene'][0] + except: + pass + + transcript_id = None + try: + transcript_id = rec.qualifiers['transcript_id'][0] + except: + pass + + if re.search(r'gene', rec.type): + gene_tags[fid] = (rec.location._start.position+1, + rec.location._end.position, + strand, + rec.type + ) + elif rec.type == 'exon': + exon[fid].append((rec.location._start.position+1, + rec.location._end.position)) + elif rec.type=='CDS': + cds[fid].append((rec.location._start.position+1, + rec.location._end.position)) + else: + # get all transcripts + if transcript_id: + tx_tags[fid].append((rec.location._start.position+1, + rec.location._end.position, + transcript_id, + rec.type)) + # record extracted, generate feature table + unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk, product,keys,n) + + fhand.close() + + +if __name__=='__main__': + + try: + parser = argparse.ArgumentParser() + parser.add_argument('-k', '--keys', help="list of products to key on in a comma delimited list") + parser.add_argument('-g', '--gbk_file', help="genbank file of reference") + parser.add_argument('-n', '--number', help="flanking region to include") + + args = parser.parse_args() + gbkfname = args.gbk_file + n=args.number + keys=args.keys + + except: + print __doc__ + sys.exit(-1) + + ## extract gbk records + gbk_parse(gbkfname,keys,n) + + + +