view select_product.py @ 8:53f1f83e80a1 draft

Uploaded
author brigidar
date Fri, 09 Oct 2015 11:52:31 -0400
parents 6497d42015aa
children 82186605ba75
line wrap: on
line source

#Adapted from gbk_to_gff
# Brigida Rusconi, UTSA
# October 8 2015
# Finds all CDS that match the key words provided in the comma separated argument -k in the product description
# adds a given number of bases on each side of the transcript for the exclusion set



#!/usr/bin/env python


import os
import re
import sys
import collections
import pdb
import argparse
from Bio import SeqIO

def open_file(fname):
    """
        Open the file (supports .gz .bz2) and returns the handler
        
        @args fname: input file name for reading
        @type fname: str
        """
    
    try:
        if os.path.splitext(fname)[1] == ".gz":
            FH = gzip.open(fname, 'rb')
        elif os.path.splitext(fname)[1] == ".bz2":
            FH = bz2.BZ2File(fname, 'rb')
        else:
            FH = open(fname, 'rU')
    except Exception as error:
        sys.exit(error)
    
    return FH


def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, product, keys_1, n):
    """
    Write the feature information
    """
    
    lines = []
    
    for gname, ginfo in genes.items():
        line = [str(chr_id),
                ginfo[3],
                str(ginfo[0]-n),
                str(ginfo[1]+n),
                ginfo[2],
                str(product)]
                
# to key on specfic products only
        for k in keys_1.split(","):
            if k in str(product):
        
                lines.append('\t'.join(line)+"\n")
                break

    return lines




def gbk_parse(fname,keys_1,n,outfile=""):
    """
    Extract genome annotation recods from genbank format 

    @args fname: gbk file name 
    @type fname: str
    """
    fhand = open_file(gbkfname)
    lines = []

    for record in SeqIO.parse(fhand, "genbank"):
        gene_tags = dict()
        tx_tags = collections.defaultdict(list) 
        exon = collections.defaultdict(list) 
        cds = collections.defaultdict(list) 
        mol_type, chr_id = None, None 

        for rec in record.features:


            product = "NA"
            
            if rec.qualifiers.has_key('product'):
            
                product = rec.qualifiers['product'][0]
            

            if rec.type == 'source':
                try:
                    mol_type = rec.qualifiers['mol_type'][0]
                except:
                    mol_type = '.'
                    pass 
                try:
                    chr_id = rec.qualifiers['chromosome'][0]
                except:
                    chr_id = record.name 
                continue 

            strand='-'
            strand='+' if rec.strand>0 else strand
            
            fid = None 
            try:
                fid = rec.qualifiers['gene'][0]
            except:
                pass

            transcript_id = None
            try:
                transcript_id = rec.qualifiers['transcript_id'][0]
            except:
                pass 

            if re.search(r'gene', rec.type):
                gene_tags[fid] = (rec.location._start.position+1, 
                                    rec.location._end.position, 
                                    strand,
                                    rec.type
                                    )
            elif rec.type == 'exon':
                exon[fid].append((rec.location._start.position+1, 
                                    rec.location._end.position))
            elif rec.type=='CDS':
                cds[fid].append((rec.location._start.position+1, 
                                    rec.location._end.position))
            else: 
                # get all transcripts 
                if transcript_id: 
                    tx_tags[fid].append((rec.location._start.position+1,
                                    rec.location._end.position, 
                                    transcript_id,
                                    rec.type))
        # record extracted, generate feature table
        lines += feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, product,keys_1,n)
        
    fhand.close()

    return lines

if __name__=='__main__': 

    try:
        parser = argparse.ArgumentParser()
        parser.add_argument('-k', '--keys_1', type=str, help="list of products to key on in a comma delimited list")
        parser.add_argument('-g', '--gbk_file', help="genbank file of reference")
        parser.add_argument('-n', '--number', type=int, help="flanking region to include")
        parser.add_argument('-o', '--output', help="output file")

        args = parser.parse_args()
        gbkfname = args.gbk_file
        n=args.number
        keys_1=args.keys_1
        output=args.output
    
    except Exception,e:
        print "Error: %s" % e
        sys.exit(-1)

    ## extract gbk records

    print "Filtering on keys: %s" % keys_1
    lines = gbk_parse(gbkfname,keys_1,n)

    with open(output, 'w') as of:
        for l in lines:
            of.write(l)