view select_product.py @ 10:82186605ba75 draft default tip

Uploaded
author brigidar
date Fri, 09 Oct 2015 17:30:50 -0400
parents 53f1f83e80a1
children
line wrap: on
line source

#Adapted from gbk_to_gff
# Brigida Rusconi, UTSA
# October 8 2015
# Finds all CDS that match the key words provided in the comma separated argument -k in the product description
# adds a given number of bases on each side of the transcript for the exclusion set



#!/usr/bin/env python


import os
import re
import sys
import collections
import pdb
import argparse
from Bio import SeqIO

def open_file(fname):
    """
        Open the file (supports .gz .bz2) and returns the handler
        
        @args fname: input file name for reading
        @type fname: str
        """
    
    try:
        if os.path.splitext(fname)[1] == ".gz":
            FH = gzip.open(fname, 'rb')
        elif os.path.splitext(fname)[1] == ".bz2":
            FH = bz2.BZ2File(fname, 'rb')
        else:
            FH = open(fname, 'rU')
    except Exception as error:
        sys.exit(error)
    
    return FH


def gbk_parse(fname,keys_1,n,outfile=""):
    """
    Extract genome annotation recods from genbank format 

    @args fname: gbk file name 
    @type fname: str
    """
    fhand = open_file(gbkfname)
    lines = []

    for record in SeqIO.parse(fhand, "genbank"):
        gene_tags = dict()
 
        mol_type, chr_id = None, None 

        for rec in record.features:


            product = "NA"
            
            if rec.qualifiers.has_key('product'):
            
                product = rec.qualifiers['product'][0]


            if rec.type == 'source':
                try:
                    mol_type = rec.qualifiers['mol_type'][0]
                except:
                    mol_type = '.'
                    pass 
                try:
                    chr_id = rec.qualifiers['chromosome'][0]
                except:
                    chr_id = record.name 
                continue 
            

            start = int(rec.location.start)+1-n
            stop = int(rec.location.end)+n

            if start < 0:
                start = 1
                    
            for k in keys_1.split(","):
    
                if str(k) in str(product):
                    lines.append('\t'.join([str(chr_id),str(start), str(stop), product])+"\n")

    
    fhand.close()

    return lines

if __name__=='__main__':

    try:
        parser = argparse.ArgumentParser()
        parser.add_argument('-k', '--keys_1', type=str, help="list of products to key on in a comma delimited list")
        parser.add_argument('-g', '--gbk_file', help="genbank file of reference")
        parser.add_argument('-n', '--number', type=int, help="flanking region to include")
        parser.add_argument('-o', '--output', help="output file")

        args = parser.parse_args()
        gbkfname = args.gbk_file
        n=args.number
        keys_1=args.keys_1
        output=args.output
    
    except Exception,e:
        print "Error: %s" % e
        sys.exit(-1)

    ## extract gbk records

    print "Filtering on keys: %s" % keys_1
    lines = gbk_parse(gbkfname,keys_1,n)

    with open(output, 'w') as of:
        for l in lines:
            of.write(l)