# HG changeset patch
# User brigidar
# Date 1444332972 14400
# Node ID 6497d42015aaf8cf51b2be59181e489b68f5cf6e

Uploaded

diff -r 000000000000 -r 6497d42015aa select_product.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/select_product.py	Thu Oct 08 15:36:12 2015 -0400
@@ -0,0 +1,161 @@
+#Adapted from gbk_to_gff
+# Brigida Rusconi, UTSA
+# October 8 2015
+# Finds all CDS that match the key words provided in the comma separated argument -k in the product description
+# adds a given number of bases on each side of the transcript for the exclusion set
+
+
+
+#!/usr/bin/env python
+"""
+Convert data from Genbank format to GFF. 
+
+Usage: 
+python gbk_to_gff.py in.gbk > out.gff 
+
+Requirements:
+    BioPython:- http://biopython.org/
+    helper.py:- https://github.com/vipints/GFFtools-GX/blob/master/helper.py
+
+Copyright (C) 
+    2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
+    2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA.
+"""
+
+import os
+import re
+import sys
+import helper 
+import collections
+import pdb
+import argparse
+from Bio import SeqIO
+
+def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk, product, keys, n):
+    """
+    Write the feature information
+    """
+    for gname, ginfo in genes.items():
+        line = [str(chr_id),
+                ginfo[3],
+                str(ginfo[0]-int(n)),
+                str(ginfo[1]+int(n)),
+                ginfo[2],
+                str(product)]
+                
+# to key on specfic products only
+        for k in keys.split(","):
+            if k in str(product):
+        
+                sys.stdout.write('\t'.join(line)+"\n")
+                break
+
+
+        unk +=1
+    return unk
+
+
+
+
+def gbk_parse(fname,keys,n):
+    """
+    Extract genome annotation recods from genbank format 
+
+    @args fname: gbk file name 
+    @type fname: str
+    """
+    fhand = helper.open_file(gbkfname)
+    unk = 1 
+
+    for record in SeqIO.parse(fhand, "genbank"):
+        gene_tags = dict()
+        tx_tags = collections.defaultdict(list) 
+        exon = collections.defaultdict(list) 
+        cds = collections.defaultdict(list) 
+        mol_type, chr_id = None, None 
+
+        for rec in record.features:
+
+
+            product = "NA"
+            
+            if rec.qualifiers.has_key('product'):
+            
+                product = rec.qualifiers['product'][0]
+            
+
+            if rec.type == 'source':
+                try:
+                    mol_type = rec.qualifiers['mol_type'][0]
+                except:
+                    mol_type = '.'
+                    pass 
+                try:
+                    chr_id = rec.qualifiers['chromosome'][0]
+                except:
+                    chr_id = record.name 
+                continue 
+
+            strand='-'
+            strand='+' if rec.strand>0 else strand
+            
+            fid = None 
+            try:
+                fid = rec.qualifiers['gene'][0]
+            except:
+                pass
+
+            transcript_id = None
+            try:
+                transcript_id = rec.qualifiers['transcript_id'][0]
+            except:
+                pass 
+
+            if re.search(r'gene', rec.type):
+                gene_tags[fid] = (rec.location._start.position+1, 
+                                    rec.location._end.position, 
+                                    strand,
+                                    rec.type
+                                    )
+            elif rec.type == 'exon':
+                exon[fid].append((rec.location._start.position+1, 
+                                    rec.location._end.position))
+            elif rec.type=='CDS':
+                cds[fid].append((rec.location._start.position+1, 
+                                    rec.location._end.position))
+            else: 
+                # get all transcripts 
+                if transcript_id: 
+                    tx_tags[fid].append((rec.location._start.position+1,
+                                    rec.location._end.position, 
+                                    transcript_id,
+                                    rec.type))
+        # record extracted, generate feature table
+        unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk, product,keys,n)
+        
+    fhand.close()
+
+
+if __name__=='__main__': 
+
+    try:
+        parser = argparse.ArgumentParser()
+        parser.add_argument('-k', '--keys', help="list of products to key on in a comma delimited list")
+        parser.add_argument('-g', '--gbk_file', help="genbank file of reference")
+        parser.add_argument('-n', '--number', help="flanking region to include")
+
+        args = parser.parse_args()
+        gbkfname = args.gbk_file
+        n=args.number
+        keys=args.keys
+    
+    except:
+        print __doc__
+        sys.exit(-1)
+
+    ## extract gbk records  
+    gbk_parse(gbkfname,keys,n)
+
+
+
+