annotate select_product.py @ 0:6497d42015aa draft

Uploaded
author brigidar
date Thu, 08 Oct 2015 15:36:12 -0400
parents
children 53f1f83e80a1
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
6497d42015aa Uploaded
brigidar
parents:
diff changeset
1 #Adapted from gbk_to_gff
6497d42015aa Uploaded
brigidar
parents:
diff changeset
2 # Brigida Rusconi, UTSA
6497d42015aa Uploaded
brigidar
parents:
diff changeset
3 # October 8 2015
6497d42015aa Uploaded
brigidar
parents:
diff changeset
4 # Finds all CDS that match the key words provided in the comma separated argument -k in the product description
6497d42015aa Uploaded
brigidar
parents:
diff changeset
5 # adds a given number of bases on each side of the transcript for the exclusion set
6497d42015aa Uploaded
brigidar
parents:
diff changeset
6
6497d42015aa Uploaded
brigidar
parents:
diff changeset
7
6497d42015aa Uploaded
brigidar
parents:
diff changeset
8
6497d42015aa Uploaded
brigidar
parents:
diff changeset
9 #!/usr/bin/env python
6497d42015aa Uploaded
brigidar
parents:
diff changeset
10 """
6497d42015aa Uploaded
brigidar
parents:
diff changeset
11 Convert data from Genbank format to GFF.
6497d42015aa Uploaded
brigidar
parents:
diff changeset
12
6497d42015aa Uploaded
brigidar
parents:
diff changeset
13 Usage:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
14 python gbk_to_gff.py in.gbk > out.gff
6497d42015aa Uploaded
brigidar
parents:
diff changeset
15
6497d42015aa Uploaded
brigidar
parents:
diff changeset
16 Requirements:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
17 BioPython:- http://biopython.org/
6497d42015aa Uploaded
brigidar
parents:
diff changeset
18 helper.py:- https://github.com/vipints/GFFtools-GX/blob/master/helper.py
6497d42015aa Uploaded
brigidar
parents:
diff changeset
19
6497d42015aa Uploaded
brigidar
parents:
diff changeset
20 Copyright (C)
6497d42015aa Uploaded
brigidar
parents:
diff changeset
21 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
6497d42015aa Uploaded
brigidar
parents:
diff changeset
22 2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA.
6497d42015aa Uploaded
brigidar
parents:
diff changeset
23 """
6497d42015aa Uploaded
brigidar
parents:
diff changeset
24
6497d42015aa Uploaded
brigidar
parents:
diff changeset
25 import os
6497d42015aa Uploaded
brigidar
parents:
diff changeset
26 import re
6497d42015aa Uploaded
brigidar
parents:
diff changeset
27 import sys
6497d42015aa Uploaded
brigidar
parents:
diff changeset
28 import helper
6497d42015aa Uploaded
brigidar
parents:
diff changeset
29 import collections
6497d42015aa Uploaded
brigidar
parents:
diff changeset
30 import pdb
6497d42015aa Uploaded
brigidar
parents:
diff changeset
31 import argparse
6497d42015aa Uploaded
brigidar
parents:
diff changeset
32 from Bio import SeqIO
6497d42015aa Uploaded
brigidar
parents:
diff changeset
33
6497d42015aa Uploaded
brigidar
parents:
diff changeset
34 def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk, product, keys, n):
6497d42015aa Uploaded
brigidar
parents:
diff changeset
35 """
6497d42015aa Uploaded
brigidar
parents:
diff changeset
36 Write the feature information
6497d42015aa Uploaded
brigidar
parents:
diff changeset
37 """
6497d42015aa Uploaded
brigidar
parents:
diff changeset
38 for gname, ginfo in genes.items():
6497d42015aa Uploaded
brigidar
parents:
diff changeset
39 line = [str(chr_id),
6497d42015aa Uploaded
brigidar
parents:
diff changeset
40 ginfo[3],
6497d42015aa Uploaded
brigidar
parents:
diff changeset
41 str(ginfo[0]-int(n)),
6497d42015aa Uploaded
brigidar
parents:
diff changeset
42 str(ginfo[1]+int(n)),
6497d42015aa Uploaded
brigidar
parents:
diff changeset
43 ginfo[2],
6497d42015aa Uploaded
brigidar
parents:
diff changeset
44 str(product)]
6497d42015aa Uploaded
brigidar
parents:
diff changeset
45
6497d42015aa Uploaded
brigidar
parents:
diff changeset
46 # to key on specfic products only
6497d42015aa Uploaded
brigidar
parents:
diff changeset
47 for k in keys.split(","):
6497d42015aa Uploaded
brigidar
parents:
diff changeset
48 if k in str(product):
6497d42015aa Uploaded
brigidar
parents:
diff changeset
49
6497d42015aa Uploaded
brigidar
parents:
diff changeset
50 sys.stdout.write('\t'.join(line)+"\n")
6497d42015aa Uploaded
brigidar
parents:
diff changeset
51 break
6497d42015aa Uploaded
brigidar
parents:
diff changeset
52
6497d42015aa Uploaded
brigidar
parents:
diff changeset
53
6497d42015aa Uploaded
brigidar
parents:
diff changeset
54 unk +=1
6497d42015aa Uploaded
brigidar
parents:
diff changeset
55 return unk
6497d42015aa Uploaded
brigidar
parents:
diff changeset
56
6497d42015aa Uploaded
brigidar
parents:
diff changeset
57
6497d42015aa Uploaded
brigidar
parents:
diff changeset
58
6497d42015aa Uploaded
brigidar
parents:
diff changeset
59
6497d42015aa Uploaded
brigidar
parents:
diff changeset
60 def gbk_parse(fname,keys,n):
6497d42015aa Uploaded
brigidar
parents:
diff changeset
61 """
6497d42015aa Uploaded
brigidar
parents:
diff changeset
62 Extract genome annotation recods from genbank format
6497d42015aa Uploaded
brigidar
parents:
diff changeset
63
6497d42015aa Uploaded
brigidar
parents:
diff changeset
64 @args fname: gbk file name
6497d42015aa Uploaded
brigidar
parents:
diff changeset
65 @type fname: str
6497d42015aa Uploaded
brigidar
parents:
diff changeset
66 """
6497d42015aa Uploaded
brigidar
parents:
diff changeset
67 fhand = helper.open_file(gbkfname)
6497d42015aa Uploaded
brigidar
parents:
diff changeset
68 unk = 1
6497d42015aa Uploaded
brigidar
parents:
diff changeset
69
6497d42015aa Uploaded
brigidar
parents:
diff changeset
70 for record in SeqIO.parse(fhand, "genbank"):
6497d42015aa Uploaded
brigidar
parents:
diff changeset
71 gene_tags = dict()
6497d42015aa Uploaded
brigidar
parents:
diff changeset
72 tx_tags = collections.defaultdict(list)
6497d42015aa Uploaded
brigidar
parents:
diff changeset
73 exon = collections.defaultdict(list)
6497d42015aa Uploaded
brigidar
parents:
diff changeset
74 cds = collections.defaultdict(list)
6497d42015aa Uploaded
brigidar
parents:
diff changeset
75 mol_type, chr_id = None, None
6497d42015aa Uploaded
brigidar
parents:
diff changeset
76
6497d42015aa Uploaded
brigidar
parents:
diff changeset
77 for rec in record.features:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
78
6497d42015aa Uploaded
brigidar
parents:
diff changeset
79
6497d42015aa Uploaded
brigidar
parents:
diff changeset
80 product = "NA"
6497d42015aa Uploaded
brigidar
parents:
diff changeset
81
6497d42015aa Uploaded
brigidar
parents:
diff changeset
82 if rec.qualifiers.has_key('product'):
6497d42015aa Uploaded
brigidar
parents:
diff changeset
83
6497d42015aa Uploaded
brigidar
parents:
diff changeset
84 product = rec.qualifiers['product'][0]
6497d42015aa Uploaded
brigidar
parents:
diff changeset
85
6497d42015aa Uploaded
brigidar
parents:
diff changeset
86
6497d42015aa Uploaded
brigidar
parents:
diff changeset
87 if rec.type == 'source':
6497d42015aa Uploaded
brigidar
parents:
diff changeset
88 try:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
89 mol_type = rec.qualifiers['mol_type'][0]
6497d42015aa Uploaded
brigidar
parents:
diff changeset
90 except:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
91 mol_type = '.'
6497d42015aa Uploaded
brigidar
parents:
diff changeset
92 pass
6497d42015aa Uploaded
brigidar
parents:
diff changeset
93 try:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
94 chr_id = rec.qualifiers['chromosome'][0]
6497d42015aa Uploaded
brigidar
parents:
diff changeset
95 except:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
96 chr_id = record.name
6497d42015aa Uploaded
brigidar
parents:
diff changeset
97 continue
6497d42015aa Uploaded
brigidar
parents:
diff changeset
98
6497d42015aa Uploaded
brigidar
parents:
diff changeset
99 strand='-'
6497d42015aa Uploaded
brigidar
parents:
diff changeset
100 strand='+' if rec.strand>0 else strand
6497d42015aa Uploaded
brigidar
parents:
diff changeset
101
6497d42015aa Uploaded
brigidar
parents:
diff changeset
102 fid = None
6497d42015aa Uploaded
brigidar
parents:
diff changeset
103 try:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
104 fid = rec.qualifiers['gene'][0]
6497d42015aa Uploaded
brigidar
parents:
diff changeset
105 except:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
106 pass
6497d42015aa Uploaded
brigidar
parents:
diff changeset
107
6497d42015aa Uploaded
brigidar
parents:
diff changeset
108 transcript_id = None
6497d42015aa Uploaded
brigidar
parents:
diff changeset
109 try:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
110 transcript_id = rec.qualifiers['transcript_id'][0]
6497d42015aa Uploaded
brigidar
parents:
diff changeset
111 except:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
112 pass
6497d42015aa Uploaded
brigidar
parents:
diff changeset
113
6497d42015aa Uploaded
brigidar
parents:
diff changeset
114 if re.search(r'gene', rec.type):
6497d42015aa Uploaded
brigidar
parents:
diff changeset
115 gene_tags[fid] = (rec.location._start.position+1,
6497d42015aa Uploaded
brigidar
parents:
diff changeset
116 rec.location._end.position,
6497d42015aa Uploaded
brigidar
parents:
diff changeset
117 strand,
6497d42015aa Uploaded
brigidar
parents:
diff changeset
118 rec.type
6497d42015aa Uploaded
brigidar
parents:
diff changeset
119 )
6497d42015aa Uploaded
brigidar
parents:
diff changeset
120 elif rec.type == 'exon':
6497d42015aa Uploaded
brigidar
parents:
diff changeset
121 exon[fid].append((rec.location._start.position+1,
6497d42015aa Uploaded
brigidar
parents:
diff changeset
122 rec.location._end.position))
6497d42015aa Uploaded
brigidar
parents:
diff changeset
123 elif rec.type=='CDS':
6497d42015aa Uploaded
brigidar
parents:
diff changeset
124 cds[fid].append((rec.location._start.position+1,
6497d42015aa Uploaded
brigidar
parents:
diff changeset
125 rec.location._end.position))
6497d42015aa Uploaded
brigidar
parents:
diff changeset
126 else:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
127 # get all transcripts
6497d42015aa Uploaded
brigidar
parents:
diff changeset
128 if transcript_id:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
129 tx_tags[fid].append((rec.location._start.position+1,
6497d42015aa Uploaded
brigidar
parents:
diff changeset
130 rec.location._end.position,
6497d42015aa Uploaded
brigidar
parents:
diff changeset
131 transcript_id,
6497d42015aa Uploaded
brigidar
parents:
diff changeset
132 rec.type))
6497d42015aa Uploaded
brigidar
parents:
diff changeset
133 # record extracted, generate feature table
6497d42015aa Uploaded
brigidar
parents:
diff changeset
134 unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk, product,keys,n)
6497d42015aa Uploaded
brigidar
parents:
diff changeset
135
6497d42015aa Uploaded
brigidar
parents:
diff changeset
136 fhand.close()
6497d42015aa Uploaded
brigidar
parents:
diff changeset
137
6497d42015aa Uploaded
brigidar
parents:
diff changeset
138
6497d42015aa Uploaded
brigidar
parents:
diff changeset
139 if __name__=='__main__':
6497d42015aa Uploaded
brigidar
parents:
diff changeset
140
6497d42015aa Uploaded
brigidar
parents:
diff changeset
141 try:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
142 parser = argparse.ArgumentParser()
6497d42015aa Uploaded
brigidar
parents:
diff changeset
143 parser.add_argument('-k', '--keys', help="list of products to key on in a comma delimited list")
6497d42015aa Uploaded
brigidar
parents:
diff changeset
144 parser.add_argument('-g', '--gbk_file', help="genbank file of reference")
6497d42015aa Uploaded
brigidar
parents:
diff changeset
145 parser.add_argument('-n', '--number', help="flanking region to include")
6497d42015aa Uploaded
brigidar
parents:
diff changeset
146
6497d42015aa Uploaded
brigidar
parents:
diff changeset
147 args = parser.parse_args()
6497d42015aa Uploaded
brigidar
parents:
diff changeset
148 gbkfname = args.gbk_file
6497d42015aa Uploaded
brigidar
parents:
diff changeset
149 n=args.number
6497d42015aa Uploaded
brigidar
parents:
diff changeset
150 keys=args.keys
6497d42015aa Uploaded
brigidar
parents:
diff changeset
151
6497d42015aa Uploaded
brigidar
parents:
diff changeset
152 except:
6497d42015aa Uploaded
brigidar
parents:
diff changeset
153 print __doc__
6497d42015aa Uploaded
brigidar
parents:
diff changeset
154 sys.exit(-1)
6497d42015aa Uploaded
brigidar
parents:
diff changeset
155
6497d42015aa Uploaded
brigidar
parents:
diff changeset
156 ## extract gbk records
6497d42015aa Uploaded
brigidar
parents:
diff changeset
157 gbk_parse(gbkfname,keys,n)
6497d42015aa Uploaded
brigidar
parents:
diff changeset
158
6497d42015aa Uploaded
brigidar
parents:
diff changeset
159
6497d42015aa Uploaded
brigidar
parents:
diff changeset
160
6497d42015aa Uploaded
brigidar
parents:
diff changeset
161