comparison gff_fmap.py @ 5:6e589f267c14

Uploaded
author devteam
date Tue, 04 Nov 2014 12:15:19 -0500
parents
children
comparison
equal deleted inserted replaced
4:619e0fcd9126 5:6e589f267c14
1 #!/usr/bin/env python
2 """
3 GFF feature mapping program, to find the relation between different features described in a given GFF file.
4
5 Usage:
6 python gff_fmap.py in.gff > out.txt
7
8 Courtesy: Brad Chapman
9 Few functions are inherited from bcbio-GFFParser program.
10 """
11
12 import re
13 import sys
14 import urllib
15 import collections
16 from helper import open_file
17
18 def _gff_line_map(line):
19 """Parses a line of GFF into a dictionary.
20 Given an input line from a GFF file, this:
21 - breaks it into component elements
22 - determines the type of attribute (flat, parent, child or annotation)
23 - generates a dictionary of GFF info
24 """
25 gff3_kw_pat = re.compile("\w+=")
26 def _split_keyvals(keyval_str):
27 """Split key-value pairs in a GFF2, GTF and GFF3 compatible way.
28 GFF3 has key value pairs like:
29 count=9;gene=amx-2;sequence=SAGE:aacggagccg
30 GFF2 and GTF have:
31 Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206"
32 name "fgenesh1_pg.C_chr_1000003"; transcriptId 869
33 """
34 quals = collections.defaultdict(list)
35 if keyval_str is None:
36 return quals
37 # ensembl GTF has a stray semi-colon at the end
38 if keyval_str[-1] == ';':
39 keyval_str = keyval_str[:-1]
40 # GFF2/GTF has a semi-colon with at least one space after it.
41 # It can have spaces on both sides; wormbase does this.
42 # GFF3 works with no spaces.
43 # Split at the first one we can recognize as working
44 parts = keyval_str.split(" ; ")
45 if len(parts) == 1:
46 parts = keyval_str.split("; ")
47 if len(parts) == 1:
48 parts = keyval_str.split(";")
49 # check if we have GFF3 style key-vals (with =)
50 is_gff2 = True
51 if gff3_kw_pat.match(parts[0]):
52 is_gff2 = False
53 key_vals = [p.split('=') for p in parts]
54 # otherwise, we are separated by a space with a key as the first item
55 else:
56 pieces = []
57 for p in parts:
58 # fix misplaced semi-colons in keys in some GFF2 files
59 if p and p[0] == ';':
60 p = p[1:]
61 pieces.append(p.strip().split(" "))
62 key_vals = [(p[0], " ".join(p[1:])) for p in pieces]
63 for key, val in key_vals:
64 # remove quotes in GFF2 files
65 if (len(val) > 0 and val[0] == '"' and val[-1] == '"'):
66 val = val[1:-1]
67 if val:
68 quals[key].extend(val.split(','))
69 # if we don't have a value, make this a key=True/False style
70 # attribute
71 else:
72 quals[key].append('true')
73 for key, vals in quals.items():
74 quals[key] = [urllib.unquote(v) for v in vals]
75 return quals, is_gff2
76
77 def _nest_gff2_features(gff_parts):
78 """Provide nesting of GFF2 transcript parts with transcript IDs.
79
80 exons and coding sequences are mapped to a parent with a transcript_id
81 in GFF2. This is implemented differently at different genome centers
82 and this function attempts to resolve that and map things to the GFF3
83 way of doing them.
84 """
85 # map protein or transcript ids to a parent
86 for transcript_id in ["transcript_id", "transcriptId", "proteinId"]:
87 try:
88 gff_parts["quals"]["Parent"] = \
89 gff_parts["quals"][transcript_id]
90 break
91 except KeyError:
92 pass
93 # case for WormBase GFF -- everything labelled as Transcript or CDS
94 for flat_name in ["Transcript", "CDS"]:
95 if gff_parts["quals"].has_key(flat_name):
96 # parent types
97 if gff_parts["type"] in [flat_name]:
98 if not gff_parts["id"]:
99 gff_parts["id"] = gff_parts["quals"][flat_name][0]
100 gff_parts["quals"]["ID"] = [gff_parts["id"]]
101 # children types
102 elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR",
103 "coding_exon", "five_prime_UTR", "CDS", "stop_codon",
104 "start_codon"]:
105 gff_parts["quals"]["Parent"] = gff_parts["quals"][flat_name]
106 break
107
108 return gff_parts
109
110 line = line.strip()
111 if line == '':return [('directive', line)] # sometimes the blank lines will be there
112 if line[0] == '>':return [('directive', '')] # sometimes it will be a FATSA header
113 if line[0] == "#":
114 return [('directive', line[2:])]
115 elif line:
116 parts = line.split('\t')
117 if len(parts) == 1 and re.search(r'\w+', parts[0]):return [('directive', '')] ## GFF files with FASTA sequence together
118 assert len(parts) == 9, line
119 gff_parts = [(None if p == '.' else p) for p in parts]
120 gff_info = dict()
121
122 # collect all of the base qualifiers for this item
123 quals, is_gff2 = _split_keyvals(gff_parts[8])
124
125 gff_info["is_gff2"] = is_gff2
126
127 if gff_parts[1]:quals["source"].append(gff_parts[1])
128 gff_info['quals'] = dict(quals)
129
130 # if we are describing a location, then we are a feature
131 if gff_parts[3] and gff_parts[4]:
132 gff_info['type'] = gff_parts[2]
133 gff_info['id'] = quals.get('ID', [''])[0]
134
135 if is_gff2:gff_info = _nest_gff2_features(gff_info)
136 # features that have parents need to link so we can pick up
137 # the relationship
138 if gff_info['quals'].has_key('Parent'):
139 final_key = 'child'
140 elif gff_info['id']:
141 final_key = 'parent'
142 # Handle flat features
143 else:
144 final_key = 'feature'
145 # otherwise, associate these annotations with the full record
146 else:
147 final_key = 'annotation'
148 return [(final_key, gff_info)]
149
150 def parent_child_id_map(gff_handle):
151 """
152 Provide a mapping of parent to child relationships in the file.
153 Gives a dictionary of parent child relationships:
154
155 keys -- tuple of (source, type) for each parent
156 values -- tuple of (source, type) as children of that parent
157 """
158 # collect all of the parent and child types mapped to IDs
159 parent_sts = dict()
160 child_sts = collections.defaultdict(list)
161 for line in gff_handle:
162 line_type, line_info = _gff_line_map(line)[0]
163 if (line_type == 'parent' or (line_type == 'child' and line_info['id'])):
164 parent_sts[line_info['id']] = (line_info['quals']['source'][0], line_info['type'])
165 if line_type == 'child':
166 for parent_id in line_info['quals']['Parent']:
167 child_sts[parent_id].append((line_info['quals']['source'][0], line_info['type']))
168 gff_handle.close()
169 # generate a dictionary of the unique final type relationships
170 pc_map = collections.defaultdict(list)
171 for parent_id, parent_type in parent_sts.items():
172 for child_type in child_sts[parent_id]:
173 pc_map[parent_type].append(child_type)
174 pc_final_map = dict()
175 for ptype, ctypes in pc_map.items():
176 unique_ctypes = list(set(ctypes))
177 unique_ctypes.sort()
178 pc_final_map[ptype] = unique_ctypes
179 # some cases the GFF file represents a single feature type
180 if not pc_final_map:
181 for fid, stypes in parent_sts.items():
182 pc_final_map[stypes] = dict()
183 # generate a report on feature id mapping in the file
184 print '+---------------------+---------------------------------+'
185 print '| Parent feature type | Associated child feature type(s)|'
186 print '+---------------------+---------------------------------+'
187 for key, value in pc_final_map.items():
188 print key[0], key[1]
189 for child_to in value:
190 print '\t\t\t|-',child_to[0], child_to[1]
191 print '+---------------------+---------------------------------+'
192
193
194 if __name__=='__main__':
195
196 try:
197 gff_file = sys.argv[1]
198 except:
199 print __doc__
200 sys.exit(-1)
201
202 gff_handle = open_file(gff_file)
203 parent_child_id_map(gff_handle)