Mercurial > repos > vipints > fml_gff3togtf
comparison gff_fmap.py @ 5:6e589f267c14
Uploaded
author | devteam |
---|---|
date | Tue, 04 Nov 2014 12:15:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
4:619e0fcd9126 | 5:6e589f267c14 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 GFF feature mapping program, to find the relation between different features described in a given GFF file. | |
4 | |
5 Usage: | |
6 python gff_fmap.py in.gff > out.txt | |
7 | |
8 Courtesy: Brad Chapman | |
9 Few functions are inherited from bcbio-GFFParser program. | |
10 """ | |
11 | |
12 import re | |
13 import sys | |
14 import urllib | |
15 import collections | |
16 from helper import open_file | |
17 | |
18 def _gff_line_map(line): | |
19 """Parses a line of GFF into a dictionary. | |
20 Given an input line from a GFF file, this: | |
21 - breaks it into component elements | |
22 - determines the type of attribute (flat, parent, child or annotation) | |
23 - generates a dictionary of GFF info | |
24 """ | |
25 gff3_kw_pat = re.compile("\w+=") | |
26 def _split_keyvals(keyval_str): | |
27 """Split key-value pairs in a GFF2, GTF and GFF3 compatible way. | |
28 GFF3 has key value pairs like: | |
29 count=9;gene=amx-2;sequence=SAGE:aacggagccg | |
30 GFF2 and GTF have: | |
31 Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206" | |
32 name "fgenesh1_pg.C_chr_1000003"; transcriptId 869 | |
33 """ | |
34 quals = collections.defaultdict(list) | |
35 if keyval_str is None: | |
36 return quals | |
37 # ensembl GTF has a stray semi-colon at the end | |
38 if keyval_str[-1] == ';': | |
39 keyval_str = keyval_str[:-1] | |
40 # GFF2/GTF has a semi-colon with at least one space after it. | |
41 # It can have spaces on both sides; wormbase does this. | |
42 # GFF3 works with no spaces. | |
43 # Split at the first one we can recognize as working | |
44 parts = keyval_str.split(" ; ") | |
45 if len(parts) == 1: | |
46 parts = keyval_str.split("; ") | |
47 if len(parts) == 1: | |
48 parts = keyval_str.split(";") | |
49 # check if we have GFF3 style key-vals (with =) | |
50 is_gff2 = True | |
51 if gff3_kw_pat.match(parts[0]): | |
52 is_gff2 = False | |
53 key_vals = [p.split('=') for p in parts] | |
54 # otherwise, we are separated by a space with a key as the first item | |
55 else: | |
56 pieces = [] | |
57 for p in parts: | |
58 # fix misplaced semi-colons in keys in some GFF2 files | |
59 if p and p[0] == ';': | |
60 p = p[1:] | |
61 pieces.append(p.strip().split(" ")) | |
62 key_vals = [(p[0], " ".join(p[1:])) for p in pieces] | |
63 for key, val in key_vals: | |
64 # remove quotes in GFF2 files | |
65 if (len(val) > 0 and val[0] == '"' and val[-1] == '"'): | |
66 val = val[1:-1] | |
67 if val: | |
68 quals[key].extend(val.split(',')) | |
69 # if we don't have a value, make this a key=True/False style | |
70 # attribute | |
71 else: | |
72 quals[key].append('true') | |
73 for key, vals in quals.items(): | |
74 quals[key] = [urllib.unquote(v) for v in vals] | |
75 return quals, is_gff2 | |
76 | |
77 def _nest_gff2_features(gff_parts): | |
78 """Provide nesting of GFF2 transcript parts with transcript IDs. | |
79 | |
80 exons and coding sequences are mapped to a parent with a transcript_id | |
81 in GFF2. This is implemented differently at different genome centers | |
82 and this function attempts to resolve that and map things to the GFF3 | |
83 way of doing them. | |
84 """ | |
85 # map protein or transcript ids to a parent | |
86 for transcript_id in ["transcript_id", "transcriptId", "proteinId"]: | |
87 try: | |
88 gff_parts["quals"]["Parent"] = \ | |
89 gff_parts["quals"][transcript_id] | |
90 break | |
91 except KeyError: | |
92 pass | |
93 # case for WormBase GFF -- everything labelled as Transcript or CDS | |
94 for flat_name in ["Transcript", "CDS"]: | |
95 if gff_parts["quals"].has_key(flat_name): | |
96 # parent types | |
97 if gff_parts["type"] in [flat_name]: | |
98 if not gff_parts["id"]: | |
99 gff_parts["id"] = gff_parts["quals"][flat_name][0] | |
100 gff_parts["quals"]["ID"] = [gff_parts["id"]] | |
101 # children types | |
102 elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR", | |
103 "coding_exon", "five_prime_UTR", "CDS", "stop_codon", | |
104 "start_codon"]: | |
105 gff_parts["quals"]["Parent"] = gff_parts["quals"][flat_name] | |
106 break | |
107 | |
108 return gff_parts | |
109 | |
110 line = line.strip() | |
111 if line == '':return [('directive', line)] # sometimes the blank lines will be there | |
112 if line[0] == '>':return [('directive', '')] # sometimes it will be a FATSA header | |
113 if line[0] == "#": | |
114 return [('directive', line[2:])] | |
115 elif line: | |
116 parts = line.split('\t') | |
117 if len(parts) == 1 and re.search(r'\w+', parts[0]):return [('directive', '')] ## GFF files with FASTA sequence together | |
118 assert len(parts) == 9, line | |
119 gff_parts = [(None if p == '.' else p) for p in parts] | |
120 gff_info = dict() | |
121 | |
122 # collect all of the base qualifiers for this item | |
123 quals, is_gff2 = _split_keyvals(gff_parts[8]) | |
124 | |
125 gff_info["is_gff2"] = is_gff2 | |
126 | |
127 if gff_parts[1]:quals["source"].append(gff_parts[1]) | |
128 gff_info['quals'] = dict(quals) | |
129 | |
130 # if we are describing a location, then we are a feature | |
131 if gff_parts[3] and gff_parts[4]: | |
132 gff_info['type'] = gff_parts[2] | |
133 gff_info['id'] = quals.get('ID', [''])[0] | |
134 | |
135 if is_gff2:gff_info = _nest_gff2_features(gff_info) | |
136 # features that have parents need to link so we can pick up | |
137 # the relationship | |
138 if gff_info['quals'].has_key('Parent'): | |
139 final_key = 'child' | |
140 elif gff_info['id']: | |
141 final_key = 'parent' | |
142 # Handle flat features | |
143 else: | |
144 final_key = 'feature' | |
145 # otherwise, associate these annotations with the full record | |
146 else: | |
147 final_key = 'annotation' | |
148 return [(final_key, gff_info)] | |
149 | |
150 def parent_child_id_map(gff_handle): | |
151 """ | |
152 Provide a mapping of parent to child relationships in the file. | |
153 Gives a dictionary of parent child relationships: | |
154 | |
155 keys -- tuple of (source, type) for each parent | |
156 values -- tuple of (source, type) as children of that parent | |
157 """ | |
158 # collect all of the parent and child types mapped to IDs | |
159 parent_sts = dict() | |
160 child_sts = collections.defaultdict(list) | |
161 for line in gff_handle: | |
162 line_type, line_info = _gff_line_map(line)[0] | |
163 if (line_type == 'parent' or (line_type == 'child' and line_info['id'])): | |
164 parent_sts[line_info['id']] = (line_info['quals']['source'][0], line_info['type']) | |
165 if line_type == 'child': | |
166 for parent_id in line_info['quals']['Parent']: | |
167 child_sts[parent_id].append((line_info['quals']['source'][0], line_info['type'])) | |
168 gff_handle.close() | |
169 # generate a dictionary of the unique final type relationships | |
170 pc_map = collections.defaultdict(list) | |
171 for parent_id, parent_type in parent_sts.items(): | |
172 for child_type in child_sts[parent_id]: | |
173 pc_map[parent_type].append(child_type) | |
174 pc_final_map = dict() | |
175 for ptype, ctypes in pc_map.items(): | |
176 unique_ctypes = list(set(ctypes)) | |
177 unique_ctypes.sort() | |
178 pc_final_map[ptype] = unique_ctypes | |
179 # some cases the GFF file represents a single feature type | |
180 if not pc_final_map: | |
181 for fid, stypes in parent_sts.items(): | |
182 pc_final_map[stypes] = dict() | |
183 # generate a report on feature id mapping in the file | |
184 print '+---------------------+---------------------------------+' | |
185 print '| Parent feature type | Associated child feature type(s)|' | |
186 print '+---------------------+---------------------------------+' | |
187 for key, value in pc_final_map.items(): | |
188 print key[0], key[1] | |
189 for child_to in value: | |
190 print '\t\t\t|-',child_to[0], child_to[1] | |
191 print '+---------------------+---------------------------------+' | |
192 | |
193 | |
194 if __name__=='__main__': | |
195 | |
196 try: | |
197 gff_file = sys.argv[1] | |
198 except: | |
199 print __doc__ | |
200 sys.exit(-1) | |
201 | |
202 gff_handle = open_file(gff_file) | |
203 parent_child_id_map(gff_handle) |