Mercurial > repos > vipints > fml_mergeloci
comparison fml_gff_groomer/scripts/gff_id_mapper.py @ 0:79726c328621 default tip
Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author | vipints |
---|---|
date | Tue, 07 Jun 2011 17:29:24 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:79726c328621 |
---|---|
1 #!/usr/bin/env python | |
2 # | |
3 # This program is free software; you can redistribute it and/or modify | |
4 # it under the terms of the GNU General Public License as published by | |
5 # the Free Software Foundation; either version 3 of the License, or | |
6 # (at your option) any later version. | |
7 # | |
8 # Written (W) 2010 Vipin T Sreedharan, Friedrich Miescher Laboratory of the Max Planck Society | |
9 # Copyright (C) 2010 Max Planck Society | |
10 # | |
11 | |
12 # Description : Provides feature to sub feature identifier mapping in a given GFF3 file. | |
13 | |
14 import re, sys | |
15 import collections | |
16 import urllib | |
17 import time | |
18 | |
19 def _gff_line_map(line): | |
20 """Parses a line of GFF into a dictionary. | |
21 Given an input line from a GFF file, this: | |
22 - breaks it into component elements | |
23 - determines the type of attribute (flat, parent, child or annotation) | |
24 - generates a dictionary of GFF info | |
25 """ | |
26 gff3_kw_pat = re.compile("\w+=") | |
27 def _split_keyvals(keyval_str): | |
28 """Split key-value pairs in a GFF2, GTF and GFF3 compatible way. | |
29 | |
30 GFF3 has key value pairs like: | |
31 count=9;gene=amx-2;sequence=SAGE:aacggagccg | |
32 GFF2 and GTF have: | |
33 Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206" | |
34 name "fgenesh1_pg.C_chr_1000003"; transcriptId 869 | |
35 """ | |
36 quals = collections.defaultdict(list) | |
37 if keyval_str is None: | |
38 return quals | |
39 # ensembl GTF has a stray semi-colon at the end | |
40 if keyval_str[-1] == ';': | |
41 keyval_str = keyval_str[:-1] | |
42 # GFF2/GTF has a semi-colon with at least one space after it. | |
43 # It can have spaces on both sides; wormbase does this. | |
44 # GFF3 works with no spaces. | |
45 # Split at the first one we can recognize as working | |
46 parts = keyval_str.split(" ; ") | |
47 if len(parts) == 1: | |
48 parts = keyval_str.split("; ") | |
49 if len(parts) == 1: | |
50 parts = keyval_str.split(";") | |
51 # check if we have GFF3 style key-vals (with =) | |
52 is_gff2 = True | |
53 if gff3_kw_pat.match(parts[0]): | |
54 is_gff2 = False | |
55 key_vals = [p.split('=') for p in parts] | |
56 # otherwise, we are separated by a space with a key as the first item | |
57 else: | |
58 pieces = [] | |
59 for p in parts: | |
60 # fix misplaced semi-colons in keys in some GFF2 files | |
61 if p and p[0] == ';': | |
62 p = p[1:] | |
63 pieces.append(p.strip().split(" ")) | |
64 key_vals = [(p[0], " ".join(p[1:])) for p in pieces] | |
65 for key, val in key_vals: | |
66 # remove quotes in GFF2 files | |
67 if (len(val) > 0 and val[0] == '"' and val[-1] == '"'): | |
68 val = val[1:-1] | |
69 if val: | |
70 quals[key].extend(val.split(',')) | |
71 # if we don't have a value, make this a key=True/False style | |
72 # attribute | |
73 else: | |
74 quals[key].append('true') | |
75 for key, vals in quals.items(): | |
76 quals[key] = [urllib.unquote(v) for v in vals] | |
77 return quals, is_gff2 | |
78 | |
79 def _nest_gff2_features(gff_parts): | |
80 """Provide nesting of GFF2 transcript parts with transcript IDs. | |
81 | |
82 exons and coding sequences are mapped to a parent with a transcript_id | |
83 in GFF2. This is implemented differently at different genome centers | |
84 and this function attempts to resolve that and map things to the GFF3 | |
85 way of doing them. | |
86 """ | |
87 # map protein or transcript ids to a parent | |
88 for transcript_id in ["transcript_id", "transcriptId", "proteinId"]: | |
89 try: | |
90 gff_parts["quals"]["Parent"] = \ | |
91 gff_parts["quals"][transcript_id] | |
92 break | |
93 except KeyError: | |
94 pass | |
95 # case for WormBase GFF -- everything labelled as Transcript or CDS | |
96 for flat_name in ["Transcript", "CDS"]: | |
97 if gff_parts["quals"].has_key(flat_name): | |
98 # parent types | |
99 if gff_parts["type"] in [flat_name]: | |
100 if not gff_parts["id"]: | |
101 gff_parts["id"] = gff_parts["quals"][flat_name][0] | |
102 gff_parts["quals"]["ID"] = [gff_parts["id"]] | |
103 # children types | |
104 elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR", | |
105 "coding_exon", "five_prime_UTR", "CDS", "stop_codon", | |
106 "start_codon"]: | |
107 gff_parts["quals"]["Parent"] = gff_parts["quals"][flat_name] | |
108 break | |
109 | |
110 return gff_parts | |
111 | |
112 line = line.strip() | |
113 if line == '':return [('directive', line)] # sometimes the blank lines will be there | |
114 if line[0] == '>':return [('directive', '')] # sometimes it will be a FATSA header | |
115 if line[0] == "#": | |
116 return [('directive', line[2:])] | |
117 elif line: | |
118 parts = line.split('\t') | |
119 if len(parts) == 1 and re.search(r'\w+', parts[0]):return [('directive', '')] ## GFF files with FASTA sequence together | |
120 assert len(parts) == 9, line | |
121 gff_parts = [(None if p == '.' else p) for p in parts] | |
122 gff_info = dict() | |
123 | |
124 # collect all of the base qualifiers for this item | |
125 quals, is_gff2 = _split_keyvals(gff_parts[8]) | |
126 | |
127 gff_info["is_gff2"] = is_gff2 | |
128 | |
129 if gff_parts[1]:quals["source"].append(gff_parts[1]) | |
130 gff_info['quals'] = dict(quals) | |
131 | |
132 # if we are describing a location, then we are a feature | |
133 if gff_parts[3] and gff_parts[4]: | |
134 gff_info['type'] = gff_parts[2] | |
135 gff_info['id'] = quals.get('ID', [''])[0] | |
136 | |
137 if is_gff2:gff_info = _nest_gff2_features(gff_info) | |
138 # features that have parents need to link so we can pick up | |
139 # the relationship | |
140 if gff_info['quals'].has_key('Parent'): | |
141 final_key = 'child' | |
142 elif gff_info['id']: | |
143 final_key = 'parent' | |
144 # Handle flat features | |
145 else: | |
146 final_key = 'feature' | |
147 # otherwise, associate these annotations with the full record | |
148 else: | |
149 final_key = 'annotation' | |
150 return [(final_key, gff_info)] | |
151 | |
152 def parent_child_id_map(gff_handle): | |
153 """Provide a mapping of parent to child relationships in the file. | |
154 | |
155 Gives a dictionary of parent child relationships: | |
156 | |
157 keys -- tuple of (source, type) for each parent | |
158 values -- tuple of (source, type) as children of that parent""" | |
159 | |
160 # collect all of the parent and child types mapped to IDs | |
161 parent_sts = dict() | |
162 child_sts = collections.defaultdict(list) | |
163 | |
164 for line in gff_handle: | |
165 line_type, line_info = _gff_line_map(line)[0] | |
166 if (line_type == 'parent' or (line_type == 'child' and line_info['id'])): | |
167 parent_sts[line_info['id']] = (line_info['quals']['source'][0], line_info['type']) | |
168 if line_type == 'child': | |
169 for parent_id in line_info['quals']['Parent']: | |
170 child_sts[parent_id].append((line_info['quals']['source'][0], line_info['type'])) | |
171 gff_handle.close() | |
172 | |
173 # generate a dictionary of the unique final type relationships | |
174 pc_map = collections.defaultdict(list) | |
175 for parent_id, parent_type in parent_sts.items(): | |
176 for child_type in child_sts[parent_id]: | |
177 pc_map[parent_type].append(child_type) | |
178 pc_final_map = dict() | |
179 for ptype, ctypes in pc_map.items(): | |
180 unique_ctypes = list(set(ctypes)) | |
181 unique_ctypes.sort() | |
182 pc_final_map[ptype] = unique_ctypes | |
183 | |
184 # Check for Parent Child relations | |
185 level1, level2, level3, sec_level_mis = {}, {}, {}, {} | |
186 for etype, fchild in pc_final_map.items(): | |
187 level2_flag = 0 | |
188 for kp, vp in pc_final_map.items(): | |
189 if etype in vp:level2_flag = 1; level2[etype] = 1 # check for second level features | |
190 if level2_flag == 0: # first level features | |
191 level1[etype] =1 | |
192 for eachfch in fchild: # perform a check for all level1 objects values were defined as level2 keys. | |
193 if not eachfch in pc_final_map.keys(): # figure out the missing level2 objects | |
194 if etype in sec_level_mis: | |
195 sec_level_mis[etype].append(eachfch) | |
196 else: | |
197 sec_level_mis[etype]=[eachfch] | |
198 if level2_flag == 1:level3[str(fchild)] =1 # taking third level features | |
199 # disply the result | |
200 if level1==level2==level3=={} and sec_level_mis == {}: | |
201 print | |
202 print 'ONLY FIRST level feature(s):' | |
203 source_type = dict() | |
204 gff_handle = open(gff_handle.name, 'rU') | |
205 for line in gff_handle: | |
206 line = line.strip('\n\r') | |
207 if line[0] == '#': continue | |
208 parts = line.split('\t') | |
209 if parts[-1] == '':parts.pop() | |
210 assert len(parts) == 9, line | |
211 source_type[(parts[1], parts[2])] = 1 | |
212 gff_handle.close() | |
213 for ele in source_type:print '\t' + str(ele) | |
214 print | |
215 else: | |
216 print | |
217 print '===Report on different level features from GFF file===' | |
218 print | |
219 print 'FIRST level feature(s):' | |
220 for ele in level1: print '\t' + str(ele) | |
221 print | |
222 print 'SECOND level feature(s):' | |
223 for ele in level2: print '\t' + str(ele) | |
224 print | |
225 print 'THIRD level feature(s):' | |
226 for ele in level3:print '\t' + str(ele[1:-1]) | |
227 print | |
228 # wrong way mapped feature mapping description | |
229 for wf, wfv in sec_level_mis.items(): | |
230 if wf[1]=='gene': | |
231 print 'GFF Parsing modules from publicly available packages like Bio-python, Bio-perl etc. are heavily dependent on feature identifier mapping.' | |
232 print 'Here few features seems to be wrongly mapped to its child, which inturn cause problems while extracting the annotation based on feature identifier.\n' | |
233 for ehv in wfv: | |
234 if ehv[1]=='exon' or ehv[1]=='intron' or ehv[1]=='CDS' or ehv[1]=='three_prime_UTR' or ehv[1]=='five_prime_UTR':print 'Error in ID mapping: Level1 feature ' + str(wf) + ' maps to Level3 feature ' + str(ehv) | |
235 | |
236 if __name__=='__main__': | |
237 | |
238 stime = time.asctime( time.localtime(time.time()) ) | |
239 print '-------------------------------------------------------' | |
240 print 'GFFExamine started on ' + stime | |
241 print '-------------------------------------------------------' | |
242 | |
243 try: | |
244 gff_handle = open(sys.argv[1], 'rU') | |
245 except: | |
246 sys.stderr.write("Can't open the GFF3 file, Cannot continue...\n") | |
247 sys.stderr.write("USAGE: gff_id_mapper.py <gff3 file> \n") | |
248 sys.exit(-1) | |
249 parent_child_id_map(gff_handle) | |
250 | |
251 stime = time.asctime( time.localtime(time.time()) ) | |
252 print '-------------------------------------------------------' | |
253 print 'GFFExamine finished at ' + stime | |
254 print '-------------------------------------------------------' |