Mercurial > repos > vipints > rdiff
comparison rDiff/tools/ParseGFF.py @ 0:0f80a5141704
version 0.3 uploaded
author | vipints |
---|---|
date | Thu, 14 Feb 2013 23:38:36 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:0f80a5141704 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 Extract genome annotation from a GFF3 (a tab delimited | |
4 format for storing sequence features and annotations: | |
5 http://www.sequenceontology.org/gff3.shtml) file. | |
6 | |
7 Usage: ParseGFF.py in.gff3 out.mat | |
8 | |
9 Requirements: | |
10 Scipy :- http://scipy.org/ | |
11 Numpy :- http://numpy.org/ | |
12 | |
13 Copyright (C) 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany | |
14 2012- Memorial Sloan-Kettering Cancer Center, New York City, USA | |
15 """ | |
16 | |
17 import re, sys | |
18 import scipy.io as sio | |
19 import numpy as np | |
20 | |
21 def addCDSphase(strand, cds): | |
22 """Add CDS phase to the CDS exons | |
23 """ | |
24 cds_region, cds_flag = [], 0 | |
25 if strand == '+': | |
26 for cdspos in cds: | |
27 if cds_flag == 0: | |
28 cdspos = (cdspos[0], cdspos[1], 0) | |
29 diff = (cdspos[1]-(cdspos[0]-1))%3 | |
30 else: | |
31 xy = 0 | |
32 if diff == 0: | |
33 cdspos = (cdspos[0], cdspos[1], 0) | |
34 elif diff == 1: | |
35 cdspos = (cdspos[0], cdspos[1], 2) | |
36 xy = 2 | |
37 elif diff == 2: | |
38 cdspos = (cdspos[0], cdspos[1], 1) | |
39 xy = 1 | |
40 diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3 | |
41 cds_region.append(cdspos) | |
42 cds_flag = 1 | |
43 elif strand == '-': | |
44 cds.reverse() | |
45 for cdspos in cds: | |
46 if cds_flag == 0: | |
47 cdspos = (cdspos[0], cdspos[1], 0) | |
48 diff = (cdspos[1]-(cdspos[0]-1))%3 | |
49 else: | |
50 xy = 0 | |
51 if diff == 0: | |
52 cdspos = (cdspos[0], cdspos[1], 0) | |
53 elif diff == 1: | |
54 cdspos = (cdspos[0], cdspos[1], 2) | |
55 xy = 2 | |
56 elif diff == 2: | |
57 cdspos = (cdspos[0], cdspos[1], 1) | |
58 xy = 1 | |
59 diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3 | |
60 cds_region.append(cdspos) | |
61 cds_flag = 1 | |
62 cds_region.reverse() | |
63 return cds_region | |
64 | |
65 def createExon(strand_p, five_p_utr, cds_cod, three_p_utr): | |
66 """Create exon cordinates from UTR's and CDS region | |
67 """ | |
68 exon_pos = [] | |
69 if strand_p == '+': | |
70 utr5_start, utr5_end = 0, 0 | |
71 if five_p_utr != []: | |
72 utr5_start, utr5_end = five_p_utr[-1][0], five_p_utr[-1][1] | |
73 cds_5start, cds_5end = cds_cod[0][0], cds_cod[0][1] | |
74 jun_exon = [] | |
75 if cds_5start-utr5_end == 0 or cds_5start-utr5_end == 1: | |
76 jun_exon = [utr5_start, cds_5end] | |
77 if len(cds_cod) == 1: | |
78 five_prime_flag = 0 | |
79 if jun_exon != []: | |
80 five_p_utr = five_p_utr[:-1] | |
81 five_prime_flag = 1 | |
82 for utr5 in five_p_utr: | |
83 exon_pos.append(utr5) | |
84 jun_exon = [] | |
85 utr3_start, utr3_end = 0, 0 | |
86 if three_p_utr != []: | |
87 utr3_start = three_p_utr[0][0] | |
88 utr3_end = three_p_utr[0][1] | |
89 if utr3_start-cds_5end == 0 or utr3_start-cds_5end == 1: | |
90 jun_exon = [cds_5start, utr3_end] | |
91 three_prime_flag = 0 | |
92 if jun_exon != []: | |
93 cds_cod = cds_cod[:-1] | |
94 three_p_utr = three_p_utr[1:] | |
95 three_prime_flag = 1 | |
96 if five_prime_flag == 1 and three_prime_flag == 1: | |
97 exon_pos.append([utr5_start, utr3_end]) | |
98 if five_prime_flag == 1 and three_prime_flag == 0: | |
99 exon_pos.append([utr5_start, cds_5end]) | |
100 cds_cod = cds_cod[:-1] | |
101 if five_prime_flag == 0 and three_prime_flag == 1: | |
102 exon_pos.append([cds_5start, utr3_end]) | |
103 for cds in cds_cod: | |
104 exon_pos.append(cds) | |
105 for utr3 in three_p_utr: | |
106 exon_pos.append(utr3) | |
107 else: | |
108 if jun_exon != []: | |
109 five_p_utr = five_p_utr[:-1] | |
110 cds_cod = cds_cod[1:] | |
111 for utr5 in five_p_utr: | |
112 exon_pos.append(utr5) | |
113 exon_pos.append(jun_exon) if jun_exon != [] else '' | |
114 jun_exon = [] | |
115 utr3_start, utr3_end = 0, 0 | |
116 if three_p_utr != []: | |
117 utr3_start = three_p_utr[0][0] | |
118 utr3_end = three_p_utr[0][1] | |
119 cds_3start = cds_cod[-1][0] | |
120 cds_3end = cds_cod[-1][1] | |
121 if utr3_start-cds_3end == 0 or utr3_start-cds_3end == 1: | |
122 jun_exon = [cds_3start, utr3_end] | |
123 if jun_exon != []: | |
124 cds_cod = cds_cod[:-1] | |
125 three_p_utr = three_p_utr[1:] | |
126 for cds in cds_cod: | |
127 exon_pos.append(cds) | |
128 exon_pos.append(jun_exon) if jun_exon != [] else '' | |
129 for utr3 in three_p_utr: | |
130 exon_pos.append(utr3) | |
131 elif strand_p == '-': | |
132 utr3_start, utr3_end = 0, 0 | |
133 if three_p_utr != []: | |
134 utr3_start = three_p_utr[-1][0] | |
135 utr3_end = three_p_utr[-1][1] | |
136 cds_3start = cds_cod[0][0] | |
137 cds_3end = cds_cod[0][1] | |
138 jun_exon = [] | |
139 if cds_3start-utr3_end == 0 or cds_3start-utr3_end == 1: | |
140 jun_exon = [utr3_start, cds_3end] | |
141 if len(cds_cod) == 1: | |
142 three_prime_flag = 0 | |
143 if jun_exon != []: | |
144 three_p_utr = three_p_utr[:-1] | |
145 three_prime_flag = 1 | |
146 for utr3 in three_p_utr: | |
147 exon_pos.append(utr3) | |
148 jun_exon = [] | |
149 (utr5_start, utr5_end) = (0, 0) | |
150 if five_p_utr != []: | |
151 utr5_start = five_p_utr[0][0] | |
152 utr5_end = five_p_utr[0][1] | |
153 if utr5_start-cds_3end == 0 or utr5_start-cds_3end == 1: | |
154 jun_exon = [cds_3start, utr5_end] | |
155 five_prime_flag = 0 | |
156 if jun_exon != []: | |
157 cds_cod = cds_cod[:-1] | |
158 five_p_utr = five_p_utr[1:] | |
159 five_prime_flag = 1 | |
160 if three_prime_flag == 1 and five_prime_flag == 1: | |
161 exon_pos.append([utr3_start, utr5_end]) | |
162 if three_prime_flag == 1 and five_prime_flag == 0: | |
163 exon_pos.append([utr3_start, cds_3end]) | |
164 cds_cod = cds_cod[:-1] | |
165 if three_prime_flag == 0 and five_prime_flag == 1: | |
166 exon_pos.append([cds_3start, utr5_end]) | |
167 for cds in cds_cod: | |
168 exon_pos.append(cds) | |
169 for utr5 in five_p_utr: | |
170 exon_pos.append(utr5) | |
171 else: | |
172 if jun_exon != []: | |
173 three_p_utr = three_p_utr[:-1] | |
174 cds_cod = cds_cod[1:] | |
175 for utr3 in three_p_utr: | |
176 exon_pos.append(utr3) | |
177 if jun_exon != []: | |
178 exon_pos.append(jun_exon) | |
179 jun_exon = [] | |
180 (utr5_start, utr5_end) = (0, 0) | |
181 if five_p_utr != []: | |
182 utr5_start = five_p_utr[0][0] | |
183 utr5_end = five_p_utr[0][1] | |
184 cds_5start = cds_cod[-1][0] | |
185 cds_5end = cds_cod[-1][1] | |
186 if utr5_start-cds_5end == 0 or utr5_start-cds_5end == 1: | |
187 jun_exon = [cds_5start, utr5_end] | |
188 if jun_exon != []: | |
189 cds_cod = cds_cod[:-1] | |
190 five_p_utr = five_p_utr[1:] | |
191 for cds in cds_cod: | |
192 exon_pos.append(cds) | |
193 if jun_exon != []: | |
194 exon_pos.append(jun_exon) | |
195 for utr5 in five_p_utr: | |
196 exon_pos.append(utr5) | |
197 return exon_pos | |
198 | |
199 def init_gene(): | |
200 """Initializing the gene structure | |
201 """ | |
202 gene_details=dict( | |
203 id = '', | |
204 anno_id = [], | |
205 confgenes_id = [], | |
206 name = '', | |
207 source = '', | |
208 gene_info = {}, | |
209 alias = '', | |
210 name2 = [], | |
211 strand = '', | |
212 chr = '', | |
213 chr_num = [], | |
214 paralogs = [], | |
215 start = '', | |
216 stop = '', | |
217 transcripts = [], | |
218 transcript_info = [], | |
219 transcript_status = [], | |
220 transcript_valid = [], | |
221 exons = [], | |
222 exons_confirmed = [], | |
223 cds_exons = [], | |
224 utr5_exons = [], | |
225 utr3_exons = [], | |
226 tis = [], | |
227 tis_conf = [], | |
228 tis_info = [], | |
229 cdsStop = [], | |
230 cdsStop_conf = [], | |
231 cdsStop_info = [], | |
232 tss = [], | |
233 tss_info = [], | |
234 tss_conf = [], | |
235 cleave = [], | |
236 cleave_info = [], | |
237 cleave_conf = [], | |
238 polya = [], | |
239 polya_info = [], | |
240 polya_conf = [], | |
241 is_alt = [], | |
242 is_alt_spliced = 0, | |
243 is_valid = [], | |
244 transcript_complete = [], | |
245 is_complete = [], | |
246 is_correctly_gff3_referenced = '', | |
247 splicegraph = [] | |
248 ) | |
249 return gene_details | |
250 | |
251 def FeatureValueFormat(singlegene): | |
252 """Make feature value compactable to write in a .mat format | |
253 """ | |
254 comp_exon = np.zeros((len(singlegene['exons']),), dtype=np.object) | |
255 for i in range(len(singlegene['exons'])): | |
256 comp_exon[i]= np.array(singlegene['exons'][i]) | |
257 singlegene['exons'] = comp_exon | |
258 comp_cds = np.zeros((len(singlegene['cds_exons']),), dtype=np.object) | |
259 for i in range(len(singlegene['cds_exons'])): | |
260 comp_cds[i]= np.array(singlegene['cds_exons'][i]) | |
261 singlegene['cds_exons'] = comp_cds | |
262 comp_utr3 = np.zeros((len(singlegene['utr3_exons']),), dtype=np.object) | |
263 for i in range(len(singlegene['utr3_exons'])): | |
264 comp_utr3[i]= np.array(singlegene['utr3_exons'][i]) | |
265 singlegene['utr3_exons'] = comp_utr3 | |
266 comp_utr5 = np.zeros((len(singlegene['utr5_exons']),), dtype=np.object) | |
267 for i in range(len(singlegene['utr5_exons'])): | |
268 comp_utr5[i]= np.array(singlegene['utr5_exons'][i]) | |
269 singlegene['utr5_exons'] = comp_utr5 | |
270 comp_transcripts = np.zeros((len(singlegene['transcripts']),), dtype=np.object) | |
271 for i in range(len(singlegene['transcripts'])): | |
272 comp_transcripts[i]= np.array(singlegene['transcripts'][i]) | |
273 singlegene['transcripts'] = comp_transcripts | |
274 comp_tss = np.zeros((len(singlegene['tss']),), dtype=np.object) | |
275 for i in range(len(singlegene['tss'])): | |
276 comp_tss[i]= np.array(singlegene['tss'][i]) | |
277 singlegene['tss'] = comp_tss | |
278 comp_tis = np.zeros((len(singlegene['tis']),), dtype=np.object) | |
279 for i in range(len(singlegene['tis'])): | |
280 comp_tis[i]= np.array(singlegene['tis'][i]) | |
281 singlegene['tis'] = comp_tis | |
282 comp_cleave = np.zeros((len(singlegene['cleave']),), dtype=np.object) | |
283 for i in range(len(singlegene['cleave'])): | |
284 comp_cleave[i]= np.array(singlegene['cleave'][i]) | |
285 singlegene['cleave'] = comp_cleave | |
286 comp_cdsStop = np.zeros((len(singlegene['cdsStop']),), dtype=np.object) | |
287 for i in range(len(singlegene['cdsStop'])): | |
288 comp_cdsStop[i]= np.array(singlegene['cdsStop'][i]) | |
289 singlegene['cdsStop'] = comp_cdsStop | |
290 | |
291 return singlegene | |
292 | |
293 def CreateGeneModels(genes_cmpt, transcripts_cmpt, exons_cmpt, utr3_cmpt, utr5_cmpt, cds_cmpt): | |
294 """Creating Coding/Non-coding gene models from parsed GFF objects. | |
295 """ | |
296 gene_counter, gene_models = 1, [] | |
297 for gene_entry in genes_cmpt: ## Figure out the genes and transcripts associated feature | |
298 if gene_entry in transcripts_cmpt: | |
299 gene=init_gene() | |
300 gene['id']=gene_counter | |
301 gene['name']=gene_entry[1] | |
302 gene['chr']=genes_cmpt[gene_entry]['chr'] | |
303 gene['source']=genes_cmpt[gene_entry]['source'] | |
304 gene['start']=genes_cmpt[gene_entry]['start'] | |
305 gene['stop']=genes_cmpt[gene_entry]['stop'] | |
306 gene['strand']=genes_cmpt[gene_entry]['strand'] | |
307 if not gene['strand'] in ['+', '-']: | |
308 gene['strand']='.' # Strand info not known replaced with a dot symbol instead of None, ?, . etc. | |
309 if len(transcripts_cmpt[gene_entry])>1: | |
310 gene['is_alt_spliced'] = 1 | |
311 gene['is_alt'] = 1 | |
312 gtype=[] | |
313 for tids in transcripts_cmpt[gene_entry]: ## transcript section related tags | |
314 gene['transcripts'].append(tids['ID']) | |
315 gtype.append(tids['type']) | |
316 exon_cod, utr5_cod, utr3_cod, cds_cod = [], [], [], [] | |
317 if (gene['chr'], tids['ID']) in exons_cmpt: | |
318 exon_cod = [[feat_exon['start'], feat_exon['stop']] for feat_exon in exons_cmpt[(gene['chr'], tids['ID'])]] | |
319 if (gene['chr'], tids['ID']) in utr5_cmpt: | |
320 utr5_cod = [[feat_utr5['start'], feat_utr5['stop']] for feat_utr5 in utr5_cmpt[(gene['chr'], tids['ID'])]] | |
321 if (gene['chr'], tids['ID']) in utr3_cmpt: | |
322 utr3_cod = [[feat_utr3['start'], feat_utr3['stop']] for feat_utr3 in utr3_cmpt[(gene['chr'], tids['ID'])]] | |
323 if (gene['chr'], tids['ID']) in cds_cmpt: | |
324 cds_cod = [[feat_cds['start'], feat_cds['stop']] for feat_cds in cds_cmpt[(gene['chr'], tids['ID'])]] | |
325 if len(exon_cod) == 0: ## build exon coordinates from UTR3, UTR5 and CDS | |
326 if cds_cod != []: | |
327 exon_cod=createExon(gene['strand'], utr5_cod, cds_cod, utr3_cod) | |
328 | |
329 if gene['strand']=='-': ## general order to coordinates | |
330 if len(exon_cod) >1: | |
331 if exon_cod[0][0] > exon_cod[-1][0]: | |
332 exon_cod.reverse() | |
333 if len(cds_cod) >1: | |
334 if cds_cod[0][0] > cds_cod[-1][0]: | |
335 cds_cod.reverse() | |
336 if len(utr3_cod) >1: | |
337 if utr3_cod[0][0] > utr3_cod[-1][0]: | |
338 utr3_cod.reverse() | |
339 if len(utr5_cod) >1: | |
340 if utr5_cod[0][0] > utr5_cod[-1][0]: | |
341 utr5_cod.reverse() | |
342 | |
343 tis, cdsStop, tss, cleave = [], [], [], [] ## speacial sited in the gene region | |
344 if cds_cod != []: | |
345 if gene['strand'] == '+': | |
346 tis = [cds_cod[0][0]] | |
347 cdsStop = [cds_cod[-1][1]-3] | |
348 elif gene['strand'] == '-': | |
349 tis = [cds_cod[-1][1]] | |
350 cdsStop = [cds_cod[0][0]+3] | |
351 if utr5_cod != []: | |
352 if gene['strand'] == '+': | |
353 tss = [utr5_cod[0][0]] | |
354 elif gene['strand'] == '-': | |
355 tss = [utr5_cod[-1][1]] | |
356 if utr3_cod != []: | |
357 if gene['strand'] == '+': | |
358 cleave = [utr3_cod[-1][1]] | |
359 elif gene['strand'] == '-': | |
360 cleave = [utr3_cod[0][0]] | |
361 | |
362 cds_status, exon_status, utr_status = 0, 0, 0 ## status of the complete elements of the gene | |
363 if cds_cod != []: ## adding phase to the CDS region | |
364 cds_cod_phase = addCDSphase(gene['strand'], cds_cod) | |
365 cds_status = 1 | |
366 gene['cds_exons'].append(cds_cod_phase) | |
367 | |
368 if exon_cod != []: | |
369 exon_status = 1 | |
370 if utr5_cod != [] or utr3_cod != []: | |
371 utr_status = 1 | |
372 if cds_status != 0 and exon_status != 0 and utr_status != 0: | |
373 gene['transcript_status'].append(1) | |
374 else: | |
375 gene['transcript_status'].append(0) | |
376 | |
377 if exon_cod: ## final check point for a valid gene model | |
378 gene['exons'].append(exon_cod) | |
379 gene['utr3_exons'].append(utr3_cod) | |
380 gene['utr5_exons'].append(utr5_cod) | |
381 gene['tis'].append(tis) | |
382 gene['cdsStop'].append(cdsStop) | |
383 gene['tss'].append(tss) | |
384 gene['cleave'].append(cleave) | |
385 | |
386 gtype=list(set(gtype)) ## different types | |
387 gene['gene_info']=dict(ID=gene_entry[1], | |
388 Source=genes_cmpt[gene_entry]['source'], | |
389 Type=gtype) | |
390 gene=FeatureValueFormat(gene) ## get prepare for MAT writing | |
391 gene_counter+=1 | |
392 gene_models.append(gene) | |
393 return gene_models | |
394 | |
395 def GFFParse(gff_file): | |
396 """Parsing GFF file based on feature relationship. | |
397 """ | |
398 genes, utr5, exons=dict(), dict(), dict() | |
399 transcripts, utr3, cds=dict(), dict(), dict() | |
400 # TODO Include growing key words of different non-coding/coding transcripts | |
401 features=['mrna', 'transcript', 'ncRNA', 'mirna', 'pseudogenic_transcript', 'rrna', 'snorna', 'snrna', 'trna', 'scrna'] | |
402 gff_handle=open(gff_file, "rU") | |
403 for gff_line in gff_handle: | |
404 gff_line=gff_line.strip('\n\r').split('\t') | |
405 if re.match(r'#|>', gff_line[0]): # skip commented line or fasta identifier line | |
406 continue | |
407 if len(gff_line)==1: # skip fasta sequence/empty line if present | |
408 continue | |
409 assert len(gff_line)==9, '\t'.join(gff_line) # not found 9 tab-delimited fields in this line | |
410 if '' in gff_line: # skip this line if there any field with an empty value | |
411 print 'Skipping..', '\t'.join(gff_line) | |
412 continue | |
413 if gff_line[-1][-1]==';': # trim the last ';' character | |
414 gff_line[-1]=gff_line[-1].strip(';') | |
415 if gff_line[2].lower() in ['gene', 'pseudogene']: | |
416 gid, gene_info=None, dict() | |
417 gene_info['start']=int(gff_line[3]) | |
418 gene_info['stop']=int(gff_line[4]) | |
419 gene_info['chr']=gff_line[0] | |
420 gene_info['source']=gff_line[1] | |
421 gene_info['strand']=gff_line[6] | |
422 for attb in gff_line[-1].split(';'): | |
423 attb=attb.split('=') # gff attributes are separated by key=value pair | |
424 if attb[0]=='ID': | |
425 gid=attb[1] | |
426 break | |
427 genes[(gff_line[0], gid)]=gene_info # store gene information based on the chromosome and gene symbol. | |
428 elif gff_line[2].lower() in features: | |
429 gid, mrna_info=None, dict() | |
430 mrna_info['start']=int(gff_line[3]) | |
431 mrna_info['stop']=int(gff_line[4]) | |
432 mrna_info['chr']=gff_line[0] | |
433 mrna_info['strand']=gff_line[6] | |
434 mrna_info['type'] = gff_line[2] | |
435 for attb in gff_line[-1].split(';'): | |
436 attb=attb.split('=') | |
437 if attb[0]=='Parent': | |
438 gid=attb[1] | |
439 elif attb[0]=='ID': | |
440 mrna_info[attb[0]]=attb[1] | |
441 for fid in gid.split(','): # child may be mapped to multiple parents ex: Parent=AT01,AT01-1-Protein | |
442 if (gff_line[0], fid) in transcripts: | |
443 transcripts[(gff_line[0], fid)].append(mrna_info) | |
444 else: | |
445 transcripts[(gff_line[0], fid)]=[mrna_info] | |
446 elif gff_line[2].lower() in ['exon']: | |
447 tids, exon_info=None, dict() | |
448 exon_info['start']=int(gff_line[3]) | |
449 exon_info['stop']=int(gff_line[4]) | |
450 exon_info['chr']=gff_line[0] | |
451 exon_info['strand']=gff_line[6] | |
452 for attb in gff_line[-1].split(';'): | |
453 attb=attb.split('=') | |
454 if attb[0]=='Parent': | |
455 tids=attb[1] | |
456 break | |
457 for tid in tids.split(','): | |
458 if (gff_line[0], tid) in exons: | |
459 exons[(gff_line[0], tid)].append(exon_info) | |
460 else: | |
461 exons[(gff_line[0], tid)]=[exon_info] | |
462 elif gff_line[2].lower() in ['five_prime_utr']: | |
463 utr5_info, tids=dict(), None | |
464 utr5_info['start']=int(gff_line[3]) | |
465 utr5_info['stop']=int(gff_line[4]) | |
466 utr5_info['chr']=gff_line[0] | |
467 utr5_info['strand']=gff_line[6] | |
468 for attb in gff_line[-1].split(';'): | |
469 attb=attb.split('=') | |
470 if attb[0]=='Parent': | |
471 tids=attb[1] | |
472 break | |
473 for tid in tids.split(','): | |
474 if (gff_line[0], tid) in utr5: | |
475 utr5[(gff_line[0], tid)].append(utr5_info) | |
476 else: | |
477 utr5[(gff_line[0], tid)]=[utr5_info] | |
478 elif gff_line[2].lower() in ['cds']: | |
479 cds_info, tids=dict(), None | |
480 cds_info['start']=int(gff_line[3]) | |
481 cds_info['stop']=int(gff_line[4]) | |
482 cds_info['chr']=gff_line[0] | |
483 cds_info['strand']=gff_line[6] | |
484 for attb in gff_line[-1].split(';'): | |
485 attb=attb.split('=') | |
486 if attb[0]=='Parent': | |
487 tids=attb[1] | |
488 break | |
489 for tid in tids.split(','): | |
490 if (gff_line[0], tid) in cds: | |
491 cds[(gff_line[0], tid)].append(cds_info) | |
492 else: | |
493 cds[(gff_line[0], tid)]=[cds_info] | |
494 elif gff_line[2].lower() in ['three_prime_utr']: | |
495 utr3_info, tids=dict(), None | |
496 utr3_info['start']=int(gff_line[3]) | |
497 utr3_info['stop']=int(gff_line[4]) | |
498 utr3_info['chr']=gff_line[0] | |
499 utr3_info['strand']=gff_line[6] | |
500 for attb in gff_line[-1].split(';'): | |
501 attb=attb.split('=') | |
502 if attb[0]=='Parent': | |
503 tids=attb[1] | |
504 break | |
505 for tid in tids.split(','): | |
506 if (gff_line[0], tid) in utr3: | |
507 utr3[(gff_line[0], tid)].append(utr3_info) | |
508 else: | |
509 utr3[(gff_line[0], tid)]=[utr3_info] | |
510 gff_handle.close() | |
511 return genes, transcripts, exons, utr3, utr5, cds | |
512 | |
513 def __main__(): | |
514 """extract genome feature information | |
515 """ | |
516 try: | |
517 gff_file = sys.argv[1] | |
518 mat_file = sys.argv[2] | |
519 except: | |
520 print __doc__ | |
521 sys.exit(-1) | |
522 | |
523 genes, transcripts, exons, utr3, utr5, cds = GFFParse(gff_file) | |
524 gene_models = CreateGeneModels(genes, transcripts, exons, utr3, utr5, cds) | |
525 # TODO Write to matlab/octave struct instead of cell arrays. | |
526 sio.savemat(mat_file, | |
527 mdict=dict(genes=gene_models), | |
528 format='5', | |
529 oned_as='row') | |
530 | |
531 if __name__=='__main__': | |
532 __main__() |