Mercurial > repos > vipints > rdiff
comparison rDiff/tools/GFFParser.py @ 2:233c30f91d66
updated python based GFF parsing module which will handle GTF/GFF/GFF3 file types
| author | vipints <vipin@cbio.mskcc.org> |
|---|---|
| date | Tue, 08 Oct 2013 07:15:44 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:0f80a5141704 | 2:233c30f91d66 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 Extract genome annotation from a GFF (a tab delimited format for storing sequence features and annotations) file. | |
| 4 | |
| 5 Requirements: | |
| 6 Numpy :- http://numpy.org/ | |
| 7 Scipy :- http://scipy.org/ | |
| 8 | |
| 9 Copyright (C) | |
| 10 | |
| 11 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. | |
| 12 2012-2013 Memorial Sloan-Kettering Cancer Center, New York City, USA. | |
| 13 """ | |
| 14 | |
| 15 import re | |
| 16 import os | |
| 17 import sys | |
| 18 import urllib | |
| 19 import numpy as np | |
| 20 import scipy.io as sio | |
| 21 from collections import defaultdict | |
| 22 import helper as utils | |
| 23 | |
| 24 def _attribute_tags(col9): | |
| 25 """ | |
| 26 Split the key-value tags from the attribute column, it takes column number 9 from GTF/GFF file | |
| 27 """ | |
| 28 info = defaultdict(list) | |
| 29 is_gff = False | |
| 30 | |
| 31 if not col9: | |
| 32 return is_gff, info | |
| 33 | |
| 34 # trim the line ending semi-colon ucsc may have some white-space | |
| 35 col9 = col9.rstrip(';| ') | |
| 36 # attributes from 9th column | |
| 37 atbs = col9.split(" ; ") | |
| 38 if len(atbs) == 1: | |
| 39 atbs = col9.split("; ") | |
| 40 if len(atbs) == 1: | |
| 41 atbs = col9.split(";") | |
| 42 # check the GFF3 pattern which has key value pairs like: | |
| 43 gff3_pat = re.compile("\w+=") | |
| 44 # sometime GTF have: gene_id uc002zkg.1; | |
| 45 gtf_pat = re.compile("\s?\w+\s") | |
| 46 | |
| 47 key_vals = [] | |
| 48 | |
| 49 if gff3_pat.match(atbs[0]): # gff3 pattern | |
| 50 is_gff = True | |
| 51 key_vals = [at.split('=') for at in atbs] | |
| 52 elif gtf_pat.match(atbs[0]): # gtf pattern | |
| 53 for at in atbs: | |
| 54 key_vals.append(at.strip().split(" ",1)) | |
| 55 else: | |
| 56 # to handle attribute column has only single value | |
| 57 key_vals.append(['ID', atbs[0]]) | |
| 58 # get key, val items | |
| 59 for item in key_vals: | |
| 60 key, val = item | |
| 61 # replace the double qoutes from feature identifier | |
| 62 val = re.sub('"', '', val) | |
| 63 # replace the web formating place holders to plain text format | |
| 64 info[key].extend([urllib.unquote(v) for v in val.split(',') if v]) | |
| 65 | |
| 66 return is_gff, info | |
| 67 | |
| 68 def _spec_features_keywd(gff_parts): | |
| 69 """ | |
| 70 Specify the feature key word according to the GFF specifications | |
| 71 """ | |
| 72 for t_id in ["transcript_id", "transcriptId", "proteinId"]: | |
| 73 try: | |
| 74 gff_parts["info"]["Parent"] = gff_parts["info"][t_id] | |
| 75 break | |
| 76 except KeyError: | |
| 77 pass | |
| 78 for g_id in ["gene_id", "geneid", "geneId", "name", "gene_name", "genename"]: | |
| 79 try: | |
| 80 gff_parts["info"]["GParent"] = gff_parts["info"][g_id] | |
| 81 break | |
| 82 except KeyError: | |
| 83 pass | |
| 84 ## TODO key words | |
| 85 for flat_name in ["Transcript", "CDS"]: | |
| 86 if gff_parts["info"].has_key(flat_name): | |
| 87 # parents | |
| 88 if gff_parts['type'] in [flat_name] or re.search(r'transcript', gff_parts['type'], re.IGNORECASE): | |
| 89 if not gff_parts['id']: | |
| 90 gff_parts['id'] = gff_parts['info'][flat_name][0] | |
| 91 #gff_parts["info"]["ID"] = [gff_parts["id"]] | |
| 92 # children | |
| 93 elif gff_parts["type"] in ["intron", "exon", "pseudogenic_exon", "three_prime_UTR", | |
| 94 "coding_exon", "five_prime_UTR", "CDS", "stop_codon", | |
| 95 "start_codon"]: | |
| 96 gff_parts["info"]["Parent"] = gff_parts["info"][flat_name] | |
| 97 break | |
| 98 return gff_parts | |
| 99 | |
| 100 def Parse(ga_file): | |
| 101 """ | |
| 102 Parsing GFF/GTF file based on feature relationship, it takes the input file. | |
| 103 """ | |
| 104 child_map = defaultdict(list) | |
| 105 parent_map = dict() | |
| 106 | |
| 107 ga_handle = utils._open_file(ga_file) | |
| 108 | |
| 109 for rec in ga_handle: | |
| 110 rec = rec.strip('\n\r') | |
| 111 | |
| 112 # skip empty line fasta identifier and commented line | |
| 113 if not rec or rec[0] in ['#', '>']: | |
| 114 continue | |
| 115 # skip the genome sequence | |
| 116 if not re.search('\t', rec): | |
| 117 continue | |
| 118 | |
| 119 parts = rec.split('\t') | |
| 120 assert len(parts) >= 8, rec | |
| 121 | |
| 122 # process the attribute column (9th column) | |
| 123 ftype, tags = _attribute_tags(parts[-1]) | |
| 124 if not tags: # skip the line if no attribute column. | |
| 125 continue | |
| 126 | |
| 127 # extract fields | |
| 128 if parts[1]: | |
| 129 tags["source"] = parts[1] | |
| 130 if parts[7]: | |
| 131 tags["phase"] = parts[7] | |
| 132 | |
| 133 gff_info = dict() | |
| 134 gff_info['info'] = dict(tags) | |
| 135 #gff_info["is_gff3"] = ftype | |
| 136 gff_info['chr'] = parts[0] | |
| 137 | |
| 138 if parts[3] and parts[4]: | |
| 139 gff_info['location'] = [int(parts[3]) , | |
| 140 int(parts[4])] | |
| 141 gff_info['type'] = parts[2] | |
| 142 gff_info['id'] = tags.get('ID', [''])[0] | |
| 143 if parts[6] in ['?', '.']: | |
| 144 parts[6] = None | |
| 145 gff_info['strand'] = parts[6] | |
| 146 | |
| 147 # key word according to the GFF spec. | |
| 148 if not ftype: | |
| 149 gff_info = _spec_features_keywd(gff_info) | |
| 150 | |
| 151 # link the feature relationships | |
| 152 if gff_info['info'].has_key('Parent'): | |
| 153 for p in gff_info['info']['Parent']: | |
| 154 if p == gff_info['id']: | |
| 155 gff_info['id'] = '' | |
| 156 break | |
| 157 rec_category = 'child' | |
| 158 elif gff_info['id']: | |
| 159 rec_category = 'parent' | |
| 160 else: | |
| 161 rec_category = 'record' | |
| 162 | |
| 163 # depends on the record category organize the features | |
| 164 if rec_category == 'child': | |
| 165 for p in gff_info['info']['Parent']: | |
| 166 # create the data structure based on source and feature id | |
| 167 child_map[(gff_info['chr'], gff_info['info']['source'], p)].append( | |
| 168 dict( type = gff_info['type'], | |
| 169 location = gff_info['location'], | |
| 170 strand = gff_info['strand'], | |
| 171 ID = gff_info['id'], | |
| 172 gene_id = gff_info['info'].get('GParent', '') | |
| 173 )) | |
| 174 elif rec_category == 'parent': | |
| 175 parent_map[(gff_info['chr'], gff_info['info']['source'], gff_info['id'])] = dict( | |
| 176 type = gff_info['type'], | |
| 177 location = gff_info['location'], | |
| 178 strand = gff_info['strand'], | |
| 179 name = tags.get('Name', [''])[0]) | |
| 180 elif rec_category == 'record': | |
| 181 #TODO how to handle plain records? | |
| 182 c = 1 | |
| 183 ga_handle.close() | |
| 184 | |
| 185 # depends on file type create parent feature | |
| 186 if not ftype: | |
| 187 parent_map, child_map = _create_missing_feature_type(parent_map, child_map) | |
| 188 | |
| 189 # connecting parent child relations | |
| 190 # // essentially the parent child features are here from any type of GTF/GFF2/GFF3 file | |
| 191 gene_mat = _format_gene_models(parent_map, child_map) | |
| 192 | |
| 193 return gene_mat | |
| 194 | |
| 195 def _format_gene_models(parent_nf_map, child_nf_map): | |
| 196 """ | |
| 197 Genarate GeneObject based on the parsed file contents | |
| 198 | |
| 199 parent_map: parent features with source and chromosome information | |
| 200 child_map: transctipt and exon information are encoded | |
| 201 """ | |
| 202 g_cnt = 0 | |
| 203 gene = np.zeros((len(parent_nf_map),), dtype = utils.init_gene_DE()) | |
| 204 | |
| 205 for pkey, pdet in parent_nf_map.items(): | |
| 206 # considering only gene features | |
| 207 if not re.search(r'gene', pdet.get('type', '')): | |
| 208 continue | |
| 209 # infer the gene start and stop if not there in the | |
| 210 if not pdet.get('location', []): | |
| 211 GNS, GNE = [], [] | |
| 212 # multiple number of transcripts | |
| 213 for L1 in child_nf_map[pkey]: | |
| 214 GNS.append(L1.get('location', [])[0]) | |
| 215 GNE.append(L1.get('location', [])[1]) | |
| 216 GNS.sort() | |
| 217 GNE.sort() | |
| 218 pdet['location'] = [GNS[0], GNE[-1]] | |
| 219 orient = pdet.get('strand', '') | |
| 220 | |
| 221 gene[g_cnt]['id'] = g_cnt +1 | |
| 222 gene[g_cnt]['chr'] = pkey[0] | |
| 223 gene[g_cnt]['source'] = pkey[1] | |
| 224 gene[g_cnt]['name'] = pkey[-1] | |
| 225 gene[g_cnt]['start'] = pdet.get('location', [])[0] | |
| 226 gene[g_cnt]['stop'] = pdet.get('location', [])[1] | |
| 227 gene[g_cnt]['strand'] = orient | |
| 228 | |
| 229 # default value | |
| 230 gene[g_cnt]['is_alt_spliced'] = 0 | |
| 231 if len(child_nf_map[pkey]) > 1: | |
| 232 gene[g_cnt]['is_alt_spliced'] = 1 | |
| 233 | |
| 234 # complete sub-feature for all transcripts | |
| 235 dim = len(child_nf_map[pkey]) | |
| 236 TRS = np.zeros((dim,), dtype=np.object) | |
| 237 EXON = np.zeros((dim,), dtype=np.object) | |
| 238 | |
| 239 # fetching corresponding transcripts | |
| 240 for xq, Lv1 in enumerate(child_nf_map[pkey]): | |
| 241 | |
| 242 TID = Lv1.get('ID', '') | |
| 243 TRS[xq]= np.array([TID]) | |
| 244 | |
| 245 orient = Lv1.get('strand', '') | |
| 246 | |
| 247 # fetching different sub-features | |
| 248 child_feat = defaultdict(list) | |
| 249 for Lv2 in child_nf_map[(pkey[0], pkey[1], TID)]: | |
| 250 E_TYP = Lv2.get('type', '') | |
| 251 child_feat[E_TYP].append(Lv2.get('location')) | |
| 252 | |
| 253 # make exon coordinate from cds and utr regions | |
| 254 if not child_feat.get('exon'): | |
| 255 if child_feat.get('CDS'): | |
| 256 exon_cod = utils.make_Exon_cod( orient, | |
| 257 NonetoemptyList(child_feat.get('five_prime_UTR')), | |
| 258 NonetoemptyList(child_feat.get('CDS')), | |
| 259 NonetoemptyList(child_feat.get('three_prime_UTR'))) | |
| 260 child_feat['exon'] = exon_cod | |
| 261 else: | |
| 262 # searching through keys to find a pattern describing exon feature | |
| 263 ex_key_pattern = [k for k in child_feat if k.endswith("exon")] | |
| 264 child_feat['exon'] = child_feat[ex_key_pattern[0]] | |
| 265 # TODO only UTR's | |
| 266 | |
| 267 # make general ascending order of coordinates | |
| 268 if orient == '-': | |
| 269 for etype, excod in child_feat.items(): | |
| 270 if len(excod) > 1: | |
| 271 if excod[0][0] > excod[-1][0]: | |
| 272 excod.reverse() | |
| 273 child_feat[etype] = excod | |
| 274 | |
| 275 # add sub-feature # make array for export to different out | |
| 276 EXON[xq] = np.array(child_feat.get('exon'), np.float64) | |
| 277 | |
| 278 # add sub-features to the parent gene feature | |
| 279 gene[g_cnt]['transcripts'] = TRS | |
| 280 gene[g_cnt]['exons'] = EXON | |
| 281 | |
| 282 gene[g_cnt]['gene_info'] = dict( ID = pkey[-1], | |
| 283 Name = pdet.get('name'), | |
| 284 Source = pkey[1]) | |
| 285 g_cnt += 1 | |
| 286 | |
| 287 ## deleting empty gene records from the main array | |
| 288 for XP, ens in enumerate(gene): | |
| 289 if ens[0]==0: | |
| 290 break | |
| 291 | |
| 292 XQC = range(XP, len(gene)+1) | |
| 293 gene = np.delete(gene, XQC) | |
| 294 | |
| 295 return gene | |
| 296 | |
| 297 def NonetoemptyList(XS): | |
| 298 """ | |
| 299 Convert a None type to empty list | |
| 300 """ | |
| 301 return [] if XS is None else XS | |
| 302 | |
| 303 def _create_missing_feature_type(p_feat, c_feat): | |
| 304 """ | |
| 305 GFF/GTF file defines only child features. This function tries to create | |
| 306 the parent feature from the information provided in the attribute column. | |
| 307 | |
| 308 example: | |
| 309 chr21 hg19_knownGene exon 9690071 9690100 0.000000 + . gene_id "uc002zkg.1"; transcript_id "uc002zkg.1"; | |
| 310 chr21 hg19_knownGene exon 9692178 9692207 0.000000 + . gene_id "uc021wgt.1"; transcript_id "uc021wgt.1"; | |
| 311 chr21 hg19_knownGene exon 9711935 9712038 0.000000 + . gene_id "uc011abu.2"; transcript_id "uc011abu.2"; | |
| 312 | |
| 313 This function gets the parsed feature annotations. | |
| 314 """ | |
| 315 child_n_map = defaultdict(list) | |
| 316 for fid, det in c_feat.items(): | |
| 317 # get the details from grand child | |
| 318 GID = STRD = None | |
| 319 SPOS, EPOS = [], [] | |
| 320 TYP = dict() | |
| 321 for gchild in det: | |
| 322 GID = gchild.get('gene_id', [''])[0] | |
| 323 SPOS.append(gchild.get('location', [])[0]) | |
| 324 EPOS.append(gchild.get('location', [])[1]) | |
| 325 STRD = gchild.get('strand', '') | |
| 326 TYP[gchild.get('type', '')] = 1 | |
| 327 SPOS.sort() | |
| 328 EPOS.sort() | |
| 329 | |
| 330 # infer transcript type | |
| 331 transcript_type = 'transcript' | |
| 332 transcript_type = 'mRNA' if TYP.get('CDS', '') or TYP.get('cds', '') else transcript_type | |
| 333 | |
| 334 # gene id and transcript id are same | |
| 335 transcript_id = fid[-1] | |
| 336 if GID == transcript_id: | |
| 337 transcript_id = 'Transcript:' + str(GID) | |
| 338 | |
| 339 # level -1 feature type | |
| 340 p_feat[(fid[0], fid[1], GID)] = dict( type = 'gene', | |
| 341 location = [], ## infer location based on multiple transcripts | |
| 342 strand = STRD, | |
| 343 name = GID ) | |
| 344 # level -2 feature type | |
| 345 child_n_map[(fid[0], fid[1], GID)].append( | |
| 346 dict( type = transcript_type, | |
| 347 location = [SPOS[0], EPOS[-1]], | |
| 348 strand = STRD, | |
| 349 ID = transcript_id, | |
| 350 gene_id = '' )) | |
| 351 # reorganizing the grand child | |
| 352 for gchild in det: | |
| 353 child_n_map[(fid[0], fid[1], transcript_id)].append( | |
| 354 dict( type = gchild.get('type', ''), | |
| 355 location = gchild.get('location'), | |
| 356 strand = gchild.get('strand'), | |
| 357 ID = gchild.get('ID'), | |
| 358 gene_id = '' )) | |
| 359 return p_feat, child_n_map | |
| 360 | |
| 361 | |
| 362 ## General instruction to use the above functions: | |
| 363 ## Usage: GFFParser.py in.gff3 out.mat | |
| 364 | |
| 365 try: | |
| 366 gff_file = sys.argv[1] | |
| 367 out_mat = sys.argv[2] | |
| 368 except: | |
| 369 print __doc__ | |
| 370 sys.exit(-1) | |
| 371 | |
| 372 ## Parse the file accoring to the type and returns the genes informations -- | |
| 373 gene_struct = Parse(gff_file) | |
| 374 | |
| 375 ## Write the gene annotations to a matlab struct array format -- | |
| 376 sio.savemat(out_mat, | |
| 377 mdict = dict(genes = gene_struct), | |
| 378 format = '5', | |
| 379 oned_as = 'row') |
