Mercurial > repos > vipints > fml_gff3togtf
view gff_fmap.py @ 5:6e589f267c14
Uploaded
author | devteam |
---|---|
date | Tue, 04 Nov 2014 12:15:19 -0500 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python """ GFF feature mapping program, to find the relation between different features described in a given GFF file. Usage: python gff_fmap.py in.gff > out.txt Courtesy: Brad Chapman Few functions are inherited from bcbio-GFFParser program. """ import re import sys import urllib import collections from helper import open_file def _gff_line_map(line): """Parses a line of GFF into a dictionary. Given an input line from a GFF file, this: - breaks it into component elements - determines the type of attribute (flat, parent, child or annotation) - generates a dictionary of GFF info """ gff3_kw_pat = re.compile("\w+=") def _split_keyvals(keyval_str): """Split key-value pairs in a GFF2, GTF and GFF3 compatible way. GFF3 has key value pairs like: count=9;gene=amx-2;sequence=SAGE:aacggagccg GFF2 and GTF have: Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206" name "fgenesh1_pg.C_chr_1000003"; transcriptId 869 """ quals = collections.defaultdict(list) if keyval_str is None: return quals # ensembl GTF has a stray semi-colon at the end if keyval_str[-1] == ';': keyval_str = keyval_str[:-1] # GFF2/GTF has a semi-colon with at least one space after it. # It can have spaces on both sides; wormbase does this. # GFF3 works with no spaces. # Split at the first one we can recognize as working parts = keyval_str.split(" ; ") if len(parts) == 1: parts = keyval_str.split("; ") if len(parts) == 1: parts = keyval_str.split(";") # check if we have GFF3 style key-vals (with =) is_gff2 = True if gff3_kw_pat.match(parts[0]): is_gff2 = False key_vals = [p.split('=') for p in parts] # otherwise, we are separated by a space with a key as the first item else: pieces = [] for p in parts: # fix misplaced semi-colons in keys in some GFF2 files if p and p[0] == ';': p = p[1:] pieces.append(p.strip().split(" ")) key_vals = [(p[0], " ".join(p[1:])) for p in pieces] for key, val in key_vals: # remove quotes in GFF2 files if (len(val) > 0 and val[0] == '"' and val[-1] == '"'): val = val[1:-1] if val: quals[key].extend(val.split(',')) # if we don't have a value, make this a key=True/False style # attribute else: quals[key].append('true') for key, vals in quals.items(): quals[key] = [urllib.unquote(v) for v in vals] return quals, is_gff2 def _nest_gff2_features(gff_parts): """Provide nesting of GFF2 transcript parts with transcript IDs. exons and coding sequences are mapped to a parent with a transcript_id in GFF2. This is implemented differently at different genome centers and this function attempts to resolve that and map things to the GFF3 way of doing them. """ # map protein or transcript ids to a parent for transcript_id in ["transcript_id", "transcriptId", "proteinId"]: try: gff_parts["quals"]["Parent"] = \ gff_parts["quals"][transcript_id] break except KeyError: pass # case for WormBase GFF -- everything labelled as Transcript or CDS for flat_name in ["Transcript", "CDS"]: if gff_parts["quals"].has_key(flat_name): # parent types if gff_parts["type"] in [flat_name]: if not gff_parts["id"]: gff_parts["id"] = gff_parts["quals"][flat_name][0] gff_parts["quals"]["ID"] = [gff_parts["id"]] # children types elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR", "coding_exon", "five_prime_UTR", "CDS", "stop_codon", "start_codon"]: gff_parts["quals"]["Parent"] = gff_parts["quals"][flat_name] break return gff_parts line = line.strip() if line == '':return [('directive', line)] # sometimes the blank lines will be there if line[0] == '>':return [('directive', '')] # sometimes it will be a FATSA header if line[0] == "#": return [('directive', line[2:])] elif line: parts = line.split('\t') if len(parts) == 1 and re.search(r'\w+', parts[0]):return [('directive', '')] ## GFF files with FASTA sequence together assert len(parts) == 9, line gff_parts = [(None if p == '.' else p) for p in parts] gff_info = dict() # collect all of the base qualifiers for this item quals, is_gff2 = _split_keyvals(gff_parts[8]) gff_info["is_gff2"] = is_gff2 if gff_parts[1]:quals["source"].append(gff_parts[1]) gff_info['quals'] = dict(quals) # if we are describing a location, then we are a feature if gff_parts[3] and gff_parts[4]: gff_info['type'] = gff_parts[2] gff_info['id'] = quals.get('ID', [''])[0] if is_gff2:gff_info = _nest_gff2_features(gff_info) # features that have parents need to link so we can pick up # the relationship if gff_info['quals'].has_key('Parent'): final_key = 'child' elif gff_info['id']: final_key = 'parent' # Handle flat features else: final_key = 'feature' # otherwise, associate these annotations with the full record else: final_key = 'annotation' return [(final_key, gff_info)] def parent_child_id_map(gff_handle): """ Provide a mapping of parent to child relationships in the file. Gives a dictionary of parent child relationships: keys -- tuple of (source, type) for each parent values -- tuple of (source, type) as children of that parent """ # collect all of the parent and child types mapped to IDs parent_sts = dict() child_sts = collections.defaultdict(list) for line in gff_handle: line_type, line_info = _gff_line_map(line)[0] if (line_type == 'parent' or (line_type == 'child' and line_info['id'])): parent_sts[line_info['id']] = (line_info['quals']['source'][0], line_info['type']) if line_type == 'child': for parent_id in line_info['quals']['Parent']: child_sts[parent_id].append((line_info['quals']['source'][0], line_info['type'])) gff_handle.close() # generate a dictionary of the unique final type relationships pc_map = collections.defaultdict(list) for parent_id, parent_type in parent_sts.items(): for child_type in child_sts[parent_id]: pc_map[parent_type].append(child_type) pc_final_map = dict() for ptype, ctypes in pc_map.items(): unique_ctypes = list(set(ctypes)) unique_ctypes.sort() pc_final_map[ptype] = unique_ctypes # some cases the GFF file represents a single feature type if not pc_final_map: for fid, stypes in parent_sts.items(): pc_final_map[stypes] = dict() # generate a report on feature id mapping in the file print '+---------------------+---------------------------------+' print '| Parent feature type | Associated child feature type(s)|' print '+---------------------+---------------------------------+' for key, value in pc_final_map.items(): print key[0], key[1] for child_to in value: print '\t\t\t|-',child_to[0], child_to[1] print '+---------------------+---------------------------------+' if __name__=='__main__': try: gff_file = sys.argv[1] except: print __doc__ sys.exit(-1) gff_handle = open_file(gff_file) parent_child_id_map(gff_handle)