Mercurial > repos > vipints > fml_mergeloci
diff fml_gff_groomer/scripts/gff_id_mapper.py @ 0:79726c328621 default tip
Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author | vipints |
---|---|
date | Tue, 07 Jun 2011 17:29:24 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fml_gff_groomer/scripts/gff_id_mapper.py Tue Jun 07 17:29:24 2011 -0400 @@ -0,0 +1,254 @@ +#!/usr/bin/env python +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# Written (W) 2010 Vipin T Sreedharan, Friedrich Miescher Laboratory of the Max Planck Society +# Copyright (C) 2010 Max Planck Society +# + +# Description : Provides feature to sub feature identifier mapping in a given GFF3 file. + +import re, sys +import collections +import urllib +import time + +def _gff_line_map(line): + """Parses a line of GFF into a dictionary. + Given an input line from a GFF file, this: + - breaks it into component elements + - determines the type of attribute (flat, parent, child or annotation) + - generates a dictionary of GFF info + """ + gff3_kw_pat = re.compile("\w+=") + def _split_keyvals(keyval_str): + """Split key-value pairs in a GFF2, GTF and GFF3 compatible way. + + GFF3 has key value pairs like: + count=9;gene=amx-2;sequence=SAGE:aacggagccg + GFF2 and GTF have: + Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206" + name "fgenesh1_pg.C_chr_1000003"; transcriptId 869 + """ + quals = collections.defaultdict(list) + if keyval_str is None: + return quals + # ensembl GTF has a stray semi-colon at the end + if keyval_str[-1] == ';': + keyval_str = keyval_str[:-1] + # GFF2/GTF has a semi-colon with at least one space after it. + # It can have spaces on both sides; wormbase does this. + # GFF3 works with no spaces. + # Split at the first one we can recognize as working + parts = keyval_str.split(" ; ") + if len(parts) == 1: + parts = keyval_str.split("; ") + if len(parts) == 1: + parts = keyval_str.split(";") + # check if we have GFF3 style key-vals (with =) + is_gff2 = True + if gff3_kw_pat.match(parts[0]): + is_gff2 = False + key_vals = [p.split('=') for p in parts] + # otherwise, we are separated by a space with a key as the first item + else: + pieces = [] + for p in parts: + # fix misplaced semi-colons in keys in some GFF2 files + if p and p[0] == ';': + p = p[1:] + pieces.append(p.strip().split(" ")) + key_vals = [(p[0], " ".join(p[1:])) for p in pieces] + for key, val in key_vals: + # remove quotes in GFF2 files + if (len(val) > 0 and val[0] == '"' and val[-1] == '"'): + val = val[1:-1] + if val: + quals[key].extend(val.split(',')) + # if we don't have a value, make this a key=True/False style + # attribute + else: + quals[key].append('true') + for key, vals in quals.items(): + quals[key] = [urllib.unquote(v) for v in vals] + return quals, is_gff2 + + def _nest_gff2_features(gff_parts): + """Provide nesting of GFF2 transcript parts with transcript IDs. + + exons and coding sequences are mapped to a parent with a transcript_id + in GFF2. This is implemented differently at different genome centers + and this function attempts to resolve that and map things to the GFF3 + way of doing them. + """ + # map protein or transcript ids to a parent + for transcript_id in ["transcript_id", "transcriptId", "proteinId"]: + try: + gff_parts["quals"]["Parent"] = \ + gff_parts["quals"][transcript_id] + break + except KeyError: + pass + # case for WormBase GFF -- everything labelled as Transcript or CDS + for flat_name in ["Transcript", "CDS"]: + if gff_parts["quals"].has_key(flat_name): + # parent types + if gff_parts["type"] in [flat_name]: + if not gff_parts["id"]: + gff_parts["id"] = gff_parts["quals"][flat_name][0] + gff_parts["quals"]["ID"] = [gff_parts["id"]] + # children types + elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR", + "coding_exon", "five_prime_UTR", "CDS", "stop_codon", + "start_codon"]: + gff_parts["quals"]["Parent"] = gff_parts["quals"][flat_name] + break + + return gff_parts + + line = line.strip() + if line == '':return [('directive', line)] # sometimes the blank lines will be there + if line[0] == '>':return [('directive', '')] # sometimes it will be a FATSA header + if line[0] == "#": + return [('directive', line[2:])] + elif line: + parts = line.split('\t') + if len(parts) == 1 and re.search(r'\w+', parts[0]):return [('directive', '')] ## GFF files with FASTA sequence together + assert len(parts) == 9, line + gff_parts = [(None if p == '.' else p) for p in parts] + gff_info = dict() + + # collect all of the base qualifiers for this item + quals, is_gff2 = _split_keyvals(gff_parts[8]) + + gff_info["is_gff2"] = is_gff2 + + if gff_parts[1]:quals["source"].append(gff_parts[1]) + gff_info['quals'] = dict(quals) + + # if we are describing a location, then we are a feature + if gff_parts[3] and gff_parts[4]: + gff_info['type'] = gff_parts[2] + gff_info['id'] = quals.get('ID', [''])[0] + + if is_gff2:gff_info = _nest_gff2_features(gff_info) + # features that have parents need to link so we can pick up + # the relationship + if gff_info['quals'].has_key('Parent'): + final_key = 'child' + elif gff_info['id']: + final_key = 'parent' + # Handle flat features + else: + final_key = 'feature' + # otherwise, associate these annotations with the full record + else: + final_key = 'annotation' + return [(final_key, gff_info)] + +def parent_child_id_map(gff_handle): + """Provide a mapping of parent to child relationships in the file. + + Gives a dictionary of parent child relationships: + + keys -- tuple of (source, type) for each parent + values -- tuple of (source, type) as children of that parent""" + + # collect all of the parent and child types mapped to IDs + parent_sts = dict() + child_sts = collections.defaultdict(list) + + for line in gff_handle: + line_type, line_info = _gff_line_map(line)[0] + if (line_type == 'parent' or (line_type == 'child' and line_info['id'])): + parent_sts[line_info['id']] = (line_info['quals']['source'][0], line_info['type']) + if line_type == 'child': + for parent_id in line_info['quals']['Parent']: + child_sts[parent_id].append((line_info['quals']['source'][0], line_info['type'])) + gff_handle.close() + + # generate a dictionary of the unique final type relationships + pc_map = collections.defaultdict(list) + for parent_id, parent_type in parent_sts.items(): + for child_type in child_sts[parent_id]: + pc_map[parent_type].append(child_type) + pc_final_map = dict() + for ptype, ctypes in pc_map.items(): + unique_ctypes = list(set(ctypes)) + unique_ctypes.sort() + pc_final_map[ptype] = unique_ctypes + + # Check for Parent Child relations + level1, level2, level3, sec_level_mis = {}, {}, {}, {} + for etype, fchild in pc_final_map.items(): + level2_flag = 0 + for kp, vp in pc_final_map.items(): + if etype in vp:level2_flag = 1; level2[etype] = 1 # check for second level features + if level2_flag == 0: # first level features + level1[etype] =1 + for eachfch in fchild: # perform a check for all level1 objects values were defined as level2 keys. + if not eachfch in pc_final_map.keys(): # figure out the missing level2 objects + if etype in sec_level_mis: + sec_level_mis[etype].append(eachfch) + else: + sec_level_mis[etype]=[eachfch] + if level2_flag == 1:level3[str(fchild)] =1 # taking third level features + # disply the result + if level1==level2==level3=={} and sec_level_mis == {}: + print + print 'ONLY FIRST level feature(s):' + source_type = dict() + gff_handle = open(gff_handle.name, 'rU') + for line in gff_handle: + line = line.strip('\n\r') + if line[0] == '#': continue + parts = line.split('\t') + if parts[-1] == '':parts.pop() + assert len(parts) == 9, line + source_type[(parts[1], parts[2])] = 1 + gff_handle.close() + for ele in source_type:print '\t' + str(ele) + print + else: + print + print '===Report on different level features from GFF file===' + print + print 'FIRST level feature(s):' + for ele in level1: print '\t' + str(ele) + print + print 'SECOND level feature(s):' + for ele in level2: print '\t' + str(ele) + print + print 'THIRD level feature(s):' + for ele in level3:print '\t' + str(ele[1:-1]) + print + # wrong way mapped feature mapping description + for wf, wfv in sec_level_mis.items(): + if wf[1]=='gene': + print 'GFF Parsing modules from publicly available packages like Bio-python, Bio-perl etc. are heavily dependent on feature identifier mapping.' + print 'Here few features seems to be wrongly mapped to its child, which inturn cause problems while extracting the annotation based on feature identifier.\n' + for ehv in wfv: + if ehv[1]=='exon' or ehv[1]=='intron' or ehv[1]=='CDS' or ehv[1]=='three_prime_UTR' or ehv[1]=='five_prime_UTR':print 'Error in ID mapping: Level1 feature ' + str(wf) + ' maps to Level3 feature ' + str(ehv) + +if __name__=='__main__': + + stime = time.asctime( time.localtime(time.time()) ) + print '-------------------------------------------------------' + print 'GFFExamine started on ' + stime + print '-------------------------------------------------------' + + try: + gff_handle = open(sys.argv[1], 'rU') + except: + sys.stderr.write("Can't open the GFF3 file, Cannot continue...\n") + sys.stderr.write("USAGE: gff_id_mapper.py <gff3 file> \n") + sys.exit(-1) + parent_child_id_map(gff_handle) + + stime = time.asctime( time.localtime(time.time()) ) + print '-------------------------------------------------------' + print 'GFFExamine finished at ' + stime + print '-------------------------------------------------------'