view fml_gff_groomer/scripts/gff_id_mapper.py @ 0:79726c328621 default tip

Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author vipints
date Tue, 07 Jun 2011 17:29:24 -0400
parents
children
line wrap: on
line source

#!/usr/bin/env python
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Written (W) 2010 Vipin T Sreedharan, Friedrich Miescher Laboratory of the Max Planck Society
# Copyright (C) 2010 Max Planck Society
#

# Description : Provides feature to sub feature identifier mapping in a given GFF3 file.

import re, sys 
import collections
import urllib
import time 

def _gff_line_map(line):
    """Parses a line of GFF into a dictionary.
    Given an input line from a GFF file, this:
        - breaks it into component elements
        - determines the type of attribute (flat, parent, child or annotation)
        - generates a dictionary of GFF info 
    """
    gff3_kw_pat = re.compile("\w+=")
    def _split_keyvals(keyval_str):
        """Split key-value pairs in a GFF2, GTF and GFF3 compatible way.

        GFF3 has key value pairs like:
          count=9;gene=amx-2;sequence=SAGE:aacggagccg
        GFF2 and GTF have:           
          Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206"
          name "fgenesh1_pg.C_chr_1000003"; transcriptId 869
        """
        quals = collections.defaultdict(list)
        if keyval_str is None:
            return quals
        # ensembl GTF has a stray semi-colon at the end
        if keyval_str[-1] == ';':
            keyval_str = keyval_str[:-1]
        # GFF2/GTF has a semi-colon with at least one space after it.
        # It can have spaces on both sides; wormbase does this.
        # GFF3 works with no spaces.
        # Split at the first one we can recognize as working
        parts = keyval_str.split(" ; ")
        if len(parts) == 1:
            parts = keyval_str.split("; ")
            if len(parts) == 1:
                parts = keyval_str.split(";")
        # check if we have GFF3 style key-vals (with =)
        is_gff2 = True
        if gff3_kw_pat.match(parts[0]):
            is_gff2 = False
            key_vals = [p.split('=') for p in parts]
        # otherwise, we are separated by a space with a key as the first item
        else:
            pieces = []
            for p in parts:
                # fix misplaced semi-colons in keys in some GFF2 files
                if p and p[0] == ';':
                    p = p[1:]
                pieces.append(p.strip().split(" "))
            key_vals = [(p[0], " ".join(p[1:])) for p in pieces]
        for key, val in key_vals:
            # remove quotes in GFF2 files
            if (len(val) > 0 and val[0] == '"' and val[-1] == '"'):
                val = val[1:-1] 
            if val:
                quals[key].extend(val.split(','))
            # if we don't have a value, make this a key=True/False style
            # attribute
            else:
                quals[key].append('true')
        for key, vals in quals.items():
            quals[key] = [urllib.unquote(v) for v in vals]
        return quals, is_gff2

    def _nest_gff2_features(gff_parts):
        """Provide nesting of GFF2 transcript parts with transcript IDs.

        exons and coding sequences are mapped to a parent with a transcript_id
        in GFF2. This is implemented differently at different genome centers
        and this function attempts to resolve that and map things to the GFF3
        way of doing them.
        """
        # map protein or transcript ids to a parent
        for transcript_id in ["transcript_id", "transcriptId", "proteinId"]:
            try:
                gff_parts["quals"]["Parent"] = \
                        gff_parts["quals"][transcript_id]
                break
            except KeyError:
                pass
        # case for WormBase GFF -- everything labelled as Transcript or CDS
        for flat_name in ["Transcript", "CDS"]:
            if gff_parts["quals"].has_key(flat_name):
                # parent types
                if gff_parts["type"] in [flat_name]:
                    if not gff_parts["id"]:
                        gff_parts["id"] = gff_parts["quals"][flat_name][0]
                        gff_parts["quals"]["ID"] = [gff_parts["id"]]
                # children types
                elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR",
                        "coding_exon", "five_prime_UTR", "CDS", "stop_codon",
                        "start_codon"]:
                    gff_parts["quals"]["Parent"] = gff_parts["quals"][flat_name]
                break

        return gff_parts

    line = line.strip()
    if line == '':return [('directive', line)] # sometimes the blank lines will be there 
    if line[0] == '>':return [('directive', '')] # sometimes it will be a FATSA header
    if line[0] == "#":
        return [('directive', line[2:])]
    elif line:
        parts = line.split('\t')
        if len(parts) == 1 and re.search(r'\w+', parts[0]):return [('directive', '')] ## GFF files with FASTA sequence together 
        assert len(parts) == 9, line
        gff_parts = [(None if p == '.' else p) for p in parts]
        gff_info = dict()
            
        # collect all of the base qualifiers for this item
        quals, is_gff2 = _split_keyvals(gff_parts[8])

        gff_info["is_gff2"] = is_gff2

        if gff_parts[1]:quals["source"].append(gff_parts[1])
        gff_info['quals'] = dict(quals)

        # if we are describing a location, then we are a feature
        if gff_parts[3] and gff_parts[4]:
            gff_info['type'] = gff_parts[2]
            gff_info['id'] = quals.get('ID', [''])[0]
            
            if is_gff2:gff_info = _nest_gff2_features(gff_info)
            # features that have parents need to link so we can pick up
            # the relationship
            if gff_info['quals'].has_key('Parent'):
                final_key = 'child'
            elif gff_info['id']:
                final_key = 'parent'
            # Handle flat features
            else:
                final_key = 'feature'
        # otherwise, associate these annotations with the full record
        else:
            final_key = 'annotation'
        return [(final_key, gff_info)]
    
def parent_child_id_map(gff_handle):
    """Provide a mapping of parent to child relationships in the file.

    Gives a dictionary of parent child relationships:

    keys -- tuple of (source, type) for each parent
    values -- tuple of (source, type) as children of that parent"""

    # collect all of the parent and child types mapped to IDs
    parent_sts = dict()
    child_sts = collections.defaultdict(list)

    for line in gff_handle:
        line_type, line_info = _gff_line_map(line)[0]
        if (line_type == 'parent' or (line_type == 'child' and line_info['id'])):
            parent_sts[line_info['id']] = (line_info['quals']['source'][0], line_info['type'])
        if line_type == 'child':
            for parent_id in line_info['quals']['Parent']:
                child_sts[parent_id].append((line_info['quals']['source'][0], line_info['type']))
    gff_handle.close()
     
    # generate a dictionary of the unique final type relationships
    pc_map = collections.defaultdict(list)
    for parent_id, parent_type in parent_sts.items():
        for child_type in child_sts[parent_id]:
            pc_map[parent_type].append(child_type)
    pc_final_map = dict()
    for ptype, ctypes in pc_map.items():
        unique_ctypes = list(set(ctypes))
        unique_ctypes.sort()
        pc_final_map[ptype] = unique_ctypes

    # Check for Parent Child relations
    level1, level2, level3, sec_level_mis = {}, {}, {}, {}
    for etype, fchild in pc_final_map.items():
        level2_flag = 0
        for kp, vp in pc_final_map.items():
            if etype in vp:level2_flag = 1; level2[etype] = 1 # check for second level features
        if level2_flag == 0: # first level features
            level1[etype] =1 
            for eachfch in fchild: # perform a check for all level1 objects values were defined as level2 keys.  
                if not eachfch in pc_final_map.keys(): # figure out the missing level2 objects  
                    if etype in sec_level_mis:
                        sec_level_mis[etype].append(eachfch)
                    else:
                        sec_level_mis[etype]=[eachfch]
        if level2_flag == 1:level3[str(fchild)] =1  # taking third level features 
    # disply the result 
    if level1==level2==level3=={} and sec_level_mis == {}:
        print
        print 'ONLY FIRST level feature(s):'
        source_type = dict()
        gff_handle = open(gff_handle.name, 'rU')
        for line in gff_handle:
            line = line.strip('\n\r')
            if line[0] == '#': continue
            parts = line.split('\t')
            if parts[-1] == '':parts.pop()
            assert len(parts) == 9, line
            source_type[(parts[1], parts[2])] = 1
        gff_handle.close()
        for ele in source_type:print '\t' + str(ele)
        print
    else:
        print
        print '===Report on different level features from GFF file==='
        print
        print 'FIRST level feature(s):'
        for ele in level1: print '\t' + str(ele)
        print 
        print 'SECOND level feature(s):'
        for ele in level2: print '\t' + str(ele)
        print 
        print 'THIRD level feature(s):'
        for ele in level3:print '\t' + str(ele[1:-1])
        print
        # wrong way mapped feature mapping description 
        for wf, wfv in sec_level_mis.items():
            if wf[1]=='gene':
                print 'GFF Parsing modules from publicly available packages like Bio-python, Bio-perl etc. are heavily dependent on feature identifier mapping.' 
                print 'Here few features seems to be wrongly mapped to its child, which inturn cause problems while extracting the annotation based on feature identifier.\n'
                for ehv in wfv:
                    if ehv[1]=='exon' or ehv[1]=='intron' or ehv[1]=='CDS' or ehv[1]=='three_prime_UTR' or ehv[1]=='five_prime_UTR':print 'Error in ID mapping: Level1 feature ' + str(wf) + ' maps to Level3 feature ' + str(ehv)

if __name__=='__main__':
    
    stime = time.asctime( time.localtime(time.time()) )
    print '-------------------------------------------------------'
    print 'GFFExamine started on ' + stime
    print '-------------------------------------------------------'

    try:
        gff_handle = open(sys.argv[1], 'rU')
    except:
        sys.stderr.write("Can't open the GFF3 file, Cannot continue...\n")
        sys.stderr.write("USAGE: gff_id_mapper.py <gff3 file> \n")
        sys.exit(-1)
    parent_child_id_map(gff_handle)
    
    stime = time.asctime( time.localtime(time.time()) )
    print '-------------------------------------------------------'
    print 'GFFExamine finished at ' + stime
    print '-------------------------------------------------------'