diff fml_gff_groomer/scripts/gff_id_mapper.py @ 0:79726c328621 default tip

Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author vipints
date Tue, 07 Jun 2011 17:29:24 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fml_gff_groomer/scripts/gff_id_mapper.py	Tue Jun 07 17:29:24 2011 -0400
@@ -0,0 +1,254 @@
+#!/usr/bin/env python
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2010 Vipin T Sreedharan, Friedrich Miescher Laboratory of the Max Planck Society
+# Copyright (C) 2010 Max Planck Society
+#
+
+# Description : Provides feature to sub feature identifier mapping in a given GFF3 file.
+
+import re, sys 
+import collections
+import urllib
+import time 
+
+def _gff_line_map(line):
+    """Parses a line of GFF into a dictionary.
+    Given an input line from a GFF file, this:
+        - breaks it into component elements
+        - determines the type of attribute (flat, parent, child or annotation)
+        - generates a dictionary of GFF info 
+    """
+    gff3_kw_pat = re.compile("\w+=")
+    def _split_keyvals(keyval_str):
+        """Split key-value pairs in a GFF2, GTF and GFF3 compatible way.
+
+        GFF3 has key value pairs like:
+          count=9;gene=amx-2;sequence=SAGE:aacggagccg
+        GFF2 and GTF have:           
+          Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206"
+          name "fgenesh1_pg.C_chr_1000003"; transcriptId 869
+        """
+        quals = collections.defaultdict(list)
+        if keyval_str is None:
+            return quals
+        # ensembl GTF has a stray semi-colon at the end
+        if keyval_str[-1] == ';':
+            keyval_str = keyval_str[:-1]
+        # GFF2/GTF has a semi-colon with at least one space after it.
+        # It can have spaces on both sides; wormbase does this.
+        # GFF3 works with no spaces.
+        # Split at the first one we can recognize as working
+        parts = keyval_str.split(" ; ")
+        if len(parts) == 1:
+            parts = keyval_str.split("; ")
+            if len(parts) == 1:
+                parts = keyval_str.split(";")
+        # check if we have GFF3 style key-vals (with =)
+        is_gff2 = True
+        if gff3_kw_pat.match(parts[0]):
+            is_gff2 = False
+            key_vals = [p.split('=') for p in parts]
+        # otherwise, we are separated by a space with a key as the first item
+        else:
+            pieces = []
+            for p in parts:
+                # fix misplaced semi-colons in keys in some GFF2 files
+                if p and p[0] == ';':
+                    p = p[1:]
+                pieces.append(p.strip().split(" "))
+            key_vals = [(p[0], " ".join(p[1:])) for p in pieces]
+        for key, val in key_vals:
+            # remove quotes in GFF2 files
+            if (len(val) > 0 and val[0] == '"' and val[-1] == '"'):
+                val = val[1:-1] 
+            if val:
+                quals[key].extend(val.split(','))
+            # if we don't have a value, make this a key=True/False style
+            # attribute
+            else:
+                quals[key].append('true')
+        for key, vals in quals.items():
+            quals[key] = [urllib.unquote(v) for v in vals]
+        return quals, is_gff2
+
+    def _nest_gff2_features(gff_parts):
+        """Provide nesting of GFF2 transcript parts with transcript IDs.
+
+        exons and coding sequences are mapped to a parent with a transcript_id
+        in GFF2. This is implemented differently at different genome centers
+        and this function attempts to resolve that and map things to the GFF3
+        way of doing them.
+        """
+        # map protein or transcript ids to a parent
+        for transcript_id in ["transcript_id", "transcriptId", "proteinId"]:
+            try:
+                gff_parts["quals"]["Parent"] = \
+                        gff_parts["quals"][transcript_id]
+                break
+            except KeyError:
+                pass
+        # case for WormBase GFF -- everything labelled as Transcript or CDS
+        for flat_name in ["Transcript", "CDS"]:
+            if gff_parts["quals"].has_key(flat_name):
+                # parent types
+                if gff_parts["type"] in [flat_name]:
+                    if not gff_parts["id"]:
+                        gff_parts["id"] = gff_parts["quals"][flat_name][0]
+                        gff_parts["quals"]["ID"] = [gff_parts["id"]]
+                # children types
+                elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR",
+                        "coding_exon", "five_prime_UTR", "CDS", "stop_codon",
+                        "start_codon"]:
+                    gff_parts["quals"]["Parent"] = gff_parts["quals"][flat_name]
+                break
+
+        return gff_parts
+
+    line = line.strip()
+    if line == '':return [('directive', line)] # sometimes the blank lines will be there 
+    if line[0] == '>':return [('directive', '')] # sometimes it will be a FATSA header
+    if line[0] == "#":
+        return [('directive', line[2:])]
+    elif line:
+        parts = line.split('\t')
+        if len(parts) == 1 and re.search(r'\w+', parts[0]):return [('directive', '')] ## GFF files with FASTA sequence together 
+        assert len(parts) == 9, line
+        gff_parts = [(None if p == '.' else p) for p in parts]
+        gff_info = dict()
+            
+        # collect all of the base qualifiers for this item
+        quals, is_gff2 = _split_keyvals(gff_parts[8])
+
+        gff_info["is_gff2"] = is_gff2
+
+        if gff_parts[1]:quals["source"].append(gff_parts[1])
+        gff_info['quals'] = dict(quals)
+
+        # if we are describing a location, then we are a feature
+        if gff_parts[3] and gff_parts[4]:
+            gff_info['type'] = gff_parts[2]
+            gff_info['id'] = quals.get('ID', [''])[0]
+            
+            if is_gff2:gff_info = _nest_gff2_features(gff_info)
+            # features that have parents need to link so we can pick up
+            # the relationship
+            if gff_info['quals'].has_key('Parent'):
+                final_key = 'child'
+            elif gff_info['id']:
+                final_key = 'parent'
+            # Handle flat features
+            else:
+                final_key = 'feature'
+        # otherwise, associate these annotations with the full record
+        else:
+            final_key = 'annotation'
+        return [(final_key, gff_info)]
+    
+def parent_child_id_map(gff_handle):
+    """Provide a mapping of parent to child relationships in the file.
+
+    Gives a dictionary of parent child relationships:
+
+    keys -- tuple of (source, type) for each parent
+    values -- tuple of (source, type) as children of that parent"""
+
+    # collect all of the parent and child types mapped to IDs
+    parent_sts = dict()
+    child_sts = collections.defaultdict(list)
+
+    for line in gff_handle:
+        line_type, line_info = _gff_line_map(line)[0]
+        if (line_type == 'parent' or (line_type == 'child' and line_info['id'])):
+            parent_sts[line_info['id']] = (line_info['quals']['source'][0], line_info['type'])
+        if line_type == 'child':
+            for parent_id in line_info['quals']['Parent']:
+                child_sts[parent_id].append((line_info['quals']['source'][0], line_info['type']))
+    gff_handle.close()
+     
+    # generate a dictionary of the unique final type relationships
+    pc_map = collections.defaultdict(list)
+    for parent_id, parent_type in parent_sts.items():
+        for child_type in child_sts[parent_id]:
+            pc_map[parent_type].append(child_type)
+    pc_final_map = dict()
+    for ptype, ctypes in pc_map.items():
+        unique_ctypes = list(set(ctypes))
+        unique_ctypes.sort()
+        pc_final_map[ptype] = unique_ctypes
+
+    # Check for Parent Child relations
+    level1, level2, level3, sec_level_mis = {}, {}, {}, {}
+    for etype, fchild in pc_final_map.items():
+        level2_flag = 0
+        for kp, vp in pc_final_map.items():
+            if etype in vp:level2_flag = 1; level2[etype] = 1 # check for second level features
+        if level2_flag == 0: # first level features
+            level1[etype] =1 
+            for eachfch in fchild: # perform a check for all level1 objects values were defined as level2 keys.  
+                if not eachfch in pc_final_map.keys(): # figure out the missing level2 objects  
+                    if etype in sec_level_mis:
+                        sec_level_mis[etype].append(eachfch)
+                    else:
+                        sec_level_mis[etype]=[eachfch]
+        if level2_flag == 1:level3[str(fchild)] =1  # taking third level features 
+    # disply the result 
+    if level1==level2==level3=={} and sec_level_mis == {}:
+        print
+        print 'ONLY FIRST level feature(s):'
+        source_type = dict()
+        gff_handle = open(gff_handle.name, 'rU')
+        for line in gff_handle:
+            line = line.strip('\n\r')
+            if line[0] == '#': continue
+            parts = line.split('\t')
+            if parts[-1] == '':parts.pop()
+            assert len(parts) == 9, line
+            source_type[(parts[1], parts[2])] = 1
+        gff_handle.close()
+        for ele in source_type:print '\t' + str(ele)
+        print
+    else:
+        print
+        print '===Report on different level features from GFF file==='
+        print
+        print 'FIRST level feature(s):'
+        for ele in level1: print '\t' + str(ele)
+        print 
+        print 'SECOND level feature(s):'
+        for ele in level2: print '\t' + str(ele)
+        print 
+        print 'THIRD level feature(s):'
+        for ele in level3:print '\t' + str(ele[1:-1])
+        print
+        # wrong way mapped feature mapping description 
+        for wf, wfv in sec_level_mis.items():
+            if wf[1]=='gene':
+                print 'GFF Parsing modules from publicly available packages like Bio-python, Bio-perl etc. are heavily dependent on feature identifier mapping.' 
+                print 'Here few features seems to be wrongly mapped to its child, which inturn cause problems while extracting the annotation based on feature identifier.\n'
+                for ehv in wfv:
+                    if ehv[1]=='exon' or ehv[1]=='intron' or ehv[1]=='CDS' or ehv[1]=='three_prime_UTR' or ehv[1]=='five_prime_UTR':print 'Error in ID mapping: Level1 feature ' + str(wf) + ' maps to Level3 feature ' + str(ehv)
+
+if __name__=='__main__':
+    
+    stime = time.asctime( time.localtime(time.time()) )
+    print '-------------------------------------------------------'
+    print 'GFFExamine started on ' + stime
+    print '-------------------------------------------------------'
+
+    try:
+        gff_handle = open(sys.argv[1], 'rU')
+    except:
+        sys.stderr.write("Can't open the GFF3 file, Cannot continue...\n")
+        sys.stderr.write("USAGE: gff_id_mapper.py <gff3 file> \n")
+        sys.exit(-1)
+    parent_child_id_map(gff_handle)
+    
+    stime = time.asctime( time.localtime(time.time()) )
+    print '-------------------------------------------------------'
+    print 'GFFExamine finished at ' + stime
+    print '-------------------------------------------------------'