changeset 2:233c30f91d66

updated python based GFF parsing module which will handle GTF/GFF/GFF3 file types
author vipints <vipin@cbio.mskcc.org>
date Tue, 08 Oct 2013 07:15:44 -0400
parents 0f80a5141704
children 29a698dc5c7e
files rDiff/bin/genes_cell2struct rDiff/bin/rdiff rDiff/galaxy/rdiff.xml rDiff/src/genes_cell2struct.m rDiff/tools/GFFParser.py rDiff/tools/ParseGFF.py rDiff/tools/helper.py
diffstat 7 files changed, 588 insertions(+), 600 deletions(-) [+]
line wrap: on
line diff
--- a/rDiff/bin/genes_cell2struct	Thu Feb 14 23:38:36 2013 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,9 +0,0 @@
-#/bin/bash
-# rDiff wrapper script to start the interpreter with the correct list of arguments
-
-set -e
-
-PROG=`basename $0`
-DIR=`dirname $0`
-
-exec ${DIR}/start_interpreter.sh ${PROG} "`${DIR}/genarglist.sh $@`"
--- a/rDiff/bin/rdiff	Thu Feb 14 23:38:36 2013 -0500
+++ b/rDiff/bin/rdiff	Tue Oct 08 07:15:44 2013 -0400
@@ -161,19 +161,19 @@
 echo
 
 #Check wether results directory exists
-#if [ -d $RDIFF_RES_DIR ]
-#then       
-#	echo "Writting results to: $RDIFF_RES_DIR"
-#else
-#	mkdir $RDIFF_RES_DIR
-#fi 
+if [ -d $RDIFF_RES_DIR ]
+then       
+	echo "Writting results to: $RDIFF_RES_DIR"
+else
+	mkdir $RDIFF_RES_DIR
+fi 
 
 
 echo 1a. load the genome annotation in GFF3 format, create an annotation object #\[Log file in ${RDIFF_RES_DIR}}/elegans-gff2anno.log\]
-export PYTHONPATH=$PYTHONPATH:$RDIFF_PYTHON_PATH:${SCIPY_PATH}
-${RDIFF_PYTHON_PATH} -W ignore::RuntimeWarning ${DIR}/../tools/ParseGFF.py ${GFF_INPUT} ${RDIFF_RES_DIR}/genes.mat #> ${RDIFF_RES_DIR}}/elegans-gff2anno.log
-${DIR}/../bin/genes_cell2struct ${RDIFF_RES_DIR}/genes.mat 
+export PYTHONPATH=$PYTHONPATH:$RDIFF_PYTHON_PATH
+touch ${RDIFF_RES_DIR}/genes.mat
 
+${RDIFF_PYTHON_PATH} ${DIR}/../tools/GFFParser.py ${GFF_INPUT} ${RDIFF_RES_DIR}/genes.mat #> ${RDIFF_RES_DIR}}/elegans-gff2anno.log
 
 PARAM_VECT="$PARAM_VECT""GFF_INPUT:${RDIFF_RES_DIR}/genes.mat;"
 
@@ -183,4 +183,3 @@
 echo
 
 exec ${DIR}/start_interpreter.sh ${PROG} "$PARAM_VECT"
-
--- a/rDiff/galaxy/rdiff.xml	Thu Feb 14 23:38:36 2013 -0500
+++ b/rDiff/galaxy/rdiff.xml	Tue Oct 08 07:15:44 2013 -0400
@@ -1,19 +1,27 @@
 <tool id="rdiff-web" name="rDiff" version="0.3">
     <description>Determines differentially expressed transcripts from read alignments</description>
     <command interpreter="bash"> 
-        rdiff_run.sh $test_meth $anno_input $read_length $rdiff_out $rdiff_out.extra_files_path
-        #for $i in $replicate_groups
-        #for $j in $i.replicates
-        $j.bam_alignment?
-        #end for
-        :
-        #end for
+        rdiff_run.sh
+        $test_meth 
+        $anno_input 
+        $read_length 
+        $rdiff_out $rdiff_out.extra_files_path
+##
+## replicate groups 
+##
+#for $i in $replicate_groups
+#for $j in $i.replicates
+$j.bam_alignment?
+#end for
+:
+#end for
+
         >> $Log_File 
     </command>
   <inputs>
 
     <!-- genome annotation in GFF format -->
-	<param format="gff3" name="anno_input" type="data" label="Genome annotation in GFF3 file" help="A tab delimited format for storing sequence features and annotations"/>
+	<param format="gff,gtf,gff3" name="anno_input" type="data" label="Genome annotation file" help="A tab delimited format for storing sequence features and annotations(GFF/GTF/GFF3)"/>
 
     <!-- RNA-seq samples and corresponding replicates -->
    <repeat name="replicate_groups" title="Sample" min="2" max="2">
@@ -22,19 +30,21 @@
      </repeat>
    </repeat>
 
-    
+    <!-- rDiff test method --> 
      <param name="test_meth" type="select" label="rDiff testing method" help="Statistical test on differential transcript expression data." >
             <option value="poisson">Poisson</option>
             <option value="param">Parametric</option>
             <option value="nonparam">Nonparametric</option>
     </param>
+
+    <!-- read length uesd in the experiment-->
     <param name="read_length" type="text" value="75" help="default 75 nucleotide." label="Read length from sequencing experiment"/>
 
   </inputs>
 
   <outputs>
-    <data format="txt" name="rdiff_out" label="${tool.name} on ${on_string}: Differential Testing Result" />
-    <data format="txt" name="Log_File" label="${tool.name} on ${on_string}: log"/>
+    <data format="txt" name="rdiff_out" label="${tool.name} on ${on_string}: Differential Expression" />
+    <data format="txt" name="Log_File" label="${tool.name} on ${on_string}: Log"/>
   </outputs>
   
   <help>
--- a/rDiff/src/genes_cell2struct.m	Thu Feb 14 23:38:36 2013 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,38 +0,0 @@
-function genes_cell2struct(anno_fname)
-% GENES_CELL2STRUCT   Converts genes stored as a cell to struct.
-%
-%   genes_cell2struct(anno_fname)
-%
-%   -- input --
-%   anno_fname:   name of file where genes as cell are stored
-%
-%   -- output --
-%   genes as a struct
-%
-%
-%   This program is free software; you can redistribute it and/or modify
-%   it under the terms of the GNU General Public License as published by
-%   the Free Software Foundation; either version 3 of the License, or
-%   (at your option) any later version.
-%
-%   Written (W) 2009-2011 Regina Bohnert, Gunnar Raetsch
-%   Copyright (C) 2009-2011 Max Planck Society
-%
-
-load(anno_fname, 'genes');
-if iscell(genes)
-  genes_cell = genes;
-  clear genes;
-  for g = 1:length(genes_cell), 
-    gene = genes_cell{g};
-    for e = 1:length(gene.exons)
-      gene.exons{e} = double(gene.exons{e});
-    end    
-    gene.exons = reshape(gene.exons, 1, length(gene.exons));
-    gene.id = double(gene.id);
-    gene.start = double(gene.start);
-    gene.stop = double(gene.stop);
-    genes(g) = gene;
-  end
-  save(anno_fname, 'genes');
-end
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rDiff/tools/GFFParser.py	Tue Oct 08 07:15:44 2013 -0400
@@ -0,0 +1,379 @@
+#!/usr/bin/env python
+"""
+Extract genome annotation from a GFF (a tab delimited format for storing sequence features and annotations) file.
+
+Requirements: 
+    Numpy :- http://numpy.org/ 
+    Scipy :- http://scipy.org/ 
+
+Copyright (C)	
+
+2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. 
+2012-2013 Memorial Sloan-Kettering Cancer Center, New York City, USA.
+"""
+
+import re
+import os
+import sys
+import urllib
+import numpy as np
+import scipy.io as sio
+from collections import defaultdict
+import helper as utils 
+
+def _attribute_tags(col9):
+    """ 
+    Split the key-value tags from the attribute column, it takes column number 9 from GTF/GFF file 
+    """
+    info = defaultdict(list)
+    is_gff = False
+    
+    if not col9:
+        return is_gff, info
+        
+    # trim the line ending semi-colon  ucsc may have some white-space  
+    col9 = col9.rstrip(';| ')
+    # attributes from 9th column 
+    atbs = col9.split(" ; ")
+    if len(atbs) == 1:
+        atbs = col9.split("; ")
+        if len(atbs) == 1:
+            atbs = col9.split(";")
+    # check the GFF3 pattern which has key value pairs like:
+    gff3_pat = re.compile("\w+=")
+    # sometime GTF have: gene_id uc002zkg.1;
+    gtf_pat = re.compile("\s?\w+\s")
+
+    key_vals = []
+
+    if gff3_pat.match(atbs[0]): # gff3 pattern 
+        is_gff = True
+        key_vals = [at.split('=') for at in atbs]
+    elif gtf_pat.match(atbs[0]): # gtf pattern
+        for at in atbs:
+            key_vals.append(at.strip().split(" ",1))
+    else:
+        # to handle attribute column has only single value 
+        key_vals.append(['ID', atbs[0]])
+    # get key, val items 
+    for item in key_vals:
+        key, val = item
+        # replace the double qoutes from feature identifier 
+        val = re.sub('"', '', val)
+        # replace the web formating place holders to plain text format 
+        info[key].extend([urllib.unquote(v) for v in val.split(',') if v])
+
+    return is_gff, info
+                
+def _spec_features_keywd(gff_parts):
+    """
+    Specify the feature key word according to the GFF specifications
+    """
+    for t_id in ["transcript_id", "transcriptId", "proteinId"]:
+        try:
+            gff_parts["info"]["Parent"] = gff_parts["info"][t_id]
+            break
+        except KeyError:
+            pass
+    for g_id in ["gene_id", "geneid", "geneId", "name", "gene_name", "genename"]:
+        try:
+            gff_parts["info"]["GParent"] = gff_parts["info"][g_id]
+            break
+        except KeyError:
+            pass
+    ## TODO key words
+    for flat_name in ["Transcript", "CDS"]:
+        if gff_parts["info"].has_key(flat_name):
+            # parents
+            if gff_parts['type'] in [flat_name] or re.search(r'transcript', gff_parts['type'], re.IGNORECASE):
+                if not gff_parts['id']:
+                    gff_parts['id'] = gff_parts['info'][flat_name][0]
+                    #gff_parts["info"]["ID"] = [gff_parts["id"]]
+            # children 
+            elif gff_parts["type"] in ["intron", "exon", "pseudogenic_exon", "three_prime_UTR",
+                        "coding_exon", "five_prime_UTR", "CDS", "stop_codon",
+                        "start_codon"]:
+                gff_parts["info"]["Parent"] = gff_parts["info"][flat_name]
+            break
+    return gff_parts
+
+def Parse(ga_file):
+    """
+    Parsing GFF/GTF file based on feature relationship, it takes the input file.
+    """
+    child_map = defaultdict(list)
+    parent_map = dict()
+
+    ga_handle = utils._open_file(ga_file)
+
+    for rec in ga_handle:
+        rec = rec.strip('\n\r')
+
+        # skip empty line fasta identifier and commented line
+        if not rec or rec[0] in  ['#', '>']:
+            continue
+        # skip the genome sequence 
+        if not re.search('\t', rec):
+            continue
+
+        parts = rec.split('\t')
+        assert len(parts) >= 8, rec
+
+        # process the attribute column (9th column)
+        ftype, tags = _attribute_tags(parts[-1])
+        if not tags: # skip the line if no attribute column.
+	        continue 
+        
+        # extract fields  
+        if parts[1]:
+            tags["source"] = parts[1]
+        if parts[7]:
+            tags["phase"] = parts[7]
+
+        gff_info = dict()
+        gff_info['info'] = dict(tags)
+        #gff_info["is_gff3"] = ftype
+        gff_info['chr'] = parts[0]
+
+        if parts[3] and parts[4]:
+            gff_info['location'] = [int(parts[3]) ,
+                        int(parts[4])]
+            gff_info['type'] = parts[2]
+            gff_info['id'] = tags.get('ID', [''])[0]
+            if parts[6] in ['?', '.']:
+                parts[6] = None 
+            gff_info['strand'] = parts[6]
+
+            # key word according to the GFF spec.
+            if not ftype:
+                gff_info = _spec_features_keywd(gff_info)
+        
+            # link the feature relationships
+            if gff_info['info'].has_key('Parent'): 
+                for p in gff_info['info']['Parent']:
+                    if p == gff_info['id']:
+                        gff_info['id'] = ''
+                        break
+                rec_category = 'child'
+            elif gff_info['id']:
+                rec_category = 'parent'
+            else:
+                rec_category = 'record'
+
+            # depends on the record category organize the features
+            if rec_category == 'child':
+                for p in gff_info['info']['Parent']:
+                    # create the data structure based on source and feature id 
+                    child_map[(gff_info['chr'], gff_info['info']['source'], p)].append(
+                                            dict( type = gff_info['type'], 
+                                            location =  gff_info['location'], 
+                                            strand = gff_info['strand'], 
+                                            ID = gff_info['id'],
+                                            gene_id = gff_info['info'].get('GParent', '') 
+                                            ))
+            elif rec_category == 'parent':
+                parent_map[(gff_info['chr'], gff_info['info']['source'], gff_info['id'])] = dict( 
+                                            type = gff_info['type'], 
+                                            location = gff_info['location'],
+                                            strand = gff_info['strand'],
+                                            name = tags.get('Name', [''])[0])
+            elif rec_category == 'record':
+                #TODO how to handle plain records?
+                c = 1 
+    ga_handle.close()
+    
+    # depends on file type create parent feature  
+    if not ftype:
+        parent_map, child_map = _create_missing_feature_type(parent_map, child_map)    
+    
+    # connecting parent child relations  
+    # // essentially the parent child features are here from any type of GTF/GFF2/GFF3 file
+    gene_mat = _format_gene_models(parent_map, child_map) 
+
+    return gene_mat 
+    
+def _format_gene_models(parent_nf_map, child_nf_map): 
+    """
+    Genarate GeneObject based on the parsed file contents
+
+    parent_map: parent features with source and chromosome information 
+    child_map: transctipt and exon information are encoded 
+    """
+    g_cnt = 0 
+    gene = np.zeros((len(parent_nf_map),), dtype = utils.init_gene_DE())
+
+    for pkey, pdet in parent_nf_map.items():
+        # considering only gene features 
+        if not re.search(r'gene', pdet.get('type', '')):
+            continue 
+        # infer the gene start and stop if not there in the 
+        if not pdet.get('location', []):
+            GNS, GNE = [], []
+            # multiple number of transcripts 
+            for L1 in child_nf_map[pkey]:
+                GNS.append(L1.get('location', [])[0]) 
+                GNE.append(L1.get('location', [])[1]) 
+            GNS.sort()
+            GNE.sort()
+            pdet['location'] = [GNS[0], GNE[-1]]
+        orient = pdet.get('strand', '')
+
+        gene[g_cnt]['id'] = g_cnt +1 
+        gene[g_cnt]['chr'] = pkey[0]
+        gene[g_cnt]['source'] = pkey[1]
+        gene[g_cnt]['name'] = pkey[-1]
+        gene[g_cnt]['start'] = pdet.get('location', [])[0]
+        gene[g_cnt]['stop'] = pdet.get('location', [])[1]
+        gene[g_cnt]['strand'] = orient  
+        
+        # default value 
+        gene[g_cnt]['is_alt_spliced'] = 0
+        if len(child_nf_map[pkey]) > 1:
+            gene[g_cnt]['is_alt_spliced'] = 1
+
+        # complete sub-feature for all transcripts 
+        dim = len(child_nf_map[pkey])
+        TRS = np.zeros((dim,), dtype=np.object)
+        EXON = np.zeros((dim,), dtype=np.object)
+
+        # fetching corresponding transcripts 
+        for xq, Lv1 in enumerate(child_nf_map[pkey]):
+
+            TID = Lv1.get('ID', '')
+            TRS[xq]= np.array([TID])
+
+            orient = Lv1.get('strand', '')
+
+            # fetching different sub-features 
+            child_feat = defaultdict(list)
+            for Lv2 in child_nf_map[(pkey[0], pkey[1], TID)]:
+                E_TYP = Lv2.get('type', '')
+                child_feat[E_TYP].append(Lv2.get('location'))
+            
+            # make exon coordinate from cds and utr regions 
+            if not child_feat.get('exon'):  
+                if child_feat.get('CDS'):
+                    exon_cod = utils.make_Exon_cod( orient, 
+                                NonetoemptyList(child_feat.get('five_prime_UTR')), 
+                                NonetoemptyList(child_feat.get('CDS')),
+                                NonetoemptyList(child_feat.get('three_prime_UTR')))
+                    child_feat['exon'] = exon_cod 
+                else:
+                    # searching through keys to find a pattern describing exon feature 
+                    ex_key_pattern = [k for k in child_feat if k.endswith("exon")]
+                    child_feat['exon'] = child_feat[ex_key_pattern[0]]
+                    # TODO only UTR's
+
+            # make general ascending order of coordinates 
+            if orient == '-':
+                for etype, excod in child_feat.items():
+                    if len(excod) > 1:
+                        if excod[0][0] > excod[-1][0]:
+                            excod.reverse()
+                            child_feat[etype] = excod
+
+            # add sub-feature # make array for export to different out
+            EXON[xq] = np.array(child_feat.get('exon'), np.float64)
+            
+        # add sub-features to the parent gene feature
+        gene[g_cnt]['transcripts'] = TRS 
+        gene[g_cnt]['exons'] = EXON
+        
+        gene[g_cnt]['gene_info'] = dict( ID = pkey[-1], 
+                                Name = pdet.get('name'), 
+                                Source = pkey[1]) 
+        g_cnt += 1 
+
+    ## deleting empty gene records from the main array
+    for XP, ens in enumerate(gene):
+        if ens[0]==0:
+            break
+
+    XQC = range(XP, len(gene)+1)
+    gene = np.delete(gene, XQC)
+
+    return gene 
+
+def NonetoemptyList(XS):
+    """
+    Convert a None type to empty list 
+    """
+    return [] if XS is None else XS 
+
+def _create_missing_feature_type(p_feat, c_feat):
+    """
+    GFF/GTF file defines only child features. This function tries to create 
+    the parent feature from the information provided in the attribute column. 
+
+    example: 
+    chr21   hg19_knownGene  exon    9690071 9690100 0.000000        +       .       gene_id "uc002zkg.1"; transcript_id "uc002zkg.1"; 
+    chr21   hg19_knownGene  exon    9692178 9692207 0.000000        +       .       gene_id "uc021wgt.1"; transcript_id "uc021wgt.1"; 
+    chr21   hg19_knownGene  exon    9711935 9712038 0.000000        +       .       gene_id "uc011abu.2"; transcript_id "uc011abu.2"; 
+
+    This function gets the parsed feature annotations. 
+    """
+    child_n_map = defaultdict(list)
+    for fid, det in c_feat.items():
+        # get the details from grand child  
+        GID = STRD = None
+        SPOS, EPOS = [], [] 
+        TYP = dict()
+        for gchild in det:
+            GID = gchild.get('gene_id', [''])[0] 
+            SPOS.append(gchild.get('location', [])[0]) 
+            EPOS.append(gchild.get('location', [])[1]) 
+            STRD = gchild.get('strand', '')
+            TYP[gchild.get('type', '')] = 1
+        SPOS.sort() 
+        EPOS.sort()
+        
+        # infer transcript type
+        transcript_type = 'transcript'
+        transcript_type = 'mRNA' if TYP.get('CDS', '') or TYP.get('cds', '') else transcript_type
+        
+        # gene id and transcript id are same
+        transcript_id = fid[-1]
+        if GID == transcript_id:
+            transcript_id = 'Transcript:' + str(GID)
+        
+        # level -1 feature type 
+        p_feat[(fid[0], fid[1], GID)] = dict( type = 'gene',
+                                            location = [], ## infer location based on multiple transcripts  
+                                            strand = STRD,
+                                            name = GID )
+        # level -2 feature type 
+        child_n_map[(fid[0], fid[1], GID)].append(
+                                            dict( type = transcript_type,
+                                            location =  [SPOS[0], EPOS[-1]], 
+                                            strand = STRD, 
+                                            ID = transcript_id,
+                                            gene_id = '' ))
+        # reorganizing the grand child
+        for gchild in det:
+            child_n_map[(fid[0], fid[1], transcript_id)].append(
+                                            dict( type = gchild.get('type', ''),
+                                            location =  gchild.get('location'),
+                                            strand = gchild.get('strand'), 
+                                            ID = gchild.get('ID'),
+                                            gene_id = '' ))
+    return p_feat, child_n_map 
+
+
+## General instruction to use the above functions:
+## Usage: GFFParser.py in.gff3 out.mat 
+
+try:
+    gff_file = sys.argv[1]
+    out_mat = sys.argv[2]
+except:
+    print __doc__
+    sys.exit(-1)
+
+## Parse the file accoring to the type and returns the genes informations --
+gene_struct = Parse(gff_file)
+
+## Write the gene annotations to a matlab struct array format --
+sio.savemat(out_mat, 
+    mdict = dict(genes = gene_struct), 
+    format = '5', 
+    oned_as = 'row')
--- a/rDiff/tools/ParseGFF.py	Thu Feb 14 23:38:36 2013 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,532 +0,0 @@
-#!/usr/bin/env python
-"""
-Extract genome annotation from a GFF3 (a tab delimited 
-format for storing sequence features and annotations:
-http://www.sequenceontology.org/gff3.shtml) file.
-
-Usage: ParseGFF.py in.gff3 out.mat 
-
-Requirements: 
-    Scipy :- http://scipy.org/ 
-    Numpy :- http://numpy.org/ 
-
-Copyright (C)	2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany 
-		2012- Memorial Sloan-Kettering Cancer Center, New York City, USA 
-"""
-
-import re, sys
-import scipy.io as sio
-import numpy as np
-
-def addCDSphase(strand, cds):
-    """Add CDS phase to the CDS exons
-    """
-    cds_region, cds_flag = [], 0
-    if strand == '+':
-        for cdspos in cds:
-            if cds_flag == 0:
-                cdspos = (cdspos[0], cdspos[1], 0)
-                diff = (cdspos[1]-(cdspos[0]-1))%3
-            else:
-                xy = 0
-                if diff == 0:
-                    cdspos = (cdspos[0], cdspos[1], 0)
-                elif diff == 1:
-                    cdspos = (cdspos[0], cdspos[1], 2)
-                    xy = 2
-                elif diff == 2:
-                    cdspos = (cdspos[0], cdspos[1], 1)
-                    xy = 1
-                diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3
-            cds_region.append(cdspos)
-            cds_flag = 1
-    elif strand == '-':
-        cds.reverse()
-        for cdspos in cds:
-            if cds_flag == 0:
-                cdspos = (cdspos[0], cdspos[1], 0)
-                diff = (cdspos[1]-(cdspos[0]-1))%3
-            else:
-                xy = 0
-                if diff == 0:
-                    cdspos = (cdspos[0], cdspos[1], 0)
-                elif diff == 1:
-                    cdspos = (cdspos[0], cdspos[1], 2)
-                    xy = 2
-                elif diff == 2:
-                    cdspos = (cdspos[0], cdspos[1], 1)
-                    xy = 1
-                diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3
-            cds_region.append(cdspos)
-            cds_flag = 1
-        cds_region.reverse()
-    return cds_region
-
-def createExon(strand_p, five_p_utr, cds_cod, three_p_utr):
-    """Create exon cordinates from UTR's and CDS region
-    """
-    exon_pos = []
-    if strand_p == '+':        
-        utr5_start, utr5_end = 0, 0
-        if five_p_utr != []:
-            utr5_start, utr5_end = five_p_utr[-1][0], five_p_utr[-1][1] 
-        cds_5start, cds_5end = cds_cod[0][0], cds_cod[0][1]
-        jun_exon = []
-        if cds_5start-utr5_end == 0 or cds_5start-utr5_end == 1:
-            jun_exon = [utr5_start, cds_5end]    
-        if len(cds_cod) == 1:
-            five_prime_flag = 0
-            if jun_exon != []:
-                five_p_utr = five_p_utr[:-1]
-                five_prime_flag = 1
-            for utr5 in five_p_utr:
-                exon_pos.append(utr5)
-            jun_exon = []
-            utr3_start, utr3_end = 0, 0
-            if three_p_utr != []: 
-                utr3_start = three_p_utr[0][0]
-                utr3_end = three_p_utr[0][1]
-            if utr3_start-cds_5end == 0 or utr3_start-cds_5end == 1:
-                jun_exon = [cds_5start, utr3_end]
-            three_prime_flag = 0
-            if jun_exon != []: 
-                cds_cod = cds_cod[:-1]
-                three_p_utr = three_p_utr[1:]
-                three_prime_flag = 1
-            if five_prime_flag == 1 and three_prime_flag == 1:
-                exon_pos.append([utr5_start, utr3_end])
-            if five_prime_flag == 1 and three_prime_flag == 0:
-                exon_pos.append([utr5_start, cds_5end])
-                cds_cod = cds_cod[:-1]
-            if five_prime_flag == 0 and three_prime_flag == 1:
-                exon_pos.append([cds_5start, utr3_end])
-            for cds in cds_cod:
-                exon_pos.append(cds)
-            for utr3 in three_p_utr:
-                exon_pos.append(utr3)
-        else:    
-            if jun_exon != []:
-                five_p_utr = five_p_utr[:-1]
-                cds_cod = cds_cod[1:]
-            for utr5 in five_p_utr:
-                exon_pos.append(utr5)
-            exon_pos.append(jun_exon) if jun_exon != [] else ''
-            jun_exon = []
-            utr3_start, utr3_end = 0, 0
-            if three_p_utr != []:
-                utr3_start = three_p_utr[0][0]
-                utr3_end = three_p_utr[0][1]
-            cds_3start = cds_cod[-1][0]
-            cds_3end = cds_cod[-1][1]
-            if utr3_start-cds_3end == 0 or utr3_start-cds_3end == 1:       
-                jun_exon = [cds_3start, utr3_end]
-            if jun_exon != []:
-                cds_cod = cds_cod[:-1]
-                three_p_utr = three_p_utr[1:]
-            for cds in cds_cod:
-                exon_pos.append(cds)
-            exon_pos.append(jun_exon) if jun_exon != [] else ''
-            for utr3 in three_p_utr:
-                exon_pos.append(utr3)
-    elif strand_p == '-':
-        utr3_start, utr3_end = 0, 0        
-        if three_p_utr != []:
-            utr3_start = three_p_utr[-1][0]
-            utr3_end = three_p_utr[-1][1]
-        cds_3start = cds_cod[0][0]
-        cds_3end = cds_cod[0][1]
-        jun_exon = []
-        if cds_3start-utr3_end == 0 or cds_3start-utr3_end == 1:
-            jun_exon = [utr3_start, cds_3end]  
-        if len(cds_cod) == 1:    
-            three_prime_flag = 0
-            if jun_exon != []:
-                three_p_utr = three_p_utr[:-1]
-                three_prime_flag = 1
-            for utr3 in three_p_utr:
-                exon_pos.append(utr3)
-            jun_exon = []
-            (utr5_start, utr5_end) = (0, 0)
-            if five_p_utr != []:
-                utr5_start = five_p_utr[0][0]
-                utr5_end = five_p_utr[0][1]
-            if utr5_start-cds_3end == 0 or utr5_start-cds_3end == 1:
-                jun_exon = [cds_3start, utr5_end]
-            five_prime_flag = 0
-            if jun_exon != []:
-                cds_cod = cds_cod[:-1]
-                five_p_utr = five_p_utr[1:]
-                five_prime_flag = 1
-            if three_prime_flag == 1 and five_prime_flag == 1:
-                exon_pos.append([utr3_start, utr5_end])
-            if three_prime_flag == 1 and five_prime_flag == 0:
-                exon_pos.append([utr3_start, cds_3end])
-                cds_cod = cds_cod[:-1]
-            if three_prime_flag == 0 and five_prime_flag == 1:
-                exon_pos.append([cds_3start, utr5_end])        
-            for cds in cds_cod:
-                exon_pos.append(cds)
-            for utr5 in five_p_utr:
-                exon_pos.append(utr5)
-        else:
-            if jun_exon != []:
-                three_p_utr = three_p_utr[:-1]
-                cds_cod = cds_cod[1:]
-            for utr3 in three_p_utr:
-                exon_pos.append(utr3)   
-            if jun_exon != []:
-                exon_pos.append(jun_exon)
-            jun_exon = []
-            (utr5_start, utr5_end) = (0, 0)
-            if five_p_utr != []:
-                utr5_start = five_p_utr[0][0]
-                utr5_end = five_p_utr[0][1]    
-            cds_5start = cds_cod[-1][0]
-            cds_5end = cds_cod[-1][1]
-            if utr5_start-cds_5end == 0 or utr5_start-cds_5end == 1:
-                jun_exon = [cds_5start, utr5_end]
-            if jun_exon != []:
-                cds_cod = cds_cod[:-1]
-                five_p_utr = five_p_utr[1:]
-            for cds in cds_cod:
-                exon_pos.append(cds)
-            if jun_exon != []:
-                exon_pos.append(jun_exon)    
-            for utr5 in five_p_utr:
-                exon_pos.append(utr5)
-    return exon_pos
-
-def init_gene():
-    """Initializing the gene structure
-    """
-    gene_details=dict(
-                    id = '', 
-                    anno_id = [],
-                    confgenes_id = [],
-                    name = '',
-                    source = '',
-                    gene_info = {},
-                    alias = '',
-                    name2 = [],
-                    strand = '',
-                    chr = '',
-                    chr_num = [],
-                    paralogs = [],
-                    start = '',
-                    stop = '',
-                    transcripts = [],
-                    transcript_info = [],
-                    transcript_status = [],
-                    transcript_valid = [],
-                    exons = [],
-                    exons_confirmed = [],
-                    cds_exons = [],
-                    utr5_exons = [],
-                    utr3_exons = [],
-                    tis = [],
-                    tis_conf = [],
-                    tis_info = [],
-                    cdsStop = [],
-                    cdsStop_conf = [],
-                    cdsStop_info = [],
-                    tss = [],
-                    tss_info = [],
-                    tss_conf = [],
-                    cleave = [],
-                    cleave_info = [],
-                    cleave_conf = [],
-                    polya = [],
-                    polya_info = [],
-                    polya_conf = [],
-                    is_alt = [],
-                    is_alt_spliced = 0,
-                    is_valid = [],
-                    transcript_complete = [],
-                    is_complete = [],
-                    is_correctly_gff3_referenced = '',
-                    splicegraph = []
-                    )
-    return gene_details
-
-def FeatureValueFormat(singlegene):
-    """Make feature value compactable to write in a .mat format
-    """
-    comp_exon = np.zeros((len(singlegene['exons']),), dtype=np.object)
-    for i in range(len(singlegene['exons'])):
-        comp_exon[i]= np.array(singlegene['exons'][i])
-    singlegene['exons'] = comp_exon
-    comp_cds = np.zeros((len(singlegene['cds_exons']),), dtype=np.object)
-    for i in range(len(singlegene['cds_exons'])):
-        comp_cds[i]= np.array(singlegene['cds_exons'][i])
-    singlegene['cds_exons'] = comp_cds
-    comp_utr3 = np.zeros((len(singlegene['utr3_exons']),), dtype=np.object)
-    for i in range(len(singlegene['utr3_exons'])):
-        comp_utr3[i]= np.array(singlegene['utr3_exons'][i])
-    singlegene['utr3_exons'] = comp_utr3
-    comp_utr5 = np.zeros((len(singlegene['utr5_exons']),), dtype=np.object)
-    for i in range(len(singlegene['utr5_exons'])):
-        comp_utr5[i]= np.array(singlegene['utr5_exons'][i])
-    singlegene['utr5_exons'] = comp_utr5
-    comp_transcripts = np.zeros((len(singlegene['transcripts']),), dtype=np.object)
-    for i in range(len(singlegene['transcripts'])):
-        comp_transcripts[i]= np.array(singlegene['transcripts'][i])
-    singlegene['transcripts'] = comp_transcripts
-    comp_tss = np.zeros((len(singlegene['tss']),), dtype=np.object)
-    for i in range(len(singlegene['tss'])):
-        comp_tss[i]= np.array(singlegene['tss'][i])
-    singlegene['tss'] = comp_tss
-    comp_tis = np.zeros((len(singlegene['tis']),), dtype=np.object)
-    for i in range(len(singlegene['tis'])):
-        comp_tis[i]= np.array(singlegene['tis'][i])
-    singlegene['tis'] = comp_tis
-    comp_cleave = np.zeros((len(singlegene['cleave']),), dtype=np.object)
-    for i in range(len(singlegene['cleave'])):
-        comp_cleave[i]= np.array(singlegene['cleave'][i])
-    singlegene['cleave'] = comp_cleave
-    comp_cdsStop = np.zeros((len(singlegene['cdsStop']),), dtype=np.object)
-    for i in range(len(singlegene['cdsStop'])):
-        comp_cdsStop[i]= np.array(singlegene['cdsStop'][i])
-    singlegene['cdsStop'] = comp_cdsStop
-
-    return singlegene 
-
-def CreateGeneModels(genes_cmpt, transcripts_cmpt, exons_cmpt, utr3_cmpt, utr5_cmpt, cds_cmpt):
-    """Creating Coding/Non-coding gene models from parsed GFF objects.
-    """
-    gene_counter, gene_models = 1, []
-    for gene_entry in genes_cmpt: ## Figure out the genes and transcripts associated feature 
-        if gene_entry in transcripts_cmpt:
-            gene=init_gene() 
-            gene['id']=gene_counter
-            gene['name']=gene_entry[1]
-            gene['chr']=genes_cmpt[gene_entry]['chr']
-            gene['source']=genes_cmpt[gene_entry]['source']
-            gene['start']=genes_cmpt[gene_entry]['start']
-            gene['stop']=genes_cmpt[gene_entry]['stop']
-            gene['strand']=genes_cmpt[gene_entry]['strand']
-            if not gene['strand'] in ['+', '-']:
-                gene['strand']='.' # Strand info not known replaced with a dot symbol instead of None, ?, . etc.
-            if len(transcripts_cmpt[gene_entry])>1:
-                gene['is_alt_spliced'] = 1
-                gene['is_alt'] = 1
-	    gtype=[]
-            for tids in transcripts_cmpt[gene_entry]: ## transcript section related tags 
-                gene['transcripts'].append(tids['ID'])
-		gtype.append(tids['type'])
-                exon_cod, utr5_cod, utr3_cod, cds_cod = [], [], [], []
-                if (gene['chr'], tids['ID']) in exons_cmpt:
-                    exon_cod = [[feat_exon['start'], feat_exon['stop']] for feat_exon in exons_cmpt[(gene['chr'], tids['ID'])]]
-                if (gene['chr'], tids['ID']) in utr5_cmpt:
-                    utr5_cod = [[feat_utr5['start'], feat_utr5['stop']] for feat_utr5 in utr5_cmpt[(gene['chr'], tids['ID'])]]
-                if (gene['chr'], tids['ID']) in utr3_cmpt:
-                    utr3_cod = [[feat_utr3['start'], feat_utr3['stop']] for feat_utr3 in utr3_cmpt[(gene['chr'], tids['ID'])]]
-                if (gene['chr'], tids['ID']) in cds_cmpt:
-                    cds_cod = [[feat_cds['start'], feat_cds['stop']] for feat_cds in cds_cmpt[(gene['chr'], tids['ID'])]]
-                if len(exon_cod) == 0: ## build exon coordinates from UTR3, UTR5 and CDS
-                    if cds_cod != []:
-                        exon_cod=createExon(gene['strand'], utr5_cod, cds_cod, utr3_cod) 
-
-                if gene['strand']=='-': ## general order to coordinates
-                    if len(exon_cod) >1:
-                        if exon_cod[0][0] > exon_cod[-1][0]:
-                            exon_cod.reverse()
-                    if len(cds_cod) >1:
-                        if cds_cod[0][0] > cds_cod[-1][0]: 
-                            cds_cod.reverse()
-                    if len(utr3_cod) >1:
-                        if utr3_cod[0][0] > utr3_cod[-1][0]: 
-                            utr3_cod.reverse()
-                    if len(utr5_cod) >1:
-                        if utr5_cod[0][0] > utr5_cod[-1][0]:
-                            utr5_cod.reverse()
-
-                tis, cdsStop, tss, cleave = [], [], [], [] ## speacial sited in the gene region 
-                if cds_cod != []:
-                    if gene['strand'] == '+':
-                        tis = [cds_cod[0][0]]
-                        cdsStop = [cds_cod[-1][1]-3]
-                    elif gene['strand'] == '-':
-                        tis = [cds_cod[-1][1]]
-                        cdsStop = [cds_cod[0][0]+3]
-                if utr5_cod != []:
-                    if gene['strand'] == '+':
-                        tss = [utr5_cod[0][0]]
-                    elif gene['strand'] == '-':
-                        tss = [utr5_cod[-1][1]]
-                if utr3_cod != []:
-                    if gene['strand'] == '+':
-                        cleave = [utr3_cod[-1][1]]
-                    elif gene['strand'] == '-':
-                        cleave = [utr3_cod[0][0]]
-
-                cds_status, exon_status, utr_status = 0, 0, 0 ## status of the complete elements of the gene
-                if cds_cod != []: ## adding phase to the CDS region 
-                    cds_cod_phase = addCDSphase(gene['strand'], cds_cod)
-                    cds_status = 1
-                    gene['cds_exons'].append(cds_cod_phase)
-
-                if exon_cod != []: 
-                    exon_status = 1
-                if utr5_cod != [] or utr3_cod != []: 
-                    utr_status = 1
-                if cds_status != 0 and exon_status != 0 and utr_status != 0:
-                    gene['transcript_status'].append(1)
-                else:
-                    gene['transcript_status'].append(0)
-
-                if exon_cod: ## final check point for a valid gene model 
-                    gene['exons'].append(exon_cod)
-                    gene['utr3_exons'].append(utr3_cod)
-                    gene['utr5_exons'].append(utr5_cod)
-                    gene['tis'].append(tis)
-                    gene['cdsStop'].append(cdsStop)
-                    gene['tss'].append(tss)
-                    gene['cleave'].append(cleave)    
-	    
-	    gtype=list(set(gtype)) ## different types  
-            gene['gene_info']=dict(ID=gene_entry[1],
-				Source=genes_cmpt[gene_entry]['source'],
-				Type=gtype)
-            gene=FeatureValueFormat(gene) ## get prepare for MAT writing 
-            gene_counter+=1
-            gene_models.append(gene)
-    return gene_models    
-
-def GFFParse(gff_file):
-    """Parsing GFF file based on feature relationship.
-    """
-    genes, utr5, exons=dict(), dict(), dict()
-    transcripts, utr3, cds=dict(), dict(), dict()
-    # TODO Include growing key words of different non-coding/coding transcripts 
-    features=['mrna', 'transcript', 'ncRNA', 'mirna', 'pseudogenic_transcript', 'rrna', 'snorna', 'snrna', 'trna', 'scrna']
-    gff_handle=open(gff_file, "rU")
-    for gff_line in gff_handle:
-        gff_line=gff_line.strip('\n\r').split('\t')
-        if re.match(r'#|>', gff_line[0]): # skip commented line or fasta identifier line 
-            continue
-        if len(gff_line)==1: # skip fasta sequence/empty line if present 
-            continue 
-        assert len(gff_line)==9, '\t'.join(gff_line) # not found 9 tab-delimited fields in this line     
-        if '' in gff_line: # skip this line if there any field with an empty value
-            print 'Skipping..', '\t'.join(gff_line)
-            continue
-        if gff_line[-1][-1]==';': # trim the last ';' character 
-            gff_line[-1]=gff_line[-1].strip(';')
-        if gff_line[2].lower() in ['gene', 'pseudogene']:
-            gid, gene_info=None, dict()
-            gene_info['start']=int(gff_line[3])
-            gene_info['stop']=int(gff_line[4])
-            gene_info['chr']=gff_line[0]
-            gene_info['source']=gff_line[1]
-            gene_info['strand']=gff_line[6]
-            for attb in gff_line[-1].split(';'):
-                attb=attb.split('=') # gff attributes are separated by key=value pair 
-                if attb[0]=='ID':
-                    gid=attb[1]
-                    break
-            genes[(gff_line[0], gid)]=gene_info # store gene information based on the chromosome and gene symbol.
-        elif gff_line[2].lower() in features: 
-            gid, mrna_info=None, dict() 
-            mrna_info['start']=int(gff_line[3])
-            mrna_info['stop']=int(gff_line[4])
-            mrna_info['chr']=gff_line[0]
-            mrna_info['strand']=gff_line[6]
-            mrna_info['type'] = gff_line[2]
-            for attb in gff_line[-1].split(';'):
-                attb=attb.split('=')
-                if attb[0]=='Parent':
-                    gid=attb[1]
-                elif attb[0]=='ID':
-                    mrna_info[attb[0]]=attb[1]
-            for fid in gid.split(','): # child may be mapped to multiple parents ex: Parent=AT01,AT01-1-Protein 
-                if (gff_line[0], fid) in transcripts:
-                    transcripts[(gff_line[0], fid)].append(mrna_info)
-                else:
-                    transcripts[(gff_line[0], fid)]=[mrna_info]
-        elif gff_line[2].lower() in ['exon']:
-            tids, exon_info=None, dict()
-            exon_info['start']=int(gff_line[3])
-            exon_info['stop']=int(gff_line[4])
-            exon_info['chr']=gff_line[0]
-            exon_info['strand']=gff_line[6]
-            for attb in gff_line[-1].split(';'):
-                attb=attb.split('=')
-                if attb[0]=='Parent':
-                    tids=attb[1]
-                    break
-            for tid in tids.split(','):
-                if (gff_line[0], tid) in exons:
-                    exons[(gff_line[0], tid)].append(exon_info)
-                else:
-                    exons[(gff_line[0], tid)]=[exon_info]
-        elif gff_line[2].lower() in ['five_prime_utr']:
-            utr5_info, tids=dict(), None
-            utr5_info['start']=int(gff_line[3])
-            utr5_info['stop']=int(gff_line[4])
-            utr5_info['chr']=gff_line[0]
-            utr5_info['strand']=gff_line[6]
-            for attb in gff_line[-1].split(';'):
-                attb=attb.split('=')
-                if attb[0]=='Parent':
-                    tids=attb[1]
-                    break
-            for tid in tids.split(','):
-                if (gff_line[0], tid) in utr5:
-                    utr5[(gff_line[0], tid)].append(utr5_info)
-                else:
-                    utr5[(gff_line[0], tid)]=[utr5_info]
-        elif gff_line[2].lower() in ['cds']:
-            cds_info, tids=dict(), None
-            cds_info['start']=int(gff_line[3])
-            cds_info['stop']=int(gff_line[4])
-            cds_info['chr']=gff_line[0]
-            cds_info['strand']=gff_line[6]
-            for attb in gff_line[-1].split(';'):
-                attb=attb.split('=')
-                if attb[0]=='Parent':
-                    tids=attb[1]
-                    break
-            for tid in tids.split(','):
-                if (gff_line[0], tid) in cds:
-                    cds[(gff_line[0], tid)].append(cds_info)
-                else:
-                    cds[(gff_line[0], tid)]=[cds_info]
-        elif gff_line[2].lower() in ['three_prime_utr']:
-            utr3_info, tids=dict(), None
-            utr3_info['start']=int(gff_line[3])
-            utr3_info['stop']=int(gff_line[4])
-            utr3_info['chr']=gff_line[0]
-            utr3_info['strand']=gff_line[6]
-            for attb in gff_line[-1].split(';'):
-                attb=attb.split('=')
-                if attb[0]=='Parent':
-                    tids=attb[1]
-                    break
-            for tid in tids.split(','):
-                if (gff_line[0], tid) in utr3:
-                    utr3[(gff_line[0], tid)].append(utr3_info)
-                else:
-                    utr3[(gff_line[0], tid)]=[utr3_info]
-    gff_handle.close()
-    return genes, transcripts, exons, utr3, utr5, cds
-
-def __main__():
-    """extract genome feature information 
-    """
-    try:
-        gff_file = sys.argv[1]
-        mat_file = sys.argv[2]
-    except:
-        print __doc__
-        sys.exit(-1)
-
-    genes, transcripts, exons, utr3, utr5, cds = GFFParse(gff_file) 
-    gene_models = CreateGeneModels(genes, transcripts, exons, utr3, utr5, cds)
-    # TODO Write to matlab/octave struct instead of cell arrays.
-    sio.savemat(mat_file, 
-                    mdict=dict(genes=gene_models), 
-                    format='5', 
-                    oned_as='row')
-
-if __name__=='__main__':
-    __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rDiff/tools/helper.py	Tue Oct 08 07:15:44 2013 -0400
@@ -0,0 +1,179 @@
+#!/usr/bin/env python
+"""
+Common utility functions
+"""
+
+import os 
+import re
+import sys 
+import gzip 
+import bz2
+import numpy 
+
+def init_gene_DE():
+    """
+    Initializing the gene structure for DE
+    """
+    gene_det = [('id', 'f8'), 
+                    ('chr', 'S15'), 
+                    ('exons', numpy.dtype),
+                    ('gene_info', numpy.dtype),
+                    ('is_alt_spliced', 'f8'), 
+                    ('name', 'S25'),
+                    ('source', 'S25'),
+                    ('start', 'f8'),
+                    ('stop', 'f8'), 
+                    ('strand', 'S2'), 
+                    ('transcripts', numpy.dtype)]
+
+    return gene_det
+
+def _open_file(fname):
+    """
+    Open the file (supports .gz .bz2) and returns the handler
+    """
+    try:
+        if os.path.splitext(fname)[1] == ".gz":
+            FH = gzip.open(fname, 'rb')
+        elif os.path.splitext(fname)[1] == ".bz2":
+            FH = bz2.BZ2File(fname, 'rb')
+        else:
+            FH = open(fname, 'rU')
+    except Exception as error:
+        sys.exit(error)
+    return FH
+
+def make_Exon_cod(strand_p, five_p_utr, cds_cod, three_p_utr):
+    """
+    Create exon cordinates from UTR's and CDS region
+    """
+    exon_pos = []
+    if strand_p == '+':        
+        utr5_start, utr5_end = 0, 0
+        if five_p_utr != []:
+            utr5_start, utr5_end = five_p_utr[-1][0], five_p_utr[-1][1] 
+        cds_5start, cds_5end = cds_cod[0][0], cds_cod[0][1]
+        jun_exon = []
+        if cds_5start-utr5_end == 0 or cds_5start-utr5_end == 1:
+            jun_exon = [utr5_start, cds_5end]    
+        if len(cds_cod) == 1:
+            five_prime_flag = 0
+            if jun_exon != []:
+                five_p_utr = five_p_utr[:-1]
+                five_prime_flag = 1
+            for utr5 in five_p_utr:
+                exon_pos.append(utr5)
+            jun_exon = []
+            utr3_start, utr3_end = 0, 0
+            if three_p_utr != []: 
+                utr3_start = three_p_utr[0][0]
+                utr3_end = three_p_utr[0][1]
+            if utr3_start-cds_5end == 0 or utr3_start-cds_5end == 1:
+                jun_exon = [cds_5start, utr3_end]
+            three_prime_flag = 0
+            if jun_exon != []: 
+                cds_cod = cds_cod[:-1]
+                three_p_utr = three_p_utr[1:]
+                three_prime_flag = 1
+            if five_prime_flag == 1 and three_prime_flag == 1:
+                exon_pos.append([utr5_start, utr3_end])
+            if five_prime_flag == 1 and three_prime_flag == 0:
+                exon_pos.append([utr5_start, cds_5end])
+                cds_cod = cds_cod[:-1]
+            if five_prime_flag == 0 and three_prime_flag == 1:
+                exon_pos.append([cds_5start, utr3_end])
+            for cds in cds_cod:
+                exon_pos.append(cds)
+            for utr3 in three_p_utr:
+                exon_pos.append(utr3)
+        else:    
+            if jun_exon != []:
+                five_p_utr = five_p_utr[:-1]
+                cds_cod = cds_cod[1:]
+            for utr5 in five_p_utr:
+                exon_pos.append(utr5)
+            exon_pos.append(jun_exon) if jun_exon != [] else ''
+            jun_exon = []
+            utr3_start, utr3_end = 0, 0
+            if three_p_utr != []:
+                utr3_start = three_p_utr[0][0]
+                utr3_end = three_p_utr[0][1]
+            cds_3start = cds_cod[-1][0]
+            cds_3end = cds_cod[-1][1]
+            if utr3_start-cds_3end == 0 or utr3_start-cds_3end == 1:       
+                jun_exon = [cds_3start, utr3_end]
+            if jun_exon != []:
+                cds_cod = cds_cod[:-1]
+                three_p_utr = three_p_utr[1:]
+            for cds in cds_cod:
+                exon_pos.append(cds)
+            exon_pos.append(jun_exon) if jun_exon != [] else ''
+            for utr3 in three_p_utr:
+                exon_pos.append(utr3)
+    elif strand_p == '-':
+        utr3_start, utr3_end = 0, 0        
+        if three_p_utr != []:
+            utr3_start = three_p_utr[-1][0]
+            utr3_end = three_p_utr[-1][1]
+        cds_3start = cds_cod[0][0]
+        cds_3end = cds_cod[0][1]
+        jun_exon = []
+        if cds_3start-utr3_end == 0 or cds_3start-utr3_end == 1:
+            jun_exon = [utr3_start, cds_3end]  
+        if len(cds_cod) == 1:    
+            three_prime_flag = 0
+            if jun_exon != []:
+                three_p_utr = three_p_utr[:-1]
+                three_prime_flag = 1
+            for utr3 in three_p_utr:
+                exon_pos.append(utr3)
+            jun_exon = []
+            (utr5_start, utr5_end) = (0, 0)
+            if five_p_utr != []:
+                utr5_start = five_p_utr[0][0]
+                utr5_end = five_p_utr[0][1]
+            if utr5_start-cds_3end == 0 or utr5_start-cds_3end == 1:
+                jun_exon = [cds_3start, utr5_end]
+            five_prime_flag = 0
+            if jun_exon != []:
+                cds_cod = cds_cod[:-1]
+                five_p_utr = five_p_utr[1:]
+                five_prime_flag = 1
+            if three_prime_flag == 1 and five_prime_flag == 1:
+                exon_pos.append([utr3_start, utr5_end])
+            if three_prime_flag == 1 and five_prime_flag == 0:
+                exon_pos.append([utr3_start, cds_3end])
+                cds_cod = cds_cod[:-1]
+            if three_prime_flag == 0 and five_prime_flag == 1:
+                exon_pos.append([cds_3start, utr5_end])        
+            for cds in cds_cod:
+                exon_pos.append(cds)
+            for utr5 in five_p_utr:
+                exon_pos.append(utr5)
+        else:
+            if jun_exon != []:
+                three_p_utr = three_p_utr[:-1]
+                cds_cod = cds_cod[1:]
+            for utr3 in three_p_utr:
+                exon_pos.append(utr3)   
+            if jun_exon != []:
+                exon_pos.append(jun_exon)
+            jun_exon = []
+            (utr5_start, utr5_end) = (0, 0)
+            if five_p_utr != []:
+                utr5_start = five_p_utr[0][0]
+                utr5_end = five_p_utr[0][1]    
+            cds_5start = cds_cod[-1][0]
+            cds_5end = cds_cod[-1][1]
+            if utr5_start-cds_5end == 0 or utr5_start-cds_5end == 1:
+                jun_exon = [cds_5start, utr5_end]
+            if jun_exon != []:
+                cds_cod = cds_cod[:-1]
+                five_p_utr = five_p_utr[1:]
+            for cds in cds_cod:
+                exon_pos.append(cds)
+            if jun_exon != []:
+                exon_pos.append(jun_exon)    
+            for utr5 in five_p_utr:
+                exon_pos.append(utr5)
+    return exon_pos