Mercurial > repos > vipints > rdiff
diff rDiff/tools/ParseGFF.py @ 0:0f80a5141704
version 0.3 uploaded
author | vipints |
---|---|
date | Thu, 14 Feb 2013 23:38:36 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rDiff/tools/ParseGFF.py Thu Feb 14 23:38:36 2013 -0500 @@ -0,0 +1,532 @@ +#!/usr/bin/env python +""" +Extract genome annotation from a GFF3 (a tab delimited +format for storing sequence features and annotations: +http://www.sequenceontology.org/gff3.shtml) file. + +Usage: ParseGFF.py in.gff3 out.mat + +Requirements: + Scipy :- http://scipy.org/ + Numpy :- http://numpy.org/ + +Copyright (C) 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany + 2012- Memorial Sloan-Kettering Cancer Center, New York City, USA +""" + +import re, sys +import scipy.io as sio +import numpy as np + +def addCDSphase(strand, cds): + """Add CDS phase to the CDS exons + """ + cds_region, cds_flag = [], 0 + if strand == '+': + for cdspos in cds: + if cds_flag == 0: + cdspos = (cdspos[0], cdspos[1], 0) + diff = (cdspos[1]-(cdspos[0]-1))%3 + else: + xy = 0 + if diff == 0: + cdspos = (cdspos[0], cdspos[1], 0) + elif diff == 1: + cdspos = (cdspos[0], cdspos[1], 2) + xy = 2 + elif diff == 2: + cdspos = (cdspos[0], cdspos[1], 1) + xy = 1 + diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3 + cds_region.append(cdspos) + cds_flag = 1 + elif strand == '-': + cds.reverse() + for cdspos in cds: + if cds_flag == 0: + cdspos = (cdspos[0], cdspos[1], 0) + diff = (cdspos[1]-(cdspos[0]-1))%3 + else: + xy = 0 + if diff == 0: + cdspos = (cdspos[0], cdspos[1], 0) + elif diff == 1: + cdspos = (cdspos[0], cdspos[1], 2) + xy = 2 + elif diff == 2: + cdspos = (cdspos[0], cdspos[1], 1) + xy = 1 + diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3 + cds_region.append(cdspos) + cds_flag = 1 + cds_region.reverse() + return cds_region + +def createExon(strand_p, five_p_utr, cds_cod, three_p_utr): + """Create exon cordinates from UTR's and CDS region + """ + exon_pos = [] + if strand_p == '+': + utr5_start, utr5_end = 0, 0 + if five_p_utr != []: + utr5_start, utr5_end = five_p_utr[-1][0], five_p_utr[-1][1] + cds_5start, cds_5end = cds_cod[0][0], cds_cod[0][1] + jun_exon = [] + if cds_5start-utr5_end == 0 or cds_5start-utr5_end == 1: + jun_exon = [utr5_start, cds_5end] + if len(cds_cod) == 1: + five_prime_flag = 0 + if jun_exon != []: + five_p_utr = five_p_utr[:-1] + five_prime_flag = 1 + for utr5 in five_p_utr: + exon_pos.append(utr5) + jun_exon = [] + utr3_start, utr3_end = 0, 0 + if three_p_utr != []: + utr3_start = three_p_utr[0][0] + utr3_end = three_p_utr[0][1] + if utr3_start-cds_5end == 0 or utr3_start-cds_5end == 1: + jun_exon = [cds_5start, utr3_end] + three_prime_flag = 0 + if jun_exon != []: + cds_cod = cds_cod[:-1] + three_p_utr = three_p_utr[1:] + three_prime_flag = 1 + if five_prime_flag == 1 and three_prime_flag == 1: + exon_pos.append([utr5_start, utr3_end]) + if five_prime_flag == 1 and three_prime_flag == 0: + exon_pos.append([utr5_start, cds_5end]) + cds_cod = cds_cod[:-1] + if five_prime_flag == 0 and three_prime_flag == 1: + exon_pos.append([cds_5start, utr3_end]) + for cds in cds_cod: + exon_pos.append(cds) + for utr3 in three_p_utr: + exon_pos.append(utr3) + else: + if jun_exon != []: + five_p_utr = five_p_utr[:-1] + cds_cod = cds_cod[1:] + for utr5 in five_p_utr: + exon_pos.append(utr5) + exon_pos.append(jun_exon) if jun_exon != [] else '' + jun_exon = [] + utr3_start, utr3_end = 0, 0 + if three_p_utr != []: + utr3_start = three_p_utr[0][0] + utr3_end = three_p_utr[0][1] + cds_3start = cds_cod[-1][0] + cds_3end = cds_cod[-1][1] + if utr3_start-cds_3end == 0 or utr3_start-cds_3end == 1: + jun_exon = [cds_3start, utr3_end] + if jun_exon != []: + cds_cod = cds_cod[:-1] + three_p_utr = three_p_utr[1:] + for cds in cds_cod: + exon_pos.append(cds) + exon_pos.append(jun_exon) if jun_exon != [] else '' + for utr3 in three_p_utr: + exon_pos.append(utr3) + elif strand_p == '-': + utr3_start, utr3_end = 0, 0 + if three_p_utr != []: + utr3_start = three_p_utr[-1][0] + utr3_end = three_p_utr[-1][1] + cds_3start = cds_cod[0][0] + cds_3end = cds_cod[0][1] + jun_exon = [] + if cds_3start-utr3_end == 0 or cds_3start-utr3_end == 1: + jun_exon = [utr3_start, cds_3end] + if len(cds_cod) == 1: + three_prime_flag = 0 + if jun_exon != []: + three_p_utr = three_p_utr[:-1] + three_prime_flag = 1 + for utr3 in three_p_utr: + exon_pos.append(utr3) + jun_exon = [] + (utr5_start, utr5_end) = (0, 0) + if five_p_utr != []: + utr5_start = five_p_utr[0][0] + utr5_end = five_p_utr[0][1] + if utr5_start-cds_3end == 0 or utr5_start-cds_3end == 1: + jun_exon = [cds_3start, utr5_end] + five_prime_flag = 0 + if jun_exon != []: + cds_cod = cds_cod[:-1] + five_p_utr = five_p_utr[1:] + five_prime_flag = 1 + if three_prime_flag == 1 and five_prime_flag == 1: + exon_pos.append([utr3_start, utr5_end]) + if three_prime_flag == 1 and five_prime_flag == 0: + exon_pos.append([utr3_start, cds_3end]) + cds_cod = cds_cod[:-1] + if three_prime_flag == 0 and five_prime_flag == 1: + exon_pos.append([cds_3start, utr5_end]) + for cds in cds_cod: + exon_pos.append(cds) + for utr5 in five_p_utr: + exon_pos.append(utr5) + else: + if jun_exon != []: + three_p_utr = three_p_utr[:-1] + cds_cod = cds_cod[1:] + for utr3 in three_p_utr: + exon_pos.append(utr3) + if jun_exon != []: + exon_pos.append(jun_exon) + jun_exon = [] + (utr5_start, utr5_end) = (0, 0) + if five_p_utr != []: + utr5_start = five_p_utr[0][0] + utr5_end = five_p_utr[0][1] + cds_5start = cds_cod[-1][0] + cds_5end = cds_cod[-1][1] + if utr5_start-cds_5end == 0 or utr5_start-cds_5end == 1: + jun_exon = [cds_5start, utr5_end] + if jun_exon != []: + cds_cod = cds_cod[:-1] + five_p_utr = five_p_utr[1:] + for cds in cds_cod: + exon_pos.append(cds) + if jun_exon != []: + exon_pos.append(jun_exon) + for utr5 in five_p_utr: + exon_pos.append(utr5) + return exon_pos + +def init_gene(): + """Initializing the gene structure + """ + gene_details=dict( + id = '', + anno_id = [], + confgenes_id = [], + name = '', + source = '', + gene_info = {}, + alias = '', + name2 = [], + strand = '', + chr = '', + chr_num = [], + paralogs = [], + start = '', + stop = '', + transcripts = [], + transcript_info = [], + transcript_status = [], + transcript_valid = [], + exons = [], + exons_confirmed = [], + cds_exons = [], + utr5_exons = [], + utr3_exons = [], + tis = [], + tis_conf = [], + tis_info = [], + cdsStop = [], + cdsStop_conf = [], + cdsStop_info = [], + tss = [], + tss_info = [], + tss_conf = [], + cleave = [], + cleave_info = [], + cleave_conf = [], + polya = [], + polya_info = [], + polya_conf = [], + is_alt = [], + is_alt_spliced = 0, + is_valid = [], + transcript_complete = [], + is_complete = [], + is_correctly_gff3_referenced = '', + splicegraph = [] + ) + return gene_details + +def FeatureValueFormat(singlegene): + """Make feature value compactable to write in a .mat format + """ + comp_exon = np.zeros((len(singlegene['exons']),), dtype=np.object) + for i in range(len(singlegene['exons'])): + comp_exon[i]= np.array(singlegene['exons'][i]) + singlegene['exons'] = comp_exon + comp_cds = np.zeros((len(singlegene['cds_exons']),), dtype=np.object) + for i in range(len(singlegene['cds_exons'])): + comp_cds[i]= np.array(singlegene['cds_exons'][i]) + singlegene['cds_exons'] = comp_cds + comp_utr3 = np.zeros((len(singlegene['utr3_exons']),), dtype=np.object) + for i in range(len(singlegene['utr3_exons'])): + comp_utr3[i]= np.array(singlegene['utr3_exons'][i]) + singlegene['utr3_exons'] = comp_utr3 + comp_utr5 = np.zeros((len(singlegene['utr5_exons']),), dtype=np.object) + for i in range(len(singlegene['utr5_exons'])): + comp_utr5[i]= np.array(singlegene['utr5_exons'][i]) + singlegene['utr5_exons'] = comp_utr5 + comp_transcripts = np.zeros((len(singlegene['transcripts']),), dtype=np.object) + for i in range(len(singlegene['transcripts'])): + comp_transcripts[i]= np.array(singlegene['transcripts'][i]) + singlegene['transcripts'] = comp_transcripts + comp_tss = np.zeros((len(singlegene['tss']),), dtype=np.object) + for i in range(len(singlegene['tss'])): + comp_tss[i]= np.array(singlegene['tss'][i]) + singlegene['tss'] = comp_tss + comp_tis = np.zeros((len(singlegene['tis']),), dtype=np.object) + for i in range(len(singlegene['tis'])): + comp_tis[i]= np.array(singlegene['tis'][i]) + singlegene['tis'] = comp_tis + comp_cleave = np.zeros((len(singlegene['cleave']),), dtype=np.object) + for i in range(len(singlegene['cleave'])): + comp_cleave[i]= np.array(singlegene['cleave'][i]) + singlegene['cleave'] = comp_cleave + comp_cdsStop = np.zeros((len(singlegene['cdsStop']),), dtype=np.object) + for i in range(len(singlegene['cdsStop'])): + comp_cdsStop[i]= np.array(singlegene['cdsStop'][i]) + singlegene['cdsStop'] = comp_cdsStop + + return singlegene + +def CreateGeneModels(genes_cmpt, transcripts_cmpt, exons_cmpt, utr3_cmpt, utr5_cmpt, cds_cmpt): + """Creating Coding/Non-coding gene models from parsed GFF objects. + """ + gene_counter, gene_models = 1, [] + for gene_entry in genes_cmpt: ## Figure out the genes and transcripts associated feature + if gene_entry in transcripts_cmpt: + gene=init_gene() + gene['id']=gene_counter + gene['name']=gene_entry[1] + gene['chr']=genes_cmpt[gene_entry]['chr'] + gene['source']=genes_cmpt[gene_entry]['source'] + gene['start']=genes_cmpt[gene_entry]['start'] + gene['stop']=genes_cmpt[gene_entry]['stop'] + gene['strand']=genes_cmpt[gene_entry]['strand'] + if not gene['strand'] in ['+', '-']: + gene['strand']='.' # Strand info not known replaced with a dot symbol instead of None, ?, . etc. + if len(transcripts_cmpt[gene_entry])>1: + gene['is_alt_spliced'] = 1 + gene['is_alt'] = 1 + gtype=[] + for tids in transcripts_cmpt[gene_entry]: ## transcript section related tags + gene['transcripts'].append(tids['ID']) + gtype.append(tids['type']) + exon_cod, utr5_cod, utr3_cod, cds_cod = [], [], [], [] + if (gene['chr'], tids['ID']) in exons_cmpt: + exon_cod = [[feat_exon['start'], feat_exon['stop']] for feat_exon in exons_cmpt[(gene['chr'], tids['ID'])]] + if (gene['chr'], tids['ID']) in utr5_cmpt: + utr5_cod = [[feat_utr5['start'], feat_utr5['stop']] for feat_utr5 in utr5_cmpt[(gene['chr'], tids['ID'])]] + if (gene['chr'], tids['ID']) in utr3_cmpt: + utr3_cod = [[feat_utr3['start'], feat_utr3['stop']] for feat_utr3 in utr3_cmpt[(gene['chr'], tids['ID'])]] + if (gene['chr'], tids['ID']) in cds_cmpt: + cds_cod = [[feat_cds['start'], feat_cds['stop']] for feat_cds in cds_cmpt[(gene['chr'], tids['ID'])]] + if len(exon_cod) == 0: ## build exon coordinates from UTR3, UTR5 and CDS + if cds_cod != []: + exon_cod=createExon(gene['strand'], utr5_cod, cds_cod, utr3_cod) + + if gene['strand']=='-': ## general order to coordinates + if len(exon_cod) >1: + if exon_cod[0][0] > exon_cod[-1][0]: + exon_cod.reverse() + if len(cds_cod) >1: + if cds_cod[0][0] > cds_cod[-1][0]: + cds_cod.reverse() + if len(utr3_cod) >1: + if utr3_cod[0][0] > utr3_cod[-1][0]: + utr3_cod.reverse() + if len(utr5_cod) >1: + if utr5_cod[0][0] > utr5_cod[-1][0]: + utr5_cod.reverse() + + tis, cdsStop, tss, cleave = [], [], [], [] ## speacial sited in the gene region + if cds_cod != []: + if gene['strand'] == '+': + tis = [cds_cod[0][0]] + cdsStop = [cds_cod[-1][1]-3] + elif gene['strand'] == '-': + tis = [cds_cod[-1][1]] + cdsStop = [cds_cod[0][0]+3] + if utr5_cod != []: + if gene['strand'] == '+': + tss = [utr5_cod[0][0]] + elif gene['strand'] == '-': + tss = [utr5_cod[-1][1]] + if utr3_cod != []: + if gene['strand'] == '+': + cleave = [utr3_cod[-1][1]] + elif gene['strand'] == '-': + cleave = [utr3_cod[0][0]] + + cds_status, exon_status, utr_status = 0, 0, 0 ## status of the complete elements of the gene + if cds_cod != []: ## adding phase to the CDS region + cds_cod_phase = addCDSphase(gene['strand'], cds_cod) + cds_status = 1 + gene['cds_exons'].append(cds_cod_phase) + + if exon_cod != []: + exon_status = 1 + if utr5_cod != [] or utr3_cod != []: + utr_status = 1 + if cds_status != 0 and exon_status != 0 and utr_status != 0: + gene['transcript_status'].append(1) + else: + gene['transcript_status'].append(0) + + if exon_cod: ## final check point for a valid gene model + gene['exons'].append(exon_cod) + gene['utr3_exons'].append(utr3_cod) + gene['utr5_exons'].append(utr5_cod) + gene['tis'].append(tis) + gene['cdsStop'].append(cdsStop) + gene['tss'].append(tss) + gene['cleave'].append(cleave) + + gtype=list(set(gtype)) ## different types + gene['gene_info']=dict(ID=gene_entry[1], + Source=genes_cmpt[gene_entry]['source'], + Type=gtype) + gene=FeatureValueFormat(gene) ## get prepare for MAT writing + gene_counter+=1 + gene_models.append(gene) + return gene_models + +def GFFParse(gff_file): + """Parsing GFF file based on feature relationship. + """ + genes, utr5, exons=dict(), dict(), dict() + transcripts, utr3, cds=dict(), dict(), dict() + # TODO Include growing key words of different non-coding/coding transcripts + features=['mrna', 'transcript', 'ncRNA', 'mirna', 'pseudogenic_transcript', 'rrna', 'snorna', 'snrna', 'trna', 'scrna'] + gff_handle=open(gff_file, "rU") + for gff_line in gff_handle: + gff_line=gff_line.strip('\n\r').split('\t') + if re.match(r'#|>', gff_line[0]): # skip commented line or fasta identifier line + continue + if len(gff_line)==1: # skip fasta sequence/empty line if present + continue + assert len(gff_line)==9, '\t'.join(gff_line) # not found 9 tab-delimited fields in this line + if '' in gff_line: # skip this line if there any field with an empty value + print 'Skipping..', '\t'.join(gff_line) + continue + if gff_line[-1][-1]==';': # trim the last ';' character + gff_line[-1]=gff_line[-1].strip(';') + if gff_line[2].lower() in ['gene', 'pseudogene']: + gid, gene_info=None, dict() + gene_info['start']=int(gff_line[3]) + gene_info['stop']=int(gff_line[4]) + gene_info['chr']=gff_line[0] + gene_info['source']=gff_line[1] + gene_info['strand']=gff_line[6] + for attb in gff_line[-1].split(';'): + attb=attb.split('=') # gff attributes are separated by key=value pair + if attb[0]=='ID': + gid=attb[1] + break + genes[(gff_line[0], gid)]=gene_info # store gene information based on the chromosome and gene symbol. + elif gff_line[2].lower() in features: + gid, mrna_info=None, dict() + mrna_info['start']=int(gff_line[3]) + mrna_info['stop']=int(gff_line[4]) + mrna_info['chr']=gff_line[0] + mrna_info['strand']=gff_line[6] + mrna_info['type'] = gff_line[2] + for attb in gff_line[-1].split(';'): + attb=attb.split('=') + if attb[0]=='Parent': + gid=attb[1] + elif attb[0]=='ID': + mrna_info[attb[0]]=attb[1] + for fid in gid.split(','): # child may be mapped to multiple parents ex: Parent=AT01,AT01-1-Protein + if (gff_line[0], fid) in transcripts: + transcripts[(gff_line[0], fid)].append(mrna_info) + else: + transcripts[(gff_line[0], fid)]=[mrna_info] + elif gff_line[2].lower() in ['exon']: + tids, exon_info=None, dict() + exon_info['start']=int(gff_line[3]) + exon_info['stop']=int(gff_line[4]) + exon_info['chr']=gff_line[0] + exon_info['strand']=gff_line[6] + for attb in gff_line[-1].split(';'): + attb=attb.split('=') + if attb[0]=='Parent': + tids=attb[1] + break + for tid in tids.split(','): + if (gff_line[0], tid) in exons: + exons[(gff_line[0], tid)].append(exon_info) + else: + exons[(gff_line[0], tid)]=[exon_info] + elif gff_line[2].lower() in ['five_prime_utr']: + utr5_info, tids=dict(), None + utr5_info['start']=int(gff_line[3]) + utr5_info['stop']=int(gff_line[4]) + utr5_info['chr']=gff_line[0] + utr5_info['strand']=gff_line[6] + for attb in gff_line[-1].split(';'): + attb=attb.split('=') + if attb[0]=='Parent': + tids=attb[1] + break + for tid in tids.split(','): + if (gff_line[0], tid) in utr5: + utr5[(gff_line[0], tid)].append(utr5_info) + else: + utr5[(gff_line[0], tid)]=[utr5_info] + elif gff_line[2].lower() in ['cds']: + cds_info, tids=dict(), None + cds_info['start']=int(gff_line[3]) + cds_info['stop']=int(gff_line[4]) + cds_info['chr']=gff_line[0] + cds_info['strand']=gff_line[6] + for attb in gff_line[-1].split(';'): + attb=attb.split('=') + if attb[0]=='Parent': + tids=attb[1] + break + for tid in tids.split(','): + if (gff_line[0], tid) in cds: + cds[(gff_line[0], tid)].append(cds_info) + else: + cds[(gff_line[0], tid)]=[cds_info] + elif gff_line[2].lower() in ['three_prime_utr']: + utr3_info, tids=dict(), None + utr3_info['start']=int(gff_line[3]) + utr3_info['stop']=int(gff_line[4]) + utr3_info['chr']=gff_line[0] + utr3_info['strand']=gff_line[6] + for attb in gff_line[-1].split(';'): + attb=attb.split('=') + if attb[0]=='Parent': + tids=attb[1] + break + for tid in tids.split(','): + if (gff_line[0], tid) in utr3: + utr3[(gff_line[0], tid)].append(utr3_info) + else: + utr3[(gff_line[0], tid)]=[utr3_info] + gff_handle.close() + return genes, transcripts, exons, utr3, utr5, cds + +def __main__(): + """extract genome feature information + """ + try: + gff_file = sys.argv[1] + mat_file = sys.argv[2] + except: + print __doc__ + sys.exit(-1) + + genes, transcripts, exons, utr3, utr5, cds = GFFParse(gff_file) + gene_models = CreateGeneModels(genes, transcripts, exons, utr3, utr5, cds) + # TODO Write to matlab/octave struct instead of cell arrays. + sio.savemat(mat_file, + mdict=dict(genes=gene_models), + format='5', + oned_as='row') + +if __name__=='__main__': + __main__()