Mercurial > repos > siyuan > prada
view pyPRADA_1.2/prada-frame @ 3:f17965495ec9 draft default tip
Uploaded
author | siyuan |
---|---|
date | Tue, 11 Mar 2014 12:14:01 -0400 |
parents | acc2ca1a3ba4 |
children |
line wrap: on
line source
#!/usr/bin/env python #This script is part of the pyPRADA package. It is used to predict the functional consequences of #gene fusions. Possible predictions include in-frame, out-of-frame, UTR-UTR, UTR-CDS, Unknown etc. #It relies on the definition of transcript CDS/UTR boundaries from ENSEMBL, and gives a prediction #for every pair of transcripts coded by the fusion genes. The whole point is to see if the fusion #lead to reading shift of the CDS sequences. #Two files are generated, brief/detail result. #Result is purely predicted, no guarantee is assumed. #Contact: Siyuan Zheng, szheng2@mdanderson.org import os import sys import re import time import subprocess import bioclass from Bio import SeqIO import ioprada ###################################################################################### help_menu='''\nUsage: prada-frame -conf xx.txt -i inputfile -docstr XX -outdir ./ **parameters** -conf see prada-fusion -conf -i input file, a tab/space delimited four-column file, each row like "GeneA GeneA_break GeneB GeneB_break" Example:FGFR3 1808661 TACC3 1739325 -docstr outfile prefix. output to two files: XX.frame.brief.txt, XX.frame.detail.txt -outdir output directory, default is ./ ''' args=sys.argv if '-h' in args or '-help' in args or len(args)==1: print help_menu sys.exit(0) if '-i' not in sys.argv: sys.exit('Input file needed') else: i=sys.argv.index('-i') datafilename=sys.argv[i+1] if '-outdir' not in args: outpath=os.path.abspath('./') else: i=args.index('-outdir') outpath=os.path.abspath(args[i+1]) if os.path.exists(outpath): print 'WARNING: outdir %s exists'%outpath else: os.mkdir(outpath) if '-docstr' not in sys.argv: sys.exit('docstring needed') else: i=sys.argv.index('-docstr') outfile_prefix=sys.argv[i+1] outfile_brief='%s/%s.frame.brief.txt'%(outpath,outfile_prefix) outfile_detail='%s/%s.frame.detail.txt'%(outpath,outfile_prefix) #PRADA executive path prada_path=os.path.dirname(os.path.abspath(__file__)) #### ref_search_path=[prada_path,os.getcwd()] #search path for ref file if not specified in command line if '-ref' in args: i=args.index('-conf') reffile=args[i+1] if os.path.exists(reffile): pass else: for pth in ref_search_path: new_reffile='%s/%s'%(pth, os.path.basename(reffile)) if os.path.exists(new_reffile): reffile=new_reffile break else: sys.exit('ERROR: conf file %s not found'%reffile) else: reffile='%s/conf.txt'%prada_path if not os.path.exists(reffile): sys.exit('ERROR: No default conf.txt found and none specified') #read reference files refdict=ioprada.read_conf(reffile) featurefile=refdict['--REF--']['feature_file'] cdsfile=refdict['--REF--']['cds_file'] txcatfile=refdict['--REF--']['txcat_file'] #Now print all input parameters print 'CMD: %s'%('\t'.join(args)) ########################################################################################### ##Read information from annotation files, for all genes print 'Reading annotations' #call functions in ioprada module // txdb,genedb=ioprada.read_feature(featurefile,verbose=False) tx_cds=ioprada.read_cds(cdsfile) tx_primary=ioprada.read_tx_cat(txcatfile) ########################################################################################### def maptranscript(gene,exon_end,part=''): '''Find all transcripts of the gene having the fusion boundary. gene is a Gene object,return a list of Transcript objects''' if part not in ['5','3']: raise Exception('part must be "5" or "3"') txuse=[] ## g=gene.name strand=gene.strand if part=='5': tag='big' if strand=='1' else 'small' else: tag='small' if strand=='1' else 'big' txs=gene.transcript if tag=='small': for tx in txs: e_pos=[x.start for x in tx.exon] if exon_end in e_pos: txuse.append(tx) elif tag=='big': for tx in txs: e_pos=[x.end for x in tx.exon] if exon_end in e_pos: txuse.append(tx) return txuse def overlap(r1,r2): '''compare two ranges, return the overlapped''' if r1[0] <= r2[0]: ra,rb=r1[:],r2[:] else: ra,rb=r2[:],r1[:] if rb[0] > ra[1]: return [] else: if rb[1] <= ra[1]: return rb else: return [rb[0],ra[1]] def br_front_len(tx,br,part): '''No matter it is 5'/3' partner,calculate the ORF length before the break''' global tx_cds txname=tx.name strand=tx.strand cds_start,cds_end=[int(x) for x in tx_cds[txname]] e_pos=[[x.start,x.end] for x in tx.exon] if part=='5' and strand=='1': posset=[x.end for x in tx.exon] if br not in posset: raise Exception('Br is not exon boundary') if br < cds_start: return '5UTR' elif br > cds_end: return '3UTR' else: chim_r=[cds_start,br] ol=[overlap(x,chim_r) for x in e_pos] L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) return L if part=='5' and strand=='-1': posset=[x.start for x in tx.exon] if br not in posset: raise Exception('Br is not exon boundary') if br > cds_end: return '5UTR' elif br < cds_start: return '3UTR' else: chim_r=[br,cds_end] ol=[overlap(x,chim_r) for x in e_pos] L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) return L if part=='3' and strand=='1': posset=[x.start for x in tx.exon] if br not in posset: raise Exception('Br is not exon boundary') if br < cds_start: return '5UTR' elif br > cds_end: return '3UTR' else: chim_r=[cds_start,br-1] ol=[overlap(x,chim_r) for x in e_pos] L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) return L if part=='3' and strand=='-1': posset=[x.end for x in tx.exon] if br not in posset: raise Exception('Br is not exon boundary') if br > cds_end: return '5UTR' elif br < cds_start: return '3UTR' else: chim_r=[br+1,cds_end] ol=[overlap(x,chim_r) for x in e_pos] L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) return L def fusion_frame(gene_a,ga_br,gene_b,gb_br): '''gene_a:FGFR3,gene_b:TACC3,ga_br:1808661,gb_br:1741429 or chr4.1808661.chr4.1741429''' global genedb,txdb ga_br=int(ga_br) gb_br=int(gb_br) ga_txs=maptranscript(genedb[gene_a],ga_br,part='5') gb_txs=maptranscript(genedb[gene_b],gb_br,part='3') res=[] for ta in ga_txs: for tb in gb_txs: s1=br_front_len(ta,ga_br,'5') s2=br_front_len(tb,gb_br,'3') fusion_conseq='Unknown' if isinstance(s1,int) and isinstance(s2,int): #both br in CDS region if s1%3==s2%3: fusion_conseq='In-frame' else: fusion_conseq='Out-of-frame' elif isinstance(s1,int) and not isinstance(s2,int): fusion_conseq='CDS'+'-'+s2 elif isinstance(s2,int) and not isinstance(s1,int): fusion_conseq=s1+'-'+'CDS' else: fusion_conseq=s1+'-'+s2 res.append((ta.name,tb.name,fusion_conseq)) if ga_txs==[] and gb_txs==[]: res.append(['NA','NA','NA']) elif ga_txs==[]: for tb in gb_txs: res.append(['NA',tb.name,'NA']) elif gb_txs==[]: for ta in ga_txs: res.append([ta.name,'NA','NA']) return res ############################################################################ print 'Predicting...' infile=open(datafilename) outfile_detail=open(outfile_detail,'w') outfile_brief=open(outfile_brief,'w') detailM=[] briefM=[] for line in infile: if not line.strip(): continue geneA,ga_br,geneB,gb_br=line.split() try: M=fusion_frame(geneA,ga_br,geneB,gb_br) except: M=[] ppcollected=0 for ccc in M: a_cat,b_cat='alternative','alternative' if tx_primary[geneA]==ccc[0]: a_cat='primary' if tx_primary[geneB]==ccc[1]: b_cat='primary' tag='%s_%s'%(a_cat,b_cat) if ccc[0]=='NA' or ccc[1]=='NA': tag='NA' row_d=[geneA,ga_br,geneB,gb_br,ccc[0],ccc[1],tag,ccc[2]] #genea,geneb,tag,consequence detailM.append(row_d) if tag=='primary_primary': row_b=[geneA,ga_br,geneB,gb_br,ccc[2]] ppcollected=1 if ppcollected==0: row_b=[geneA,ga_br,geneB,gb_br,'not-classified'] briefM.append(row_b) infile.close() for line in detailM: outfile_detail.write('%s\n'%('\t'.join(line))) outfile_detail.close() for line in briefM: outfile_brief.write('%s\n'%('\t'.join(line))) outfile_brief.close() print 'Done!'