| 0 | 1 #!/usr/bin/env python | 
|  | 2 | 
|  | 3 #This script is part of the pyPRADA package. It is used to predict the functional consequences of | 
|  | 4 #gene fusions. Possible predictions include in-frame, out-of-frame, UTR-UTR, UTR-CDS, Unknown etc. | 
|  | 5 #It relies on the definition of transcript CDS/UTR boundaries from ENSEMBL, and gives a prediction | 
|  | 6 #for every pair of transcripts coded by the fusion genes. The whole point is to see if the fusion | 
|  | 7 #lead to reading shift of the CDS sequences. | 
|  | 8 #Two files are generated, brief/detail result. | 
|  | 9 #Result is purely predicted, no guarantee is assumed. | 
|  | 10 #Contact: Siyuan Zheng, szheng2@mdanderson.org | 
|  | 11 | 
|  | 12 import os | 
|  | 13 import sys | 
|  | 14 import re | 
|  | 15 import time | 
|  | 16 import subprocess | 
|  | 17 import bioclass | 
|  | 18 from Bio import SeqIO | 
|  | 19 import ioprada | 
|  | 20 | 
|  | 21 ###################################################################################### | 
|  | 22 help_menu='''\nUsage: prada-frame -conf xx.txt -i inputfile -docstr XX -outdir ./ | 
|  | 23 **parameters** | 
|  | 24     -conf	see prada-fusion -conf | 
|  | 25     -i		input file, a tab/space delimited four-column file, each row like "GeneA GeneA_break GeneB GeneB_break" | 
|  | 26     		Example:FGFR3 1808661 TACC3 1739325 | 
|  | 27     -docstr	outfile prefix. output to two files: XX.frame.brief.txt, XX.frame.detail.txt | 
|  | 28     -outdir	output directory, default is ./ | 
|  | 29 ''' | 
|  | 30 | 
|  | 31 args=sys.argv | 
|  | 32 if '-h' in args or '-help' in args or len(args)==1: | 
|  | 33     print help_menu | 
|  | 34     sys.exit(0) | 
|  | 35 | 
|  | 36 if '-i' not in sys.argv: | 
|  | 37     sys.exit('Input file needed') | 
|  | 38 else: | 
|  | 39     i=sys.argv.index('-i') | 
|  | 40     datafilename=sys.argv[i+1] | 
|  | 41 | 
|  | 42 if '-outdir' not in args: | 
|  | 43     outpath=os.path.abspath('./') | 
|  | 44 else: | 
|  | 45     i=args.index('-outdir') | 
|  | 46     outpath=os.path.abspath(args[i+1]) | 
|  | 47 if os.path.exists(outpath): | 
|  | 48     print 'WARNING: outdir %s exists'%outpath | 
|  | 49 else: | 
|  | 50     os.mkdir(outpath) | 
|  | 51 | 
|  | 52 if '-docstr' not in sys.argv: | 
|  | 53     sys.exit('docstring needed') | 
|  | 54 else: | 
|  | 55     i=sys.argv.index('-docstr') | 
|  | 56     outfile_prefix=sys.argv[i+1] | 
|  | 57     outfile_brief='%s/%s.frame.brief.txt'%(outpath,outfile_prefix) | 
|  | 58     outfile_detail='%s/%s.frame.detail.txt'%(outpath,outfile_prefix) | 
|  | 59 | 
|  | 60 #PRADA executive path | 
|  | 61 prada_path=os.path.dirname(os.path.abspath(__file__))   #### | 
|  | 62 ref_search_path=[prada_path,os.getcwd()]                #search path for ref file if not specified in command line | 
|  | 63 | 
|  | 64 if '-ref' in args: | 
|  | 65     i=args.index('-conf') | 
|  | 66     reffile=args[i+1] | 
|  | 67     if os.path.exists(reffile): | 
|  | 68         pass | 
|  | 69     else: | 
|  | 70         for pth in ref_search_path: | 
|  | 71             new_reffile='%s/%s'%(pth, os.path.basename(reffile)) | 
|  | 72             if os.path.exists(new_reffile): | 
|  | 73                 reffile=new_reffile | 
|  | 74                 break | 
|  | 75         else: | 
|  | 76             sys.exit('ERROR: conf file %s not found'%reffile) | 
|  | 77 else: | 
|  | 78     reffile='%s/conf.txt'%prada_path | 
|  | 79     if not os.path.exists(reffile): | 
|  | 80         sys.exit('ERROR: No default conf.txt found and none specified') | 
|  | 81 | 
|  | 82 #read reference files | 
|  | 83 refdict=ioprada.read_conf(reffile) | 
|  | 84 featurefile=refdict['--REF--']['feature_file'] | 
|  | 85 cdsfile=refdict['--REF--']['cds_file'] | 
|  | 86 txcatfile=refdict['--REF--']['txcat_file'] | 
|  | 87 | 
|  | 88 #Now print all input parameters | 
|  | 89 print 'CMD: %s'%('\t'.join(args)) | 
|  | 90 | 
|  | 91 ########################################################################################### | 
|  | 92 ##Read information from annotation files, for all genes | 
|  | 93 print 'Reading annotations' | 
|  | 94 #call functions in ioprada module // | 
|  | 95 txdb,genedb=ioprada.read_feature(featurefile,verbose=False) | 
|  | 96 tx_cds=ioprada.read_cds(cdsfile) | 
|  | 97 tx_primary=ioprada.read_tx_cat(txcatfile) | 
|  | 98 | 
|  | 99 ########################################################################################### | 
|  | 100 def maptranscript(gene,exon_end,part=''): | 
|  | 101     '''Find all transcripts of the gene having the fusion boundary. | 
|  | 102     gene is a Gene object,return a list of Transcript objects''' | 
|  | 103     if part not in ['5','3']: | 
|  | 104         raise Exception('part must be "5" or "3"') | 
|  | 105     txuse=[] ## | 
|  | 106     g=gene.name | 
|  | 107     strand=gene.strand | 
|  | 108     if part=='5': | 
|  | 109         tag='big' if strand=='1' else 'small' | 
|  | 110     else: | 
|  | 111         tag='small' if strand=='1' else 'big' | 
|  | 112     txs=gene.transcript | 
|  | 113     if tag=='small': | 
|  | 114         for tx in txs: | 
|  | 115             e_pos=[x.start for x in tx.exon] | 
|  | 116             if exon_end in e_pos: | 
|  | 117                 txuse.append(tx) | 
|  | 118     elif tag=='big': | 
|  | 119         for tx in txs: | 
|  | 120             e_pos=[x.end for x in tx.exon] | 
|  | 121             if exon_end in e_pos: | 
|  | 122                 txuse.append(tx) | 
|  | 123     return txuse | 
|  | 124 | 
|  | 125 def overlap(r1,r2): | 
|  | 126     '''compare two ranges, return the overlapped''' | 
|  | 127     if r1[0] <= r2[0]: | 
|  | 128         ra,rb=r1[:],r2[:] | 
|  | 129     else: | 
|  | 130         ra,rb=r2[:],r1[:] | 
|  | 131     if rb[0] > ra[1]: | 
|  | 132         return [] | 
|  | 133     else: | 
|  | 134         if rb[1] <= ra[1]: | 
|  | 135             return rb | 
|  | 136         else: | 
|  | 137             return [rb[0],ra[1]] | 
|  | 138 | 
|  | 139 def br_front_len(tx,br,part): | 
|  | 140     '''No matter it is 5'/3' partner,calculate the ORF length before the break''' | 
|  | 141     global tx_cds | 
|  | 142     txname=tx.name | 
|  | 143     strand=tx.strand | 
|  | 144     cds_start,cds_end=[int(x) for x in tx_cds[txname]] | 
|  | 145     e_pos=[[x.start,x.end] for x in tx.exon] | 
|  | 146     if part=='5' and strand=='1': | 
|  | 147         posset=[x.end for x in tx.exon] | 
|  | 148         if br not in posset: | 
|  | 149             raise Exception('Br is not exon boundary') | 
|  | 150         if br < cds_start: | 
|  | 151             return '5UTR' | 
|  | 152         elif br > cds_end: | 
|  | 153             return '3UTR' | 
|  | 154         else: | 
|  | 155            chim_r=[cds_start,br] | 
|  | 156            ol=[overlap(x,chim_r) for x in e_pos] | 
|  | 157            L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) | 
|  | 158            return L | 
|  | 159     if part=='5' and strand=='-1': | 
|  | 160         posset=[x.start for x in tx.exon] | 
|  | 161         if br not in posset: | 
|  | 162             raise Exception('Br is not exon boundary') | 
|  | 163         if br > cds_end: | 
|  | 164             return '5UTR' | 
|  | 165         elif br < cds_start: | 
|  | 166             return '3UTR' | 
|  | 167         else: | 
|  | 168             chim_r=[br,cds_end] | 
|  | 169             ol=[overlap(x,chim_r) for x in e_pos] | 
|  | 170             L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) | 
|  | 171             return L | 
|  | 172     if part=='3' and strand=='1': | 
|  | 173         posset=[x.start for x in tx.exon] | 
|  | 174         if br not in posset: | 
|  | 175             raise Exception('Br is not exon boundary') | 
|  | 176         if br < cds_start: | 
|  | 177             return '5UTR' | 
|  | 178         elif br > cds_end: | 
|  | 179             return '3UTR' | 
|  | 180         else: | 
|  | 181             chim_r=[cds_start,br-1] | 
|  | 182             ol=[overlap(x,chim_r) for x in e_pos] | 
|  | 183             L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) | 
|  | 184             return L | 
|  | 185     if part=='3' and strand=='-1': | 
|  | 186         posset=[x.end for x in tx.exon] | 
|  | 187         if br not in posset: | 
|  | 188             raise Exception('Br is not exon boundary') | 
|  | 189         if br > cds_end: | 
|  | 190             return '5UTR' | 
|  | 191         elif br < cds_start: | 
|  | 192             return '3UTR' | 
|  | 193         else: | 
|  | 194             chim_r=[br+1,cds_end] | 
|  | 195             ol=[overlap(x,chim_r) for x in e_pos] | 
|  | 196             L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) | 
|  | 197             return L | 
|  | 198 | 
|  | 199 def fusion_frame(gene_a,ga_br,gene_b,gb_br): | 
|  | 200     '''gene_a:FGFR3,gene_b:TACC3,ga_br:1808661,gb_br:1741429 | 
|  | 201     or chr4.1808661.chr4.1741429''' | 
|  | 202     global genedb,txdb | 
|  | 203     ga_br=int(ga_br) | 
|  | 204     gb_br=int(gb_br) | 
|  | 205     ga_txs=maptranscript(genedb[gene_a],ga_br,part='5') | 
|  | 206     gb_txs=maptranscript(genedb[gene_b],gb_br,part='3') | 
|  | 207     res=[] | 
|  | 208     for ta in ga_txs: | 
|  | 209         for tb in gb_txs: | 
|  | 210             s1=br_front_len(ta,ga_br,'5') | 
|  | 211             s2=br_front_len(tb,gb_br,'3') | 
|  | 212             fusion_conseq='Unknown' | 
|  | 213             if isinstance(s1,int) and isinstance(s2,int):    #both br in CDS region | 
|  | 214                 if s1%3==s2%3: | 
|  | 215                     fusion_conseq='In-frame' | 
|  | 216                 else: | 
|  | 217                     fusion_conseq='Out-of-frame' | 
|  | 218             elif isinstance(s1,int) and not isinstance(s2,int): | 
|  | 219                 fusion_conseq='CDS'+'-'+s2 | 
|  | 220             elif isinstance(s2,int) and not isinstance(s1,int): | 
|  | 221                 fusion_conseq=s1+'-'+'CDS' | 
|  | 222             else: | 
|  | 223                 fusion_conseq=s1+'-'+s2 | 
|  | 224             res.append((ta.name,tb.name,fusion_conseq)) | 
|  | 225     if ga_txs==[] and gb_txs==[]: | 
|  | 226         res.append(['NA','NA','NA']) | 
|  | 227     elif ga_txs==[]: | 
|  | 228         for tb in gb_txs: | 
|  | 229             res.append(['NA',tb.name,'NA']) | 
|  | 230     elif gb_txs==[]: | 
|  | 231         for ta in ga_txs: | 
|  | 232             res.append([ta.name,'NA','NA']) | 
|  | 233     return res | 
|  | 234 | 
|  | 235 ############################################################################ | 
|  | 236 print 'Predicting...' | 
|  | 237 infile=open(datafilename) | 
|  | 238 outfile_detail=open(outfile_detail,'w') | 
|  | 239 outfile_brief=open(outfile_brief,'w') | 
|  | 240 detailM=[] | 
|  | 241 briefM=[] | 
|  | 242 for line in infile: | 
|  | 243     if not line.strip(): | 
|  | 244         continue | 
|  | 245     geneA,ga_br,geneB,gb_br=line.split() | 
|  | 246     try: | 
|  | 247         M=fusion_frame(geneA,ga_br,geneB,gb_br) | 
|  | 248     except: | 
|  | 249         M=[] | 
|  | 250     ppcollected=0 | 
|  | 251     for ccc in M: | 
|  | 252         a_cat,b_cat='alternative','alternative' | 
|  | 253         if tx_primary[geneA]==ccc[0]: | 
|  | 254             a_cat='primary' | 
|  | 255         if tx_primary[geneB]==ccc[1]: | 
|  | 256             b_cat='primary' | 
|  | 257         tag='%s_%s'%(a_cat,b_cat) | 
|  | 258         if ccc[0]=='NA' or ccc[1]=='NA': | 
|  | 259             tag='NA' | 
|  | 260         row_d=[geneA,ga_br,geneB,gb_br,ccc[0],ccc[1],tag,ccc[2]] #genea,geneb,tag,consequence | 
|  | 261         detailM.append(row_d) | 
|  | 262         if tag=='primary_primary': | 
|  | 263             row_b=[geneA,ga_br,geneB,gb_br,ccc[2]] | 
|  | 264             ppcollected=1 | 
|  | 265     if ppcollected==0: | 
|  | 266         row_b=[geneA,ga_br,geneB,gb_br,'not-classified'] | 
|  | 267     briefM.append(row_b) | 
|  | 268 infile.close() | 
|  | 269 | 
|  | 270 for line in detailM: | 
|  | 271     outfile_detail.write('%s\n'%('\t'.join(line))) | 
|  | 272 outfile_detail.close() | 
|  | 273 | 
|  | 274 for line in briefM: | 
|  | 275     outfile_brief.write('%s\n'%('\t'.join(line))) | 
|  | 276 outfile_brief.close() | 
|  | 277 | 
|  | 278 print 'Done!' |