Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/prada-frame @ 0:acc2ca1a3ba4
Uploaded
| author | siyuan |
|---|---|
| date | Thu, 20 Feb 2014 00:44:58 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:acc2ca1a3ba4 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 #This script is part of the pyPRADA package. It is used to predict the functional consequences of | |
| 4 #gene fusions. Possible predictions include in-frame, out-of-frame, UTR-UTR, UTR-CDS, Unknown etc. | |
| 5 #It relies on the definition of transcript CDS/UTR boundaries from ENSEMBL, and gives a prediction | |
| 6 #for every pair of transcripts coded by the fusion genes. The whole point is to see if the fusion | |
| 7 #lead to reading shift of the CDS sequences. | |
| 8 #Two files are generated, brief/detail result. | |
| 9 #Result is purely predicted, no guarantee is assumed. | |
| 10 #Contact: Siyuan Zheng, szheng2@mdanderson.org | |
| 11 | |
| 12 import os | |
| 13 import sys | |
| 14 import re | |
| 15 import time | |
| 16 import subprocess | |
| 17 import bioclass | |
| 18 from Bio import SeqIO | |
| 19 import ioprada | |
| 20 | |
| 21 ###################################################################################### | |
| 22 help_menu='''\nUsage: prada-frame -conf xx.txt -i inputfile -docstr XX -outdir ./ | |
| 23 **parameters** | |
| 24 -conf see prada-fusion -conf | |
| 25 -i input file, a tab/space delimited four-column file, each row like "GeneA GeneA_break GeneB GeneB_break" | |
| 26 Example:FGFR3 1808661 TACC3 1739325 | |
| 27 -docstr outfile prefix. output to two files: XX.frame.brief.txt, XX.frame.detail.txt | |
| 28 -outdir output directory, default is ./ | |
| 29 ''' | |
| 30 | |
| 31 args=sys.argv | |
| 32 if '-h' in args or '-help' in args or len(args)==1: | |
| 33 print help_menu | |
| 34 sys.exit(0) | |
| 35 | |
| 36 if '-i' not in sys.argv: | |
| 37 sys.exit('Input file needed') | |
| 38 else: | |
| 39 i=sys.argv.index('-i') | |
| 40 datafilename=sys.argv[i+1] | |
| 41 | |
| 42 if '-outdir' not in args: | |
| 43 outpath=os.path.abspath('./') | |
| 44 else: | |
| 45 i=args.index('-outdir') | |
| 46 outpath=os.path.abspath(args[i+1]) | |
| 47 if os.path.exists(outpath): | |
| 48 print 'WARNING: outdir %s exists'%outpath | |
| 49 else: | |
| 50 os.mkdir(outpath) | |
| 51 | |
| 52 if '-docstr' not in sys.argv: | |
| 53 sys.exit('docstring needed') | |
| 54 else: | |
| 55 i=sys.argv.index('-docstr') | |
| 56 outfile_prefix=sys.argv[i+1] | |
| 57 outfile_brief='%s/%s.frame.brief.txt'%(outpath,outfile_prefix) | |
| 58 outfile_detail='%s/%s.frame.detail.txt'%(outpath,outfile_prefix) | |
| 59 | |
| 60 #PRADA executive path | |
| 61 prada_path=os.path.dirname(os.path.abspath(__file__)) #### | |
| 62 ref_search_path=[prada_path,os.getcwd()] #search path for ref file if not specified in command line | |
| 63 | |
| 64 if '-ref' in args: | |
| 65 i=args.index('-conf') | |
| 66 reffile=args[i+1] | |
| 67 if os.path.exists(reffile): | |
| 68 pass | |
| 69 else: | |
| 70 for pth in ref_search_path: | |
| 71 new_reffile='%s/%s'%(pth, os.path.basename(reffile)) | |
| 72 if os.path.exists(new_reffile): | |
| 73 reffile=new_reffile | |
| 74 break | |
| 75 else: | |
| 76 sys.exit('ERROR: conf file %s not found'%reffile) | |
| 77 else: | |
| 78 reffile='%s/conf.txt'%prada_path | |
| 79 if not os.path.exists(reffile): | |
| 80 sys.exit('ERROR: No default conf.txt found and none specified') | |
| 81 | |
| 82 #read reference files | |
| 83 refdict=ioprada.read_conf(reffile) | |
| 84 featurefile=refdict['--REF--']['feature_file'] | |
| 85 cdsfile=refdict['--REF--']['cds_file'] | |
| 86 txcatfile=refdict['--REF--']['txcat_file'] | |
| 87 | |
| 88 #Now print all input parameters | |
| 89 print 'CMD: %s'%('\t'.join(args)) | |
| 90 | |
| 91 ########################################################################################### | |
| 92 ##Read information from annotation files, for all genes | |
| 93 print 'Reading annotations' | |
| 94 #call functions in ioprada module // | |
| 95 txdb,genedb=ioprada.read_feature(featurefile,verbose=False) | |
| 96 tx_cds=ioprada.read_cds(cdsfile) | |
| 97 tx_primary=ioprada.read_tx_cat(txcatfile) | |
| 98 | |
| 99 ########################################################################################### | |
| 100 def maptranscript(gene,exon_end,part=''): | |
| 101 '''Find all transcripts of the gene having the fusion boundary. | |
| 102 gene is a Gene object,return a list of Transcript objects''' | |
| 103 if part not in ['5','3']: | |
| 104 raise Exception('part must be "5" or "3"') | |
| 105 txuse=[] ## | |
| 106 g=gene.name | |
| 107 strand=gene.strand | |
| 108 if part=='5': | |
| 109 tag='big' if strand=='1' else 'small' | |
| 110 else: | |
| 111 tag='small' if strand=='1' else 'big' | |
| 112 txs=gene.transcript | |
| 113 if tag=='small': | |
| 114 for tx in txs: | |
| 115 e_pos=[x.start for x in tx.exon] | |
| 116 if exon_end in e_pos: | |
| 117 txuse.append(tx) | |
| 118 elif tag=='big': | |
| 119 for tx in txs: | |
| 120 e_pos=[x.end for x in tx.exon] | |
| 121 if exon_end in e_pos: | |
| 122 txuse.append(tx) | |
| 123 return txuse | |
| 124 | |
| 125 def overlap(r1,r2): | |
| 126 '''compare two ranges, return the overlapped''' | |
| 127 if r1[0] <= r2[0]: | |
| 128 ra,rb=r1[:],r2[:] | |
| 129 else: | |
| 130 ra,rb=r2[:],r1[:] | |
| 131 if rb[0] > ra[1]: | |
| 132 return [] | |
| 133 else: | |
| 134 if rb[1] <= ra[1]: | |
| 135 return rb | |
| 136 else: | |
| 137 return [rb[0],ra[1]] | |
| 138 | |
| 139 def br_front_len(tx,br,part): | |
| 140 '''No matter it is 5'/3' partner,calculate the ORF length before the break''' | |
| 141 global tx_cds | |
| 142 txname=tx.name | |
| 143 strand=tx.strand | |
| 144 cds_start,cds_end=[int(x) for x in tx_cds[txname]] | |
| 145 e_pos=[[x.start,x.end] for x in tx.exon] | |
| 146 if part=='5' and strand=='1': | |
| 147 posset=[x.end for x in tx.exon] | |
| 148 if br not in posset: | |
| 149 raise Exception('Br is not exon boundary') | |
| 150 if br < cds_start: | |
| 151 return '5UTR' | |
| 152 elif br > cds_end: | |
| 153 return '3UTR' | |
| 154 else: | |
| 155 chim_r=[cds_start,br] | |
| 156 ol=[overlap(x,chim_r) for x in e_pos] | |
| 157 L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) | |
| 158 return L | |
| 159 if part=='5' and strand=='-1': | |
| 160 posset=[x.start for x in tx.exon] | |
| 161 if br not in posset: | |
| 162 raise Exception('Br is not exon boundary') | |
| 163 if br > cds_end: | |
| 164 return '5UTR' | |
| 165 elif br < cds_start: | |
| 166 return '3UTR' | |
| 167 else: | |
| 168 chim_r=[br,cds_end] | |
| 169 ol=[overlap(x,chim_r) for x in e_pos] | |
| 170 L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) | |
| 171 return L | |
| 172 if part=='3' and strand=='1': | |
| 173 posset=[x.start for x in tx.exon] | |
| 174 if br not in posset: | |
| 175 raise Exception('Br is not exon boundary') | |
| 176 if br < cds_start: | |
| 177 return '5UTR' | |
| 178 elif br > cds_end: | |
| 179 return '3UTR' | |
| 180 else: | |
| 181 chim_r=[cds_start,br-1] | |
| 182 ol=[overlap(x,chim_r) for x in e_pos] | |
| 183 L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) | |
| 184 return L | |
| 185 if part=='3' and strand=='-1': | |
| 186 posset=[x.end for x in tx.exon] | |
| 187 if br not in posset: | |
| 188 raise Exception('Br is not exon boundary') | |
| 189 if br > cds_end: | |
| 190 return '5UTR' | |
| 191 elif br < cds_start: | |
| 192 return '3UTR' | |
| 193 else: | |
| 194 chim_r=[br+1,cds_end] | |
| 195 ol=[overlap(x,chim_r) for x in e_pos] | |
| 196 L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) | |
| 197 return L | |
| 198 | |
| 199 def fusion_frame(gene_a,ga_br,gene_b,gb_br): | |
| 200 '''gene_a:FGFR3,gene_b:TACC3,ga_br:1808661,gb_br:1741429 | |
| 201 or chr4.1808661.chr4.1741429''' | |
| 202 global genedb,txdb | |
| 203 ga_br=int(ga_br) | |
| 204 gb_br=int(gb_br) | |
| 205 ga_txs=maptranscript(genedb[gene_a],ga_br,part='5') | |
| 206 gb_txs=maptranscript(genedb[gene_b],gb_br,part='3') | |
| 207 res=[] | |
| 208 for ta in ga_txs: | |
| 209 for tb in gb_txs: | |
| 210 s1=br_front_len(ta,ga_br,'5') | |
| 211 s2=br_front_len(tb,gb_br,'3') | |
| 212 fusion_conseq='Unknown' | |
| 213 if isinstance(s1,int) and isinstance(s2,int): #both br in CDS region | |
| 214 if s1%3==s2%3: | |
| 215 fusion_conseq='In-frame' | |
| 216 else: | |
| 217 fusion_conseq='Out-of-frame' | |
| 218 elif isinstance(s1,int) and not isinstance(s2,int): | |
| 219 fusion_conseq='CDS'+'-'+s2 | |
| 220 elif isinstance(s2,int) and not isinstance(s1,int): | |
| 221 fusion_conseq=s1+'-'+'CDS' | |
| 222 else: | |
| 223 fusion_conseq=s1+'-'+s2 | |
| 224 res.append((ta.name,tb.name,fusion_conseq)) | |
| 225 if ga_txs==[] and gb_txs==[]: | |
| 226 res.append(['NA','NA','NA']) | |
| 227 elif ga_txs==[]: | |
| 228 for tb in gb_txs: | |
| 229 res.append(['NA',tb.name,'NA']) | |
| 230 elif gb_txs==[]: | |
| 231 for ta in ga_txs: | |
| 232 res.append([ta.name,'NA','NA']) | |
| 233 return res | |
| 234 | |
| 235 ############################################################################ | |
| 236 print 'Predicting...' | |
| 237 infile=open(datafilename) | |
| 238 outfile_detail=open(outfile_detail,'w') | |
| 239 outfile_brief=open(outfile_brief,'w') | |
| 240 detailM=[] | |
| 241 briefM=[] | |
| 242 for line in infile: | |
| 243 if not line.strip(): | |
| 244 continue | |
| 245 geneA,ga_br,geneB,gb_br=line.split() | |
| 246 try: | |
| 247 M=fusion_frame(geneA,ga_br,geneB,gb_br) | |
| 248 except: | |
| 249 M=[] | |
| 250 ppcollected=0 | |
| 251 for ccc in M: | |
| 252 a_cat,b_cat='alternative','alternative' | |
| 253 if tx_primary[geneA]==ccc[0]: | |
| 254 a_cat='primary' | |
| 255 if tx_primary[geneB]==ccc[1]: | |
| 256 b_cat='primary' | |
| 257 tag='%s_%s'%(a_cat,b_cat) | |
| 258 if ccc[0]=='NA' or ccc[1]=='NA': | |
| 259 tag='NA' | |
| 260 row_d=[geneA,ga_br,geneB,gb_br,ccc[0],ccc[1],tag,ccc[2]] #genea,geneb,tag,consequence | |
| 261 detailM.append(row_d) | |
| 262 if tag=='primary_primary': | |
| 263 row_b=[geneA,ga_br,geneB,gb_br,ccc[2]] | |
| 264 ppcollected=1 | |
| 265 if ppcollected==0: | |
| 266 row_b=[geneA,ga_br,geneB,gb_br,'not-classified'] | |
| 267 briefM.append(row_b) | |
| 268 infile.close() | |
| 269 | |
| 270 for line in detailM: | |
| 271 outfile_detail.write('%s\n'%('\t'.join(line))) | |
| 272 outfile_detail.close() | |
| 273 | |
| 274 for line in briefM: | |
| 275 outfile_brief.write('%s\n'%('\t'.join(line))) | |
| 276 outfile_brief.close() | |
| 277 | |
| 278 print 'Done!' |
