comparison pyPRADA_1.2/prada-fusion @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children f17965495ec9
comparison
equal deleted inserted replaced
-1:000000000000 0:acc2ca1a3ba4
1 #!/usr/bin/env python
2
3 #PRADA: Pipeline for RnAseq Data Analysis
4 #Fusion detection module, algorithm published by Michael Berger et al (Genome Res, 2010) at Broad Institute.
5 #Implemented by Siyuan Zheng, Wandaliz Torres-Garcia and Rahul Vegesna.
6 #Copy Right: The Univ of Texas MD Anderson Cancer Center, Department of Bioinformatics and Computational Biology
7 #Contact: Roel Verhaak (rverhaak@mdanderson.org)
8 #Citation: to be added
9 #Tested with Python v2.6 and v2.7
10 #The program requires NM tag and high quality mapping reads with mapping score more than -minmapq.
11 #Last modified: 04/11/2013
12
13 ######################################################################################
14 import sys
15 import time
16 import os
17 import os.path
18 import subprocess
19 import operator
20 import pysam
21 import bioclass
22 import gfclass
23 import ioprada
24 import privutils
25 from Bio import SeqIO,Seq
26
27 ######################################################################################
28 args=sys.argv
29
30 help_menu='''\nPipeline for RNAseq Data Analaysis - fusion detection (PRADA).
31 **Command**:
32 prada-fusion -bam XX.bam -conf xx.txt -tag XX -mm 1 -junL XX -minmapq 30 -outdir ./
33 **Parameters**:
34 -h print help message
35 -bam input BAM file, must has a .bam suffix. BAM is the output from PRADA preprocess module.
36 -conf config file for references and parameters. Use conf.txt in py-PRADA installation folder if none specified.
37 -tag a tag to describe the sample, used to name output files. Default is ''.
38 -mm number of mismatches allowed in discordant pairs and fusion spanning reads.Default is 1.
39 -junL length of sequences taken from EACH side of exons when making hypothetical junctions. No default.
40 -minmapq minimum read mapping quality to be considered as fusion evidences. Default is 30.
41 -outdir output directory.
42 -v print version
43 '''
44
45 if '-h' in args or '-help' in args or len(args)==1:
46 print help_menu
47 sys.exit(0)
48
49 if '-v' in args:
50 import version
51 print version.version
52 sys.exit(0)
53
54 if '-bam' not in args:
55 sys.exit('ERROR: BAM file needed')
56 i=args.index('-bam')
57 bampath=args[i+1]
58 bampath=os.path.abspath(bampath)
59 bam=os.path.basename(bampath)
60 if bam[-4:] != '.bam':
61 sys.exit('ERROR: BAM file must have suffix .bam')
62
63 #Mismatch allowed. This filter is applied at the very end of the pipeline.
64 #I strongly suggest 1. We also record how many junction spanning reads (JSRs) are perfect matched.
65 if '-mm' not in args:
66 mm=1
67 else:
68 i=args.index('-mm')
69 mm=int(args[i+1])
70
71 #junL should be less than the read length in the dataset. I suggest it to be read_length*0.8
72 if '-junL' not in args:
73 sys.exit('ERROR: junL must be specified')
74 i=args.index('-junL')
75 overlap=int(args[i+1])
76
77 #minimum mapping quality for reads as fusion evidences
78 if '-minmapq' not in args:
79 minmapq=30
80 else:
81 i=args.index('-minmapq')
82 minmapq=int(args[i+1])
83
84 if '-outdir' not in args:
85 outpath=os.path.abspath('./')
86 else:
87 i=args.index('-outdir')
88 outpath=os.path.abspath(args[i+1])
89 if os.path.exists(outpath):
90 print 'WARNING: outdir %s exists'%outpath
91 else:
92 os.mkdir(outpath)
93
94 if '-tag' not in args:
95 docstring='prada'
96 else:
97 i=args.index('-tag')
98 docstring=args[i+1]
99
100 #by default, search conf.txt in the prada folder
101 prada_path=os.path.dirname(os.path.abspath(__file__)) #path to find libraries and executives.
102 ref_search_path=[prada_path,os.getcwd()] #search path for conf file if not specified in command line
103
104 if '-conf' in args:
105 i=args.index('-ref')
106 reffile=args[i+1]
107 if os.path.exists(reffile):
108 pass
109 else:
110 for pth in ref_search_path:
111 new_reffile='%s/%s'%(pth, os.path.basename(reffile))
112 if os.path.exists(new_reffile):
113 reffile=new_reffile
114 break
115 else:
116 sys.exit('ERROR: conf file %s not found'%reffile)
117 else:
118 reffile='%s/conf.txt'%prada_path
119 if not os.path.exists(reffile):
120 sys.exit('ERROR: No default conf.txt found and none specified')
121
122 #Now print all input parameters
123 print 'CMD: %s'%('\t'.join(args))
124
125 #reference files pointing to the annotation files.
126 refdict=ioprada.read_conf(reffile)
127 featurefile=refdict['--REF--']['feature_file']
128 txseqfile=refdict['--REF--']['tx_seq_file']
129 txcatfile=refdict['--REF--']['txcat_file']
130 cdsfile=refdict['--REF--']['cds_file']
131
132 #underlying utilities, automatically detected
133 #these are customized tools. update is needed.
134 samtools='%s/tools/samtools-0.1.16/samtools'%prada_path
135 bwa='%s/tools/bwa-0.5.7-mh/bwa'%prada_path
136 blastn='%s/tools/blastn'%prada_path
137
138 ######################################################################################
139 print 'step 0: loading gene annotations @ %s'%time.ctime()
140 #call functions in ioprada module //
141 txdb,genedb=ioprada.read_feature(featurefile,verbose=True)
142 tx_primary=ioprada.read_tx_cat(txcatfile)
143 tx_cds=ioprada.read_cds(cdsfile)
144
145 ######################################################################################
146 print 'step 1: finding discordant pairs @ %s'%time.ctime()
147
148 #We sift through all exons of protein coding genes and get the mapping reads.
149 #Within, we exclude low mapping quality reads and PCR duplicates. For pairs that the two ends
150 #map to different genes, we all it a discordant pair.
151 #This is a step for finding all possible candidate fusions.
152
153 samfile=pysam.Samfile(bampath,'rb')
154
155 read1_ab={}
156 read2_ab={}
157 db1={}
158 db2={}
159
160 i,N=0,len(genedb)
161 for gene in genedb:
162 i+=1
163 if i%200==0:
164 print '%d/%d genes processed for discordant pairs'%(i,N)
165 g=genedb[gene]
166 exons=g.get_exons()
167 for e in exons.values():
168 for rd in samfile.fetch(e.chr,e.start-1,e.end):
169 if rd.mapq < minmapq:
170 continue
171 if rd.is_duplicate:
172 continue
173 if rd.mate_is_unmapped: #at this point, only consider pairs
174 continue
175 if rd.rnext == rd.tid and rd.mpos <= g.end and rd.mpos >= g.start-1: #remove reads that fall into the same gene range
176 continue
177 if rd.is_read1:
178 if read1_ab.has_key(rd.qname):
179 read1_ab[rd.qname].add(gene)
180 else:
181 read1_ab[rd.qname]=set([gene])
182 db1[rd.qname]=rd
183 if rd.is_read2:
184 if read2_ab.has_key(rd.qname):
185 read2_ab[rd.qname].add(gene)
186 else:
187 read2_ab[rd.qname]=set([gene])
188 db2[rd.qname]=rd
189
190 ##output the discordant pairs and determine the orientation of candidate fusions
191 discordant={} #catalogue all discordant pairs, using gene pairs as keys
192 outfile=open('%s/%s.discordant.txt'%(outpath,docstring),'w')
193 title=['read','gene1','gene1_chr','read1_pos','read1_mm','read1_strand','read1_orient','gene2',\
194 'gene2_chr','read2_pos','read2_mm','read2_strand','read2_orient']
195 outfile.write('%s\n'%('\t'.join(title)))
196 i=0
197 for rdid in read1_ab:
198 i+=1
199 if i%10000==0:
200 print '%d discordant pairs processed'%i
201 if not read2_ab.has_key(rdid): #skip if not all ends are catalogued
202 continue
203 g1set=read1_ab[rdid] #consider all combinations if a read maps to multiple genes
204 g2set=read2_ab[rdid] #consider all combinations if a read maps to multiple genes
205 r1,r2=db1[rdid],db2[rdid]
206 read1strd='-1' if r1.is_reverse else '1'
207 read2strd='-1' if r2.is_reverse else '1'
208 for g1 in g1set:
209 for g2 in g2set:
210 if g1==g2: #for some uncasual cases
211 continue
212 g1obj,g2obj=genedb[g1],genedb[g2]
213 read1orient='F' if read1strd == g1obj.strand else 'R' #read1 --> gene1
214 read2orient='F' if read2strd == g2obj.strand else 'R' #read2 --> gene2
215 fkey=''
216 if read1orient=='F' and read2orient=='R': ##scenario I, gene1-gene2
217 fkey=g1+'_'+g2
218 if read1orient=='R' and read2orient=='F': ##scenario II, gene2-gene1
219 fkey=g2+'_'+g1
220 if fkey:
221 if discordant.has_key(fkey):
222 discordant[fkey].update({rdid:'%s:%s'%(read1orient,read2orient)})
223 else:
224 discordant[fkey]={rdid:'%s:%s'%(read1orient,read2orient)}
225 ##output
226 nm1=str([x[1] for x in r1.tags if x[0]=='NM'][0]) #output mismatch, but does not consider it at this point
227 nm2=str([x[1] for x in r2.tags if x[0]=='NM'][0])
228 uvec=[rdid,g1,g1obj.chr,str(r1.pos+1),nm1,read1strd,read1orient,g2,g2obj.chr,str(r2.pos+1),nm2,read2strd,read2orient]
229 outfile.write('%s\n'%('\t'.join(uvec)))
230 outfile.close()
231
232 ##########################################################################
233 print 'step 2: finding recurrent pairs (candidates) @ %s'%time.ctime()
234
235 #step 2 finds all candidates that have at least 2 discordant pairs. Meanwhile, filter out potential
236 #read through events. read through is defined as reads with mapping position less than 1M, while meeting
237 #the strand expectation.
238
239 guess=[]
240 outfile=open('%s/%s.recurrent.txt'%(outpath,docstring),'w')
241 title=['geneA','geneA_chr','geneB','geneB_chr','num_pairs','IDs']
242 outfile.write('%s\n'%('\t'.join(title)))
243 for pp in discordant:
244 if len(discordant[pp]) < 2: #consider only "recurrent" (more than 1 pair support) cases
245 continue
246 gene1,gene2=pp.split('_')
247 g1obj,g2obj=genedb[gene1],genedb[gene2]
248 rdset=discordant[pp].keys()
249 #filter read-through
250 #readthrough is defined at read level, regardless of mapping genes
251 for rd in rdset:
252 r1,r2=db1[rd],db2[rd]
253 read1strd='-1' if r1.is_reverse else '1'
254 read2strd='-1' if r2.is_reverse else '1'
255 readthrough=False
256 if db1[rd].tid == db2[rd].tid and abs(db1[rd].pos - db2[rd].pos) <= 1000000:
257 if discordant[pp][rd]=='F:R':
258 if read1strd=='1' and read2strd=='-1' and db1[rd].pos < db2[rd].pos:
259 readthrough=True
260 if read1strd=='-1' and read2strd=='1' and db1[rd].pos > db2[rd].pos:
261 readthrough=True
262 if discordant[pp][rd]=='R:F':
263 if read2strd=='1' and read1strd=='-1' and db2[rd].pos < db1[rd].pos:
264 readthrough=True
265 if read2strd=='-1' and read1strd=='1' and db2[rd].pos > db1[rd].pos:
266 readthrough=True
267 if readthrough:
268 del discordant[pp][rd] #in-place deletion!!!! Change the discordant variable in place!
269 if len(discordant[pp]) < 2: #skip all that have less than 2 supporting discordant read pairs after readthrough filter
270 continue
271 guess.append(pp)
272 #output
273 uvec2=[gene1,g1obj.chr,gene2,g2obj.chr,str(len(discordant[pp])),'|'.join(discordant[pp])]
274 outfile.write('%s\n'%('\t'.join(uvec2)))
275 outfile.close()
276
277 ##########################################################################
278 print 'step 3: finding potential junction spanning reads @ %s'%time.ctime()
279
280 #For all candidates, find potential junction spanning reads (JSRs). A JSR is defined as a unmapped read but with the mate mapping
281 #to either F or R partner, with high mapping quality. Since the JSR is unmapped, it is not practical to consider PCR duplicate
282 #because they are not properly marked.
283
284 Fpartners=set()
285 Rpartners=set()
286 for pp in guess:
287 gs=pp.split('_')
288 Fpartners.add(gs[0])
289 Rpartners.add(gs[1])
290 AllPartners=Fpartners|Rpartners
291
292 samfile.reset()
293 posjun={} ##catalogue all JSRs, with track of the mate mapping genes and orientation.
294 i,N=0,len(AllPartners)
295 for gene in AllPartners:
296 i+=1
297 if i%200==0:
298 print '%d/%d genes processed for potential junc reads'%(i,N)
299 g=genedb[gene]
300 exons=g.get_exons().values()
301 for e in exons:
302 for rd in samfile.fetch(e.chr,e.start-1,e.end):
303 if rd.mapq < minmapq:
304 continue
305 if not rd.mate_is_unmapped:
306 continue
307 readstrd='-1' if rd.is_reverse else '1'
308 orient='F' if readstrd == g.strand else 'R' #mapping info of mate read. JSR per se is unmapped in BAM
309 posjun[rd.qname]={'gene':gene,'orient':orient}
310
311 samfile.reset()
312 outfile=open('%s/%s.pos_junc_unmapped.fastq'%(outpath,docstring),'w')
313 i,N=0,len(posjun)
314 for rd in samfile:
315 if rd.mate_is_unmapped: #since the read is potential jun spanning read, all mate map to A or B
316 continue
317 if rd.is_unmapped:
318 if posjun.has_key(rd.qname):
319 i+=1
320 if i%10000==0:
321 print 'extracted %d/%d potential junc reads'%(i,N)
322 rdname='%s_prada_%s_prada_%s'%(rd.qname,posjun[rd.qname]['gene'],posjun[rd.qname]['orient']) #_prada_ as split
323 outfile.write('@%s\n'%rdname)
324 outfile.write('%s\n'%rd.seq)
325 outfile.write('+\n')
326 outfile.write('%s\n'%rd.qual)
327 outfile.close()
328
329 ######################################################################################
330 print 'step 4: building junction database @ %s'%time.ctime()
331
332 #Make hypothetical junctions between candidate fusion partners. To improve speed, we make a big junction database comprising
333 #exons of all candidates, instead of by candidate individually. This also gives the possibility to assess the JSR mapping ambiguity
334 #across many junctions. It turned out very useful in filtering out false positives.
335 #Note that in PRADA transcript sequence file, all sequences are + strand sequences. For - strand transcript, one need to
336 #reverse complement the sequence to get the real transcript sequences.
337
338 seqdb={}
339 for record in SeqIO.parse(txseqfile,'fasta'):
340 seqdb[record.name]=record
341
342 outfile=open('%s/%s.junction.fasta'%(outpath,docstring),'w')
343 i,N=0,len(guess)
344 for pp in guess:
345 i+=1
346 if i%100==0:
347 print 'building junction for %d/%d pairs'%(i,N)
348 gene1,gene2=pp.split('_')
349 g1obj,g2obj=genedb[gene1],genedb[gene2]
350 eset1=g1obj.get_exons() #unique exons in gene 1
351 eset2=g2obj.get_exons() #unique exons in gene 2
352 #collect unique junctions
353 juncseqdict={} #save junction sequences
354 for e1 in eset1.values():
355 for e2 in eset2.values():
356 e1_jun_name='%s:%s:%s'%(gene1,e1.chr,e1.end) if e1.strand=='1' else '%s:%s:%s'%(gene1,e1.chr,e1.start)
357 e2_jun_name='%s:%s:%s'%(gene2,e2.chr,e2.start) if e2.strand=='1' else '%s:%s:%s'%(gene2,e2.chr,e2.end)
358 jun_name=e1_jun_name+'_'+e2_jun_name
359 tx1,tx2=txdb[e1.transcript],txdb[e2.transcript]
360 max_a=tx1.exon_relative_pos()[e1.name][1]
361 min_a=0 if max_a - overlap < 0 else max_a - overlap
362 min_b=tx2.exon_relative_pos()[e2.name][0]-1
363 max_b=tx2.length if min_b + overlap > tx2.length else min_b + overlap
364 #reverse complementary when necessary
365 try:
366 tx1seq=seqdb[tx1.name].seq.tostring() if tx1.strand=='1' else seqdb[tx1.name].reverse_complement().seq.tostring()
367 tx2seq=seqdb[tx2.name].seq.tostring() if tx2.strand=='1' else seqdb[tx2.name].reverse_complement().seq.tostring()
368 except KeyError: #in case transcript not found in sequence file
369 continue
370 jun_seq=tx1seq[min_a:max_a]+tx2seq[min_b:max_b]
371 juncseqdict[jun_name]=jun_seq
372 for junc in juncseqdict:
373 outfile.write('>%s\n'%junc)
374 outfile.write('%s\n'%juncseqdict[junc])
375 outfile.close()
376 samfile.close()
377
378 #for memory efficiecy, del seqdb
379 del seqdb
380
381 ########################################################################################
382 print 'step 5: aligning potential junction reads to junction database @ %s'%time.ctime()
383
384 #Mapping potential JSRs to hypothetical junction database.
385 #Allow 4 mismatches at the beginning.
386
387 idx_cmd='%s index %s/%s.junction.fasta'%(bwa,outpath,docstring)
388 os.system(idx_cmd)
389 aln_cmd='%s aln -n 4 -R 100 %s/%s.junction.fasta %s/%s.pos_junc_unmapped.fastq > %s/%s.juncmap.sai'%(bwa,outpath,docstring,outpath,docstring,outpath,docstring)
390 os.system(aln_cmd)
391 samse_cmd='%s samse -n 1000 -s %s/%s.junction.fasta %s/%s.juncmap.sai %s/%s.pos_junc_unmapped.fastq > %s/%s.juncmap.sam'%(bwa,outpath,docstring,outpath,docstring,outpath,docstring,outpath,docstring)
392 os.system(samse_cmd)
393 sam2bam_cmd='%s view -bS %s/%s.juncmap.sam -o %s/%s.juncmap.bam'%(samtools,outpath,docstring,outpath,docstring)
394 os.system(sam2bam_cmd)
395
396 jsam=pysam.Samfile('%s/%s.juncmap.bam'%(outpath,docstring),'rb')
397 #get the junction name directory
398 junctions=jsam.references
399 junname=dict(zip(range(0,len(junctions)),junctions)) #this is essential for quick speed.
400 junc_align={}
401
402 #go through the BAM file for meaningful (meeting fusion orientation etc) reads
403 strd_right_reads={}
404 rdb={} #collect all junction spanning reads
405 i=0
406 for rd in jsam:
407 i+=1
408 if i%100000==0:
409 print '%d junction alignments parsed'%i
410 if rd.is_unmapped:
411 continue
412 read,mate_gene,mate_orient=rd.qname.split('_prada_')
413 junc=junname[rd.tid]
414 tmp=junc.split('_')
415 gene1,gene2=[x.split(':')[0] for x in tmp]
416 if gene1==mate_gene:
417 if mate_orient=='F':
418 if rd.is_reverse:
419 if strd_right_reads.has_key(rd.qname):
420 strd_right_reads[rd.qname]+=1 #count how many times the read maps
421 else:
422 strd_right_reads[rd.qname]=1
423 rdb[rd.qname]={'read':rd,'gene1':gene1,'gene2':gene2,'junc':junc} #will overwrite, but it is OK since we only look at unique ones
424 elif gene2==mate_gene:
425 if mate_orient == 'R':
426 if not rd.is_reverse:
427 if strd_right_reads.has_key(rd.qname):
428 strd_right_reads[rd.qname]+=1
429 else:
430 strd_right_reads[rd.qname]=1
431 rdb[rd.qname]={'read':rd,'gene1':gene1,'gene2':gene2,'junc':junc} #will overwrite, but it is OK since we only look at unique ones
432
433 #find uniquely mapped reads and their gene pairs
434 junc_map={} #a dictionary from junction to mapping reads
435 for rdname in strd_right_reads:
436 if strd_right_reads[rdname] > 1: #remove non-unique junction spanning reads
437 continue
438 infodict=rdb[rdname]
439 pp=infodict['gene1']+'_'+infodict['gene2']
440 if junc_map.has_key(pp):
441 junc_map[pp].add(rdname)
442 else:
443 junc_map[pp]=set([rdname])
444
445 ########################################################################################
446 print 'step 6: summarizing fusion evidences @ %s'%time.ctime()
447
448 #Now, time to apply mismatch filter and summarize the results
449 #Candidate fusions --> guess
450 #Discordant pairs --> discordant, db1, db2
451 #Junction reads --> junc_map, rdb
452 #Gene info --> genedb
453
454 outfile_s=open('%s/%s.fus.candidates.txt'%(outpath,docstring),'w')
455 outfile_d=open('%s/%s.fus.evidences.txt'%(outpath,docstring),'w')
456
457 title=['Gene_A','Gene_B','A_chr','B_chr','A_strand','B_strand','Discordant_n','JSR_n','perfectJSR_n','Junc_n','Position_Consist','Junction']
458 outfile_s.write('%s\n'%('\t'.join(title)))
459
460 for pp in junc_map: #all pairs with junc reads
461 gene1,gene2=pp.split('_')
462 g1obj,g2obj=genedb[gene1],genedb[gene2]
463 fus_disc=[] #collecting discordant pairs
464 for rdname in discordant[pp]:
465 #arrange read1/read2 into F/R so it will be easier for GeneFusion obj to handle
466 r1,r2=db1[rdname],db2[rdname]
467 orient=discordant[pp][rdname]
468 if orient=='F:R':
469 fus_disc.append((r1,r2))
470 elif orient=='R:F':
471 fus_disc.append((r2,r1))
472 fus_jsr=[]
473 if junc_map.has_key(pp):
474 for rdname in junc_map[pp]:
475 r=rdb[rdname]['read']
476 junc=rdb[rdname]['junc']
477 jsr=gfclass.JSR(r,junc)
478 fus_jsr.append(jsr)
479 gf=gfclass.GeneFusion(gene1,gene2,fus_disc,fus_jsr)
480 gf_new=gf.update(mm=mm) ##apply the mismatch parameter, default is 1
481 #output the results
482 disc_n=str(len(gf_new.discordantpairs))
483 junctions=sorted(gf_new.get_junction_freq(),key=operator.itemgetter(1),reverse=True) #sort junc by # of JSRs
484 junc_n=str(len(junctions))
485 junc_str='|'.join([','.join([x[0],str(x[1])]) for x in junctions])
486 jsr_n=str(len(gf_new.fusionreads))
487 pjsr_n=str(len(gf_new.get_perfect_JSR()))
488 pos_consist=gf_new.positioncheck()
489 svec=[gene1,gene2,g1obj.chr,g2obj.chr,g1obj.strand,g2obj.strand,disc_n,jsr_n,pjsr_n,junc_n,pos_consist,junc_str]
490 outfile_s.write('%s\n'%('\t'.join(svec)))
491 outfile_d.write('@@\t%s,%s,%s\t%s,%s,%s\n'%(gene1,g1obj.chr,g1obj.strand,gene2,g2obj.chr,g2obj.strand))
492 outfile_d.write('\n')
493 outfile_d.write('>discordant\n')
494 for rp in gf_new.discordantpairs:
495 rf,rr=rp
496 nm1=[x[1] for x in rf.tags if x[0]=='NM'][0]
497 nm2=[x[1] for x in rr.tags if x[0]=='NM'][0]
498 outfile_d.write('%s\tF\t%s.%s.mm%d\n'%(rf.qname,gene1,rf.pos+1,nm1)) ##0-based coordinates
499 outfile_d.write('%s\tR\t%s.%s.mm%d\n'%(rr.qname,gene2,rr.pos+1,nm2)) ##0-based coordinates
500 outfile_d.write('\n')
501 outfile_d.write('>spanning\n')
502 for jsr in gf_new.fusionreads:
503 r=jsr.read
504 nm=[x[1] for x in r.tags if x[0]=='NM'][0]
505 outfile_d.write('%s\t%s.mm%d\n'%(r.qname,jsr.junction,nm))
506 outfile_d.write('\n')
507 outfile_d.write('>junction\n')
508 for junc_info in junctions:
509 outfile_d.write('%s\t%d\n'%(junc_info[0],junc_info[1]))
510 outfile_d.write('\n')
511 outfile_d.write('>summary\n')
512 outfile_d.write('Number of Discordant Pairs = %s\n'%disc_n)
513 outfile_d.write('Number of Fusion Reads = %s\n'%jsr_n)
514 outfile_d.write('Number of Perfect Fusion Reads = %s\n'%pjsr_n)
515 outfile_d.write('Number of Distinct Junctions = %s\n'%junc_n)
516 outfile_d.write('Position Consistency = %s\n'%pos_consist)
517 outfile_d.write('\n')
518
519 outfile_s.close()
520 outfile_d.close()
521
522 ########################################################################################
523 print 'step 7: generating fusion lists @ %s'%time.ctime()
524
525 #For convenience, filter the lists to candidates with
526 # 1) at least 2 discordant pairs
527 # 2) at least 1 perfect JSR
528 #meanwhile, calculate sequence similarity for each pair
529 #user may need to manually filter the lists per this measure.
530
531 #The following code is a copy of prada-homology
532 outfile_o=open('%s/%s.fus.summary.txt'%(outpath,docstring),'w')
533 ifname='%s/%s.fus.candidates.txt'%(outpath,docstring)
534 if not os.path.exists(ifname):
535 sys.exit('ERROR: %s was not found'%ifname)
536
537 blastseq_tmp_dir='%s/blast_tmp/'%outpath
538 if not os.path.exists(blastseq_tmp_dir):
539 os.mkdir(blastseq_tmp_dir)
540
541 flists=[]
542 infile=open(ifname)
543 iN=0
544 for line in open(ifname):
545 info=line.strip().split('\t')
546 if iN==0:
547 iN+=1 #skip title
548 flists.append(info)
549 continue
550 else:
551 if int(info[6])>=2 and int(info[8])>=1 and info[10] in ['PARTIALLY','YES']:
552 flists.append(info)
553 infile.close()
554
555 if len(flists)==1: #if no candidate passes the filters
556 outfile_o.write('%s\n'%'\t'.join(flists[0]))
557 outfile_o.close()
558 print 'step done @ %s'%time.ctime()
559 sys.exit(0)
560
561 candidates={}
562 for line in flists[1:]:
563 geneA,geneB=line[0],line[1]
564 key='%s_%s'%(geneA,geneB)
565 candidates[key]=''
566
567 selecttranscript={}
568 for gene in genedb:
569 txs=genedb[gene].transcript
570 stx=txs[0]
571 initlen=stx.length
572 for tx in txs:
573 if tx.length >= initlen:
574 stx=tx
575 initlen=stx.length
576 selecttranscript[gene]=stx.name
577
578 allpartners=set()
579 for item in candidates:
580 sset=set(item.split('_'))
581 allpartners=allpartners.union(sset)
582
583 presenttxs=[] #tx that is present in our annotation
584 absent=[] #tx that is not in our annotation
585 for gene in allpartners:
586 if selecttranscript.has_key(gene):
587 presenttxs.append(selecttranscript[gene])
588 else:
589 absent.append(gene)
590
591 for seq_record in SeqIO.parse(txseqfile,'fasta'):
592 sid=seq_record.id
593 seq=seq_record.seq
594 if sid in presenttxs:
595 g=txdb[sid].gene
596 fastafile=open('%s/%s.fasta'%(blastseq_tmp_dir,g),'w')
597 SeqIO.write(seq_record,fastafile,'fasta')
598 fastafile.close()
599
600 for gp in candidates:
601 geneA,geneB=gp.split('_')
602 if geneA in absent or geneB in absent:
603 candidates[gp]=['NA']*4
604 else:
605 gaseq='%s/%s.fasta'%(blastseq_tmp_dir,geneA)
606 gaobj=SeqIO.parse(gaseq,'fasta').next()
607 gbseq='%s/%s.fasta'%(blastseq_tmp_dir,geneB)
608 gbobj=SeqIO.parse(gbseq,'fasta').next()
609 ga_len,gb_len=str(len(gaobj.seq)),str(len(gbobj.seq))
610 a=privutils.seqblast(gaseq,gbseq,blastn)
611 if a==None:
612 candidates[gp]=['NA','NA','100','0']
613 else:
614 candidates[gp]=a
615
616 header=flists[0][:]
617 header.extend(['Identity','Align_Len','Evalue','BitScore'])
618 outfile_o.write('%s\n'%('\t'.join(header)))
619
620 for info in flists[1:]:
621 geneA,geneB=info[0],info[1]
622 key='%s_%s'%(geneA,geneB)
623 vv=candidates[key]
624 row=info[:]
625 row.extend(vv)
626 outfile_o.write('%s\n'%('\t'.join(row)))
627 outfile_o.close()
628
629 ########################################################################################
630 print 'step done @ %s'%time.ctime()