0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 #PRADA: Pipeline for RnAseq Data Analysis
|
|
4 #Fusion detection module, algorithm published by Michael Berger et al (Genome Res, 2010) at Broad Institute.
|
|
5 #Implemented by Siyuan Zheng, Wandaliz Torres-Garcia and Rahul Vegesna.
|
|
6 #Copy Right: The Univ of Texas MD Anderson Cancer Center, Department of Bioinformatics and Computational Biology
|
|
7 #Contact: Roel Verhaak (rverhaak@mdanderson.org)
|
|
8 #Citation: to be added
|
|
9 #Tested with Python v2.6 and v2.7
|
|
10 #The program requires NM tag and high quality mapping reads with mapping score more than -minmapq.
|
|
11 #Last modified: 04/11/2013
|
|
12
|
|
13 ######################################################################################
|
|
14 import sys
|
|
15 import time
|
|
16 import os
|
|
17 import os.path
|
|
18 import subprocess
|
|
19 import operator
|
|
20 import pysam
|
|
21 import bioclass
|
|
22 import gfclass
|
|
23 import ioprada
|
|
24 import privutils
|
|
25 from Bio import SeqIO,Seq
|
|
26
|
|
27 ######################################################################################
|
|
28 args=sys.argv
|
|
29
|
|
30 help_menu='''\nPipeline for RNAseq Data Analaysis - fusion detection (PRADA).
|
|
31 **Command**:
|
|
32 prada-fusion -bam XX.bam -conf xx.txt -tag XX -mm 1 -junL XX -minmapq 30 -outdir ./
|
|
33 **Parameters**:
|
|
34 -h print help message
|
|
35 -bam input BAM file, must has a .bam suffix. BAM is the output from PRADA preprocess module.
|
|
36 -conf config file for references and parameters. Use conf.txt in py-PRADA installation folder if none specified.
|
|
37 -tag a tag to describe the sample, used to name output files. Default is ''.
|
|
38 -mm number of mismatches allowed in discordant pairs and fusion spanning reads.Default is 1.
|
|
39 -junL length of sequences taken from EACH side of exons when making hypothetical junctions. No default.
|
|
40 -minmapq minimum read mapping quality to be considered as fusion evidences. Default is 30.
|
|
41 -outdir output directory.
|
|
42 -v print version
|
|
43 '''
|
|
44
|
|
45 if '-h' in args or '-help' in args or len(args)==1:
|
|
46 print help_menu
|
|
47 sys.exit(0)
|
|
48
|
|
49 if '-v' in args:
|
|
50 import version
|
|
51 print version.version
|
|
52 sys.exit(0)
|
|
53
|
|
54 if '-bam' not in args:
|
|
55 sys.exit('ERROR: BAM file needed')
|
|
56 i=args.index('-bam')
|
|
57 bampath=args[i+1]
|
|
58 bampath=os.path.abspath(bampath)
|
|
59 bam=os.path.basename(bampath)
|
|
60 if bam[-4:] != '.bam':
|
|
61 sys.exit('ERROR: BAM file must have suffix .bam')
|
|
62
|
|
63 #Mismatch allowed. This filter is applied at the very end of the pipeline.
|
|
64 #I strongly suggest 1. We also record how many junction spanning reads (JSRs) are perfect matched.
|
|
65 if '-mm' not in args:
|
|
66 mm=1
|
|
67 else:
|
|
68 i=args.index('-mm')
|
|
69 mm=int(args[i+1])
|
|
70
|
|
71 #junL should be less than the read length in the dataset. I suggest it to be read_length*0.8
|
|
72 if '-junL' not in args:
|
|
73 sys.exit('ERROR: junL must be specified')
|
|
74 i=args.index('-junL')
|
|
75 overlap=int(args[i+1])
|
|
76
|
|
77 #minimum mapping quality for reads as fusion evidences
|
|
78 if '-minmapq' not in args:
|
|
79 minmapq=30
|
|
80 else:
|
|
81 i=args.index('-minmapq')
|
|
82 minmapq=int(args[i+1])
|
|
83
|
|
84 if '-outdir' not in args:
|
|
85 outpath=os.path.abspath('./')
|
|
86 else:
|
|
87 i=args.index('-outdir')
|
|
88 outpath=os.path.abspath(args[i+1])
|
|
89 if os.path.exists(outpath):
|
|
90 print 'WARNING: outdir %s exists'%outpath
|
|
91 else:
|
|
92 os.mkdir(outpath)
|
|
93
|
|
94 if '-tag' not in args:
|
|
95 docstring='prada'
|
|
96 else:
|
|
97 i=args.index('-tag')
|
|
98 docstring=args[i+1]
|
|
99
|
|
100 #by default, search conf.txt in the prada folder
|
|
101 prada_path=os.path.dirname(os.path.abspath(__file__)) #path to find libraries and executives.
|
|
102 ref_search_path=[prada_path,os.getcwd()] #search path for conf file if not specified in command line
|
|
103
|
|
104 if '-conf' in args:
|
3
|
105 i=args.index('-conf')
|
0
|
106 reffile=args[i+1]
|
|
107 if os.path.exists(reffile):
|
|
108 pass
|
|
109 else:
|
|
110 for pth in ref_search_path:
|
|
111 new_reffile='%s/%s'%(pth, os.path.basename(reffile))
|
|
112 if os.path.exists(new_reffile):
|
|
113 reffile=new_reffile
|
|
114 break
|
|
115 else:
|
|
116 sys.exit('ERROR: conf file %s not found'%reffile)
|
|
117 else:
|
|
118 reffile='%s/conf.txt'%prada_path
|
|
119 if not os.path.exists(reffile):
|
|
120 sys.exit('ERROR: No default conf.txt found and none specified')
|
|
121
|
|
122 #Now print all input parameters
|
|
123 print 'CMD: %s'%('\t'.join(args))
|
|
124
|
|
125 #reference files pointing to the annotation files.
|
|
126 refdict=ioprada.read_conf(reffile)
|
|
127 featurefile=refdict['--REF--']['feature_file']
|
|
128 txseqfile=refdict['--REF--']['tx_seq_file']
|
|
129 txcatfile=refdict['--REF--']['txcat_file']
|
|
130 cdsfile=refdict['--REF--']['cds_file']
|
|
131
|
|
132 #underlying utilities, automatically detected
|
|
133 #these are customized tools. update is needed.
|
|
134 samtools='%s/tools/samtools-0.1.16/samtools'%prada_path
|
|
135 bwa='%s/tools/bwa-0.5.7-mh/bwa'%prada_path
|
|
136 blastn='%s/tools/blastn'%prada_path
|
|
137
|
|
138 ######################################################################################
|
|
139 print 'step 0: loading gene annotations @ %s'%time.ctime()
|
|
140 #call functions in ioprada module //
|
|
141 txdb,genedb=ioprada.read_feature(featurefile,verbose=True)
|
|
142 tx_primary=ioprada.read_tx_cat(txcatfile)
|
|
143 tx_cds=ioprada.read_cds(cdsfile)
|
|
144
|
|
145 ######################################################################################
|
|
146 print 'step 1: finding discordant pairs @ %s'%time.ctime()
|
|
147
|
|
148 #We sift through all exons of protein coding genes and get the mapping reads.
|
|
149 #Within, we exclude low mapping quality reads and PCR duplicates. For pairs that the two ends
|
|
150 #map to different genes, we all it a discordant pair.
|
|
151 #This is a step for finding all possible candidate fusions.
|
|
152
|
|
153 samfile=pysam.Samfile(bampath,'rb')
|
|
154
|
|
155 read1_ab={}
|
|
156 read2_ab={}
|
|
157 db1={}
|
|
158 db2={}
|
|
159
|
|
160 i,N=0,len(genedb)
|
|
161 for gene in genedb:
|
|
162 i+=1
|
|
163 if i%200==0:
|
|
164 print '%d/%d genes processed for discordant pairs'%(i,N)
|
|
165 g=genedb[gene]
|
|
166 exons=g.get_exons()
|
|
167 for e in exons.values():
|
|
168 for rd in samfile.fetch(e.chr,e.start-1,e.end):
|
|
169 if rd.mapq < minmapq:
|
|
170 continue
|
|
171 if rd.is_duplicate:
|
|
172 continue
|
|
173 if rd.mate_is_unmapped: #at this point, only consider pairs
|
|
174 continue
|
|
175 if rd.rnext == rd.tid and rd.mpos <= g.end and rd.mpos >= g.start-1: #remove reads that fall into the same gene range
|
|
176 continue
|
|
177 if rd.is_read1:
|
|
178 if read1_ab.has_key(rd.qname):
|
|
179 read1_ab[rd.qname].add(gene)
|
|
180 else:
|
|
181 read1_ab[rd.qname]=set([gene])
|
|
182 db1[rd.qname]=rd
|
|
183 if rd.is_read2:
|
|
184 if read2_ab.has_key(rd.qname):
|
|
185 read2_ab[rd.qname].add(gene)
|
|
186 else:
|
|
187 read2_ab[rd.qname]=set([gene])
|
|
188 db2[rd.qname]=rd
|
|
189
|
|
190 ##output the discordant pairs and determine the orientation of candidate fusions
|
|
191 discordant={} #catalogue all discordant pairs, using gene pairs as keys
|
|
192 outfile=open('%s/%s.discordant.txt'%(outpath,docstring),'w')
|
|
193 title=['read','gene1','gene1_chr','read1_pos','read1_mm','read1_strand','read1_orient','gene2',\
|
|
194 'gene2_chr','read2_pos','read2_mm','read2_strand','read2_orient']
|
|
195 outfile.write('%s\n'%('\t'.join(title)))
|
|
196 i=0
|
|
197 for rdid in read1_ab:
|
|
198 i+=1
|
|
199 if i%10000==0:
|
|
200 print '%d discordant pairs processed'%i
|
|
201 if not read2_ab.has_key(rdid): #skip if not all ends are catalogued
|
|
202 continue
|
|
203 g1set=read1_ab[rdid] #consider all combinations if a read maps to multiple genes
|
|
204 g2set=read2_ab[rdid] #consider all combinations if a read maps to multiple genes
|
|
205 r1,r2=db1[rdid],db2[rdid]
|
|
206 read1strd='-1' if r1.is_reverse else '1'
|
|
207 read2strd='-1' if r2.is_reverse else '1'
|
|
208 for g1 in g1set:
|
|
209 for g2 in g2set:
|
|
210 if g1==g2: #for some uncasual cases
|
|
211 continue
|
|
212 g1obj,g2obj=genedb[g1],genedb[g2]
|
|
213 read1orient='F' if read1strd == g1obj.strand else 'R' #read1 --> gene1
|
|
214 read2orient='F' if read2strd == g2obj.strand else 'R' #read2 --> gene2
|
|
215 fkey=''
|
|
216 if read1orient=='F' and read2orient=='R': ##scenario I, gene1-gene2
|
|
217 fkey=g1+'_'+g2
|
|
218 if read1orient=='R' and read2orient=='F': ##scenario II, gene2-gene1
|
|
219 fkey=g2+'_'+g1
|
|
220 if fkey:
|
|
221 if discordant.has_key(fkey):
|
|
222 discordant[fkey].update({rdid:'%s:%s'%(read1orient,read2orient)})
|
|
223 else:
|
|
224 discordant[fkey]={rdid:'%s:%s'%(read1orient,read2orient)}
|
|
225 ##output
|
|
226 nm1=str([x[1] for x in r1.tags if x[0]=='NM'][0]) #output mismatch, but does not consider it at this point
|
|
227 nm2=str([x[1] for x in r2.tags if x[0]=='NM'][0])
|
|
228 uvec=[rdid,g1,g1obj.chr,str(r1.pos+1),nm1,read1strd,read1orient,g2,g2obj.chr,str(r2.pos+1),nm2,read2strd,read2orient]
|
|
229 outfile.write('%s\n'%('\t'.join(uvec)))
|
|
230 outfile.close()
|
|
231
|
|
232 ##########################################################################
|
|
233 print 'step 2: finding recurrent pairs (candidates) @ %s'%time.ctime()
|
|
234
|
|
235 #step 2 finds all candidates that have at least 2 discordant pairs. Meanwhile, filter out potential
|
|
236 #read through events. read through is defined as reads with mapping position less than 1M, while meeting
|
|
237 #the strand expectation.
|
|
238
|
|
239 guess=[]
|
|
240 outfile=open('%s/%s.recurrent.txt'%(outpath,docstring),'w')
|
|
241 title=['geneA','geneA_chr','geneB','geneB_chr','num_pairs','IDs']
|
|
242 outfile.write('%s\n'%('\t'.join(title)))
|
|
243 for pp in discordant:
|
|
244 if len(discordant[pp]) < 2: #consider only "recurrent" (more than 1 pair support) cases
|
|
245 continue
|
|
246 gene1,gene2=pp.split('_')
|
|
247 g1obj,g2obj=genedb[gene1],genedb[gene2]
|
|
248 rdset=discordant[pp].keys()
|
|
249 #filter read-through
|
|
250 #readthrough is defined at read level, regardless of mapping genes
|
|
251 for rd in rdset:
|
|
252 r1,r2=db1[rd],db2[rd]
|
|
253 read1strd='-1' if r1.is_reverse else '1'
|
|
254 read2strd='-1' if r2.is_reverse else '1'
|
|
255 readthrough=False
|
|
256 if db1[rd].tid == db2[rd].tid and abs(db1[rd].pos - db2[rd].pos) <= 1000000:
|
|
257 if discordant[pp][rd]=='F:R':
|
|
258 if read1strd=='1' and read2strd=='-1' and db1[rd].pos < db2[rd].pos:
|
|
259 readthrough=True
|
|
260 if read1strd=='-1' and read2strd=='1' and db1[rd].pos > db2[rd].pos:
|
|
261 readthrough=True
|
|
262 if discordant[pp][rd]=='R:F':
|
|
263 if read2strd=='1' and read1strd=='-1' and db2[rd].pos < db1[rd].pos:
|
|
264 readthrough=True
|
|
265 if read2strd=='-1' and read1strd=='1' and db2[rd].pos > db1[rd].pos:
|
|
266 readthrough=True
|
|
267 if readthrough:
|
|
268 del discordant[pp][rd] #in-place deletion!!!! Change the discordant variable in place!
|
|
269 if len(discordant[pp]) < 2: #skip all that have less than 2 supporting discordant read pairs after readthrough filter
|
|
270 continue
|
|
271 guess.append(pp)
|
|
272 #output
|
|
273 uvec2=[gene1,g1obj.chr,gene2,g2obj.chr,str(len(discordant[pp])),'|'.join(discordant[pp])]
|
|
274 outfile.write('%s\n'%('\t'.join(uvec2)))
|
|
275 outfile.close()
|
|
276
|
|
277 ##########################################################################
|
|
278 print 'step 3: finding potential junction spanning reads @ %s'%time.ctime()
|
|
279
|
|
280 #For all candidates, find potential junction spanning reads (JSRs). A JSR is defined as a unmapped read but with the mate mapping
|
|
281 #to either F or R partner, with high mapping quality. Since the JSR is unmapped, it is not practical to consider PCR duplicate
|
|
282 #because they are not properly marked.
|
|
283
|
|
284 Fpartners=set()
|
|
285 Rpartners=set()
|
|
286 for pp in guess:
|
|
287 gs=pp.split('_')
|
|
288 Fpartners.add(gs[0])
|
|
289 Rpartners.add(gs[1])
|
|
290 AllPartners=Fpartners|Rpartners
|
|
291
|
|
292 samfile.reset()
|
|
293 posjun={} ##catalogue all JSRs, with track of the mate mapping genes and orientation.
|
|
294 i,N=0,len(AllPartners)
|
|
295 for gene in AllPartners:
|
|
296 i+=1
|
|
297 if i%200==0:
|
|
298 print '%d/%d genes processed for potential junc reads'%(i,N)
|
|
299 g=genedb[gene]
|
|
300 exons=g.get_exons().values()
|
|
301 for e in exons:
|
|
302 for rd in samfile.fetch(e.chr,e.start-1,e.end):
|
|
303 if rd.mapq < minmapq:
|
|
304 continue
|
|
305 if not rd.mate_is_unmapped:
|
|
306 continue
|
|
307 readstrd='-1' if rd.is_reverse else '1'
|
|
308 orient='F' if readstrd == g.strand else 'R' #mapping info of mate read. JSR per se is unmapped in BAM
|
|
309 posjun[rd.qname]={'gene':gene,'orient':orient}
|
|
310
|
|
311 samfile.reset()
|
|
312 outfile=open('%s/%s.pos_junc_unmapped.fastq'%(outpath,docstring),'w')
|
|
313 i,N=0,len(posjun)
|
|
314 for rd in samfile:
|
|
315 if rd.mate_is_unmapped: #since the read is potential jun spanning read, all mate map to A or B
|
|
316 continue
|
|
317 if rd.is_unmapped:
|
|
318 if posjun.has_key(rd.qname):
|
|
319 i+=1
|
|
320 if i%10000==0:
|
|
321 print 'extracted %d/%d potential junc reads'%(i,N)
|
|
322 rdname='%s_prada_%s_prada_%s'%(rd.qname,posjun[rd.qname]['gene'],posjun[rd.qname]['orient']) #_prada_ as split
|
|
323 outfile.write('@%s\n'%rdname)
|
|
324 outfile.write('%s\n'%rd.seq)
|
|
325 outfile.write('+\n')
|
|
326 outfile.write('%s\n'%rd.qual)
|
|
327 outfile.close()
|
|
328
|
|
329 ######################################################################################
|
|
330 print 'step 4: building junction database @ %s'%time.ctime()
|
|
331
|
|
332 #Make hypothetical junctions between candidate fusion partners. To improve speed, we make a big junction database comprising
|
|
333 #exons of all candidates, instead of by candidate individually. This also gives the possibility to assess the JSR mapping ambiguity
|
|
334 #across many junctions. It turned out very useful in filtering out false positives.
|
|
335 #Note that in PRADA transcript sequence file, all sequences are + strand sequences. For - strand transcript, one need to
|
|
336 #reverse complement the sequence to get the real transcript sequences.
|
|
337
|
|
338 seqdb={}
|
|
339 for record in SeqIO.parse(txseqfile,'fasta'):
|
|
340 seqdb[record.name]=record
|
|
341
|
|
342 outfile=open('%s/%s.junction.fasta'%(outpath,docstring),'w')
|
|
343 i,N=0,len(guess)
|
|
344 for pp in guess:
|
|
345 i+=1
|
|
346 if i%100==0:
|
|
347 print 'building junction for %d/%d pairs'%(i,N)
|
|
348 gene1,gene2=pp.split('_')
|
|
349 g1obj,g2obj=genedb[gene1],genedb[gene2]
|
|
350 eset1=g1obj.get_exons() #unique exons in gene 1
|
|
351 eset2=g2obj.get_exons() #unique exons in gene 2
|
|
352 #collect unique junctions
|
|
353 juncseqdict={} #save junction sequences
|
|
354 for e1 in eset1.values():
|
|
355 for e2 in eset2.values():
|
|
356 e1_jun_name='%s:%s:%s'%(gene1,e1.chr,e1.end) if e1.strand=='1' else '%s:%s:%s'%(gene1,e1.chr,e1.start)
|
|
357 e2_jun_name='%s:%s:%s'%(gene2,e2.chr,e2.start) if e2.strand=='1' else '%s:%s:%s'%(gene2,e2.chr,e2.end)
|
|
358 jun_name=e1_jun_name+'_'+e2_jun_name
|
|
359 tx1,tx2=txdb[e1.transcript],txdb[e2.transcript]
|
|
360 max_a=tx1.exon_relative_pos()[e1.name][1]
|
|
361 min_a=0 if max_a - overlap < 0 else max_a - overlap
|
|
362 min_b=tx2.exon_relative_pos()[e2.name][0]-1
|
|
363 max_b=tx2.length if min_b + overlap > tx2.length else min_b + overlap
|
|
364 #reverse complementary when necessary
|
|
365 try:
|
|
366 tx1seq=seqdb[tx1.name].seq.tostring() if tx1.strand=='1' else seqdb[tx1.name].reverse_complement().seq.tostring()
|
|
367 tx2seq=seqdb[tx2.name].seq.tostring() if tx2.strand=='1' else seqdb[tx2.name].reverse_complement().seq.tostring()
|
|
368 except KeyError: #in case transcript not found in sequence file
|
|
369 continue
|
|
370 jun_seq=tx1seq[min_a:max_a]+tx2seq[min_b:max_b]
|
|
371 juncseqdict[jun_name]=jun_seq
|
|
372 for junc in juncseqdict:
|
|
373 outfile.write('>%s\n'%junc)
|
|
374 outfile.write('%s\n'%juncseqdict[junc])
|
|
375 outfile.close()
|
|
376 samfile.close()
|
|
377
|
|
378 #for memory efficiecy, del seqdb
|
|
379 del seqdb
|
|
380
|
|
381 ########################################################################################
|
|
382 print 'step 5: aligning potential junction reads to junction database @ %s'%time.ctime()
|
|
383
|
|
384 #Mapping potential JSRs to hypothetical junction database.
|
|
385 #Allow 4 mismatches at the beginning.
|
|
386
|
|
387 idx_cmd='%s index %s/%s.junction.fasta'%(bwa,outpath,docstring)
|
|
388 os.system(idx_cmd)
|
|
389 aln_cmd='%s aln -n 4 -R 100 %s/%s.junction.fasta %s/%s.pos_junc_unmapped.fastq > %s/%s.juncmap.sai'%(bwa,outpath,docstring,outpath,docstring,outpath,docstring)
|
|
390 os.system(aln_cmd)
|
|
391 samse_cmd='%s samse -n 1000 -s %s/%s.junction.fasta %s/%s.juncmap.sai %s/%s.pos_junc_unmapped.fastq > %s/%s.juncmap.sam'%(bwa,outpath,docstring,outpath,docstring,outpath,docstring,outpath,docstring)
|
|
392 os.system(samse_cmd)
|
|
393 sam2bam_cmd='%s view -bS %s/%s.juncmap.sam -o %s/%s.juncmap.bam'%(samtools,outpath,docstring,outpath,docstring)
|
|
394 os.system(sam2bam_cmd)
|
|
395
|
|
396 jsam=pysam.Samfile('%s/%s.juncmap.bam'%(outpath,docstring),'rb')
|
|
397 #get the junction name directory
|
|
398 junctions=jsam.references
|
|
399 junname=dict(zip(range(0,len(junctions)),junctions)) #this is essential for quick speed.
|
|
400 junc_align={}
|
|
401
|
|
402 #go through the BAM file for meaningful (meeting fusion orientation etc) reads
|
|
403 strd_right_reads={}
|
|
404 rdb={} #collect all junction spanning reads
|
|
405 i=0
|
|
406 for rd in jsam:
|
|
407 i+=1
|
|
408 if i%100000==0:
|
|
409 print '%d junction alignments parsed'%i
|
|
410 if rd.is_unmapped:
|
|
411 continue
|
|
412 read,mate_gene,mate_orient=rd.qname.split('_prada_')
|
|
413 junc=junname[rd.tid]
|
|
414 tmp=junc.split('_')
|
|
415 gene1,gene2=[x.split(':')[0] for x in tmp]
|
|
416 if gene1==mate_gene:
|
|
417 if mate_orient=='F':
|
|
418 if rd.is_reverse:
|
|
419 if strd_right_reads.has_key(rd.qname):
|
|
420 strd_right_reads[rd.qname]+=1 #count how many times the read maps
|
|
421 else:
|
|
422 strd_right_reads[rd.qname]=1
|
|
423 rdb[rd.qname]={'read':rd,'gene1':gene1,'gene2':gene2,'junc':junc} #will overwrite, but it is OK since we only look at unique ones
|
|
424 elif gene2==mate_gene:
|
|
425 if mate_orient == 'R':
|
|
426 if not rd.is_reverse:
|
|
427 if strd_right_reads.has_key(rd.qname):
|
|
428 strd_right_reads[rd.qname]+=1
|
|
429 else:
|
|
430 strd_right_reads[rd.qname]=1
|
|
431 rdb[rd.qname]={'read':rd,'gene1':gene1,'gene2':gene2,'junc':junc} #will overwrite, but it is OK since we only look at unique ones
|
|
432
|
|
433 #find uniquely mapped reads and their gene pairs
|
|
434 junc_map={} #a dictionary from junction to mapping reads
|
|
435 for rdname in strd_right_reads:
|
|
436 if strd_right_reads[rdname] > 1: #remove non-unique junction spanning reads
|
|
437 continue
|
|
438 infodict=rdb[rdname]
|
|
439 pp=infodict['gene1']+'_'+infodict['gene2']
|
|
440 if junc_map.has_key(pp):
|
|
441 junc_map[pp].add(rdname)
|
|
442 else:
|
|
443 junc_map[pp]=set([rdname])
|
|
444
|
|
445 ########################################################################################
|
|
446 print 'step 6: summarizing fusion evidences @ %s'%time.ctime()
|
|
447
|
|
448 #Now, time to apply mismatch filter and summarize the results
|
|
449 #Candidate fusions --> guess
|
|
450 #Discordant pairs --> discordant, db1, db2
|
|
451 #Junction reads --> junc_map, rdb
|
|
452 #Gene info --> genedb
|
|
453
|
|
454 outfile_s=open('%s/%s.fus.candidates.txt'%(outpath,docstring),'w')
|
|
455 outfile_d=open('%s/%s.fus.evidences.txt'%(outpath,docstring),'w')
|
|
456
|
|
457 title=['Gene_A','Gene_B','A_chr','B_chr','A_strand','B_strand','Discordant_n','JSR_n','perfectJSR_n','Junc_n','Position_Consist','Junction']
|
|
458 outfile_s.write('%s\n'%('\t'.join(title)))
|
|
459
|
|
460 for pp in junc_map: #all pairs with junc reads
|
|
461 gene1,gene2=pp.split('_')
|
|
462 g1obj,g2obj=genedb[gene1],genedb[gene2]
|
|
463 fus_disc=[] #collecting discordant pairs
|
|
464 for rdname in discordant[pp]:
|
|
465 #arrange read1/read2 into F/R so it will be easier for GeneFusion obj to handle
|
|
466 r1,r2=db1[rdname],db2[rdname]
|
|
467 orient=discordant[pp][rdname]
|
|
468 if orient=='F:R':
|
|
469 fus_disc.append((r1,r2))
|
|
470 elif orient=='R:F':
|
|
471 fus_disc.append((r2,r1))
|
|
472 fus_jsr=[]
|
|
473 if junc_map.has_key(pp):
|
|
474 for rdname in junc_map[pp]:
|
|
475 r=rdb[rdname]['read']
|
|
476 junc=rdb[rdname]['junc']
|
|
477 jsr=gfclass.JSR(r,junc)
|
|
478 fus_jsr.append(jsr)
|
|
479 gf=gfclass.GeneFusion(gene1,gene2,fus_disc,fus_jsr)
|
|
480 gf_new=gf.update(mm=mm) ##apply the mismatch parameter, default is 1
|
|
481 #output the results
|
|
482 disc_n=str(len(gf_new.discordantpairs))
|
|
483 junctions=sorted(gf_new.get_junction_freq(),key=operator.itemgetter(1),reverse=True) #sort junc by # of JSRs
|
|
484 junc_n=str(len(junctions))
|
|
485 junc_str='|'.join([','.join([x[0],str(x[1])]) for x in junctions])
|
|
486 jsr_n=str(len(gf_new.fusionreads))
|
|
487 pjsr_n=str(len(gf_new.get_perfect_JSR()))
|
|
488 pos_consist=gf_new.positioncheck()
|
|
489 svec=[gene1,gene2,g1obj.chr,g2obj.chr,g1obj.strand,g2obj.strand,disc_n,jsr_n,pjsr_n,junc_n,pos_consist,junc_str]
|
|
490 outfile_s.write('%s\n'%('\t'.join(svec)))
|
|
491 outfile_d.write('@@\t%s,%s,%s\t%s,%s,%s\n'%(gene1,g1obj.chr,g1obj.strand,gene2,g2obj.chr,g2obj.strand))
|
|
492 outfile_d.write('\n')
|
|
493 outfile_d.write('>discordant\n')
|
|
494 for rp in gf_new.discordantpairs:
|
|
495 rf,rr=rp
|
|
496 nm1=[x[1] for x in rf.tags if x[0]=='NM'][0]
|
|
497 nm2=[x[1] for x in rr.tags if x[0]=='NM'][0]
|
|
498 outfile_d.write('%s\tF\t%s.%s.mm%d\n'%(rf.qname,gene1,rf.pos+1,nm1)) ##0-based coordinates
|
|
499 outfile_d.write('%s\tR\t%s.%s.mm%d\n'%(rr.qname,gene2,rr.pos+1,nm2)) ##0-based coordinates
|
|
500 outfile_d.write('\n')
|
|
501 outfile_d.write('>spanning\n')
|
|
502 for jsr in gf_new.fusionreads:
|
|
503 r=jsr.read
|
|
504 nm=[x[1] for x in r.tags if x[0]=='NM'][0]
|
|
505 outfile_d.write('%s\t%s.mm%d\n'%(r.qname,jsr.junction,nm))
|
|
506 outfile_d.write('\n')
|
|
507 outfile_d.write('>junction\n')
|
|
508 for junc_info in junctions:
|
|
509 outfile_d.write('%s\t%d\n'%(junc_info[0],junc_info[1]))
|
|
510 outfile_d.write('\n')
|
|
511 outfile_d.write('>summary\n')
|
|
512 outfile_d.write('Number of Discordant Pairs = %s\n'%disc_n)
|
|
513 outfile_d.write('Number of Fusion Reads = %s\n'%jsr_n)
|
|
514 outfile_d.write('Number of Perfect Fusion Reads = %s\n'%pjsr_n)
|
|
515 outfile_d.write('Number of Distinct Junctions = %s\n'%junc_n)
|
|
516 outfile_d.write('Position Consistency = %s\n'%pos_consist)
|
|
517 outfile_d.write('\n')
|
|
518
|
|
519 outfile_s.close()
|
|
520 outfile_d.close()
|
|
521
|
|
522 ########################################################################################
|
|
523 print 'step 7: generating fusion lists @ %s'%time.ctime()
|
|
524
|
|
525 #For convenience, filter the lists to candidates with
|
|
526 # 1) at least 2 discordant pairs
|
|
527 # 2) at least 1 perfect JSR
|
|
528 #meanwhile, calculate sequence similarity for each pair
|
|
529 #user may need to manually filter the lists per this measure.
|
|
530
|
|
531 #The following code is a copy of prada-homology
|
|
532 outfile_o=open('%s/%s.fus.summary.txt'%(outpath,docstring),'w')
|
|
533 ifname='%s/%s.fus.candidates.txt'%(outpath,docstring)
|
|
534 if not os.path.exists(ifname):
|
|
535 sys.exit('ERROR: %s was not found'%ifname)
|
|
536
|
|
537 blastseq_tmp_dir='%s/blast_tmp/'%outpath
|
|
538 if not os.path.exists(blastseq_tmp_dir):
|
|
539 os.mkdir(blastseq_tmp_dir)
|
|
540
|
|
541 flists=[]
|
|
542 infile=open(ifname)
|
|
543 iN=0
|
|
544 for line in open(ifname):
|
|
545 info=line.strip().split('\t')
|
|
546 if iN==0:
|
|
547 iN+=1 #skip title
|
|
548 flists.append(info)
|
|
549 continue
|
|
550 else:
|
|
551 if int(info[6])>=2 and int(info[8])>=1 and info[10] in ['PARTIALLY','YES']:
|
|
552 flists.append(info)
|
|
553 infile.close()
|
|
554
|
|
555 if len(flists)==1: #if no candidate passes the filters
|
|
556 outfile_o.write('%s\n'%'\t'.join(flists[0]))
|
|
557 outfile_o.close()
|
|
558 print 'step done @ %s'%time.ctime()
|
|
559 sys.exit(0)
|
|
560
|
|
561 candidates={}
|
|
562 for line in flists[1:]:
|
|
563 geneA,geneB=line[0],line[1]
|
|
564 key='%s_%s'%(geneA,geneB)
|
|
565 candidates[key]=''
|
|
566
|
|
567 selecttranscript={}
|
|
568 for gene in genedb:
|
|
569 txs=genedb[gene].transcript
|
|
570 stx=txs[0]
|
|
571 initlen=stx.length
|
|
572 for tx in txs:
|
|
573 if tx.length >= initlen:
|
|
574 stx=tx
|
|
575 initlen=stx.length
|
|
576 selecttranscript[gene]=stx.name
|
|
577
|
|
578 allpartners=set()
|
|
579 for item in candidates:
|
|
580 sset=set(item.split('_'))
|
|
581 allpartners=allpartners.union(sset)
|
|
582
|
|
583 presenttxs=[] #tx that is present in our annotation
|
|
584 absent=[] #tx that is not in our annotation
|
|
585 for gene in allpartners:
|
|
586 if selecttranscript.has_key(gene):
|
|
587 presenttxs.append(selecttranscript[gene])
|
|
588 else:
|
|
589 absent.append(gene)
|
|
590
|
|
591 for seq_record in SeqIO.parse(txseqfile,'fasta'):
|
|
592 sid=seq_record.id
|
|
593 seq=seq_record.seq
|
|
594 if sid in presenttxs:
|
|
595 g=txdb[sid].gene
|
|
596 fastafile=open('%s/%s.fasta'%(blastseq_tmp_dir,g),'w')
|
|
597 SeqIO.write(seq_record,fastafile,'fasta')
|
|
598 fastafile.close()
|
|
599
|
|
600 for gp in candidates:
|
|
601 geneA,geneB=gp.split('_')
|
|
602 if geneA in absent or geneB in absent:
|
|
603 candidates[gp]=['NA']*4
|
|
604 else:
|
|
605 gaseq='%s/%s.fasta'%(blastseq_tmp_dir,geneA)
|
|
606 gaobj=SeqIO.parse(gaseq,'fasta').next()
|
|
607 gbseq='%s/%s.fasta'%(blastseq_tmp_dir,geneB)
|
|
608 gbobj=SeqIO.parse(gbseq,'fasta').next()
|
|
609 ga_len,gb_len=str(len(gaobj.seq)),str(len(gbobj.seq))
|
|
610 a=privutils.seqblast(gaseq,gbseq,blastn)
|
|
611 if a==None:
|
|
612 candidates[gp]=['NA','NA','100','0']
|
|
613 else:
|
|
614 candidates[gp]=a
|
|
615
|
|
616 header=flists[0][:]
|
|
617 header.extend(['Identity','Align_Len','Evalue','BitScore'])
|
|
618 outfile_o.write('%s\n'%('\t'.join(header)))
|
|
619
|
|
620 for info in flists[1:]:
|
|
621 geneA,geneB=info[0],info[1]
|
|
622 key='%s_%s'%(geneA,geneB)
|
|
623 vv=candidates[key]
|
|
624 row=info[:]
|
|
625 row.extend(vv)
|
|
626 outfile_o.write('%s\n'%('\t'.join(row)))
|
|
627 outfile_o.close()
|
|
628
|
|
629 ########################################################################################
|
|
630 print 'step done @ %s'%time.ctime()
|