Mercurial > repos > siyuan > prada

diff pyPRADA_1.2/prada-fusion @ 0:acc2ca1a3ba4
Uploaded
author: siyuan
date: Thu, 20 Feb 2014 00:44:58 -0500
children: f17965495ec9
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyPRADA_1.2/prada-fusion	Thu Feb 20 00:44:58 2014 -0500
@@ -0,0 +1,630 @@
+#!/usr/bin/env python
+
+#PRADA: Pipeline for RnAseq Data Analysis
+#Fusion detection module, algorithm published by Michael Berger et al (Genome Res, 2010) at Broad Institute. 
+#Implemented by Siyuan Zheng, Wandaliz Torres-Garcia and Rahul Vegesna. 
+#Copy Right: The Univ of Texas MD Anderson Cancer Center, Department of Bioinformatics and Computational Biology
+#Contact: Roel Verhaak (rverhaak@mdanderson.org)
+#Citation: to be added
+#Tested with Python v2.6 and v2.7 
+#The program requires NM tag and high quality mapping reads with mapping score more than -minmapq. 
+#Last modified: 04/11/2013
+
+######################################################################################
+import sys
+import time
+import os
+import os.path
+import subprocess
+import operator
+import pysam
+import bioclass
+import gfclass 
+import ioprada
+import privutils
+from Bio import SeqIO,Seq
+
+######################################################################################
+args=sys.argv
+
+help_menu='''\nPipeline for RNAseq Data Analaysis - fusion detection (PRADA).
+    **Command**:
+    prada-fusion -bam XX.bam -conf xx.txt -tag XX -mm 1 -junL XX -minmapq 30 -outdir ./ 
+    **Parameters**:
+    -h		print help message
+    -bam	input BAM file, must has a .bam suffix. BAM is the output from PRADA preprocess module.
+    -conf	config file for references and parameters. Use conf.txt in py-PRADA installation folder if none specified.
+    -tag	a tag to describe the sample, used to name output files. Default is ''.
+    -mm		number of mismatches allowed in discordant pairs and fusion spanning reads.Default is 1.  
+    -junL	length of sequences taken from EACH side of exons when making hypothetical junctions. No default.
+    -minmapq	minimum read mapping quality to be considered as fusion evidences. Default is 30.
+    -outdir	output directory.
+    -v		print version
+'''
+
+if '-h' in args or '-help' in args or len(args)==1:
+    print help_menu
+    sys.exit(0)
+
+if '-v' in args:
+    import version
+    print version.version
+    sys.exit(0)
+
+if '-bam' not in args:
+    sys.exit('ERROR: BAM file needed')
+i=args.index('-bam')
+bampath=args[i+1]
+bampath=os.path.abspath(bampath)
+bam=os.path.basename(bampath)
+if bam[-4:] != '.bam':
+    sys.exit('ERROR: BAM file must have suffix .bam')
+
+#Mismatch allowed. This filter is applied at the very end of the pipeline.
+#I strongly suggest 1. We also record how many junction spanning reads (JSRs) are perfect matched.
+if '-mm' not in args:
+    mm=1
+else:
+    i=args.index('-mm')
+    mm=int(args[i+1])
+
+#junL should be less than the read length in the dataset. I suggest it to be read_length*0.8
+if '-junL' not in args:
+    sys.exit('ERROR: junL must be specified')
+i=args.index('-junL')
+overlap=int(args[i+1])
+
+#minimum mapping quality for reads as fusion evidences
+if '-minmapq' not in args:
+    minmapq=30
+else:
+    i=args.index('-minmapq')
+    minmapq=int(args[i+1])
+
+if '-outdir' not in args:
+    outpath=os.path.abspath('./')
+else:
+    i=args.index('-outdir')
+    outpath=os.path.abspath(args[i+1])
+if os.path.exists(outpath):
+    print 'WARNING: outdir %s exists'%outpath
+else:
+    os.mkdir(outpath)
+
+if '-tag' not in args:
+    docstring='prada'
+else:
+    i=args.index('-tag')
+    docstring=args[i+1]
+
+#by default, search conf.txt in the prada folder
+prada_path=os.path.dirname(os.path.abspath(__file__))   #path to find libraries and executives.
+ref_search_path=[prada_path,os.getcwd()]                #search path for conf file if not specified in command line
+
+if '-conf' in args:
+    i=args.index('-ref')
+    reffile=args[i+1]
+    if os.path.exists(reffile):
+        pass
+    else:
+        for pth in ref_search_path:
+            new_reffile='%s/%s'%(pth, os.path.basename(reffile))  
+            if os.path.exists(new_reffile):
+                reffile=new_reffile
+                break
+        else:
+            sys.exit('ERROR: conf file %s not found'%reffile)
+else:
+    reffile='%s/conf.txt'%prada_path
+    if not os.path.exists(reffile):
+        sys.exit('ERROR: No default conf.txt found and none specified')
+
+#Now print all input parameters
+print 'CMD: %s'%('\t'.join(args))
+
+#reference files pointing to the annotation files. 
+refdict=ioprada.read_conf(reffile)
+featurefile=refdict['--REF--']['feature_file']
+txseqfile=refdict['--REF--']['tx_seq_file']
+txcatfile=refdict['--REF--']['txcat_file']
+cdsfile=refdict['--REF--']['cds_file']
+
+#underlying utilities, automatically detected
+#these are customized tools. update is needed. 
+samtools='%s/tools/samtools-0.1.16/samtools'%prada_path
+bwa='%s/tools/bwa-0.5.7-mh/bwa'%prada_path
+blastn='%s/tools/blastn'%prada_path
+
+######################################################################################
+print 'step 0: loading gene annotations @ %s'%time.ctime()
+#call functions in ioprada module // 
+txdb,genedb=ioprada.read_feature(featurefile,verbose=True)
+tx_primary=ioprada.read_tx_cat(txcatfile)
+tx_cds=ioprada.read_cds(cdsfile)
+
+######################################################################################
+print 'step 1: finding discordant pairs @ %s'%time.ctime()
+
+#We sift through all exons of protein coding genes and get the mapping reads. 
+#Within, we exclude low mapping quality reads and PCR duplicates. For pairs that the two ends
+#map to different genes, we all it a discordant pair. 
+#This is a step for finding all possible candidate fusions. 
+
+samfile=pysam.Samfile(bampath,'rb')
+
+read1_ab={}
+read2_ab={}
+db1={}
+db2={}
+
+i,N=0,len(genedb)
+for gene in genedb:
+    i+=1
+    if i%200==0:
+        print '%d/%d genes processed for discordant pairs'%(i,N)
+    g=genedb[gene]
+    exons=g.get_exons()
+    for e in exons.values():
+        for rd in samfile.fetch(e.chr,e.start-1,e.end):
+            if rd.mapq < minmapq:
+                continue
+            if rd.is_duplicate:
+                continue
+            if rd.mate_is_unmapped:   #at this point, only consider pairs
+                continue
+            if rd.rnext == rd.tid and rd.mpos <= g.end and rd.mpos >= g.start-1: #remove reads that fall into the same gene range
+                continue
+            if rd.is_read1:
+                if read1_ab.has_key(rd.qname):
+                    read1_ab[rd.qname].add(gene)
+                else:
+                    read1_ab[rd.qname]=set([gene])
+                db1[rd.qname]=rd
+            if rd.is_read2:
+                if read2_ab.has_key(rd.qname):
+                    read2_ab[rd.qname].add(gene)
+                else:
+                    read2_ab[rd.qname]=set([gene])
+                db2[rd.qname]=rd
+
+##output the discordant pairs and determine the orientation of candidate fusions
+discordant={}   #catalogue all discordant pairs, using gene pairs as keys
+outfile=open('%s/%s.discordant.txt'%(outpath,docstring),'w')
+title=['read','gene1','gene1_chr','read1_pos','read1_mm','read1_strand','read1_orient','gene2',\
+      'gene2_chr','read2_pos','read2_mm','read2_strand','read2_orient']
+outfile.write('%s\n'%('\t'.join(title)))
+i=0
+for rdid in read1_ab:
+    i+=1
+    if i%10000==0:
+        print '%d discordant pairs processed'%i
+    if not read2_ab.has_key(rdid):   #skip if not all ends are catalogued
+        continue
+    g1set=read1_ab[rdid]    #consider all combinations if a read maps to multiple genes
+    g2set=read2_ab[rdid]    #consider all combinations if a read maps to multiple genes
+    r1,r2=db1[rdid],db2[rdid]
+    read1strd='-1' if r1.is_reverse else '1'
+    read2strd='-1' if r2.is_reverse else '1'
+    for g1 in g1set:
+        for g2 in g2set:
+            if g1==g2:    #for some uncasual cases
+                continue
+            g1obj,g2obj=genedb[g1],genedb[g2]
+            read1orient='F' if read1strd == g1obj.strand else 'R' #read1 --> gene1
+            read2orient='F' if read2strd == g2obj.strand else 'R' #read2 --> gene2
+            fkey=''
+            if read1orient=='F' and read2orient=='R':    ##scenario I, gene1-gene2
+                fkey=g1+'_'+g2
+            if read1orient=='R' and read2orient=='F':    ##scenario II, gene2-gene1
+                fkey=g2+'_'+g1
+            if fkey:
+                if discordant.has_key(fkey):
+                    discordant[fkey].update({rdid:'%s:%s'%(read1orient,read2orient)})
+                else:
+                    discordant[fkey]={rdid:'%s:%s'%(read1orient,read2orient)}
+                ##output
+                nm1=str([x[1] for x in r1.tags if x[0]=='NM'][0])   #output mismatch, but does not consider it at this point
+                nm2=str([x[1] for x in r2.tags if x[0]=='NM'][0])
+                uvec=[rdid,g1,g1obj.chr,str(r1.pos+1),nm1,read1strd,read1orient,g2,g2obj.chr,str(r2.pos+1),nm2,read2strd,read2orient]
+                outfile.write('%s\n'%('\t'.join(uvec)))
+outfile.close()
+      
+##########################################################################
+print 'step 2: finding recurrent pairs (candidates) @ %s'%time.ctime()    
+
+#step 2 finds all candidates that have at least 2 discordant pairs. Meanwhile, filter out potential
+#read through events. read through is defined as reads with mapping position less than 1M, while meeting
+#the strand expectation.
+ 
+guess=[]
+outfile=open('%s/%s.recurrent.txt'%(outpath,docstring),'w')
+title=['geneA','geneA_chr','geneB','geneB_chr','num_pairs','IDs']
+outfile.write('%s\n'%('\t'.join(title)))
+for pp in discordant:
+    if len(discordant[pp]) < 2:   #consider only "recurrent" (more than 1 pair support) cases
+        continue
+    gene1,gene2=pp.split('_')
+    g1obj,g2obj=genedb[gene1],genedb[gene2]
+    rdset=discordant[pp].keys()
+    #filter read-through
+    #readthrough is defined at read level, regardless of mapping genes
+    for rd in rdset:
+        r1,r2=db1[rd],db2[rd]
+        read1strd='-1' if r1.is_reverse else '1'
+        read2strd='-1' if r2.is_reverse else '1'
+        readthrough=False
+        if db1[rd].tid == db2[rd].tid and abs(db1[rd].pos - db2[rd].pos) <= 1000000:
+            if discordant[pp][rd]=='F:R':
+                if read1strd=='1' and read2strd=='-1' and db1[rd].pos < db2[rd].pos:
+                    readthrough=True
+                if read1strd=='-1' and read2strd=='1' and db1[rd].pos > db2[rd].pos:
+                    readthrough=True
+            if discordant[pp][rd]=='R:F':
+                if read2strd=='1' and read1strd=='-1' and db2[rd].pos < db1[rd].pos:
+                    readthrough=True
+                if read2strd=='-1' and read1strd=='1' and db2[rd].pos > db1[rd].pos:
+                    readthrough=True
+            if readthrough:
+                del discordant[pp][rd]    #in-place deletion!!!! Change the discordant variable in place!
+    if len(discordant[pp]) < 2:    #skip all that have less than 2 supporting discordant read pairs after readthrough filter
+        continue
+    guess.append(pp)
+    #output
+    uvec2=[gene1,g1obj.chr,gene2,g2obj.chr,str(len(discordant[pp])),'|'.join(discordant[pp])]
+    outfile.write('%s\n'%('\t'.join(uvec2)))
+outfile.close()
+
+##########################################################################
+print 'step 3: finding potential junction spanning reads @ %s'%time.ctime()
+
+#For all candidates, find potential junction spanning reads (JSRs). A JSR is defined as a unmapped read but with the mate mapping
+#to either F or R partner, with high mapping quality. Since the JSR is unmapped, it is not practical to consider PCR duplicate 
+#because they are not properly marked. 
+
+Fpartners=set()
+Rpartners=set()
+for pp in guess:
+    gs=pp.split('_')
+    Fpartners.add(gs[0])
+    Rpartners.add(gs[1])
+AllPartners=Fpartners|Rpartners
+
+samfile.reset()
+posjun={}    ##catalogue all JSRs, with track of the mate mapping genes and orientation.
+i,N=0,len(AllPartners)
+for gene in AllPartners:
+    i+=1
+    if i%200==0:
+        print '%d/%d genes processed for potential junc reads'%(i,N)
+    g=genedb[gene]
+    exons=g.get_exons().values()
+    for e in exons:
+        for rd in samfile.fetch(e.chr,e.start-1,e.end):
+            if rd.mapq < minmapq:
+                continue
+            if not rd.mate_is_unmapped:
+                continue
+            readstrd='-1' if rd.is_reverse else '1'
+            orient='F' if readstrd == g.strand else 'R'  #mapping info of mate read. JSR per se is unmapped in BAM
+            posjun[rd.qname]={'gene':gene,'orient':orient}
+
+samfile.reset()
+outfile=open('%s/%s.pos_junc_unmapped.fastq'%(outpath,docstring),'w')
+i,N=0,len(posjun)
+for rd in samfile:
+    if rd.mate_is_unmapped:  #since the read is potential jun spanning read, all mate map to A or B
+        continue
+    if rd.is_unmapped:
+        if posjun.has_key(rd.qname):
+            i+=1
+            if i%10000==0:
+                print 'extracted %d/%d potential junc reads'%(i,N)
+            rdname='%s_prada_%s_prada_%s'%(rd.qname,posjun[rd.qname]['gene'],posjun[rd.qname]['orient']) #_prada_ as split
+            outfile.write('@%s\n'%rdname)
+            outfile.write('%s\n'%rd.seq)
+            outfile.write('+\n')
+            outfile.write('%s\n'%rd.qual)
+outfile.close()
+
+######################################################################################
+print 'step 4: building junction database @ %s'%time.ctime()
+
+#Make hypothetical junctions between candidate fusion partners. To improve speed, we make a big junction database comprising 
+#exons of all candidates, instead of by candidate individually. This also gives the possibility to assess the JSR mapping ambiguity
+#across many junctions. It turned out very useful in filtering out false positives. 
+#Note that in PRADA transcript sequence file, all sequences are + strand sequences. For - strand transcript, one need to 
+#reverse complement the sequence to get the real transcript sequences. 
+
+seqdb={}
+for record in SeqIO.parse(txseqfile,'fasta'):
+    seqdb[record.name]=record
+
+outfile=open('%s/%s.junction.fasta'%(outpath,docstring),'w')
+i,N=0,len(guess)
+for pp in guess:
+    i+=1
+    if i%100==0:
+        print 'building junction for %d/%d pairs'%(i,N)
+    gene1,gene2=pp.split('_')
+    g1obj,g2obj=genedb[gene1],genedb[gene2]
+    eset1=g1obj.get_exons()    #unique exons in gene 1
+    eset2=g2obj.get_exons()    #unique exons in gene 2
+    #collect unique junctions
+    juncseqdict={}    #save junction sequences
+    for e1 in eset1.values():
+        for e2 in eset2.values():
+            e1_jun_name='%s:%s:%s'%(gene1,e1.chr,e1.end) if e1.strand=='1' else '%s:%s:%s'%(gene1,e1.chr,e1.start)
+            e2_jun_name='%s:%s:%s'%(gene2,e2.chr,e2.start) if e2.strand=='1' else '%s:%s:%s'%(gene2,e2.chr,e2.end)
+            jun_name=e1_jun_name+'_'+e2_jun_name
+            tx1,tx2=txdb[e1.transcript],txdb[e2.transcript]
+            max_a=tx1.exon_relative_pos()[e1.name][1]
+            min_a=0 if max_a - overlap < 0 else max_a - overlap
+            min_b=tx2.exon_relative_pos()[e2.name][0]-1
+            max_b=tx2.length if min_b + overlap > tx2.length else min_b + overlap
+            #reverse complementary when necessary
+            try:
+                tx1seq=seqdb[tx1.name].seq.tostring() if tx1.strand=='1' else seqdb[tx1.name].reverse_complement().seq.tostring()
+                tx2seq=seqdb[tx2.name].seq.tostring() if tx2.strand=='1' else seqdb[tx2.name].reverse_complement().seq.tostring()
+            except KeyError:   #in case transcript not found in sequence file
+                continue
+            jun_seq=tx1seq[min_a:max_a]+tx2seq[min_b:max_b]
+            juncseqdict[jun_name]=jun_seq
+    for junc in juncseqdict:
+        outfile.write('>%s\n'%junc)
+        outfile.write('%s\n'%juncseqdict[junc])
+outfile.close()
+samfile.close()
+
+#for memory efficiecy, del seqdb
+del seqdb
+
+########################################################################################
+print 'step 5: aligning potential junction reads to junction database @ %s'%time.ctime()
+
+#Mapping potential JSRs to hypothetical junction database. 
+#Allow 4 mismatches at the beginning.
+
+idx_cmd='%s index %s/%s.junction.fasta'%(bwa,outpath,docstring)
+os.system(idx_cmd)
+aln_cmd='%s aln -n 4 -R 100 %s/%s.junction.fasta %s/%s.pos_junc_unmapped.fastq > %s/%s.juncmap.sai'%(bwa,outpath,docstring,outpath,docstring,outpath,docstring)
+os.system(aln_cmd)
+samse_cmd='%s samse -n 1000 -s %s/%s.junction.fasta %s/%s.juncmap.sai %s/%s.pos_junc_unmapped.fastq > %s/%s.juncmap.sam'%(bwa,outpath,docstring,outpath,docstring,outpath,docstring,outpath,docstring)
+os.system(samse_cmd)
+sam2bam_cmd='%s view -bS %s/%s.juncmap.sam -o %s/%s.juncmap.bam'%(samtools,outpath,docstring,outpath,docstring)
+os.system(sam2bam_cmd)
+
+jsam=pysam.Samfile('%s/%s.juncmap.bam'%(outpath,docstring),'rb')
+#get the junction name directory
+junctions=jsam.references
+junname=dict(zip(range(0,len(junctions)),junctions))   #this is essential for quick speed.
+junc_align={}
+
+#go through the BAM file for meaningful (meeting fusion orientation etc) reads
+strd_right_reads={}
+rdb={}    #collect all junction spanning reads
+i=0
+for rd in jsam:
+    i+=1
+    if i%100000==0:
+        print '%d junction alignments parsed'%i
+    if rd.is_unmapped:
+        continue
+    read,mate_gene,mate_orient=rd.qname.split('_prada_')
+    junc=junname[rd.tid]
+    tmp=junc.split('_')
+    gene1,gene2=[x.split(':')[0] for x in tmp]
+    if gene1==mate_gene:
+        if mate_orient=='F':
+            if rd.is_reverse:
+                if strd_right_reads.has_key(rd.qname):
+                    strd_right_reads[rd.qname]+=1    #count how many times the read maps
+                else:  
+                    strd_right_reads[rd.qname]=1
+                rdb[rd.qname]={'read':rd,'gene1':gene1,'gene2':gene2,'junc':junc} #will overwrite, but it is OK since we only look at unique ones
+    elif gene2==mate_gene:
+        if mate_orient == 'R':
+            if not rd.is_reverse:
+                if strd_right_reads.has_key(rd.qname):
+                    strd_right_reads[rd.qname]+=1
+                else:
+                    strd_right_reads[rd.qname]=1
+                rdb[rd.qname]={'read':rd,'gene1':gene1,'gene2':gene2,'junc':junc} #will overwrite, but it is OK since we only look at unique ones
+
+#find uniquely mapped reads and their gene pairs
+junc_map={}  #a dictionary from junction to mapping reads
+for rdname in strd_right_reads:
+    if strd_right_reads[rdname] > 1: #remove non-unique junction spanning reads
+        continue
+    infodict=rdb[rdname]
+    pp=infodict['gene1']+'_'+infodict['gene2']
+    if junc_map.has_key(pp):
+        junc_map[pp].add(rdname)
+    else:
+        junc_map[pp]=set([rdname])
+    
+########################################################################################
+print 'step 6: summarizing fusion evidences @ %s'%time.ctime()
+
+#Now, time to apply mismatch filter and summarize the results
+#Candidate fusions --> guess
+#Discordant pairs --> discordant, db1, db2
+#Junction reads --> junc_map, rdb
+#Gene info --> genedb
+
+outfile_s=open('%s/%s.fus.candidates.txt'%(outpath,docstring),'w')
+outfile_d=open('%s/%s.fus.evidences.txt'%(outpath,docstring),'w')
+
+title=['Gene_A','Gene_B','A_chr','B_chr','A_strand','B_strand','Discordant_n','JSR_n','perfectJSR_n','Junc_n','Position_Consist','Junction']
+outfile_s.write('%s\n'%('\t'.join(title)))
+
+for pp in junc_map: #all pairs with junc reads
+    gene1,gene2=pp.split('_')
+    g1obj,g2obj=genedb[gene1],genedb[gene2]
+    fus_disc=[]									#collecting discordant pairs
+    for rdname in discordant[pp]:
+        #arrange read1/read2 into F/R so it will be easier for GeneFusion obj to handle
+        r1,r2=db1[rdname],db2[rdname]
+        orient=discordant[pp][rdname]
+        if orient=='F:R':
+            fus_disc.append((r1,r2))
+        elif orient=='R:F':
+            fus_disc.append((r2,r1))
+    fus_jsr=[]    
+    if junc_map.has_key(pp):
+        for rdname in junc_map[pp]:
+            r=rdb[rdname]['read']
+            junc=rdb[rdname]['junc']
+            jsr=gfclass.JSR(r,junc)
+            fus_jsr.append(jsr)
+    gf=gfclass.GeneFusion(gene1,gene2,fus_disc,fus_jsr)
+    gf_new=gf.update(mm=mm)     ##apply the mismatch parameter, default is 1
+    #output the results
+    disc_n=str(len(gf_new.discordantpairs))
+    junctions=sorted(gf_new.get_junction_freq(),key=operator.itemgetter(1),reverse=True) #sort junc by # of JSRs
+    junc_n=str(len(junctions))
+    junc_str='|'.join([','.join([x[0],str(x[1])]) for x in junctions])
+    jsr_n=str(len(gf_new.fusionreads))
+    pjsr_n=str(len(gf_new.get_perfect_JSR()))
+    pos_consist=gf_new.positioncheck()
+    svec=[gene1,gene2,g1obj.chr,g2obj.chr,g1obj.strand,g2obj.strand,disc_n,jsr_n,pjsr_n,junc_n,pos_consist,junc_str]
+    outfile_s.write('%s\n'%('\t'.join(svec)))
+    outfile_d.write('@@\t%s,%s,%s\t%s,%s,%s\n'%(gene1,g1obj.chr,g1obj.strand,gene2,g2obj.chr,g2obj.strand))
+    outfile_d.write('\n')
+    outfile_d.write('>discordant\n')
+    for rp in gf_new.discordantpairs:
+        rf,rr=rp
+        nm1=[x[1] for x in rf.tags if x[0]=='NM'][0]
+        nm2=[x[1] for x in rr.tags if x[0]=='NM'][0]    
+        outfile_d.write('%s\tF\t%s.%s.mm%d\n'%(rf.qname,gene1,rf.pos+1,nm1))    ##0-based coordinates
+        outfile_d.write('%s\tR\t%s.%s.mm%d\n'%(rr.qname,gene2,rr.pos+1,nm2))    ##0-based coordinates
+    outfile_d.write('\n')
+    outfile_d.write('>spanning\n') 
+    for jsr in gf_new.fusionreads:
+        r=jsr.read
+        nm=[x[1] for x in r.tags if x[0]=='NM'][0]
+        outfile_d.write('%s\t%s.mm%d\n'%(r.qname,jsr.junction,nm))
+    outfile_d.write('\n')
+    outfile_d.write('>junction\n')
+    for junc_info in junctions:
+        outfile_d.write('%s\t%d\n'%(junc_info[0],junc_info[1]))
+    outfile_d.write('\n')
+    outfile_d.write('>summary\n')
+    outfile_d.write('Number of Discordant Pairs = %s\n'%disc_n)
+    outfile_d.write('Number of Fusion Reads = %s\n'%jsr_n)
+    outfile_d.write('Number of Perfect Fusion Reads = %s\n'%pjsr_n)
+    outfile_d.write('Number of Distinct Junctions = %s\n'%junc_n)
+    outfile_d.write('Position Consistency = %s\n'%pos_consist)
+    outfile_d.write('\n')
+
+outfile_s.close()
+outfile_d.close() 
+
+########################################################################################
+print 'step 7: generating fusion lists @ %s'%time.ctime()
+
+#For convenience, filter the lists to candidates with 
+# 1) at least 2 discordant pairs
+# 2) at least 1 perfect JSR
+#meanwhile, calculate sequence similarity for each pair
+#user may need to manually filter the lists per this measure.
+
+#The following code is a copy of prada-homology
+outfile_o=open('%s/%s.fus.summary.txt'%(outpath,docstring),'w')
+ifname='%s/%s.fus.candidates.txt'%(outpath,docstring) 
+if not os.path.exists(ifname):
+    sys.exit('ERROR: %s was not found'%ifname)
+
+blastseq_tmp_dir='%s/blast_tmp/'%outpath
+if not os.path.exists(blastseq_tmp_dir):
+    os.mkdir(blastseq_tmp_dir)
+
+flists=[]
+infile=open(ifname)
+iN=0
+for line in open(ifname):
+    info=line.strip().split('\t')
+    if iN==0:
+        iN+=1		#skip title
+        flists.append(info)
+        continue
+    else:
+        if int(info[6])>=2 and int(info[8])>=1 and info[10] in ['PARTIALLY','YES']:
+            flists.append(info) 
+infile.close()
+
+if len(flists)==1:  						#if no candidate passes the filters
+    outfile_o.write('%s\n'%'\t'.join(flists[0]))
+    outfile_o.close()
+    print 'step done @ %s'%time.ctime()
+    sys.exit(0)
+
+candidates={}
+for line in flists[1:]:
+    geneA,geneB=line[0],line[1]
+    key='%s_%s'%(geneA,geneB)
+    candidates[key]=''
+
+selecttranscript={}
+for gene in genedb:
+    txs=genedb[gene].transcript
+    stx=txs[0]
+    initlen=stx.length
+    for tx in txs:
+        if tx.length >= initlen:
+            stx=tx
+            initlen=stx.length
+    selecttranscript[gene]=stx.name
+
+allpartners=set()
+for item in candidates:
+    sset=set(item.split('_'))
+    allpartners=allpartners.union(sset)
+
+presenttxs=[]    #tx that is present in our annotation
+absent=[]     #tx that is not in our annotation
+for gene in allpartners:
+    if selecttranscript.has_key(gene):
+        presenttxs.append(selecttranscript[gene])
+    else:
+        absent.append(gene)
+
+for seq_record in SeqIO.parse(txseqfile,'fasta'):
+    sid=seq_record.id
+    seq=seq_record.seq
+    if sid in presenttxs:
+        g=txdb[sid].gene
+        fastafile=open('%s/%s.fasta'%(blastseq_tmp_dir,g),'w')
+        SeqIO.write(seq_record,fastafile,'fasta')
+        fastafile.close()
+
+for gp in candidates:
+    geneA,geneB=gp.split('_')
+    if geneA in absent or geneB in absent:
+        candidates[gp]=['NA']*4
+    else:
+        gaseq='%s/%s.fasta'%(blastseq_tmp_dir,geneA)
+        gaobj=SeqIO.parse(gaseq,'fasta').next()
+        gbseq='%s/%s.fasta'%(blastseq_tmp_dir,geneB)
+        gbobj=SeqIO.parse(gbseq,'fasta').next()
+        ga_len,gb_len=str(len(gaobj.seq)),str(len(gbobj.seq))
+        a=privutils.seqblast(gaseq,gbseq,blastn)
+        if a==None:
+            candidates[gp]=['NA','NA','100','0']
+        else:
+            candidates[gp]=a
+
+header=flists[0][:]
+header.extend(['Identity','Align_Len','Evalue','BitScore'])
+outfile_o.write('%s\n'%('\t'.join(header)))
+
+for info in flists[1:]:
+    geneA,geneB=info[0],info[1]
+    key='%s_%s'%(geneA,geneB)
+    vv=candidates[key]
+    row=info[:]
+    row.extend(vv)
+    outfile_o.write('%s\n'%('\t'.join(row)))
+outfile_o.close()
+
+########################################################################################
+print 'step done @ %s'%time.ctime()
author	siyuan
date	Thu, 20 Feb 2014 00:44:58 -0500
parents
children	f17965495ec9