Mercurial > repos > siyuan > prada

#!/usr/bin/env python

#PRADA: Pipeline for RnAseq Data Analysis
#Fusion detection module, algorithm published by Michael Berger et al (Genome Res, 2010) at Broad Institute.
#Implemented by Siyuan Zheng, Wandaliz Torres-Garcia and Rahul Vegesna.
#Copy Right: The Univ of Texas MD Anderson Cancer Center, Department of Bioinformatics and Computational Biology
#Contact: Roel Verhaak (rverhaak@mdanderson.org)
#Citation: to be added
#Tested with Python v2.6 and v2.7
#The program requires NM tag and high quality mapping reads with mapping score more than -minmapq.
#Last modified: 04/11/2013

######################################################################################
import sys
import time
import os
import os.path
import subprocess
import operator
import pysam
import bioclass
import gfclass
import ioprada
import privutils
from Bio import SeqIO,Seq

######################################################################################
args=sys.argv

help_menu='''\nPipeline for RNAseq Data Analaysis - fusion detection (PRADA).
    **Command**:
    prada-fusion -bam XX.bam -conf xx.txt -tag XX -mm 1 -junL XX -minmapq 30 -outdir ./
    **Parameters**:
    -h		print help message
    -bam	input BAM file, must has a .bam suffix. BAM is the output from PRADA preprocess module.
    -conf	config file for references and parameters. Use conf.txt in py-PRADA installation folder if none specified.
    -tag	a tag to describe the sample, used to name output files. Default is ''.
    -mm		number of mismatches allowed in discordant pairs and fusion spanning reads.Default is 1.
    -junL	length of sequences taken from EACH side of exons when making hypothetical junctions. No default.
    -minmapq	minimum read mapping quality to be considered as fusion evidences. Default is 30.
    -outdir	output directory.
    -v		print version
'''

if '-h' in args or '-help' in args or len(args)==1:
    print help_menu
    sys.exit(0)

if '-v' in args:
    import version
    print version.version
    sys.exit(0)

if '-bam' not in args:
    sys.exit('ERROR: BAM file needed')
i=args.index('-bam')
bampath=args[i+1]
bampath=os.path.abspath(bampath)
bam=os.path.basename(bampath)
if bam[-4:] != '.bam':
    sys.exit('ERROR: BAM file must have suffix .bam')

#Mismatch allowed. This filter is applied at the very end of the pipeline.
#I strongly suggest 1. We also record how many junction spanning reads (JSRs) are perfect matched.
if '-mm' not in args:
    mm=1
else:
    i=args.index('-mm')
    mm=int(args[i+1])

#junL should be less than the read length in the dataset. I suggest it to be read_length*0.8
if '-junL' not in args:
    sys.exit('ERROR: junL must be specified')
i=args.index('-junL')
overlap=int(args[i+1])

#minimum mapping quality for reads as fusion evidences
if '-minmapq' not in args:
    minmapq=30
else:
    i=args.index('-minmapq')
    minmapq=int(args[i+1])

if '-outdir' not in args:
    outpath=os.path.abspath('./')
else:
    i=args.index('-outdir')
    outpath=os.path.abspath(args[i+1])
if os.path.exists(outpath):
    print 'WARNING: outdir %s exists'%outpath
else:
    os.mkdir(outpath)

if '-tag' not in args:
    docstring='prada'
else:
    i=args.index('-tag')
    docstring=args[i+1]

#by default, search conf.txt in the prada folder
prada_path=os.path.dirname(os.path.abspath(__file__))   #path to find libraries and executives.
ref_search_path=[prada_path,os.getcwd()]                #search path for conf file if not specified in command line

if '-conf' in args:
    i=args.index('-conf')
    reffile=args[i+1]
    if os.path.exists(reffile):
        pass
    else:
        for pth in ref_search_path:
            new_reffile='%s/%s'%(pth, os.path.basename(reffile))
            if os.path.exists(new_reffile):
                reffile=new_reffile
                break
        else:
            sys.exit('ERROR: conf file %s not found'%reffile)
else:
    reffile='%s/conf.txt'%prada_path
    if not os.path.exists(reffile):
        sys.exit('ERROR: No default conf.txt found and none specified')

#Now print all input parameters
print 'CMD: %s'%('\t'.join(args))

#reference files pointing to the annotation files.
refdict=ioprada.read_conf(reffile)
featurefile=refdict['--REF--']['feature_file']
txseqfile=refdict['--REF--']['tx_seq_file']
txcatfile=refdict['--REF--']['txcat_file']
cdsfile=refdict['--REF--']['cds_file']

#underlying utilities, automatically detected
#these are customized tools. update is needed.
samtools='%s/tools/samtools-0.1.16/samtools'%prada_path
bwa='%s/tools/bwa-0.5.7-mh/bwa'%prada_path
blastn='%s/tools/blastn'%prada_path

######################################################################################
print 'step 0: loading gene annotations @ %s'%time.ctime()
#call functions in ioprada module //
txdb,genedb=ioprada.read_feature(featurefile,verbose=True)
tx_primary=ioprada.read_tx_cat(txcatfile)
tx_cds=ioprada.read_cds(cdsfile)

######################################################################################
print 'step 1: finding discordant pairs @ %s'%time.ctime()

#We sift through all exons of protein coding genes and get the mapping reads.
#Within, we exclude low mapping quality reads and PCR duplicates. For pairs that the two ends
#map to different genes, we all it a discordant pair.
#This is a step for finding all possible candidate fusions.

samfile=pysam.Samfile(bampath,'rb')

read1_ab={}
read2_ab={}
db1={}
db2={}

i,N=0,len(genedb)
for gene in genedb:
    i+=1
    if i%200==0:
        print '%d/%d genes processed for discordant pairs'%(i,N)
    g=genedb[gene]
    exons=g.get_exons()
    for e in exons.values():
        for rd in samfile.fetch(e.chr,e.start-1,e.end):
            if rd.mapq < minmapq:
                continue
            if rd.is_duplicate:
                continue
            if rd.mate_is_unmapped:   #at this point, only consider pairs
                continue
            if rd.rnext == rd.tid and rd.mpos <= g.end and rd.mpos >= g.start-1: #remove reads that fall into the same gene range
                continue
            if rd.is_read1:
                if read1_ab.has_key(rd.qname):
                    read1_ab[rd.qname].add(gene)
                else:
                    read1_ab[rd.qname]=set([gene])
                db1[rd.qname]=rd
            if rd.is_read2:
                if read2_ab.has_key(rd.qname):
                    read2_ab[rd.qname].add(gene)
                else:
                    read2_ab[rd.qname]=set([gene])
                db2[rd.qname]=rd

##output the discordant pairs and determine the orientation of candidate fusions
discordant={}   #catalogue all discordant pairs, using gene pairs as keys
outfile=open('%s/%s.discordant.txt'%(outpath,docstring),'w')
title=['read','gene1','gene1_chr','read1_pos','read1_mm','read1_strand','read1_orient','gene2',\
      'gene2_chr','read2_pos','read2_mm','read2_strand','read2_orient']
outfile.write('%s\n'%('\t'.join(title)))
i=0
for rdid in read1_ab:
    i+=1
    if i%10000==0:
        print '%d discordant pairs processed'%i
    if not read2_ab.has_key(rdid):   #skip if not all ends are catalogued
        continue
    g1set=read1_ab[rdid]    #consider all combinations if a read maps to multiple genes
    g2set=read2_ab[rdid]    #consider all combinations if a read maps to multiple genes
    r1,r2=db1[rdid],db2[rdid]
    read1strd='-1' if r1.is_reverse else '1'
    read2strd='-1' if r2.is_reverse else '1'
    for g1 in g1set:
        for g2 in g2set:
            if g1==g2:    #for some uncasual cases
                continue
            g1obj,g2obj=genedb[g1],genedb[g2]
            read1orient='F' if read1strd == g1obj.strand else 'R' #read1 --> gene1
            read2orient='F' if read2strd == g2obj.strand else 'R' #read2 --> gene2
            fkey=''
            if read1orient=='F' and read2orient=='R':    ##scenario I, gene1-gene2
                fkey=g1+'_'+g2
            if read1orient=='R' and read2orient=='F':    ##scenario II, gene2-gene1
                fkey=g2+'_'+g1
            if fkey:
                if discordant.has_key(fkey):
                    discordant[fkey].update({rdid:'%s:%s'%(read1orient,read2orient)})
                else:
                    discordant[fkey]={rdid:'%s:%s'%(read1orient,read2orient)}
                ##output
                nm1=str([x[1] for x in r1.tags if x[0]=='NM'][0])   #output mismatch, but does not consider it at this point
                nm2=str([x[1] for x in r2.tags if x[0]=='NM'][0])
                uvec=[rdid,g1,g1obj.chr,str(r1.pos+1),nm1,read1strd,read1orient,g2,g2obj.chr,str(r2.pos+1),nm2,read2strd,read2orient]
                outfile.write('%s\n'%('\t'.join(uvec)))
outfile.close()

##########################################################################
print 'step 2: finding recurrent pairs (candidates) @ %s'%time.ctime()

#step 2 finds all candidates that have at least 2 discordant pairs. Meanwhile, filter out potential
#read through events. read through is defined as reads with mapping position less than 1M, while meeting
#the strand expectation.

guess=[]
outfile=open('%s/%s.recurrent.txt'%(outpath,docstring),'w')
title=['geneA','geneA_chr','geneB','geneB_chr','num_pairs','IDs']
outfile.write('%s\n'%('\t'.join(title)))
for pp in discordant:
    if len(discordant[pp]) < 2:   #consider only "recurrent" (more than 1 pair support) cases
        continue
    gene1,gene2=pp.split('_')
    g1obj,g2obj=genedb[gene1],genedb[gene2]
    rdset=discordant[pp].keys()
    #filter read-through
    #readthrough is defined at read level, regardless of mapping genes
    for rd in rdset:
        r1,r2=db1[rd],db2[rd]
        read1strd='-1' if r1.is_reverse else '1'
        read2strd='-1' if r2.is_reverse else '1'
        readthrough=False
        if db1[rd].tid == db2[rd].tid and abs(db1[rd].pos - db2[rd].pos) <= 1000000:
            if discordant[pp][rd]=='F:R':
                if read1strd=='1' and read2strd=='-1' and db1[rd].pos < db2[rd].pos:
                    readthrough=True
                if read1strd=='-1' and read2strd=='1' and db1[rd].pos > db2[rd].pos:
                    readthrough=True
            if discordant[pp][rd]=='R:F':
                if read2strd=='1' and read1strd=='-1' and db2[rd].pos < db1[rd].pos:
                    readthrough=True
                if read2strd=='-1' and read1strd=='1' and db2[rd].pos > db1[rd].pos:
                    readthrough=True
            if readthrough:
                del discordant[pp][rd]    #in-place deletion!!!! Change the discordant variable in place!
    if len(discordant[pp]) < 2:    #skip all that have less than 2 supporting discordant read pairs after readthrough filter
        continue
    guess.append(pp)
    #output
    uvec2=[gene1,g1obj.chr,gene2,g2obj.chr,str(len(discordant[pp])),'|'.join(discordant[pp])]
    outfile.write('%s\n'%('\t'.join(uvec2)))
outfile.close()

##########################################################################
print 'step 3: finding potential junction spanning reads @ %s'%time.ctime()

#For all candidates, find potential junction spanning reads (JSRs). A JSR is defined as a unmapped read but with the mate mapping
#to either F or R partner, with high mapping quality. Since the JSR is unmapped, it is not practical to consider PCR duplicate
#because they are not properly marked.

Fpartners=set()
Rpartners=set()
for pp in guess:
    gs=pp.split('_')
    Fpartners.add(gs[0])
    Rpartners.add(gs[1])
AllPartners=Fpartners|Rpartners

samfile.reset()
posjun={}    ##catalogue all JSRs, with track of the mate mapping genes and orientation.
i,N=0,len(AllPartners)
for gene in AllPartners:
    i+=1
    if i%200==0:
        print '%d/%d genes processed for potential junc reads'%(i,N)
    g=genedb[gene]
    exons=g.get_exons().values()
    for e in exons:
        for rd in samfile.fetch(e.chr,e.start-1,e.end):
            if rd.mapq < minmapq:
                continue
            if not rd.mate_is_unmapped:
                continue
            readstrd='-1' if rd.is_reverse else '1'
            orient='F' if readstrd == g.strand else 'R'  #mapping info of mate read. JSR per se is unmapped in BAM
            posjun[rd.qname]={'gene':gene,'orient':orient}

samfile.reset()
outfile=open('%s/%s.pos_junc_unmapped.fastq'%(outpath,docstring),'w')
i,N=0,len(posjun)
for rd in samfile:
    if rd.mate_is_unmapped:  #since the read is potential jun spanning read, all mate map to A or B
        continue
    if rd.is_unmapped:
        if posjun.has_key(rd.qname):
            i+=1
            if i%10000==0:
                print 'extracted %d/%d potential junc reads'%(i,N)
            rdname='%s_prada_%s_prada_%s'%(rd.qname,posjun[rd.qname]['gene'],posjun[rd.qname]['orient']) #_prada_ as split
            outfile.write('@%s\n'%rdname)
            outfile.write('%s\n'%rd.seq)
            outfile.write('+\n')
            outfile.write('%s\n'%rd.qual)
outfile.close()

######################################################################################
print 'step 4: building junction database @ %s'%time.ctime()

#Make hypothetical junctions between candidate fusion partners. To improve speed, we make a big junction database comprising
#exons of all candidates, instead of by candidate individually. This also gives the possibility to assess the JSR mapping ambiguity
#across many junctions. It turned out very useful in filtering out false positives.
#Note that in PRADA transcript sequence file, all sequences are + strand sequences. For - strand transcript, one need to
#reverse complement the sequence to get the real transcript sequences.

seqdb={}
for record in SeqIO.parse(txseqfile,'fasta'):
    seqdb[record.name]=record

outfile=open('%s/%s.junction.fasta'%(outpath,docstring),'w')
i,N=0,len(guess)
for pp in guess:
    i+=1
    if i%100==0:
        print 'building junction for %d/%d pairs'%(i,N)
    gene1,gene2=pp.split('_')
    g1obj,g2obj=genedb[gene1],genedb[gene2]
    eset1=g1obj.get_exons()    #unique exons in gene 1
    eset2=g2obj.get_exons()    #unique exons in gene 2
    #collect unique junctions
    juncseqdict={}    #save junction sequences
    for e1 in eset1.values():
        for e2 in eset2.values():
            e1_jun_name='%s:%s:%s'%(gene1,e1.chr,e1.end) if e1.strand=='1' else '%s:%s:%s'%(gene1,e1.chr,e1.start)
            e2_jun_name='%s:%s:%s'%(gene2,e2.chr,e2.start) if e2.strand=='1' else '%s:%s:%s'%(gene2,e2.chr,e2.end)
            jun_name=e1_jun_name+'_'+e2_jun_name
            tx1,tx2=txdb[e1.transcript],txdb[e2.transcript]
            max_a=tx1.exon_relative_pos()[e1.name][1]
            min_a=0 if max_a - overlap < 0 else max_a - overlap
            min_b=tx2.exon_relative_pos()[e2.name][0]-1
            max_b=tx2.length if min_b + overlap > tx2.length else min_b + overlap
            #reverse complementary when necessary
            try:
                tx1seq=seqdb[tx1.name].seq.tostring() if tx1.strand=='1' else seqdb[tx1.name].reverse_complement().seq.tostring()
                tx2seq=seqdb[tx2.name].seq.tostring() if tx2.strand=='1' else seqdb[tx2.name].reverse_complement().seq.tostring()
            except KeyError:   #in case transcript not found in sequence file
                continue
            jun_seq=tx1seq[min_a:max_a]+tx2seq[min_b:max_b]
            juncseqdict[jun_name]=jun_seq
    for junc in juncseqdict:
        outfile.write('>%s\n'%junc)
        outfile.write('%s\n'%juncseqdict[junc])
outfile.close()
samfile.close()

#for memory efficiecy, del seqdb
del seqdb

########################################################################################
print 'step 5: aligning potential junction reads to junction database @ %s'%time.ctime()

#Mapping potential JSRs to hypothetical junction database.
#Allow 4 mismatches at the beginning.

idx_cmd='%s index %s/%s.junction.fasta'%(bwa,outpath,docstring)
os.system(idx_cmd)
aln_cmd='%s aln -n 4 -R 100 %s/%s.junction.fasta %s/%s.pos_junc_unmapped.fastq > %s/%s.juncmap.sai'%(bwa,outpath,docstring,outpath,docstring,outpath,docstring)
os.system(aln_cmd)
samse_cmd='%s samse -n 1000 -s %s/%s.junction.fasta %s/%s.juncmap.sai %s/%s.pos_junc_unmapped.fastq > %s/%s.juncmap.sam'%(bwa,outpath,docstring,outpath,docstring,outpath,docstring,outpath,docstring)
os.system(samse_cmd)
sam2bam_cmd='%s view -bS %s/%s.juncmap.sam -o %s/%s.juncmap.bam'%(samtools,outpath,docstring,outpath,docstring)
os.system(sam2bam_cmd)

jsam=pysam.Samfile('%s/%s.juncmap.bam'%(outpath,docstring),'rb')
#get the junction name directory
junctions=jsam.references
junname=dict(zip(range(0,len(junctions)),junctions))   #this is essential for quick speed.
junc_align={}

#go through the BAM file for meaningful (meeting fusion orientation etc) reads
strd_right_reads={}
rdb={}    #collect all junction spanning reads
i=0
for rd in jsam:
    i+=1
    if i%100000==0:
        print '%d junction alignments parsed'%i
    if rd.is_unmapped:
        continue
    read,mate_gene,mate_orient=rd.qname.split('_prada_')
    junc=junname[rd.tid]
    tmp=junc.split('_')
    gene1,gene2=[x.split(':')[0] for x in tmp]
    if gene1==mate_gene:
        if mate_orient=='F':
            if rd.is_reverse:
                if strd_right_reads.has_key(rd.qname):
                    strd_right_reads[rd.qname]+=1    #count how many times the read maps
                else:
                    strd_right_reads[rd.qname]=1
                rdb[rd.qname]={'read':rd,'gene1':gene1,'gene2':gene2,'junc':junc} #will overwrite, but it is OK since we only look at unique ones
    elif gene2==mate_gene:
        if mate_orient == 'R':
            if not rd.is_reverse:
                if strd_right_reads.has_key(rd.qname):
                    strd_right_reads[rd.qname]+=1
                else:
                    strd_right_reads[rd.qname]=1
                rdb[rd.qname]={'read':rd,'gene1':gene1,'gene2':gene2,'junc':junc} #will overwrite, but it is OK since we only look at unique ones

#find uniquely mapped reads and their gene pairs
junc_map={}  #a dictionary from junction to mapping reads
for rdname in strd_right_reads:
    if strd_right_reads[rdname] > 1: #remove non-unique junction spanning reads
        continue
    infodict=rdb[rdname]
    pp=infodict['gene1']+'_'+infodict['gene2']
    if junc_map.has_key(pp):
        junc_map[pp].add(rdname)
    else:
        junc_map[pp]=set([rdname])

########################################################################################
print 'step 6: summarizing fusion evidences @ %s'%time.ctime()

#Now, time to apply mismatch filter and summarize the results
#Candidate fusions --> guess
#Discordant pairs --> discordant, db1, db2
#Junction reads --> junc_map, rdb
#Gene info --> genedb

outfile_s=open('%s/%s.fus.candidates.txt'%(outpath,docstring),'w')
outfile_d=open('%s/%s.fus.evidences.txt'%(outpath,docstring),'w')

title=['Gene_A','Gene_B','A_chr','B_chr','A_strand','B_strand','Discordant_n','JSR_n','perfectJSR_n','Junc_n','Position_Consist','Junction']
outfile_s.write('%s\n'%('\t'.join(title)))

for pp in junc_map: #all pairs with junc reads
    gene1,gene2=pp.split('_')
    g1obj,g2obj=genedb[gene1],genedb[gene2]
    fus_disc=[]									#collecting discordant pairs
    for rdname in discordant[pp]:
        #arrange read1/read2 into F/R so it will be easier for GeneFusion obj to handle
        r1,r2=db1[rdname],db2[rdname]
        orient=discordant[pp][rdname]
        if orient=='F:R':
            fus_disc.append((r1,r2))
        elif orient=='R:F':
            fus_disc.append((r2,r1))
    fus_jsr=[]
    if junc_map.has_key(pp):
        for rdname in junc_map[pp]:
            r=rdb[rdname]['read']
            junc=rdb[rdname]['junc']
            jsr=gfclass.JSR(r,junc)
            fus_jsr.append(jsr)
    gf=gfclass.GeneFusion(gene1,gene2,fus_disc,fus_jsr)
    gf_new=gf.update(mm=mm)     ##apply the mismatch parameter, default is 1
    #output the results
    disc_n=str(len(gf_new.discordantpairs))
    junctions=sorted(gf_new.get_junction_freq(),key=operator.itemgetter(1),reverse=True) #sort junc by # of JSRs
    junc_n=str(len(junctions))
    junc_str='|'.join([','.join([x[0],str(x[1])]) for x in junctions])
    jsr_n=str(len(gf_new.fusionreads))
    pjsr_n=str(len(gf_new.get_perfect_JSR()))
    pos_consist=gf_new.positioncheck()
    svec=[gene1,gene2,g1obj.chr,g2obj.chr,g1obj.strand,g2obj.strand,disc_n,jsr_n,pjsr_n,junc_n,pos_consist,junc_str]
    outfile_s.write('%s\n'%('\t'.join(svec)))
    outfile_d.write('@@\t%s,%s,%s\t%s,%s,%s\n'%(gene1,g1obj.chr,g1obj.strand,gene2,g2obj.chr,g2obj.strand))
    outfile_d.write('\n')
    outfile_d.write('>discordant\n')
    for rp in gf_new.discordantpairs:
        rf,rr=rp
        nm1=[x[1] for x in rf.tags if x[0]=='NM'][0]
        nm2=[x[1] for x in rr.tags if x[0]=='NM'][0]
        outfile_d.write('%s\tF\t%s.%s.mm%d\n'%(rf.qname,gene1,rf.pos+1,nm1))    ##0-based coordinates
        outfile_d.write('%s\tR\t%s.%s.mm%d\n'%(rr.qname,gene2,rr.pos+1,nm2))    ##0-based coordinates
    outfile_d.write('\n')
    outfile_d.write('>spanning\n')
    for jsr in gf_new.fusionreads:
        r=jsr.read
        nm=[x[1] for x in r.tags if x[0]=='NM'][0]
        outfile_d.write('%s\t%s.mm%d\n'%(r.qname,jsr.junction,nm))
    outfile_d.write('\n')
    outfile_d.write('>junction\n')
    for junc_info in junctions:
        outfile_d.write('%s\t%d\n'%(junc_info[0],junc_info[1]))
    outfile_d.write('\n')
    outfile_d.write('>summary\n')
    outfile_d.write('Number of Discordant Pairs = %s\n'%disc_n)
    outfile_d.write('Number of Fusion Reads = %s\n'%jsr_n)
    outfile_d.write('Number of Perfect Fusion Reads = %s\n'%pjsr_n)
    outfile_d.write('Number of Distinct Junctions = %s\n'%junc_n)
    outfile_d.write('Position Consistency = %s\n'%pos_consist)
    outfile_d.write('\n')

outfile_s.close()
outfile_d.close()

########################################################################################
print 'step 7: generating fusion lists @ %s'%time.ctime()

#For convenience, filter the lists to candidates with
# 1) at least 2 discordant pairs
# 2) at least 1 perfect JSR
#meanwhile, calculate sequence similarity for each pair
#user may need to manually filter the lists per this measure.

#The following code is a copy of prada-homology
outfile_o=open('%s/%s.fus.summary.txt'%(outpath,docstring),'w')
ifname='%s/%s.fus.candidates.txt'%(outpath,docstring)
if not os.path.exists(ifname):
    sys.exit('ERROR: %s was not found'%ifname)

blastseq_tmp_dir='%s/blast_tmp/'%outpath
if not os.path.exists(blastseq_tmp_dir):
    os.mkdir(blastseq_tmp_dir)

flists=[]
infile=open(ifname)
iN=0
for line in open(ifname):
    info=line.strip().split('\t')
    if iN==0:
        iN+=1		#skip title
        flists.append(info)
        continue
    else:
        if int(info[6])>=2 and int(info[8])>=1 and info[10] in ['PARTIALLY','YES']:
            flists.append(info)
infile.close()

if len(flists)==1:  						#if no candidate passes the filters
    outfile_o.write('%s\n'%'\t'.join(flists[0]))
    outfile_o.close()
    print 'step done @ %s'%time.ctime()
    sys.exit(0)

candidates={}
for line in flists[1:]:
    geneA,geneB=line[0],line[1]
    key='%s_%s'%(geneA,geneB)
    candidates[key]=''

selecttranscript={}
for gene in genedb:
    txs=genedb[gene].transcript
    stx=txs[0]
    initlen=stx.length
    for tx in txs:
        if tx.length >= initlen:
            stx=tx
            initlen=stx.length
    selecttranscript[gene]=stx.name

allpartners=set()
for item in candidates:
    sset=set(item.split('_'))
    allpartners=allpartners.union(sset)

presenttxs=[]    #tx that is present in our annotation
absent=[]     #tx that is not in our annotation
for gene in allpartners:
    if selecttranscript.has_key(gene):
        presenttxs.append(selecttranscript[gene])
    else:
        absent.append(gene)

for seq_record in SeqIO.parse(txseqfile,'fasta'):
    sid=seq_record.id
    seq=seq_record.seq
    if sid in presenttxs:
        g=txdb[sid].gene
        fastafile=open('%s/%s.fasta'%(blastseq_tmp_dir,g),'w')
        SeqIO.write(seq_record,fastafile,'fasta')
        fastafile.close()

for gp in candidates:
    geneA,geneB=gp.split('_')
    if geneA in absent or geneB in absent:
        candidates[gp]=['NA']*4
    else:
        gaseq='%s/%s.fasta'%(blastseq_tmp_dir,geneA)
        gaobj=SeqIO.parse(gaseq,'fasta').next()
        gbseq='%s/%s.fasta'%(blastseq_tmp_dir,geneB)
        gbobj=SeqIO.parse(gbseq,'fasta').next()
        ga_len,gb_len=str(len(gaobj.seq)),str(len(gbobj.seq))
        a=privutils.seqblast(gaseq,gbseq,blastn)
        if a==None:
            candidates[gp]=['NA','NA','100','0']
        else:
            candidates[gp]=a

header=flists[0][:]
header.extend(['Identity','Align_Len','Evalue','BitScore'])
outfile_o.write('%s\n'%('\t'.join(header)))

for info in flists[1:]:
    geneA,geneB=info[0],info[1]
    key='%s_%s'%(geneA,geneB)
    vv=candidates[key]
    row=info[:]
    row.extend(vv)
    outfile_o.write('%s\n'%('\t'.join(row)))
outfile_o.close()

########################################################################################
print 'step done @ %s'%time.ctime()
author	siyuan
date	Tue, 11 Mar 2014 12:14:01 -0400
parents	acc2ca1a3ba4
children