Mercurial > repos > siyuan > prada
diff pyPRADA_1.2/prada-fusion @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children | f17965495ec9 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyPRADA_1.2/prada-fusion Thu Feb 20 00:44:58 2014 -0500 @@ -0,0 +1,630 @@ +#!/usr/bin/env python + +#PRADA: Pipeline for RnAseq Data Analysis +#Fusion detection module, algorithm published by Michael Berger et al (Genome Res, 2010) at Broad Institute. +#Implemented by Siyuan Zheng, Wandaliz Torres-Garcia and Rahul Vegesna. +#Copy Right: The Univ of Texas MD Anderson Cancer Center, Department of Bioinformatics and Computational Biology +#Contact: Roel Verhaak (rverhaak@mdanderson.org) +#Citation: to be added +#Tested with Python v2.6 and v2.7 +#The program requires NM tag and high quality mapping reads with mapping score more than -minmapq. +#Last modified: 04/11/2013 + +###################################################################################### +import sys +import time +import os +import os.path +import subprocess +import operator +import pysam +import bioclass +import gfclass +import ioprada +import privutils +from Bio import SeqIO,Seq + +###################################################################################### +args=sys.argv + +help_menu='''\nPipeline for RNAseq Data Analaysis - fusion detection (PRADA). + **Command**: + prada-fusion -bam XX.bam -conf xx.txt -tag XX -mm 1 -junL XX -minmapq 30 -outdir ./ + **Parameters**: + -h print help message + -bam input BAM file, must has a .bam suffix. BAM is the output from PRADA preprocess module. + -conf config file for references and parameters. Use conf.txt in py-PRADA installation folder if none specified. + -tag a tag to describe the sample, used to name output files. Default is ''. + -mm number of mismatches allowed in discordant pairs and fusion spanning reads.Default is 1. + -junL length of sequences taken from EACH side of exons when making hypothetical junctions. No default. + -minmapq minimum read mapping quality to be considered as fusion evidences. Default is 30. + -outdir output directory. + -v print version +''' + +if '-h' in args or '-help' in args or len(args)==1: + print help_menu + sys.exit(0) + +if '-v' in args: + import version + print version.version + sys.exit(0) + +if '-bam' not in args: + sys.exit('ERROR: BAM file needed') +i=args.index('-bam') +bampath=args[i+1] +bampath=os.path.abspath(bampath) +bam=os.path.basename(bampath) +if bam[-4:] != '.bam': + sys.exit('ERROR: BAM file must have suffix .bam') + +#Mismatch allowed. This filter is applied at the very end of the pipeline. +#I strongly suggest 1. We also record how many junction spanning reads (JSRs) are perfect matched. +if '-mm' not in args: + mm=1 +else: + i=args.index('-mm') + mm=int(args[i+1]) + +#junL should be less than the read length in the dataset. I suggest it to be read_length*0.8 +if '-junL' not in args: + sys.exit('ERROR: junL must be specified') +i=args.index('-junL') +overlap=int(args[i+1]) + +#minimum mapping quality for reads as fusion evidences +if '-minmapq' not in args: + minmapq=30 +else: + i=args.index('-minmapq') + minmapq=int(args[i+1]) + +if '-outdir' not in args: + outpath=os.path.abspath('./') +else: + i=args.index('-outdir') + outpath=os.path.abspath(args[i+1]) +if os.path.exists(outpath): + print 'WARNING: outdir %s exists'%outpath +else: + os.mkdir(outpath) + +if '-tag' not in args: + docstring='prada' +else: + i=args.index('-tag') + docstring=args[i+1] + +#by default, search conf.txt in the prada folder +prada_path=os.path.dirname(os.path.abspath(__file__)) #path to find libraries and executives. +ref_search_path=[prada_path,os.getcwd()] #search path for conf file if not specified in command line + +if '-conf' in args: + i=args.index('-ref') + reffile=args[i+1] + if os.path.exists(reffile): + pass + else: + for pth in ref_search_path: + new_reffile='%s/%s'%(pth, os.path.basename(reffile)) + if os.path.exists(new_reffile): + reffile=new_reffile + break + else: + sys.exit('ERROR: conf file %s not found'%reffile) +else: + reffile='%s/conf.txt'%prada_path + if not os.path.exists(reffile): + sys.exit('ERROR: No default conf.txt found and none specified') + +#Now print all input parameters +print 'CMD: %s'%('\t'.join(args)) + +#reference files pointing to the annotation files. +refdict=ioprada.read_conf(reffile) +featurefile=refdict['--REF--']['feature_file'] +txseqfile=refdict['--REF--']['tx_seq_file'] +txcatfile=refdict['--REF--']['txcat_file'] +cdsfile=refdict['--REF--']['cds_file'] + +#underlying utilities, automatically detected +#these are customized tools. update is needed. +samtools='%s/tools/samtools-0.1.16/samtools'%prada_path +bwa='%s/tools/bwa-0.5.7-mh/bwa'%prada_path +blastn='%s/tools/blastn'%prada_path + +###################################################################################### +print 'step 0: loading gene annotations @ %s'%time.ctime() +#call functions in ioprada module // +txdb,genedb=ioprada.read_feature(featurefile,verbose=True) +tx_primary=ioprada.read_tx_cat(txcatfile) +tx_cds=ioprada.read_cds(cdsfile) + +###################################################################################### +print 'step 1: finding discordant pairs @ %s'%time.ctime() + +#We sift through all exons of protein coding genes and get the mapping reads. +#Within, we exclude low mapping quality reads and PCR duplicates. For pairs that the two ends +#map to different genes, we all it a discordant pair. +#This is a step for finding all possible candidate fusions. + +samfile=pysam.Samfile(bampath,'rb') + +read1_ab={} +read2_ab={} +db1={} +db2={} + +i,N=0,len(genedb) +for gene in genedb: + i+=1 + if i%200==0: + print '%d/%d genes processed for discordant pairs'%(i,N) + g=genedb[gene] + exons=g.get_exons() + for e in exons.values(): + for rd in samfile.fetch(e.chr,e.start-1,e.end): + if rd.mapq < minmapq: + continue + if rd.is_duplicate: + continue + if rd.mate_is_unmapped: #at this point, only consider pairs + continue + if rd.rnext == rd.tid and rd.mpos <= g.end and rd.mpos >= g.start-1: #remove reads that fall into the same gene range + continue + if rd.is_read1: + if read1_ab.has_key(rd.qname): + read1_ab[rd.qname].add(gene) + else: + read1_ab[rd.qname]=set([gene]) + db1[rd.qname]=rd + if rd.is_read2: + if read2_ab.has_key(rd.qname): + read2_ab[rd.qname].add(gene) + else: + read2_ab[rd.qname]=set([gene]) + db2[rd.qname]=rd + +##output the discordant pairs and determine the orientation of candidate fusions +discordant={} #catalogue all discordant pairs, using gene pairs as keys +outfile=open('%s/%s.discordant.txt'%(outpath,docstring),'w') +title=['read','gene1','gene1_chr','read1_pos','read1_mm','read1_strand','read1_orient','gene2',\ + 'gene2_chr','read2_pos','read2_mm','read2_strand','read2_orient'] +outfile.write('%s\n'%('\t'.join(title))) +i=0 +for rdid in read1_ab: + i+=1 + if i%10000==0: + print '%d discordant pairs processed'%i + if not read2_ab.has_key(rdid): #skip if not all ends are catalogued + continue + g1set=read1_ab[rdid] #consider all combinations if a read maps to multiple genes + g2set=read2_ab[rdid] #consider all combinations if a read maps to multiple genes + r1,r2=db1[rdid],db2[rdid] + read1strd='-1' if r1.is_reverse else '1' + read2strd='-1' if r2.is_reverse else '1' + for g1 in g1set: + for g2 in g2set: + if g1==g2: #for some uncasual cases + continue + g1obj,g2obj=genedb[g1],genedb[g2] + read1orient='F' if read1strd == g1obj.strand else 'R' #read1 --> gene1 + read2orient='F' if read2strd == g2obj.strand else 'R' #read2 --> gene2 + fkey='' + if read1orient=='F' and read2orient=='R': ##scenario I, gene1-gene2 + fkey=g1+'_'+g2 + if read1orient=='R' and read2orient=='F': ##scenario II, gene2-gene1 + fkey=g2+'_'+g1 + if fkey: + if discordant.has_key(fkey): + discordant[fkey].update({rdid:'%s:%s'%(read1orient,read2orient)}) + else: + discordant[fkey]={rdid:'%s:%s'%(read1orient,read2orient)} + ##output + nm1=str([x[1] for x in r1.tags if x[0]=='NM'][0]) #output mismatch, but does not consider it at this point + nm2=str([x[1] for x in r2.tags if x[0]=='NM'][0]) + uvec=[rdid,g1,g1obj.chr,str(r1.pos+1),nm1,read1strd,read1orient,g2,g2obj.chr,str(r2.pos+1),nm2,read2strd,read2orient] + outfile.write('%s\n'%('\t'.join(uvec))) +outfile.close() + +########################################################################## +print 'step 2: finding recurrent pairs (candidates) @ %s'%time.ctime() + +#step 2 finds all candidates that have at least 2 discordant pairs. Meanwhile, filter out potential +#read through events. read through is defined as reads with mapping position less than 1M, while meeting +#the strand expectation. + +guess=[] +outfile=open('%s/%s.recurrent.txt'%(outpath,docstring),'w') +title=['geneA','geneA_chr','geneB','geneB_chr','num_pairs','IDs'] +outfile.write('%s\n'%('\t'.join(title))) +for pp in discordant: + if len(discordant[pp]) < 2: #consider only "recurrent" (more than 1 pair support) cases + continue + gene1,gene2=pp.split('_') + g1obj,g2obj=genedb[gene1],genedb[gene2] + rdset=discordant[pp].keys() + #filter read-through + #readthrough is defined at read level, regardless of mapping genes + for rd in rdset: + r1,r2=db1[rd],db2[rd] + read1strd='-1' if r1.is_reverse else '1' + read2strd='-1' if r2.is_reverse else '1' + readthrough=False + if db1[rd].tid == db2[rd].tid and abs(db1[rd].pos - db2[rd].pos) <= 1000000: + if discordant[pp][rd]=='F:R': + if read1strd=='1' and read2strd=='-1' and db1[rd].pos < db2[rd].pos: + readthrough=True + if read1strd=='-1' and read2strd=='1' and db1[rd].pos > db2[rd].pos: + readthrough=True + if discordant[pp][rd]=='R:F': + if read2strd=='1' and read1strd=='-1' and db2[rd].pos < db1[rd].pos: + readthrough=True + if read2strd=='-1' and read1strd=='1' and db2[rd].pos > db1[rd].pos: + readthrough=True + if readthrough: + del discordant[pp][rd] #in-place deletion!!!! Change the discordant variable in place! + if len(discordant[pp]) < 2: #skip all that have less than 2 supporting discordant read pairs after readthrough filter + continue + guess.append(pp) + #output + uvec2=[gene1,g1obj.chr,gene2,g2obj.chr,str(len(discordant[pp])),'|'.join(discordant[pp])] + outfile.write('%s\n'%('\t'.join(uvec2))) +outfile.close() + +########################################################################## +print 'step 3: finding potential junction spanning reads @ %s'%time.ctime() + +#For all candidates, find potential junction spanning reads (JSRs). A JSR is defined as a unmapped read but with the mate mapping +#to either F or R partner, with high mapping quality. Since the JSR is unmapped, it is not practical to consider PCR duplicate +#because they are not properly marked. + +Fpartners=set() +Rpartners=set() +for pp in guess: + gs=pp.split('_') + Fpartners.add(gs[0]) + Rpartners.add(gs[1]) +AllPartners=Fpartners|Rpartners + +samfile.reset() +posjun={} ##catalogue all JSRs, with track of the mate mapping genes and orientation. +i,N=0,len(AllPartners) +for gene in AllPartners: + i+=1 + if i%200==0: + print '%d/%d genes processed for potential junc reads'%(i,N) + g=genedb[gene] + exons=g.get_exons().values() + for e in exons: + for rd in samfile.fetch(e.chr,e.start-1,e.end): + if rd.mapq < minmapq: + continue + if not rd.mate_is_unmapped: + continue + readstrd='-1' if rd.is_reverse else '1' + orient='F' if readstrd == g.strand else 'R' #mapping info of mate read. JSR per se is unmapped in BAM + posjun[rd.qname]={'gene':gene,'orient':orient} + +samfile.reset() +outfile=open('%s/%s.pos_junc_unmapped.fastq'%(outpath,docstring),'w') +i,N=0,len(posjun) +for rd in samfile: + if rd.mate_is_unmapped: #since the read is potential jun spanning read, all mate map to A or B + continue + if rd.is_unmapped: + if posjun.has_key(rd.qname): + i+=1 + if i%10000==0: + print 'extracted %d/%d potential junc reads'%(i,N) + rdname='%s_prada_%s_prada_%s'%(rd.qname,posjun[rd.qname]['gene'],posjun[rd.qname]['orient']) #_prada_ as split + outfile.write('@%s\n'%rdname) + outfile.write('%s\n'%rd.seq) + outfile.write('+\n') + outfile.write('%s\n'%rd.qual) +outfile.close() + +###################################################################################### +print 'step 4: building junction database @ %s'%time.ctime() + +#Make hypothetical junctions between candidate fusion partners. To improve speed, we make a big junction database comprising +#exons of all candidates, instead of by candidate individually. This also gives the possibility to assess the JSR mapping ambiguity +#across many junctions. It turned out very useful in filtering out false positives. +#Note that in PRADA transcript sequence file, all sequences are + strand sequences. For - strand transcript, one need to +#reverse complement the sequence to get the real transcript sequences. + +seqdb={} +for record in SeqIO.parse(txseqfile,'fasta'): + seqdb[record.name]=record + +outfile=open('%s/%s.junction.fasta'%(outpath,docstring),'w') +i,N=0,len(guess) +for pp in guess: + i+=1 + if i%100==0: + print 'building junction for %d/%d pairs'%(i,N) + gene1,gene2=pp.split('_') + g1obj,g2obj=genedb[gene1],genedb[gene2] + eset1=g1obj.get_exons() #unique exons in gene 1 + eset2=g2obj.get_exons() #unique exons in gene 2 + #collect unique junctions + juncseqdict={} #save junction sequences + for e1 in eset1.values(): + for e2 in eset2.values(): + e1_jun_name='%s:%s:%s'%(gene1,e1.chr,e1.end) if e1.strand=='1' else '%s:%s:%s'%(gene1,e1.chr,e1.start) + e2_jun_name='%s:%s:%s'%(gene2,e2.chr,e2.start) if e2.strand=='1' else '%s:%s:%s'%(gene2,e2.chr,e2.end) + jun_name=e1_jun_name+'_'+e2_jun_name + tx1,tx2=txdb[e1.transcript],txdb[e2.transcript] + max_a=tx1.exon_relative_pos()[e1.name][1] + min_a=0 if max_a - overlap < 0 else max_a - overlap + min_b=tx2.exon_relative_pos()[e2.name][0]-1 + max_b=tx2.length if min_b + overlap > tx2.length else min_b + overlap + #reverse complementary when necessary + try: + tx1seq=seqdb[tx1.name].seq.tostring() if tx1.strand=='1' else seqdb[tx1.name].reverse_complement().seq.tostring() + tx2seq=seqdb[tx2.name].seq.tostring() if tx2.strand=='1' else seqdb[tx2.name].reverse_complement().seq.tostring() + except KeyError: #in case transcript not found in sequence file + continue + jun_seq=tx1seq[min_a:max_a]+tx2seq[min_b:max_b] + juncseqdict[jun_name]=jun_seq + for junc in juncseqdict: + outfile.write('>%s\n'%junc) + outfile.write('%s\n'%juncseqdict[junc]) +outfile.close() +samfile.close() + +#for memory efficiecy, del seqdb +del seqdb + +######################################################################################## +print 'step 5: aligning potential junction reads to junction database @ %s'%time.ctime() + +#Mapping potential JSRs to hypothetical junction database. +#Allow 4 mismatches at the beginning. + +idx_cmd='%s index %s/%s.junction.fasta'%(bwa,outpath,docstring) +os.system(idx_cmd) +aln_cmd='%s aln -n 4 -R 100 %s/%s.junction.fasta %s/%s.pos_junc_unmapped.fastq > %s/%s.juncmap.sai'%(bwa,outpath,docstring,outpath,docstring,outpath,docstring) +os.system(aln_cmd) +samse_cmd='%s samse -n 1000 -s %s/%s.junction.fasta %s/%s.juncmap.sai %s/%s.pos_junc_unmapped.fastq > %s/%s.juncmap.sam'%(bwa,outpath,docstring,outpath,docstring,outpath,docstring,outpath,docstring) +os.system(samse_cmd) +sam2bam_cmd='%s view -bS %s/%s.juncmap.sam -o %s/%s.juncmap.bam'%(samtools,outpath,docstring,outpath,docstring) +os.system(sam2bam_cmd) + +jsam=pysam.Samfile('%s/%s.juncmap.bam'%(outpath,docstring),'rb') +#get the junction name directory +junctions=jsam.references +junname=dict(zip(range(0,len(junctions)),junctions)) #this is essential for quick speed. +junc_align={} + +#go through the BAM file for meaningful (meeting fusion orientation etc) reads +strd_right_reads={} +rdb={} #collect all junction spanning reads +i=0 +for rd in jsam: + i+=1 + if i%100000==0: + print '%d junction alignments parsed'%i + if rd.is_unmapped: + continue + read,mate_gene,mate_orient=rd.qname.split('_prada_') + junc=junname[rd.tid] + tmp=junc.split('_') + gene1,gene2=[x.split(':')[0] for x in tmp] + if gene1==mate_gene: + if mate_orient=='F': + if rd.is_reverse: + if strd_right_reads.has_key(rd.qname): + strd_right_reads[rd.qname]+=1 #count how many times the read maps + else: + strd_right_reads[rd.qname]=1 + rdb[rd.qname]={'read':rd,'gene1':gene1,'gene2':gene2,'junc':junc} #will overwrite, but it is OK since we only look at unique ones + elif gene2==mate_gene: + if mate_orient == 'R': + if not rd.is_reverse: + if strd_right_reads.has_key(rd.qname): + strd_right_reads[rd.qname]+=1 + else: + strd_right_reads[rd.qname]=1 + rdb[rd.qname]={'read':rd,'gene1':gene1,'gene2':gene2,'junc':junc} #will overwrite, but it is OK since we only look at unique ones + +#find uniquely mapped reads and their gene pairs +junc_map={} #a dictionary from junction to mapping reads +for rdname in strd_right_reads: + if strd_right_reads[rdname] > 1: #remove non-unique junction spanning reads + continue + infodict=rdb[rdname] + pp=infodict['gene1']+'_'+infodict['gene2'] + if junc_map.has_key(pp): + junc_map[pp].add(rdname) + else: + junc_map[pp]=set([rdname]) + +######################################################################################## +print 'step 6: summarizing fusion evidences @ %s'%time.ctime() + +#Now, time to apply mismatch filter and summarize the results +#Candidate fusions --> guess +#Discordant pairs --> discordant, db1, db2 +#Junction reads --> junc_map, rdb +#Gene info --> genedb + +outfile_s=open('%s/%s.fus.candidates.txt'%(outpath,docstring),'w') +outfile_d=open('%s/%s.fus.evidences.txt'%(outpath,docstring),'w') + +title=['Gene_A','Gene_B','A_chr','B_chr','A_strand','B_strand','Discordant_n','JSR_n','perfectJSR_n','Junc_n','Position_Consist','Junction'] +outfile_s.write('%s\n'%('\t'.join(title))) + +for pp in junc_map: #all pairs with junc reads + gene1,gene2=pp.split('_') + g1obj,g2obj=genedb[gene1],genedb[gene2] + fus_disc=[] #collecting discordant pairs + for rdname in discordant[pp]: + #arrange read1/read2 into F/R so it will be easier for GeneFusion obj to handle + r1,r2=db1[rdname],db2[rdname] + orient=discordant[pp][rdname] + if orient=='F:R': + fus_disc.append((r1,r2)) + elif orient=='R:F': + fus_disc.append((r2,r1)) + fus_jsr=[] + if junc_map.has_key(pp): + for rdname in junc_map[pp]: + r=rdb[rdname]['read'] + junc=rdb[rdname]['junc'] + jsr=gfclass.JSR(r,junc) + fus_jsr.append(jsr) + gf=gfclass.GeneFusion(gene1,gene2,fus_disc,fus_jsr) + gf_new=gf.update(mm=mm) ##apply the mismatch parameter, default is 1 + #output the results + disc_n=str(len(gf_new.discordantpairs)) + junctions=sorted(gf_new.get_junction_freq(),key=operator.itemgetter(1),reverse=True) #sort junc by # of JSRs + junc_n=str(len(junctions)) + junc_str='|'.join([','.join([x[0],str(x[1])]) for x in junctions]) + jsr_n=str(len(gf_new.fusionreads)) + pjsr_n=str(len(gf_new.get_perfect_JSR())) + pos_consist=gf_new.positioncheck() + svec=[gene1,gene2,g1obj.chr,g2obj.chr,g1obj.strand,g2obj.strand,disc_n,jsr_n,pjsr_n,junc_n,pos_consist,junc_str] + outfile_s.write('%s\n'%('\t'.join(svec))) + outfile_d.write('@@\t%s,%s,%s\t%s,%s,%s\n'%(gene1,g1obj.chr,g1obj.strand,gene2,g2obj.chr,g2obj.strand)) + outfile_d.write('\n') + outfile_d.write('>discordant\n') + for rp in gf_new.discordantpairs: + rf,rr=rp + nm1=[x[1] for x in rf.tags if x[0]=='NM'][0] + nm2=[x[1] for x in rr.tags if x[0]=='NM'][0] + outfile_d.write('%s\tF\t%s.%s.mm%d\n'%(rf.qname,gene1,rf.pos+1,nm1)) ##0-based coordinates + outfile_d.write('%s\tR\t%s.%s.mm%d\n'%(rr.qname,gene2,rr.pos+1,nm2)) ##0-based coordinates + outfile_d.write('\n') + outfile_d.write('>spanning\n') + for jsr in gf_new.fusionreads: + r=jsr.read + nm=[x[1] for x in r.tags if x[0]=='NM'][0] + outfile_d.write('%s\t%s.mm%d\n'%(r.qname,jsr.junction,nm)) + outfile_d.write('\n') + outfile_d.write('>junction\n') + for junc_info in junctions: + outfile_d.write('%s\t%d\n'%(junc_info[0],junc_info[1])) + outfile_d.write('\n') + outfile_d.write('>summary\n') + outfile_d.write('Number of Discordant Pairs = %s\n'%disc_n) + outfile_d.write('Number of Fusion Reads = %s\n'%jsr_n) + outfile_d.write('Number of Perfect Fusion Reads = %s\n'%pjsr_n) + outfile_d.write('Number of Distinct Junctions = %s\n'%junc_n) + outfile_d.write('Position Consistency = %s\n'%pos_consist) + outfile_d.write('\n') + +outfile_s.close() +outfile_d.close() + +######################################################################################## +print 'step 7: generating fusion lists @ %s'%time.ctime() + +#For convenience, filter the lists to candidates with +# 1) at least 2 discordant pairs +# 2) at least 1 perfect JSR +#meanwhile, calculate sequence similarity for each pair +#user may need to manually filter the lists per this measure. + +#The following code is a copy of prada-homology +outfile_o=open('%s/%s.fus.summary.txt'%(outpath,docstring),'w') +ifname='%s/%s.fus.candidates.txt'%(outpath,docstring) +if not os.path.exists(ifname): + sys.exit('ERROR: %s was not found'%ifname) + +blastseq_tmp_dir='%s/blast_tmp/'%outpath +if not os.path.exists(blastseq_tmp_dir): + os.mkdir(blastseq_tmp_dir) + +flists=[] +infile=open(ifname) +iN=0 +for line in open(ifname): + info=line.strip().split('\t') + if iN==0: + iN+=1 #skip title + flists.append(info) + continue + else: + if int(info[6])>=2 and int(info[8])>=1 and info[10] in ['PARTIALLY','YES']: + flists.append(info) +infile.close() + +if len(flists)==1: #if no candidate passes the filters + outfile_o.write('%s\n'%'\t'.join(flists[0])) + outfile_o.close() + print 'step done @ %s'%time.ctime() + sys.exit(0) + +candidates={} +for line in flists[1:]: + geneA,geneB=line[0],line[1] + key='%s_%s'%(geneA,geneB) + candidates[key]='' + +selecttranscript={} +for gene in genedb: + txs=genedb[gene].transcript + stx=txs[0] + initlen=stx.length + for tx in txs: + if tx.length >= initlen: + stx=tx + initlen=stx.length + selecttranscript[gene]=stx.name + +allpartners=set() +for item in candidates: + sset=set(item.split('_')) + allpartners=allpartners.union(sset) + +presenttxs=[] #tx that is present in our annotation +absent=[] #tx that is not in our annotation +for gene in allpartners: + if selecttranscript.has_key(gene): + presenttxs.append(selecttranscript[gene]) + else: + absent.append(gene) + +for seq_record in SeqIO.parse(txseqfile,'fasta'): + sid=seq_record.id + seq=seq_record.seq + if sid in presenttxs: + g=txdb[sid].gene + fastafile=open('%s/%s.fasta'%(blastseq_tmp_dir,g),'w') + SeqIO.write(seq_record,fastafile,'fasta') + fastafile.close() + +for gp in candidates: + geneA,geneB=gp.split('_') + if geneA in absent or geneB in absent: + candidates[gp]=['NA']*4 + else: + gaseq='%s/%s.fasta'%(blastseq_tmp_dir,geneA) + gaobj=SeqIO.parse(gaseq,'fasta').next() + gbseq='%s/%s.fasta'%(blastseq_tmp_dir,geneB) + gbobj=SeqIO.parse(gbseq,'fasta').next() + ga_len,gb_len=str(len(gaobj.seq)),str(len(gbobj.seq)) + a=privutils.seqblast(gaseq,gbseq,blastn) + if a==None: + candidates[gp]=['NA','NA','100','0'] + else: + candidates[gp]=a + +header=flists[0][:] +header.extend(['Identity','Align_Len','Evalue','BitScore']) +outfile_o.write('%s\n'%('\t'.join(header))) + +for info in flists[1:]: + geneA,geneB=info[0],info[1] + key='%s_%s'%(geneA,geneB) + vv=candidates[key] + row=info[:] + row.extend(vv) + outfile_o.write('%s\n'%('\t'.join(row))) +outfile_o.close() + +######################################################################################## +print 'step done @ %s'%time.ctime()