Mercurial > repos > siyuan > prada
view pyPRADA_1.2/prada-guess-ft @ 3:f17965495ec9 draft default tip
Uploaded
author | siyuan |
---|---|
date | Tue, 11 Mar 2014 12:14:01 -0400 |
parents | acc2ca1a3ba4 |
children |
line wrap: on
line source
#!/usr/bin/env python #GUESS-ft:General UsEr defined Supervised Search for fusion transcript. #This program is to perform targeted search of gene fusions from RNAseq data. #It uses samtools, pysam package. It requires a bam file and other reference files. #Note that it is a less accurate tool than prada-fusion. import pysam import subprocess import os import time import sys import bioclass import ioprada args=sys.argv #Get all parameters help_menu='''\nUsage: prada-guess-ft GeneA GeneB -conf xx.txt -inputbam X -mm 1 -minmapq 30 -junL X -outdir X -unmap X\n **Parameters**: -conf the configure file. see prada-fusion -conf for details -inputbam the input bam file -mm number of mismatch allowed -minmapq mininum mapping quality for reads to be used in fusion finding -junL length of exons to be used for junctions. see prada-fusion -outdir output directory -unmap the numapped reads. useful if user need to run guess-ft multiple times ''' if '-h' in args or '-help' in args or len(args)==1: print help_menu sys.exit(0) ################################################################################## Gene_A=args[1] Gene_B=args[2] if '-inputbam' not in args: sys.exit('Input BAM needed') else: i=args.index('-inputbam') sourcefile=args[i+1] if '-outdir' not in args: outdir='./' else: i=args.index('-outdir') outdir=args[i+1] if not os.path.exists(outdir): os.mkdir(outdir) if '-unmap' not in args: #get unmapped.bam yourself unmapbam='%s/one.end.unmapped.bam'%outdir extract_mask=1 else: i=args.index('-unmap') unmapbam=args[i+1] extract_mask=0 if '-mm' not in args: mm=1 else: i=args.index('-mm') mm=int(args[i+1]) #minimum mapping quality for reads as fusion evidences if '-minmapq' not in args: minmapq=30 else: i=args.index('-minmapq') minmapq=int(args[i+1]) if '-junL' not in args: sys.exit('-junL needed') else: i=args.index('-junL') junL=int(args[i+1]) prada_path=os.path.dirname(os.path.abspath(__file__)) #### ref_search_path=[prada_path,os.getcwd()] #search path for ref file if not specified in command line if '-conf' in args: i=args.index('-conf') reffile=args[i+1] if os.path.exists(reffile): pass else: for pth in ref_search_path: new_reffile='%s/%s'%(pth, os.path.basename(reffile)) if os.path.exists(new_reffile): reffile=new_reffile break else: sys.exit('ERROR: ref file %s not found'%reffile) else: reffile='%s/conf.txt'%prada_path if not os.path.exists(reffile): sys.exit('ERROR: No default conf.txt found and none specified') #reference files refdict=ioprada.read_conf(reffile) ref_anno=refdict['--REF--']['ref_anno'] ref_map=refdict['--REF--']['ref_map'] ref_fasta=refdict['--REF--']['ref_fasta'] featurefile=refdict['--REF--']['feature_file'] samtools='%s/tools/samtools-0.1.16/samtools'%prada_path bwa='%s/tools/bwa-0.5.7-mh/bwa'%prada_path print 'GUESS start: %s'%time.ctime() print 'CMD: %s'%('\t'.join(args)) ################################################################################## ##get gene position information gA,gB=ioprada.read_feature_genes(featurefile,Gene_A,Gene_B) if gA is None: sys.exit('%s not found'%Gene_A) if gB is None: sys.exit('%s not found'%Gene_B) #Generate unmapped reads. if extract_mask: cmd='%s view -b -f 4 -F 8 %s > %s'%(samtools,sourcefile,unmapbam) cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) while True: if cmdout.poll() is None: print 'Extracting unmapped reads...' time.sleep(120) pass if cmdout.poll()==0: print 'Extracted unmapped reads' break if cmdout.poll() is not None and cmdout.poll() != 0: raise Exception('Error extracting unmapped reads') #Generate junction db juncfile='%s_%s.junction.fasta'%(Gene_A,Gene_B) cmd='perl %s/make_exon_junctions.pl %s %s %s %s %s %d > %s/%s'%(prada_path,Gene_A,Gene_B,ref_anno,ref_map,ref_fasta,junL,outdir,juncfile) cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) while True: if cmdout.poll() is None: print 'Generating junction db. Waiting...' time.sleep(20) pass if cmdout.poll()==0: print 'Generated Junction DB.' break if cmdout.poll() is not None and cmdout.poll() != 0: raise Exception('Error generated Junction DB.') #read into the sam file, get gene-strand information print 'Finding discordant read pairs.' samfile = pysam.Samfile(sourcefile, "rb" ) #mapping quality based read collection #strand information considered #mismatch filter considered a_reads=[] b_reads=[] for alignedread in samfile.fetch(gA.chr,gA.start-1,gA.end): if alignedread.mapq >= minmapq: readastrd='-1' if alignedread.is_reverse else '1' mmf=[x[1] for x in alignedread.tags if x[0]=='NM'][0] if readastrd==gA.strand and mmf <= mm: a_reads.append(alignedread) for alignedread in samfile.fetch(gB.chr,gB.start-1,gB.end): if alignedread.mapq >= minmapq: readbstrd='-1' if alignedread.is_reverse else '1' mmf=[x[1] for x in alignedread.tags if x[0]=='NM'][0] if readbstrd != gB.strand and mmf <= mm: b_reads.append(alignedread) samfile.close() ar_ids=[x.qname for x in a_reads] br_ids=[x.qname for x in b_reads] disc=list(set(ar_ids).intersection(set(br_ids))) #discordant pairs #detect read pair that one end map to one partner, yet the other does not align print 'Determining potential junction spanning reads.' rpaonly=[] #reads that only map to gene A -- going to map the other end to junctions for rd in a_reads: if rd.mate_is_unmapped: rpaonly.append(rd) rpbonly=[] #reads that only map to gene B -- going to map the other end to junctions for rd in b_reads: if rd.mate_is_unmapped: rpbonly.append(rd) rpaonly_names=[x.qname for x in rpaonly] rpbonly_names=[x.qname for x in rpbonly] #find read sequences print 'Extracting unmapped read sequences.' print 'mate unmapped read for gene A:',len(rpaonly) print 'mate unmapped read for gene B:',len(rpbonly) samfile=pysam.Samfile(unmapbam,'rb') taga='%s-%s_a'%(Gene_A,Gene_B) tagb='%s-%s_b'%(Gene_A,Gene_B) resfq_a=open('%s/unmapreads_%s.fq'%(outdir,taga),'w') resfq_b=open('%s/unmapreads_%s.fq'%(outdir,tagb),'w') for item in samfile: if item.qname in rpaonly_names: resfq_a.write('@%s\n'%item.qname) resfq_a.write('%s\n'%item.seq) resfq_a.write('+\n') resfq_a.write('%s\n'%item.qual) if item.qname in rpbonly_names: resfq_b.write('@%s\n'%item.qname) resfq_b.write('%s\n'%item.seq) resfq_b.write('+\n') resfq_b.write('%s\n'%item.qual) resfq_a.close() resfq_b.close() samfile.close() ##indexing junction db print 'Aligning reads to junction db' cmd='%s index %s/%s'%(bwa,outdir,juncfile) cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) while True: if cmdout.poll() is None: time.sleep(3) pass if cmdout.poll()==0: print 'Junction DB indexed.' break if cmdout.poll() is not None and cmdout.poll() != 0: raise Exception('Error building junction db index') ##align the unmapped reads to junction database for rs in [taga,tagb]: cmd='%s aln -n %d -R 100 %s/%s %s/unmapreads_%s.fq > %s/%s.sai'%(bwa,mm,outdir,juncfile,outdir,rs,outdir,rs) cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) while True: if cmdout.poll() is None: time.sleep(5) pass if cmdout.poll()==0: print 'Aligned unmapped reads group %s'%rs break if cmdout.poll() is not None and cmdout.poll() != 0: raise Exception('Error aligning unmapped reads for group %s'%rs) cmd='%s samse -n 1000 %s/%s %s/%s.sai %s/unmapreads_%s.fq > %s/%s.sam'%(bwa,outdir,juncfile,outdir,rs,outdir,rs,outdir,rs) cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) while True: if cmdout.poll() is None: time.sleep(2) pass if cmdout.poll()==0: print 'Converting to sam for group %s'%rs break if cmdout.poll() is not None and cmdout.poll() != 0: raise Exception('Error converting to sam for group %s'%rs) #parse results qualrd_a=[] junc_a=[] samfile=pysam.Samfile('%s/%s.sam'%(outdir,taga),'r') for rd in samfile: if not rd.is_unmapped and rd.is_reverse: qualrd_a.append(rd) junc_a.append(samfile.getrname(rd.tid)) samfile.close() qualrd_b=[] junc_b=[] samfile=pysam.Samfile('%s/%s.sam'%(outdir,tagb),'r') for rd in samfile: if not rd.is_unmapped and not rd.is_reverse: qualrd_b.append(rd) junc_b.append(samfile.getrname(rd.tid)) samfile.close() junc_span=[] junc_span.extend(qualrd_a) junc_span.extend(qualrd_b) junc_name=[] junc_name.extend(junc_a) junc_name.extend(junc_b) #Generate a summary report sumfile=open('%s/%s_%s.GUESS.summary.txt'%(outdir,Gene_A,Gene_B),'w') sumfile.write('%s\t%s\n'%(Gene_A,Gene_B)) sumfile.write('\n') sumfile.write('>discordant\n') for rdname in disc: ia=ar_ids.index(rdname) ib=br_ids.index(rdname) reada=a_reads[ia] readb=b_reads[ib] mm_a=[x[1] for x in reada.tags if x[0]=='NM'][0] mm_b=[x[1] for x in readb.tags if x[0]=='NM'][0] ss1='%s\tF\t%s.%d.mm%d'%(reada.qname,Gene_A,reada.pos+1,mm_a) #pysam use 0-based coordinates ss2='%s\tR\t%s.%d.mm%d'%(readb.qname,Gene_B,readb.pos+1,mm_b) sumfile.write('%s\n'%ss1) sumfile.write('%s\n'%ss2) sumfile.write('\n') sumfile.write('>spanning\n') for i in range(len(junc_span)): rd=junc_span[i] jname=junc_name[i] mm_j=[x[1] for x in rd.tags if x[0]=='NM'][0] ss='%s\t%s.mm%d'%(rd.qname,jname,mm_j) sumfile.write('%s\n'%ss) sumfile.write('\n') sumfile.write('>junction\n') juncol=[] for item in set(junc_name): nn=junc_name.count(item) juncol.append([item,nn]) juncol=sorted(juncol,key=lambda x:x[1],reverse=True) for item in juncol: sumfile.write('%s\t%s\n'%(item[0],item[1])) sumfile.write('\n') sumfile.write('>summary\n') sumfile.write('Number of Discordant Pairs = %d\n'%len(disc)) sumfile.write('Number of Fusion Reads = %d\n'%len(junc_span)) sumfile.write('Number of Distinct Junctions = %d\n'%len(set(junc_name))) sumfile.close() print 'Done: %s'%time.ctime()