Mercurial > repos > siyuan > prada
diff pyPRADA_1.2/prada-guess-if @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyPRADA_1.2/prada-guess-if Thu Feb 20 00:44:58 2014 -0500 @@ -0,0 +1,288 @@ +#!/usr/bin/env python + +#GUESS-if is to find abnormal intragenic fusions. +#It is an extension of the GUESS-ft method. + +import pysam +import subprocess +import os +import time +import sys +import bioclass +import ioprada + +args=sys.argv + +#Get all parameters +help_menu='''\nUsage: prada-guess-if Gene -conf xx.txt -inputbam X -mm 1 -minmapq 30 -junL X -outdir ./ -unmap X\n + **Parameters**: + -conf the configure file. see prada-fusion -conf for details + -inputbam the input bam file + -mm number of mismatch allowed + -minmapq mininum mapping quality for reads to be used in fusion finding + -junL length of exons to be used for junctions. see prada-fusion + -outdir output directory + -unmap the numapped reads. useful if user need to run guess-ft multiple times +''' + +if '-h' in args or '-help' in args or len(args)==1: + print help_menu + sys.exit(0) + +######################################################################### +target=args[1] +if '-inputbam' not in args: + sys.exit('Input BAM needed') +else: + i=args.index('-inputbam') + sourcefile=args[i+1] +if '-outdir' not in args: + outdir='./' +else: + i=args.index('-outdir') + outdir=args[i+1] + if not os.path.exists(outdir): + os.mkdir(outdir) +if '-unmap' not in args: + unmapbam='%s/one.end.unmapped.bam'%outdir + extract_mask=1 +else: + i=args.index('-unmap') + unmapbam=args[i+1] + extract_mask=0 + +if '-mm' not in args: + mm=1 +else: + i=args.index('-mm') + mm=int(args[i+1]) + +#minimum mapping quality for reads as fusion evidences +if '-minmapq' not in args: + minmapq=30 +else: + i=args.index('-minmapq') + minmapq=int(args[i+1]) + +if '-junL' not in args: + sys.exit('-junL needed') +else: + i=args.index('-junL') + junL=int(args[i+1]) + +prada_path=os.path.dirname(os.path.abspath(__file__)) #### +ref_search_path=[prada_path,os.getcwd()] #search path for ref file if not specified in command line + +if '-conf' in args: + i=args.index('-conf') + reffile=args[i+1] + if os.path.exists(reffile): + pass + else: + for pth in ref_search_path: + new_reffile='%s/%s'%(pth, os.path.basename(reffile)) + if os.path.exists(new_reffile): + reffile=new_reffile + break + else: + sys.exit('ERROR: ref file %s not found'%reffile) +else: + reffile='%s/conf.txt'%prada_path + if not os.path.exists(reffile): + sys.exit('ERROR: No default conf.txt found and none specified') + +#reference files +refdict=ioprada.read_conf(reffile) +ref_anno=refdict['--REF--']['ref_anno'] +ref_map=refdict['--REF--']['ref_map'] +ref_fasta=refdict['--REF--']['ref_fasta'] +featurefile=refdict['--REF--']['feature_file'] + +samtools='%s/tools/samtools-0.1.16/samtools'%prada_path +bwa='%s/tools/bwa-0.5.7-mh/bwa'%prada_path + +######################################################################### +print 'GUESS-if start: %s'%time.ctime() +print 'CMD: %s'%('\t'.join(args)) + +##get gene position information +gobj=ioprada.read_feature_genes(featurefile,target)[0] +if gobj is None: + sys.exit('%s not found'%target) + +gchr=gobj.strand +gchr_rev=True if gchr=='-1' else False + +#Generate unmapped reads. +if extract_mask: + cmd='%s view -b -f 4 -F 8 %s > %s'%(samtools,sourcefile,unmapbam) + cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) + while True: + if cmdout.poll() is None: + print 'Extracting unmapped reads...' + time.sleep(120) + pass + if cmdout.poll()==0: + print 'Extracted unmapped reads' + break + if cmdout.poll() is not None and cmdout.poll() != 0: + raise Exception('Error extracting unmapped reads') + + +#Generate junction db +juncfile='%s.junction.fasta'%target +cmd='perl %s/make_intragenic_junctions.pl %s %s %s %s %s %d > %s/%s'%(prada_path,target,target,ref_anno,ref_map,ref_fasta,junL,outdir,juncfile) +cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) +while True: + if cmdout.poll() is None: + print 'Generating junction db. Waiting...' + time.sleep(20) + pass + if cmdout.poll()==0: + print 'Generated Junction DB.' + break + if cmdout.poll() is not None and cmdout.poll() != 0: + raise Exception('Error generated Junction DB.') + +#scan BAM file for mapping reads. +samfile=pysam.Samfile(sourcefile,'rb') +reads_se=[] +reads_as=[] +for alignedread in samfile.fetch(gobj.chr,gobj.start-1,gobj.end): + if alignedread.mapq >= minmapq: + mmf=[x[1] for x in alignedread.tags if x[0]=='NM'][0] + if mmf <= mm: + if alignedread.is_reverse==gchr_rev: #sense + reads_se.append(alignedread) + else: #anti-sense + reads_as.append(alignedread) +samfile.close() + +seonly,asonly=[],[] +for rd in reads_se: + if rd.mate_is_unmapped: + seonly.append(rd) +for rd in reads_as: + if rd.mate_is_unmapped: + asonly.append(rd) +seunmap=[x.qname for x in seonly] +asunmap=[x.qname for x in asonly] + +#find read sequences +print 'Extracting unmapped read sequences.' +print 'mate unmapped reads for sense strand:',len(seonly) +print 'mate unmapped reads for antisense strand:',len(asonly) +samfile=pysam.Samfile(unmapbam,'rb') +resfq_a=open('%s/%s_se_unmap.fq'%(outdir,target),'w') +resfq_b=open('%s/%s_as_unmap.fq'%(outdir,target),'w') +for item in samfile: + if item.qname in seunmap: + resfq_a.write('@%s\n'%item.qname) + resfq_a.write('%s\n'%item.seq) + resfq_a.write('+\n') + resfq_a.write('%s\n'%item.qual) + if item.qname in asunmap: + resfq_b.write('@%s\n'%item.qname) + resfq_b.write('%s\n'%item.seq) + resfq_b.write('+\n') + resfq_b.write('%s\n'%item.qual) +resfq_a.close() +resfq_b.close() +samfile.close() + +##indexing junction db +print 'Aligning reads to junction db' +cmd='%s index %s/%s'%(bwa,outdir,juncfile) +cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) +while True: + if cmdout.poll() is None: + time.sleep(3) + pass + if cmdout.poll()==0: + print 'Junction DB indexed.' + break + if cmdout.poll() is not None and cmdout.poll() != 0: + raise Exception('Error building junction db index') + +taga='%s_se'%target +tagb='%s_as'%target +for rs in [taga,tagb]: + cmd='%s aln -n %d -R 100 %s/%s %s/%s_unmap.fq > %s/%s_unmap.sai'%(bwa,mm,outdir,juncfile,outdir,rs,outdir,rs) + cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) + while True: + if cmdout.poll() is None: + time.sleep(5) + pass + if cmdout.poll()==0: + print 'Aligned unmapped reads group %s'%rs + break + if cmdout.poll() is not None and cmdout.poll() != 0: + raise Exception('Error aligning unmapped reads for group %s'%rs) + + cmd='%s samse -n 1000 %s/%s %s/%s_unmap.sai %s/%s_unmap.fq > %s/%s_unmap.sam'%(bwa,outdir,juncfile,outdir,rs,outdir,rs,outdir,rs) + cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) + while True: + if cmdout.poll() is None: + time.sleep(2) + pass + if cmdout.poll()==0: + print 'Converting to sam for group %s'%rs + break + if cmdout.poll() is not None and cmdout.poll() != 0: + raise Exception('Error converting to sam for group %s'%rs) + +qualrd_a=[] +junc_a=[] +samfile=pysam.Samfile('%s/%s_unmap.sam'%(outdir,taga),'r') +for rd in samfile: + if not rd.is_unmapped and rd.is_reverse: + qualrd_a.append(rd) + junc_a.append(samfile.getrname(rd.tid)) +samfile.close() +qualrd_b=[] +junc_b=[] +samfile=pysam.Samfile('%s/%s_unmap.sam'%(outdir,tagb),'r') +for rd in samfile: + if not rd.is_unmapped and not rd.is_reverse: + qualrd_b.append(rd) + junc_b.append(samfile.getrname(rd.tid)) +samfile.close() + +junc_span=[] +junc_span.extend(qualrd_a) +junc_span.extend(qualrd_b) + +junc_name=[] +junc_name.extend(junc_a) +junc_name.extend(junc_b) + +#Generate a summary report +sumfile=open('%s/%s.GUESS-IF.summary.txt'%(outdir,target),'w') +sumfile.write('%s\n'%(target)) +sumfile.write('\n') +sumfile.write('>spanning\n') +for i in range(len(junc_span)): + rd=junc_span[i] + jname=junc_name[i] + mm_j=[x[1] for x in rd.tags if x[0]=='NM'][0] + ss='%s\t%s.mm%d'%(rd.qname,jname,mm_j) + sumfile.write('%s\n'%ss) + +sumfile.write('\n') +sumfile.write('>junction\n') +juncol=[] +for item in set(junc_name): + nn=junc_name.count(item) + juncol.append([item,nn]) +juncol=sorted(juncol,key=lambda x:x[1],reverse=True) +for item in juncol: + sumfile.write('%s\t%s\n'%(item[0],item[1])) + +sumfile.write('\n') +sumfile.write('>summary\n') +sumfile.write('Number of Fusion Reads = %d\n'%len(junc_span)) +sumfile.write('Number of Distinct Junctions = %d\n'%len(set(junc_name))) + +sumfile.close() +print 'Done: %s'%time.ctime() +