view pyPRADA_1.2/prada-guess-ft @ 3:f17965495ec9 draft default tip

Uploaded
author siyuan
date Tue, 11 Mar 2014 12:14:01 -0400
parents acc2ca1a3ba4
children
line wrap: on
line source

#!/usr/bin/env python

#GUESS-ft:General UsEr defined Supervised Search for fusion transcript.
#This program is to perform targeted search of gene fusions from RNAseq data.
#It uses samtools, pysam package. It requires a bam file and other reference files.
#Note that it is a less accurate tool than prada-fusion.

import pysam
import subprocess
import os
import time
import sys
import bioclass
import ioprada

args=sys.argv

#Get all parameters
help_menu='''\nUsage: prada-guess-ft GeneA GeneB -conf xx.txt -inputbam X -mm 1 -minmapq 30 -junL X -outdir X -unmap X\n
    **Parameters**:
        -conf		the configure file. see prada-fusion -conf for details
        -inputbam	the input bam file
        -mm		number of mismatch allowed
        -minmapq	mininum mapping quality for reads to be used in fusion finding
        -junL		length of exons to be used for junctions. see prada-fusion
        -outdir		output directory
        -unmap		the numapped reads. useful if user need to run guess-ft multiple times
'''

if '-h' in args or '-help' in args or len(args)==1:
    print help_menu 
    sys.exit(0)

##################################################################################
Gene_A=args[1]
Gene_B=args[2]
if '-inputbam' not in args:
    sys.exit('Input BAM needed')
else:
    i=args.index('-inputbam')
    sourcefile=args[i+1]
if '-outdir' not in args:
    outdir='./'
else:
    i=args.index('-outdir')
    outdir=args[i+1]
    if not os.path.exists(outdir):
	    os.mkdir(outdir)
if '-unmap' not in args:    #get unmapped.bam yourself
	unmapbam='%s/one.end.unmapped.bam'%outdir
	extract_mask=1
else:
	i=args.index('-unmap')
	unmapbam=args[i+1]
	extract_mask=0
if '-mm' not in args:
    mm=1
else:
    i=args.index('-mm')
    mm=int(args[i+1])
#minimum mapping quality for reads as fusion evidences
if '-minmapq' not in args:
    minmapq=30
else:
    i=args.index('-minmapq')
    minmapq=int(args[i+1])

if '-junL' not in args:
    sys.exit('-junL needed')
else:
    i=args.index('-junL')
    junL=int(args[i+1])

prada_path=os.path.dirname(os.path.abspath(__file__))   ####
ref_search_path=[prada_path,os.getcwd()]                #search path for ref file if not specified in command line

if '-conf' in args:
    i=args.index('-conf')
    reffile=args[i+1]
    if os.path.exists(reffile):
        pass
    else:
        for pth in ref_search_path:
            new_reffile='%s/%s'%(pth, os.path.basename(reffile))
            if os.path.exists(new_reffile):
                reffile=new_reffile
                break
        else:
            sys.exit('ERROR: ref file %s not found'%reffile)
else:
    reffile='%s/conf.txt'%prada_path
    if not os.path.exists(reffile):
        sys.exit('ERROR: No default conf.txt found and none specified')

#reference files
refdict=ioprada.read_conf(reffile)
ref_anno=refdict['--REF--']['ref_anno']
ref_map=refdict['--REF--']['ref_map']
ref_fasta=refdict['--REF--']['ref_fasta']
featurefile=refdict['--REF--']['feature_file']

samtools='%s/tools/samtools-0.1.16/samtools'%prada_path
bwa='%s/tools/bwa-0.5.7-mh/bwa'%prada_path

print 'GUESS start: %s'%time.ctime()
print 'CMD: %s'%('\t'.join(args))

##################################################################################
##get gene position information

gA,gB=ioprada.read_feature_genes(featurefile,Gene_A,Gene_B)
if gA is None:
    sys.exit('%s not found'%Gene_A)
if gB is None: 
    sys.exit('%s not found'%Gene_B)

#Generate unmapped reads.
if extract_mask:
	cmd='%s view -b -f 4 -F 8 %s > %s'%(samtools,sourcefile,unmapbam)
	cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
	while True:
		if cmdout.poll() is None:
			print 'Extracting unmapped reads...'
			time.sleep(120)
			pass
		if cmdout.poll()==0:
			print 'Extracted unmapped reads'
			break
		if cmdout.poll() is not None and cmdout.poll() != 0:
			raise Exception('Error extracting unmapped reads')

#Generate junction db
juncfile='%s_%s.junction.fasta'%(Gene_A,Gene_B)
cmd='perl %s/make_exon_junctions.pl %s %s %s %s %s %d > %s/%s'%(prada_path,Gene_A,Gene_B,ref_anno,ref_map,ref_fasta,junL,outdir,juncfile)
cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
while True:
	if cmdout.poll() is None:
		print 'Generating junction db. Waiting...'
		time.sleep(20)
		pass
	if cmdout.poll()==0:
		print 'Generated Junction DB.'
		break
	if cmdout.poll() is not None and cmdout.poll() != 0:
		raise Exception('Error generated Junction DB.')

#read into the sam file, get gene-strand information
print 'Finding discordant read pairs.'
samfile = pysam.Samfile(sourcefile, "rb" )

#mapping quality based read collection
#strand information considered
#mismatch filter considered
a_reads=[]
b_reads=[]
for alignedread in samfile.fetch(gA.chr,gA.start-1,gA.end):
	if alignedread.mapq >= minmapq:
		readastrd='-1' if alignedread.is_reverse else '1'
		mmf=[x[1] for x in alignedread.tags if x[0]=='NM'][0]
		if readastrd==gA.strand and mmf <= mm:
			a_reads.append(alignedread)

for alignedread in samfile.fetch(gB.chr,gB.start-1,gB.end):
	if alignedread.mapq >= minmapq:
		readbstrd='-1' if alignedread.is_reverse else '1'
		mmf=[x[1] for x in alignedread.tags if x[0]=='NM'][0]
		if readbstrd != gB.strand and mmf <= mm:
			b_reads.append(alignedread)
samfile.close()

ar_ids=[x.qname for x in a_reads]
br_ids=[x.qname for x in b_reads]
disc=list(set(ar_ids).intersection(set(br_ids)))   #discordant pairs

#detect read pair that one end map to one partner, yet the other does not align
print 'Determining potential junction spanning reads.'
rpaonly=[]    #reads that only map to gene A -- going to map the other end to junctions
for rd in a_reads:
	if rd.mate_is_unmapped:
		rpaonly.append(rd)
rpbonly=[]    #reads that only map to gene B -- going to map the other end to junctions
for rd in b_reads:
    if rd.mate_is_unmapped:
        rpbonly.append(rd)
rpaonly_names=[x.qname for x in rpaonly]
rpbonly_names=[x.qname for x in rpbonly]

#find read sequences
print 'Extracting unmapped read sequences.'
print 'mate unmapped read for gene A:',len(rpaonly)
print 'mate unmapped read for gene B:',len(rpbonly) 
samfile=pysam.Samfile(unmapbam,'rb')
taga='%s-%s_a'%(Gene_A,Gene_B)
tagb='%s-%s_b'%(Gene_A,Gene_B)
resfq_a=open('%s/unmapreads_%s.fq'%(outdir,taga),'w')
resfq_b=open('%s/unmapreads_%s.fq'%(outdir,tagb),'w')
for item in samfile:
	if item.qname in rpaonly_names:
		resfq_a.write('@%s\n'%item.qname)
		resfq_a.write('%s\n'%item.seq)
		resfq_a.write('+\n')
		resfq_a.write('%s\n'%item.qual)
	if item.qname in rpbonly_names:
		resfq_b.write('@%s\n'%item.qname)
		resfq_b.write('%s\n'%item.seq)
		resfq_b.write('+\n')
		resfq_b.write('%s\n'%item.qual)
resfq_a.close()
resfq_b.close()
samfile.close()

##indexing junction db
print 'Aligning reads to junction db'
cmd='%s index %s/%s'%(bwa,outdir,juncfile)
cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
while True:
	if cmdout.poll() is None:
		time.sleep(3)
		pass
	if cmdout.poll()==0:
		print 'Junction DB indexed.'
		break
	if cmdout.poll() is not None and cmdout.poll() != 0:
		raise Exception('Error building junction db index')

##align the unmapped reads to junction database
for rs in [taga,tagb]:
	cmd='%s aln -n %d -R 100 %s/%s %s/unmapreads_%s.fq > %s/%s.sai'%(bwa,mm,outdir,juncfile,outdir,rs,outdir,rs)
	cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
	while True:
		if cmdout.poll() is None:
			time.sleep(5)
			pass
		if cmdout.poll()==0:
			print 'Aligned unmapped reads group %s'%rs
			break
		if cmdout.poll() is not None and cmdout.poll() != 0:
			raise Exception('Error aligning unmapped reads for group %s'%rs)

	cmd='%s samse -n 1000 %s/%s %s/%s.sai %s/unmapreads_%s.fq > %s/%s.sam'%(bwa,outdir,juncfile,outdir,rs,outdir,rs,outdir,rs)
	cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
	while True:
		if cmdout.poll() is None:
			time.sleep(2)
			pass
		if cmdout.poll()==0:
			print 'Converting to sam for group %s'%rs
			break
		if cmdout.poll() is not None and cmdout.poll() != 0:
			raise Exception('Error converting to sam for group %s'%rs)

#parse results
qualrd_a=[]
junc_a=[]
samfile=pysam.Samfile('%s/%s.sam'%(outdir,taga),'r')
for rd in samfile:
	if not rd.is_unmapped and rd.is_reverse:
		qualrd_a.append(rd)
		junc_a.append(samfile.getrname(rd.tid))
samfile.close()
qualrd_b=[]
junc_b=[]
samfile=pysam.Samfile('%s/%s.sam'%(outdir,tagb),'r')
for rd in samfile:
    if not rd.is_unmapped and not rd.is_reverse:
		qualrd_b.append(rd)
		junc_b.append(samfile.getrname(rd.tid))
samfile.close()

junc_span=[]
junc_span.extend(qualrd_a)
junc_span.extend(qualrd_b)

junc_name=[]
junc_name.extend(junc_a)
junc_name.extend(junc_b)

#Generate a summary report
sumfile=open('%s/%s_%s.GUESS.summary.txt'%(outdir,Gene_A,Gene_B),'w')
sumfile.write('%s\t%s\n'%(Gene_A,Gene_B))
sumfile.write('\n')
sumfile.write('>discordant\n')
for rdname in disc:
	ia=ar_ids.index(rdname)
	ib=br_ids.index(rdname)
	reada=a_reads[ia]
	readb=b_reads[ib]
	mm_a=[x[1] for x in reada.tags if x[0]=='NM'][0]
	mm_b=[x[1] for x in readb.tags if x[0]=='NM'][0]
	ss1='%s\tF\t%s.%d.mm%d'%(reada.qname,Gene_A,reada.pos+1,mm_a)    #pysam use 0-based coordinates
	ss2='%s\tR\t%s.%d.mm%d'%(readb.qname,Gene_B,readb.pos+1,mm_b)	
	sumfile.write('%s\n'%ss1)
	sumfile.write('%s\n'%ss2)

sumfile.write('\n')
sumfile.write('>spanning\n')
for i in range(len(junc_span)):
	rd=junc_span[i]
	jname=junc_name[i]
	mm_j=[x[1] for x in rd.tags if x[0]=='NM'][0]
	ss='%s\t%s.mm%d'%(rd.qname,jname,mm_j)
	sumfile.write('%s\n'%ss)

sumfile.write('\n')
sumfile.write('>junction\n')
juncol=[]
for item in set(junc_name):
	nn=junc_name.count(item)
	juncol.append([item,nn])
juncol=sorted(juncol,key=lambda x:x[1],reverse=True)
for item in juncol:
	sumfile.write('%s\t%s\n'%(item[0],item[1]))

sumfile.write('\n')
sumfile.write('>summary\n')
sumfile.write('Number of Discordant Pairs = %d\n'%len(disc))
sumfile.write('Number of Fusion Reads = %d\n'%len(junc_span))
sumfile.write('Number of Distinct Junctions = %d\n'%len(set(junc_name)))

sumfile.close()
print 'Done: %s'%time.ctime()