view utilities/vcf_compare_OLD.py @ 2:8a739c944dbf draft

planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author thondeboer
date Tue, 15 May 2018 16:22:08 -0400
parents 6e75a84e9338
children
line wrap: on
line source

#!/usr/bin/env python
# encoding: utf-8

""" **************************************************

vcf_compare.py

- compare vcf file produced by workflow to golden vcf produced by simulator

Written by:		Zach Stephens
Date:			January 20, 2015
Contact:		zstephe2@illinois.edu

************************************************** """

import sys
import os
import copy
import time
import bisect
import re
import numpy as np
import optparse


EV_BPRANGE = 50		# how far to either side of a particular variant location do we want to check for equivalents?

DEFAULT_QUAL = -666	# if we can't find a qual score, use this instead so we know it's missing

MAX_VAL = 9999999999999	# an unreasonably large value that no reference fasta could concievably be longer than

DESC   = """%prog: vcf comparison script."""
VERS   = 0.1

PARSER = optparse.OptionParser('python %prog [options] -r <ref.fa> -g <golden.vcf> -w <workflow.vcf>',description=DESC,version="%prog v"+str(VERS))

PARSER.add_option('-r', help='* Reference Fasta', dest='REFF', action='store', metavar='<ref.fa>')
PARSER.add_option('-g', help='* Golden VCF',      dest='GVCF', action='store', metavar='<golden.vcf>')
PARSER.add_option('-w', help='* Workflow VCF',    dest='WVCF', action='store', metavar='<workflow.vcf>')
PARSER.add_option('-o', help='* Output Prefix',   dest='OUTF', action='store', metavar='<prefix>')
PARSER.add_option('-m', help='Mappability Track', dest='MTRK', action='store', metavar='<track.bed>')
PARSER.add_option('-M', help='Maptrack Min Len',  dest='MTMM', action='store', metavar='<int>')
PARSER.add_option('-t', help='Targetted Regions', dest='TREG', action='store', metavar='<regions.bed>')
PARSER.add_option('-T', help='Min Region Len',    dest='MTRL', action='store', metavar='<int>')
PARSER.add_option('-c', help='Coverage Filter Threshold [%default]',       dest='DP_THRESH', default=15, action='store', metavar='<int>')
PARSER.add_option('-a', help='Allele Freq Filter Threshold [%default]',    dest='AF_THRESH', default=0.3, action='store', metavar='<float>')

PARSER.add_option('--vcf-out',   help="Output Match/FN/FP variants [%default]",       dest='VCF_OUT', default=False, action='store_true')
PARSER.add_option('--no-plot',   help="No plotting [%default]",                       dest='NO_PLOT', default=False, action='store_true')
PARSER.add_option('--incl-homs', help="Include homozygous ref calls [%default]",      dest='INCL_H',  default=False, action='store_true')
PARSER.add_option('--incl-fail', help="Include calls that failed filters [%default]", dest='INCL_F',  default=False, action='store_true')
PARSER.add_option('--fast',      help="No equivalent variant detection [%default]",   dest='FAST',    default=False, action='store_true')

(OPTS,ARGS) = PARSER.parse_args()

REFERENCE    = OPTS.REFF
GOLDEN_VCF   = OPTS.GVCF
WORKFLOW_VCF = OPTS.WVCF
OUT_PREFIX   = OPTS.OUTF
MAPTRACK     = OPTS.MTRK
MIN_READLEN  = OPTS.MTMM
BEDFILE      = OPTS.TREG
DP_THRESH    = int(OPTS.DP_THRESH)
AF_THRESH    = float(OPTS.AF_THRESH)

VCF_OUT      = OPTS.VCF_OUT
NO_PLOT      = OPTS.NO_PLOT
INCLUDE_HOMS = OPTS.INCL_H
INCLUDE_FAIL = OPTS.INCL_F
FAST         = OPTS.FAST

if len(sys.argv[1:]) == 0:
	PARSER.print_help()
	exit(1)

if OPTS.MTRL != None:
	MINREGIONLEN = int(OPTS.MTRL)
else:
	MINREGIONLEN = None

if MIN_READLEN == None:
	MIN_READLEN = 0
else:
	MIN_READLEN = int(MIN_READLEN)

if REFERENCE == None:
	print 'Error: No reference provided.'
	exit(1)
if GOLDEN_VCF == None:
	print 'Error: No golden VCF provided.'
	exit(1)
if WORKFLOW_VCF == None:
	print 'Error: No workflow VCF provided.'
	exit(1)
if OUT_PREFIX == None:
	print 'Error: No output prefix provided.'
	exit(1)
if (BEDFILE != None and MINREGIONLEN == None) or (BEDFILE == None and MINREGIONLEN != None):
	print 'Error: Both -t and -T must be specified'
	exit(1)

if NO_PLOT == False:
	import matplotlib
	matplotlib.use('Agg')
	import matplotlib.pyplot as mpl
	from matplotlib_venn import venn2, venn3
	import warnings
	warnings.filterwarnings("ignore", category=UserWarning, module='matplotlib_venn')

AF_STEPS = 20
AF_KEYS  = np.linspace(0.0,1.0,AF_STEPS+1)

def quantize_AF(af):
	if af >= 1.0:
		return AF_STEPS
	elif af <= 0.0:
		return 0
	else:
		return int(af*AF_STEPS)

VCF_HEADER = '##fileformat=VCFv4.1\n##reference='+REFERENCE+'##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">\n##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n'

DP_TOKENS = ['DP','DPU','DPI']	# in the order that we'll look for them

def parseLine(splt,colDict,colSamp):

	#	check if we want to proceed..
	ra = splt[colDict['REF']]
	aa = splt[colDict['ALT']]
	if not(INCLUDE_HOMS) and (aa == '.' or aa == '' or aa == ra):
		return None
	if not(INCLUDE_FAIL) and (splt[colDict['FILTER']] != 'PASS' and splt[colDict['FILTER']] != '.'):
		return None

	#	default vals
	cov  = None
	qual = DEFAULT_QUAL
	alt_alleles = []
	alt_freqs   = [None]

	#	any alt alleles?
	alt_split = aa.split(',')
	if len(alt_split) > 1:
		alt_alleles = alt_split

	#	cov
	for dp_tok in DP_TOKENS:
		#	check INFO for DP first
		if 'INFO' in colDict and dp_tok+'=' in splt[colDict['INFO']]:
			cov = int(re.findall(re.escape(dp_tok)+r"=[0-9]+",splt[colDict['INFO']])[0][3:])
		#	check FORMAT/SAMPLE for DP second:
		elif 'FORMAT' in colDict and len(colSamp):
			format = splt[colDict['FORMAT']]+':'
			if ':'+dp_tok+':' in format:
				dpInd = format.split(':').index(dp_tok)
				cov   = int(splt[colSamp[0]].split(':')[dpInd])
		if cov != None:
			break

	#	check INFO for AF first
	af = None
	if 'INFO' in colDict and ';AF=' in ';'+splt[colDict['INFO']]:
		info = splt[colDict['INFO']]+';'
		af  = re.findall(r"AF=.*?(?=;)",info)[0][3:]
	#	check FORMAT/SAMPLE for AF second:
	elif 'FORMAT' in colDict and len(colSamp):
		format = splt[colDict['FORMAT']]+':'
		if ':AF:' in format:
			afInd = splt[colDict['FORMAT']].split(':').index('AF')
			af    = splt[colSamp[0]].split(':')[afInd]

	if af != None:
		af_splt = af.split(',')
		while(len(af_splt) < len(alt_alleles)):	# are we lacking enough AF values for some reason?
			af_splt.append(af_splt[-1])			# phone it in.
		if len(af_splt) != 0 and af_splt[0] != '.' and af_splt[0] != '':		# missing data, yay
			alt_freqs = [float(n) for n in af_splt]
	else:
		alt_freqs = [None]*max([len(alt_alleles),1])

	#	get QUAL if it's interesting
	if 'QUAL' in colDict and splt[colDict['QUAL']] != '.':
		qual = float(splt[colDict['QUAL']])

	return (cov, qual, alt_alleles, alt_freqs)


def parseVCF(VCF_FILENAME,refName,targRegionsFl,outFile,outBool):
	v_Hashed      = {}
	v_posHash     = {}
	v_Alts        = {}
	v_Cov         = {}
	v_AF          = {}
	v_Qual        = {}
	v_TargLen     = {}
	nBelowMinRLen = 0
	line_unique   = 0	# number of lines in vcf file containing unique variant
	hash_coll     = 0	# number of times we saw a hash collision ("per line" so non-unique alt alleles don't get counted multiple times)
	var_filtered  = 0	# number of variants excluded due to filters (e.g. hom-refs, qual)
	var_merged    = 0	# number of variants we merged into another due to having the same position specified
	colDict       = {}
	colSamp       = []
	for line in open(VCF_FILENAME,'r'):
		if line[0] != '#':
			if len(colDict) == 0:
				print '\n\nError: VCF has no header?\n'+VCF_FILENAME+'\n\n'
				exit(1)
			splt = line[:-1].split('\t')
			if splt[0] == refName:

				var  = (int(splt[1]),splt[3],splt[4])
				targInd = bisect.bisect(targRegionsFl,var[0])

				if targInd%2 == 1:
					targLen = targRegionsFl[targInd]-targRegionsFl[targInd-1]
					if (BEDFILE != None and targLen >= MINREGIONLEN) or BEDFILE == None:
						
						pl_out = parseLine(splt,colDict,colSamp)
						if pl_out == None:
							var_filtered += 1
							continue
						(cov, qual, aa, af) = pl_out

						if var not in v_Hashed:

							vpos = var[0]
							if vpos in v_posHash:
								if len(aa) == 0:
									aa = [var[2]]
								aa.extend([n[2] for n in v_Hashed.keys() if n[0] == vpos])
								var_merged += 1
							v_posHash[vpos] = 1
							
							if len(aa):
								allVars = [(var[0],var[1],n) for n in aa]
								for i in xrange(len(allVars)):
									v_Hashed[allVars[i]] = 1
									#if allVars[i] not in v_Alts:
									#	v_Alts[allVars[i]] = []
									#v_Alts[allVars[i]].extend(allVars)
									v_Alts[allVars[i]] = allVars
							else:
								v_Hashed[var] = 1

							if cov != None:
								v_Cov[var] = cov
							v_AF[var]      = af[0]		# only use first AF, even if multiple. fix this later?
							v_Qual[var]    = qual
							v_TargLen[var] = targLen
							line_unique += 1

						else:
							hash_coll += 1

					else:
						nBelowMinRLen += 1
		else:
			if line[1] != '#':
				cols = line[1:-1].split('\t')
				for i in xrange(len(cols)):
					if 'FORMAT' in colDict:
						colSamp.append(i)
					colDict[cols[i]] = i
				if VCF_OUT and outBool:
					outBool = False
					outFile.write(line)

	return (v_Hashed, v_Alts, v_Cov, v_AF, v_Qual, v_TargLen, nBelowMinRLen, line_unique, var_filtered, var_merged, hash_coll)


def condenseByPos(listIn):
	varListOfInterest = [n for n in listIn]
	indCount = {}
	for n in varListOfInterest:
		c = n[0]
		if c not in indCount:
			indCount[c] = 0
		indCount[c] += 1
	#nonUniqueDict = {n:[] for n in sorted(indCount.keys()) if indCount[n] > 1}		# the python 2.7 way
	nonUniqueDict = {}
	for n in sorted(indCount.keys()):
		if indCount[n] > 1:
			nonUniqueDict[n] = []
	delList = []
	for i in xrange(len(varListOfInterest)):
		if varListOfInterest[i][0] in nonUniqueDict:
			nonUniqueDict[varListOfInterest[i][0]].append(varListOfInterest[i])
			delList.append(i)
	delList = sorted(delList,reverse=True)
	for di in delList:
		del varListOfInterest[di]
	for v in nonUniqueDict.values():
		var = (v[0][0],v[0][1],','.join([n[2] for n in v[::-1]]))
		varListOfInterest.append(var)
	return varListOfInterest


def main():

	ref = []
	f = open(REFERENCE,'r')
	nLines = 0
	prevR = None
	prevP = None
	ref_inds = []
	sys.stdout.write('\nindexing reference fasta... ')
	sys.stdout.flush()
	tt = time.time()
	while 1:
		nLines += 1
		data = f.readline()
		if not data:
			ref_inds.append( (prevR, prevP, f.tell()-len(data)) )
			break
		if data[0] == '>':
			if prevP != None:
				ref_inds.append( (prevR, prevP, f.tell()-len(data)) )
			prevP = f.tell()
			prevR = data[1:-1]
	print '{0:.3f} (sec)'.format(time.time()-tt)
	#ref_inds = [('chrM', 6, 16909), ('chr1', 16915, 254252549), ('chr2', 254252555, 502315916), ('chr3', 502315922, 704298801), ('chr4', 704298807, 899276169), ('chr5', 899276175, 1083809741), ('chr6', 1083809747, 1258347116), ('chr7', 1258347122, 1420668559), ('chr8', 1420668565, 1569959868), ('chr9', 1569959874, 1713997574), ('chr10', 1713997581, 1852243023), ('chr11', 1852243030, 1989949677), ('chr12', 1989949684, 2126478617), ('chr13', 2126478624, 2243951900), ('chr14', 2243951907, 2353448438), ('chr15', 2353448445, 2458030465), ('chr16', 2458030472, 2550192321), ('chr17', 2550192328, 2633011443), ('chr18', 2633011450, 2712650243), ('chr19', 2712650250, 2772961813), ('chr20', 2772961820, 2837247851), ('chr21', 2837247858, 2886340351), ('chr22', 2886340358, 2938671016), ('chrX', 2938671022, 3097046994), ('chrY', 3097047000, 3157608038)]

	ztV = 0	# total golden variants
	ztW = 0	# total workflow variants
	znP = 0	# total perfect matches
	zfP = 0	# total false positives
	znF = 0	# total false negatives
	znE = 0	# total equivalent variants detected
	zgF = 0	# total golden variants that were filtered and excluded
	zgR = 0	# total golden variants that were excluded for being redundant
	zgM = 0	# total golden variants that were merged into a single position
	zwF = 0	# total workflow variants that were filtered and excluded
	zwR = 0	# total workflow variants that were excluded for being redundant
	zwM = 0	# total workflow variants that were merged into a single position
	if BEDFILE != None:
		zbM = 0

	mappability_vs_FN = {0:0, 1:0}	# [0] = # of FNs that were in mappable regions, [1] = # of FNs that were in unmappable regions
	coverage_vs_FN    = {}			# [C] = # of FNs that were covered by C reads
	alleleBal_vs_FN   = {}			# [AF] = # of FNs that were heterozygous genotypes with allele freq AF (quantized to multiples of 1/AF_STEPS)
	for n in AF_KEYS:
		alleleBal_vs_FN[n] = 0

	#
	#	read in mappability track
	#
	mappability_tracks = {}		# indexed by chr string (e.g. 'chr1'), has boolean array
	prevRef            = ''
	relevantRegions    = []
	if MAPTRACK != None:
		mtf = open(MAPTRACK,'r')
		for line in mtf:
			splt = line.strip().split('\t')
			if prevRef != '' and splt[0] != prevRef:
				# do stuff
				if len(relevantRegions):
					myTrack = [0]*(relevantRegions[-1][1]+100)
					for r in relevantRegions:
						for ri in xrange(r[0],r[1]):
							myTrack[ri] = 1
					mappability_tracks[prevRef] = [n for n in myTrack]
				#
				relevantRegions = []
			if int(splt[3]) >= MIN_READLEN:
				relevantRegions.append((int(splt[1]),int(splt[2])))
			prevRef = splt[0]
		mtf.close()
		# do stuff
		if len(relevantRegions):
			myTrack = [0]*(relevantRegions[-1][1]+100)
			for r in relevantRegions:
				for ri in xrange(r[0],r[1]):
					myTrack[ri] = 1
			mappability_tracks[prevRef] = [n for n in myTrack]
		#

	#
	#	init vcf output, if desired
	#
	vcfo2 = None
	vcfo3 = None
	global vcfo2_firstTime
	global vcfo3_firstTime
	vcfo2_firstTime = False
	vcfo3_firstTime = False
	if VCF_OUT:
		vcfo2 = open(OUT_PREFIX+'_FN.vcf','w')
		vcfo3 = open(OUT_PREFIX+'_FP.vcf','w')
		vcfo2_firstTime = True
		vcfo3_firstTime = True

	#
	#	data for plotting FN analysis
	#
	set1 = []
	set2 = []
	set3 = []
	varAdj = 0

	#
	#
	#	For each sequence in reference fasta...
	#
	#
	for n_RI in ref_inds:

		refName = n_RI[0]
		if FAST == False:
			f.seek(n_RI[1])
			print 'reading '+refName+'...',
			myDat  = f.read(n_RI[2]-n_RI[1]).split('\n')
			myLen  = sum([len(m) for m in myDat])
			if sys.version_info >= (2,7):
				print '{:,} bp'.format(myLen)
			else:
				print '{0:} bp'.format(myLen)
			inWidth = len(myDat[0])
			if len(myDat[-1]) == 0:	# if last line is empty, remove it.
				del myDat[-1]
			if inWidth*(len(myDat)-1)+len(myDat[-1]) != myLen:
				print 'fasta column-width not consistent.'
				print myLen, inWidth*(len(myDat)-1)+len(myDat[-1])
				for i in xrange(len(myDat)):
					if len(myDat[i]) != inWidth:
						print i, len(myDat[i]), inWidth
				exit(1)

			myDat = bytearray(''.join(myDat)).upper()
			myLen = len(myDat)

		#
		#	Parse relevant targeted regions
		#
		targRegionsFl = []
		if BEDFILE != None:
			bedfile = open(BEDFILE,'r')
			for line in bedfile:
				splt = line.split('\t')
				if splt[0] == refName:
					targRegionsFl.extend((int(splt[1]),int(splt[2])))
			bedfile.close()
		else:
			targRegionsFl = [-1,MAX_VAL+1]

		#
		#	Parse vcf files
		#
		sys.stdout.write('comparing variation in '+refName+'... ')
		sys.stdout.flush()
		tt = time.time()

		(correctHashed, correctAlts, correctCov, correctAF, correctQual, correctTargLen, correctBelowMinRLen, correctUnique, gFiltered, gMerged, gRedundant)        = parseVCF(GOLDEN_VCF, refName, targRegionsFl, vcfo2, vcfo2_firstTime)
		(workflowHashed, workflowAlts, workflowCov, workflowAF, workflowQual, workflowTarLen, workflowBelowMinRLen, workflowUnique, wFiltered, wMerged, wRedundant) = parseVCF(WORKFLOW_VCF, refName, targRegionsFl, vcfo3, vcfo3_firstTime)
		zgF += gFiltered
		zgR += gRedundant
		zgM += gMerged
		zwF += wFiltered
		zwR += wRedundant
		zwM += wMerged

		#
		#	Deduce which variants are FP / FN
		#
		solvedInds = {}
		for var in correctHashed.keys():
			if var in workflowHashed or var[0] in solvedInds:
				correctHashed[var]  = 2
				workflowHashed[var] = 2
				solvedInds[var[0]] = True
		for var in correctHashed.keys()+workflowHashed.keys():
			if var[0] in solvedInds:
				correctHashed[var]  = 2
				workflowHashed[var] = 2
		nPerfect = len(solvedInds)
		
		# correctHashed[var] = 1: were not found
		#                    = 2: should be discluded because we were found
		#                    = 3: should be discluded because an alt was found
		notFound   = [n for n in sorted(correctHashed.keys()) if correctHashed[n] == 1]
		FPvariants = [n for n in sorted(workflowHashed.keys()) if workflowHashed[n] == 1]

		#
		#	condense all variants who have alternate alleles and were *not* found to have perfect matches
		#	into a single variant again. These will not be included in the candidates for equivalency checking. Sorry!
		#
		notFound   = condenseByPos(notFound)
		FPvariants = condenseByPos(FPvariants)

		#
		#	tally up some values, if there are no golden variants lets save some CPU cycles and move to the next ref
		#
		totalGoldenVariants   = nPerfect + len(notFound)
		totalWorkflowVariants = nPerfect + len(FPvariants)
		if totalGoldenVariants == 0:
			zfP += len(FPvariants)
			ztW += totalWorkflowVariants
			print '{0:.3f} (sec)'.format(time.time()-tt)
			continue

		#
		#	let's check for equivalent variants
		#
		if FAST == False:
			delList_i = []
			delList_j = []
			regionsToCheck = []
			for i in xrange(len(FPvariants)):
				pos = FPvariants[i][0]
				regionsToCheck.append((max([pos-EV_BPRANGE-1,0]),min([pos+EV_BPRANGE,len(myDat)-1])))

			for n in regionsToCheck:
				refSection = myDat[n[0]:n[1]]

				fpWithin = []
				for i in xrange(len(FPvariants)):
					m  = FPvariants[i]
					if (m[0] > n[0] and m[0] < n[1]):
						fpWithin.append((m,i))
				fpWithin = sorted(fpWithin)
				adj = 0
				altSection = copy.deepcopy(refSection)
				for (m,i) in fpWithin:
					lr = len(m[1])
					la = len(m[2])
					dpos = m[0]-n[0]+adj
					altSection = altSection[:dpos-1] + m[2] + altSection[dpos-1+lr:]
					adj += la-lr

				nfWithin = []
				for j in xrange(len(notFound)):
					m = notFound[j]
					if (m[0] > n[0] and m[0] < n[1]):
						nfWithin.append((m,j))
				nfWithin = sorted(nfWithin)
				adj = 0
				altSection2 = copy.deepcopy(refSection)
				for (m,j) in nfWithin:
					lr = len(m[1])
					la = len(m[2])
					dpos = m[0]-n[0]+adj
					altSection2 = altSection2[:dpos-1] + m[2] + altSection2[dpos-1+lr:]
					adj += la-lr

				if altSection == altSection2:
					for (m,i) in fpWithin:
						if i not in delList_i:
							delList_i.append(i)
					for (m,j) in nfWithin:
						if j not in delList_j:
							delList_j.append(j)

			nEquiv = 0
			for i in sorted(list(set(delList_i)),reverse=True):
				del FPvariants[i]
			for j in sorted(list(set(delList_j)),reverse=True):
				del notFound[j]
				nEquiv += 1
			nPerfect += nEquiv

		#
		#	Tally up errors and whatnot
		#
		ztV += totalGoldenVariants
		ztW += totalWorkflowVariants
		znP += nPerfect
		zfP += len(FPvariants)
		znF += len(notFound)
		if FAST == False:
			znE += nEquiv
		if BEDFILE != None:
			zbM += correctBelowMinRLen

		#
		#	try to identify a reason for FN variants:
		#

		venn_data = [[0,0,0] for n in notFound]		# [i] = (unmappable, low cov, low het)
		for i in xrange(len(notFound)):
			var = notFound[i]

			noReason = True

			#	mappability?
			if MAPTRACK != None:
				if refName in mappability_tracks and var[0] < len(mappability_tracks[refName]):
					if mappability_tracks[refName][var[0]]:
						mappability_vs_FN[1] += 1
						venn_data[i][0] = 1
						noReason = False
					else:
						mappability_vs_FN[0] += 1

			#	coverage?
			if var in correctCov:
				c = correctCov[var]
				if c != None:
					if c not in coverage_vs_FN:
						coverage_vs_FN[c] = 0
					coverage_vs_FN[c] += 1
					if c < DP_THRESH:
						venn_data[i][1] = 1
						noReason = False

			#	heterozygous genotype messing things up?
			#if var in correctAF:
			#	a = correctAF[var]
			#	if a != None:
			#		a = AF_KEYS[quantize_AF(a)]
			#		if a not in alleleBal_vs_FN:
			#			alleleBal_vs_FN[a] = 0
			#		alleleBal_vs_FN[a] += 1
			#		if a < AF_THRESH:
			#			venn_data[i][2] = 1

			#	no reason?
			if noReason:
				venn_data[i][2] += 1

		for i in xrange(len(notFound)):
			if venn_data[i][0]: set1.append(i+varAdj)
			if venn_data[i][1]: set2.append(i+varAdj)
			if venn_data[i][2]: set3.append(i+varAdj)
		varAdj += len(notFound)

		#
		#	if desired, write out vcf files.
		#
		notFound   = sorted(notFound)
		FPvariants = sorted(FPvariants)
		if VCF_OUT:
			for line in open(GOLDEN_VCF,'r'):
				if line[0] != '#':
					splt = line.split('\t')
					if splt[0] == refName:
						var  = (int(splt[1]),splt[3],splt[4])
						if var in notFound:
							vcfo2.write(line)
			for line in open(WORKFLOW_VCF,'r'):
				if line[0] != '#':
					splt = line.split('\t')
					if splt[0] == refName:
						var  = (int(splt[1]),splt[3],splt[4])
						if var in FPvariants:
							vcfo3.write(line)

		print '{0:.3f} (sec)'.format(time.time()-tt)

	#
	#	close vcf output
	#
	print ''
	if VCF_OUT:
		print OUT_PREFIX+'_FN.vcf'
		print OUT_PREFIX+'_FP.vcf'
		vcfo2.close()
		vcfo3.close()

	#
	#	plot some FN stuff
	#
	if NO_PLOT == False:
		nDetected = len(set(set1+set2+set3))
		set1 = set(set1)
		set2 = set(set2)
		set3 = set(set3)

		if len(set1): s1 = 'Unmappable'
		else: s1 = ''
		if len(set2): s2 = 'DP < '+str(DP_THRESH)
		else: s2 = ''
		#if len(set3): s3 = 'AF < '+str(AF_THRESH)
		if len(set3): s3 = 'Unknown'
		else: s3 = ''

		mpl.figure(0)
		tstr1 = 'False Negative Variants (Missed Detections)'
		#tstr2 = str(nDetected)+' / '+str(znF)+' FN variants categorized'
		tstr2 = ''
		if MAPTRACK != None:
			v = venn3([set1, set2, set3], (s1, s2, s3))
		else:
			v = venn2([set2, set3], (s2, s3))
		mpl.figtext(0.5,0.95,tstr1,fontdict={'size':14,'weight':'bold'},horizontalalignment='center')
		mpl.figtext(0.5,0.03,tstr2,fontdict={'size':14,'weight':'bold'},horizontalalignment='center')

		ouf = OUT_PREFIX+'_FNvenn.pdf'
		print ouf
		mpl.savefig(ouf)

	#
	#	spit out results to console
	#
	print '\n**********************************\n'
	if BEDFILE != None:
		print 'ONLY CONSIDERING VARIANTS FOUND WITHIN TARGETED REGIONS\n\n'
	print 'Total Golden Variants:  ',ztV,'\t[',zgF,'filtered,',zgM,'merged,',zgR,'redundant ]'
	print 'Total Workflow Variants:',ztW,'\t[',zwF,'filtered,',zwM,'merged,',zwR,'redundant ]'
	print ''
	if ztV > 0 and ztW > 0:
		print 'Perfect Matches:',znP,'({0:.2f}%)'.format(100.*float(znP)/ztV)
		print 'FN variants:    ',znF,'({0:.2f}%)'.format(100.*float(znF)/ztV)
		print 'FP variants:    ',zfP#,'({0:.2f}%)'.format(100.*float(zfP)/ztW)
	if FAST == False:
		print '\nNumber of equivalent variants denoted differently between the two vcfs:',znE
	if BEDFILE != None:
		print '\nNumber of golden variants located in targeted regions that were too small to be sampled from:',zbM
	if FAST:
		print "\nWarning! Running with '--fast' means that identical variants denoted differently between the two vcfs will not be detected! The values above may be lower than the true accuracy."
	#if NO_PLOT:
	if True:
		print '\n#unmappable:  ',len(set1)
		print '#low_coverage:',len(set2)
		print '#unknown:     ',len(set3)
	print '\n**********************************\n'





if __name__ == '__main__':
	main()