annotate utilities/vcf_compare_OLD.py @ 2:8a739c944dbf draft

planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author thondeboer
date Tue, 15 May 2018 16:22:08 -0400 (2018-05-15)
parents 6e75a84e9338
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
1 #!/usr/bin/env python
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
2 # encoding: utf-8
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
3
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
4 """ **************************************************
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
5
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
6 vcf_compare.py
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
7
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
8 - compare vcf file produced by workflow to golden vcf produced by simulator
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
9
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
10 Written by: Zach Stephens
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
11 Date: January 20, 2015
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
12 Contact: zstephe2@illinois.edu
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
13
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
14 ************************************************** """
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
15
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
16 import sys
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
17 import os
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
18 import copy
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
19 import time
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
20 import bisect
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
21 import re
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
22 import numpy as np
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
23 import optparse
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
24
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
25
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
26 EV_BPRANGE = 50 # how far to either side of a particular variant location do we want to check for equivalents?
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
27
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
28 DEFAULT_QUAL = -666 # if we can't find a qual score, use this instead so we know it's missing
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
29
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
30 MAX_VAL = 9999999999999 # an unreasonably large value that no reference fasta could concievably be longer than
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
31
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
32 DESC = """%prog: vcf comparison script."""
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
33 VERS = 0.1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
34
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
35 PARSER = optparse.OptionParser('python %prog [options] -r <ref.fa> -g <golden.vcf> -w <workflow.vcf>',description=DESC,version="%prog v"+str(VERS))
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
36
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
37 PARSER.add_option('-r', help='* Reference Fasta', dest='REFF', action='store', metavar='<ref.fa>')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
38 PARSER.add_option('-g', help='* Golden VCF', dest='GVCF', action='store', metavar='<golden.vcf>')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
39 PARSER.add_option('-w', help='* Workflow VCF', dest='WVCF', action='store', metavar='<workflow.vcf>')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
40 PARSER.add_option('-o', help='* Output Prefix', dest='OUTF', action='store', metavar='<prefix>')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
41 PARSER.add_option('-m', help='Mappability Track', dest='MTRK', action='store', metavar='<track.bed>')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
42 PARSER.add_option('-M', help='Maptrack Min Len', dest='MTMM', action='store', metavar='<int>')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
43 PARSER.add_option('-t', help='Targetted Regions', dest='TREG', action='store', metavar='<regions.bed>')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
44 PARSER.add_option('-T', help='Min Region Len', dest='MTRL', action='store', metavar='<int>')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
45 PARSER.add_option('-c', help='Coverage Filter Threshold [%default]', dest='DP_THRESH', default=15, action='store', metavar='<int>')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
46 PARSER.add_option('-a', help='Allele Freq Filter Threshold [%default]', dest='AF_THRESH', default=0.3, action='store', metavar='<float>')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
47
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
48 PARSER.add_option('--vcf-out', help="Output Match/FN/FP variants [%default]", dest='VCF_OUT', default=False, action='store_true')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
49 PARSER.add_option('--no-plot', help="No plotting [%default]", dest='NO_PLOT', default=False, action='store_true')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
50 PARSER.add_option('--incl-homs', help="Include homozygous ref calls [%default]", dest='INCL_H', default=False, action='store_true')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
51 PARSER.add_option('--incl-fail', help="Include calls that failed filters [%default]", dest='INCL_F', default=False, action='store_true')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
52 PARSER.add_option('--fast', help="No equivalent variant detection [%default]", dest='FAST', default=False, action='store_true')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
53
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
54 (OPTS,ARGS) = PARSER.parse_args()
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
55
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
56 REFERENCE = OPTS.REFF
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
57 GOLDEN_VCF = OPTS.GVCF
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
58 WORKFLOW_VCF = OPTS.WVCF
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
59 OUT_PREFIX = OPTS.OUTF
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
60 MAPTRACK = OPTS.MTRK
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
61 MIN_READLEN = OPTS.MTMM
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
62 BEDFILE = OPTS.TREG
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
63 DP_THRESH = int(OPTS.DP_THRESH)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
64 AF_THRESH = float(OPTS.AF_THRESH)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
65
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
66 VCF_OUT = OPTS.VCF_OUT
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
67 NO_PLOT = OPTS.NO_PLOT
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
68 INCLUDE_HOMS = OPTS.INCL_H
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
69 INCLUDE_FAIL = OPTS.INCL_F
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
70 FAST = OPTS.FAST
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
71
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
72 if len(sys.argv[1:]) == 0:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
73 PARSER.print_help()
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
74 exit(1)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
75
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
76 if OPTS.MTRL != None:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
77 MINREGIONLEN = int(OPTS.MTRL)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
78 else:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
79 MINREGIONLEN = None
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
80
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
81 if MIN_READLEN == None:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
82 MIN_READLEN = 0
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
83 else:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
84 MIN_READLEN = int(MIN_READLEN)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
85
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
86 if REFERENCE == None:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
87 print 'Error: No reference provided.'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
88 exit(1)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
89 if GOLDEN_VCF == None:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
90 print 'Error: No golden VCF provided.'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
91 exit(1)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
92 if WORKFLOW_VCF == None:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
93 print 'Error: No workflow VCF provided.'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
94 exit(1)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
95 if OUT_PREFIX == None:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
96 print 'Error: No output prefix provided.'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
97 exit(1)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
98 if (BEDFILE != None and MINREGIONLEN == None) or (BEDFILE == None and MINREGIONLEN != None):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
99 print 'Error: Both -t and -T must be specified'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
100 exit(1)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
101
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
102 if NO_PLOT == False:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
103 import matplotlib
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
104 matplotlib.use('Agg')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
105 import matplotlib.pyplot as mpl
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
106 from matplotlib_venn import venn2, venn3
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
107 import warnings
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
108 warnings.filterwarnings("ignore", category=UserWarning, module='matplotlib_venn')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
109
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
110 AF_STEPS = 20
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
111 AF_KEYS = np.linspace(0.0,1.0,AF_STEPS+1)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
112
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
113 def quantize_AF(af):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
114 if af >= 1.0:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
115 return AF_STEPS
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
116 elif af <= 0.0:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
117 return 0
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
118 else:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
119 return int(af*AF_STEPS)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
120
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
121 VCF_HEADER = '##fileformat=VCFv4.1\n##reference='+REFERENCE+'##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">\n##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
122
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
123 DP_TOKENS = ['DP','DPU','DPI'] # in the order that we'll look for them
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
124
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
125 def parseLine(splt,colDict,colSamp):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
126
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
127 # check if we want to proceed..
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
128 ra = splt[colDict['REF']]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
129 aa = splt[colDict['ALT']]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
130 if not(INCLUDE_HOMS) and (aa == '.' or aa == '' or aa == ra):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
131 return None
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
132 if not(INCLUDE_FAIL) and (splt[colDict['FILTER']] != 'PASS' and splt[colDict['FILTER']] != '.'):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
133 return None
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
134
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
135 # default vals
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
136 cov = None
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
137 qual = DEFAULT_QUAL
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
138 alt_alleles = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
139 alt_freqs = [None]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
140
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
141 # any alt alleles?
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
142 alt_split = aa.split(',')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
143 if len(alt_split) > 1:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
144 alt_alleles = alt_split
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
145
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
146 # cov
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
147 for dp_tok in DP_TOKENS:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
148 # check INFO for DP first
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
149 if 'INFO' in colDict and dp_tok+'=' in splt[colDict['INFO']]:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
150 cov = int(re.findall(re.escape(dp_tok)+r"=[0-9]+",splt[colDict['INFO']])[0][3:])
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
151 # check FORMAT/SAMPLE for DP second:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
152 elif 'FORMAT' in colDict and len(colSamp):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
153 format = splt[colDict['FORMAT']]+':'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
154 if ':'+dp_tok+':' in format:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
155 dpInd = format.split(':').index(dp_tok)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
156 cov = int(splt[colSamp[0]].split(':')[dpInd])
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
157 if cov != None:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
158 break
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
159
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
160 # check INFO for AF first
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
161 af = None
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
162 if 'INFO' in colDict and ';AF=' in ';'+splt[colDict['INFO']]:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
163 info = splt[colDict['INFO']]+';'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
164 af = re.findall(r"AF=.*?(?=;)",info)[0][3:]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
165 # check FORMAT/SAMPLE for AF second:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
166 elif 'FORMAT' in colDict and len(colSamp):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
167 format = splt[colDict['FORMAT']]+':'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
168 if ':AF:' in format:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
169 afInd = splt[colDict['FORMAT']].split(':').index('AF')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
170 af = splt[colSamp[0]].split(':')[afInd]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
171
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
172 if af != None:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
173 af_splt = af.split(',')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
174 while(len(af_splt) < len(alt_alleles)): # are we lacking enough AF values for some reason?
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
175 af_splt.append(af_splt[-1]) # phone it in.
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
176 if len(af_splt) != 0 and af_splt[0] != '.' and af_splt[0] != '': # missing data, yay
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
177 alt_freqs = [float(n) for n in af_splt]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
178 else:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
179 alt_freqs = [None]*max([len(alt_alleles),1])
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
180
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
181 # get QUAL if it's interesting
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
182 if 'QUAL' in colDict and splt[colDict['QUAL']] != '.':
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
183 qual = float(splt[colDict['QUAL']])
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
184
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
185 return (cov, qual, alt_alleles, alt_freqs)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
186
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
187
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
188 def parseVCF(VCF_FILENAME,refName,targRegionsFl,outFile,outBool):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
189 v_Hashed = {}
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
190 v_posHash = {}
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
191 v_Alts = {}
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
192 v_Cov = {}
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
193 v_AF = {}
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
194 v_Qual = {}
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
195 v_TargLen = {}
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
196 nBelowMinRLen = 0
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
197 line_unique = 0 # number of lines in vcf file containing unique variant
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
198 hash_coll = 0 # number of times we saw a hash collision ("per line" so non-unique alt alleles don't get counted multiple times)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
199 var_filtered = 0 # number of variants excluded due to filters (e.g. hom-refs, qual)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
200 var_merged = 0 # number of variants we merged into another due to having the same position specified
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
201 colDict = {}
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
202 colSamp = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
203 for line in open(VCF_FILENAME,'r'):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
204 if line[0] != '#':
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
205 if len(colDict) == 0:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
206 print '\n\nError: VCF has no header?\n'+VCF_FILENAME+'\n\n'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
207 exit(1)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
208 splt = line[:-1].split('\t')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
209 if splt[0] == refName:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
210
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
211 var = (int(splt[1]),splt[3],splt[4])
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
212 targInd = bisect.bisect(targRegionsFl,var[0])
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
213
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
214 if targInd%2 == 1:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
215 targLen = targRegionsFl[targInd]-targRegionsFl[targInd-1]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
216 if (BEDFILE != None and targLen >= MINREGIONLEN) or BEDFILE == None:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
217
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
218 pl_out = parseLine(splt,colDict,colSamp)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
219 if pl_out == None:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
220 var_filtered += 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
221 continue
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
222 (cov, qual, aa, af) = pl_out
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
223
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
224 if var not in v_Hashed:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
225
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
226 vpos = var[0]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
227 if vpos in v_posHash:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
228 if len(aa) == 0:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
229 aa = [var[2]]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
230 aa.extend([n[2] for n in v_Hashed.keys() if n[0] == vpos])
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
231 var_merged += 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
232 v_posHash[vpos] = 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
233
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
234 if len(aa):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
235 allVars = [(var[0],var[1],n) for n in aa]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
236 for i in xrange(len(allVars)):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
237 v_Hashed[allVars[i]] = 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
238 #if allVars[i] not in v_Alts:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
239 # v_Alts[allVars[i]] = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
240 #v_Alts[allVars[i]].extend(allVars)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
241 v_Alts[allVars[i]] = allVars
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
242 else:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
243 v_Hashed[var] = 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
244
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
245 if cov != None:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
246 v_Cov[var] = cov
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
247 v_AF[var] = af[0] # only use first AF, even if multiple. fix this later?
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
248 v_Qual[var] = qual
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
249 v_TargLen[var] = targLen
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
250 line_unique += 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
251
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
252 else:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
253 hash_coll += 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
254
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
255 else:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
256 nBelowMinRLen += 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
257 else:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
258 if line[1] != '#':
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
259 cols = line[1:-1].split('\t')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
260 for i in xrange(len(cols)):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
261 if 'FORMAT' in colDict:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
262 colSamp.append(i)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
263 colDict[cols[i]] = i
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
264 if VCF_OUT and outBool:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
265 outBool = False
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
266 outFile.write(line)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
267
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
268 return (v_Hashed, v_Alts, v_Cov, v_AF, v_Qual, v_TargLen, nBelowMinRLen, line_unique, var_filtered, var_merged, hash_coll)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
269
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
270
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
271 def condenseByPos(listIn):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
272 varListOfInterest = [n for n in listIn]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
273 indCount = {}
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
274 for n in varListOfInterest:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
275 c = n[0]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
276 if c not in indCount:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
277 indCount[c] = 0
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
278 indCount[c] += 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
279 #nonUniqueDict = {n:[] for n in sorted(indCount.keys()) if indCount[n] > 1} # the python 2.7 way
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
280 nonUniqueDict = {}
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
281 for n in sorted(indCount.keys()):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
282 if indCount[n] > 1:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
283 nonUniqueDict[n] = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
284 delList = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
285 for i in xrange(len(varListOfInterest)):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
286 if varListOfInterest[i][0] in nonUniqueDict:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
287 nonUniqueDict[varListOfInterest[i][0]].append(varListOfInterest[i])
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
288 delList.append(i)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
289 delList = sorted(delList,reverse=True)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
290 for di in delList:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
291 del varListOfInterest[di]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
292 for v in nonUniqueDict.values():
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
293 var = (v[0][0],v[0][1],','.join([n[2] for n in v[::-1]]))
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
294 varListOfInterest.append(var)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
295 return varListOfInterest
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
296
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
297
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
298 def main():
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
299
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
300 ref = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
301 f = open(REFERENCE,'r')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
302 nLines = 0
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
303 prevR = None
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
304 prevP = None
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
305 ref_inds = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
306 sys.stdout.write('\nindexing reference fasta... ')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
307 sys.stdout.flush()
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
308 tt = time.time()
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
309 while 1:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
310 nLines += 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
311 data = f.readline()
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
312 if not data:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
313 ref_inds.append( (prevR, prevP, f.tell()-len(data)) )
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
314 break
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
315 if data[0] == '>':
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
316 if prevP != None:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
317 ref_inds.append( (prevR, prevP, f.tell()-len(data)) )
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
318 prevP = f.tell()
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
319 prevR = data[1:-1]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
320 print '{0:.3f} (sec)'.format(time.time()-tt)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
321 #ref_inds = [('chrM', 6, 16909), ('chr1', 16915, 254252549), ('chr2', 254252555, 502315916), ('chr3', 502315922, 704298801), ('chr4', 704298807, 899276169), ('chr5', 899276175, 1083809741), ('chr6', 1083809747, 1258347116), ('chr7', 1258347122, 1420668559), ('chr8', 1420668565, 1569959868), ('chr9', 1569959874, 1713997574), ('chr10', 1713997581, 1852243023), ('chr11', 1852243030, 1989949677), ('chr12', 1989949684, 2126478617), ('chr13', 2126478624, 2243951900), ('chr14', 2243951907, 2353448438), ('chr15', 2353448445, 2458030465), ('chr16', 2458030472, 2550192321), ('chr17', 2550192328, 2633011443), ('chr18', 2633011450, 2712650243), ('chr19', 2712650250, 2772961813), ('chr20', 2772961820, 2837247851), ('chr21', 2837247858, 2886340351), ('chr22', 2886340358, 2938671016), ('chrX', 2938671022, 3097046994), ('chrY', 3097047000, 3157608038)]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
322
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
323 ztV = 0 # total golden variants
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
324 ztW = 0 # total workflow variants
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
325 znP = 0 # total perfect matches
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
326 zfP = 0 # total false positives
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
327 znF = 0 # total false negatives
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
328 znE = 0 # total equivalent variants detected
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
329 zgF = 0 # total golden variants that were filtered and excluded
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
330 zgR = 0 # total golden variants that were excluded for being redundant
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
331 zgM = 0 # total golden variants that were merged into a single position
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
332 zwF = 0 # total workflow variants that were filtered and excluded
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
333 zwR = 0 # total workflow variants that were excluded for being redundant
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
334 zwM = 0 # total workflow variants that were merged into a single position
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
335 if BEDFILE != None:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
336 zbM = 0
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
337
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
338 mappability_vs_FN = {0:0, 1:0} # [0] = # of FNs that were in mappable regions, [1] = # of FNs that were in unmappable regions
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
339 coverage_vs_FN = {} # [C] = # of FNs that were covered by C reads
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
340 alleleBal_vs_FN = {} # [AF] = # of FNs that were heterozygous genotypes with allele freq AF (quantized to multiples of 1/AF_STEPS)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
341 for n in AF_KEYS:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
342 alleleBal_vs_FN[n] = 0
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
343
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
344 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
345 # read in mappability track
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
346 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
347 mappability_tracks = {} # indexed by chr string (e.g. 'chr1'), has boolean array
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
348 prevRef = ''
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
349 relevantRegions = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
350 if MAPTRACK != None:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
351 mtf = open(MAPTRACK,'r')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
352 for line in mtf:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
353 splt = line.strip().split('\t')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
354 if prevRef != '' and splt[0] != prevRef:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
355 # do stuff
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
356 if len(relevantRegions):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
357 myTrack = [0]*(relevantRegions[-1][1]+100)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
358 for r in relevantRegions:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
359 for ri in xrange(r[0],r[1]):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
360 myTrack[ri] = 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
361 mappability_tracks[prevRef] = [n for n in myTrack]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
362 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
363 relevantRegions = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
364 if int(splt[3]) >= MIN_READLEN:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
365 relevantRegions.append((int(splt[1]),int(splt[2])))
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
366 prevRef = splt[0]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
367 mtf.close()
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
368 # do stuff
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
369 if len(relevantRegions):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
370 myTrack = [0]*(relevantRegions[-1][1]+100)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
371 for r in relevantRegions:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
372 for ri in xrange(r[0],r[1]):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
373 myTrack[ri] = 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
374 mappability_tracks[prevRef] = [n for n in myTrack]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
375 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
376
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
377 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
378 # init vcf output, if desired
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
379 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
380 vcfo2 = None
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
381 vcfo3 = None
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
382 global vcfo2_firstTime
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
383 global vcfo3_firstTime
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
384 vcfo2_firstTime = False
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
385 vcfo3_firstTime = False
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
386 if VCF_OUT:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
387 vcfo2 = open(OUT_PREFIX+'_FN.vcf','w')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
388 vcfo3 = open(OUT_PREFIX+'_FP.vcf','w')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
389 vcfo2_firstTime = True
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
390 vcfo3_firstTime = True
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
391
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
392 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
393 # data for plotting FN analysis
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
394 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
395 set1 = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
396 set2 = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
397 set3 = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
398 varAdj = 0
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
399
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
400 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
401 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
402 # For each sequence in reference fasta...
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
403 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
404 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
405 for n_RI in ref_inds:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
406
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
407 refName = n_RI[0]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
408 if FAST == False:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
409 f.seek(n_RI[1])
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
410 print 'reading '+refName+'...',
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
411 myDat = f.read(n_RI[2]-n_RI[1]).split('\n')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
412 myLen = sum([len(m) for m in myDat])
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
413 if sys.version_info >= (2,7):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
414 print '{:,} bp'.format(myLen)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
415 else:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
416 print '{0:} bp'.format(myLen)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
417 inWidth = len(myDat[0])
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
418 if len(myDat[-1]) == 0: # if last line is empty, remove it.
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
419 del myDat[-1]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
420 if inWidth*(len(myDat)-1)+len(myDat[-1]) != myLen:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
421 print 'fasta column-width not consistent.'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
422 print myLen, inWidth*(len(myDat)-1)+len(myDat[-1])
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
423 for i in xrange(len(myDat)):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
424 if len(myDat[i]) != inWidth:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
425 print i, len(myDat[i]), inWidth
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
426 exit(1)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
427
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
428 myDat = bytearray(''.join(myDat)).upper()
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
429 myLen = len(myDat)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
430
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
431 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
432 # Parse relevant targeted regions
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
433 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
434 targRegionsFl = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
435 if BEDFILE != None:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
436 bedfile = open(BEDFILE,'r')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
437 for line in bedfile:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
438 splt = line.split('\t')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
439 if splt[0] == refName:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
440 targRegionsFl.extend((int(splt[1]),int(splt[2])))
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
441 bedfile.close()
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
442 else:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
443 targRegionsFl = [-1,MAX_VAL+1]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
444
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
445 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
446 # Parse vcf files
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
447 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
448 sys.stdout.write('comparing variation in '+refName+'... ')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
449 sys.stdout.flush()
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
450 tt = time.time()
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
451
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
452 (correctHashed, correctAlts, correctCov, correctAF, correctQual, correctTargLen, correctBelowMinRLen, correctUnique, gFiltered, gMerged, gRedundant) = parseVCF(GOLDEN_VCF, refName, targRegionsFl, vcfo2, vcfo2_firstTime)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
453 (workflowHashed, workflowAlts, workflowCov, workflowAF, workflowQual, workflowTarLen, workflowBelowMinRLen, workflowUnique, wFiltered, wMerged, wRedundant) = parseVCF(WORKFLOW_VCF, refName, targRegionsFl, vcfo3, vcfo3_firstTime)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
454 zgF += gFiltered
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
455 zgR += gRedundant
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
456 zgM += gMerged
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
457 zwF += wFiltered
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
458 zwR += wRedundant
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
459 zwM += wMerged
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
460
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
461 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
462 # Deduce which variants are FP / FN
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
463 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
464 solvedInds = {}
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
465 for var in correctHashed.keys():
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
466 if var in workflowHashed or var[0] in solvedInds:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
467 correctHashed[var] = 2
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
468 workflowHashed[var] = 2
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
469 solvedInds[var[0]] = True
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
470 for var in correctHashed.keys()+workflowHashed.keys():
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
471 if var[0] in solvedInds:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
472 correctHashed[var] = 2
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
473 workflowHashed[var] = 2
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
474 nPerfect = len(solvedInds)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
475
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
476 # correctHashed[var] = 1: were not found
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
477 # = 2: should be discluded because we were found
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
478 # = 3: should be discluded because an alt was found
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
479 notFound = [n for n in sorted(correctHashed.keys()) if correctHashed[n] == 1]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
480 FPvariants = [n for n in sorted(workflowHashed.keys()) if workflowHashed[n] == 1]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
481
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
482 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
483 # condense all variants who have alternate alleles and were *not* found to have perfect matches
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
484 # into a single variant again. These will not be included in the candidates for equivalency checking. Sorry!
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
485 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
486 notFound = condenseByPos(notFound)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
487 FPvariants = condenseByPos(FPvariants)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
488
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
489 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
490 # tally up some values, if there are no golden variants lets save some CPU cycles and move to the next ref
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
491 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
492 totalGoldenVariants = nPerfect + len(notFound)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
493 totalWorkflowVariants = nPerfect + len(FPvariants)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
494 if totalGoldenVariants == 0:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
495 zfP += len(FPvariants)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
496 ztW += totalWorkflowVariants
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
497 print '{0:.3f} (sec)'.format(time.time()-tt)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
498 continue
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
499
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
500 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
501 # let's check for equivalent variants
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
502 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
503 if FAST == False:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
504 delList_i = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
505 delList_j = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
506 regionsToCheck = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
507 for i in xrange(len(FPvariants)):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
508 pos = FPvariants[i][0]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
509 regionsToCheck.append((max([pos-EV_BPRANGE-1,0]),min([pos+EV_BPRANGE,len(myDat)-1])))
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
510
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
511 for n in regionsToCheck:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
512 refSection = myDat[n[0]:n[1]]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
513
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
514 fpWithin = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
515 for i in xrange(len(FPvariants)):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
516 m = FPvariants[i]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
517 if (m[0] > n[0] and m[0] < n[1]):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
518 fpWithin.append((m,i))
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
519 fpWithin = sorted(fpWithin)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
520 adj = 0
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
521 altSection = copy.deepcopy(refSection)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
522 for (m,i) in fpWithin:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
523 lr = len(m[1])
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
524 la = len(m[2])
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
525 dpos = m[0]-n[0]+adj
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
526 altSection = altSection[:dpos-1] + m[2] + altSection[dpos-1+lr:]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
527 adj += la-lr
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
528
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
529 nfWithin = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
530 for j in xrange(len(notFound)):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
531 m = notFound[j]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
532 if (m[0] > n[0] and m[0] < n[1]):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
533 nfWithin.append((m,j))
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
534 nfWithin = sorted(nfWithin)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
535 adj = 0
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
536 altSection2 = copy.deepcopy(refSection)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
537 for (m,j) in nfWithin:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
538 lr = len(m[1])
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
539 la = len(m[2])
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
540 dpos = m[0]-n[0]+adj
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
541 altSection2 = altSection2[:dpos-1] + m[2] + altSection2[dpos-1+lr:]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
542 adj += la-lr
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
543
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
544 if altSection == altSection2:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
545 for (m,i) in fpWithin:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
546 if i not in delList_i:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
547 delList_i.append(i)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
548 for (m,j) in nfWithin:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
549 if j not in delList_j:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
550 delList_j.append(j)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
551
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
552 nEquiv = 0
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
553 for i in sorted(list(set(delList_i)),reverse=True):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
554 del FPvariants[i]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
555 for j in sorted(list(set(delList_j)),reverse=True):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
556 del notFound[j]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
557 nEquiv += 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
558 nPerfect += nEquiv
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
559
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
560 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
561 # Tally up errors and whatnot
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
562 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
563 ztV += totalGoldenVariants
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
564 ztW += totalWorkflowVariants
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
565 znP += nPerfect
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
566 zfP += len(FPvariants)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
567 znF += len(notFound)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
568 if FAST == False:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
569 znE += nEquiv
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
570 if BEDFILE != None:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
571 zbM += correctBelowMinRLen
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
572
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
573 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
574 # try to identify a reason for FN variants:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
575 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
576
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
577 venn_data = [[0,0,0] for n in notFound] # [i] = (unmappable, low cov, low het)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
578 for i in xrange(len(notFound)):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
579 var = notFound[i]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
580
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
581 noReason = True
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
582
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
583 # mappability?
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
584 if MAPTRACK != None:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
585 if refName in mappability_tracks and var[0] < len(mappability_tracks[refName]):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
586 if mappability_tracks[refName][var[0]]:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
587 mappability_vs_FN[1] += 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
588 venn_data[i][0] = 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
589 noReason = False
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
590 else:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
591 mappability_vs_FN[0] += 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
592
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
593 # coverage?
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
594 if var in correctCov:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
595 c = correctCov[var]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
596 if c != None:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
597 if c not in coverage_vs_FN:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
598 coverage_vs_FN[c] = 0
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
599 coverage_vs_FN[c] += 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
600 if c < DP_THRESH:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
601 venn_data[i][1] = 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
602 noReason = False
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
603
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
604 # heterozygous genotype messing things up?
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
605 #if var in correctAF:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
606 # a = correctAF[var]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
607 # if a != None:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
608 # a = AF_KEYS[quantize_AF(a)]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
609 # if a not in alleleBal_vs_FN:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
610 # alleleBal_vs_FN[a] = 0
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
611 # alleleBal_vs_FN[a] += 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
612 # if a < AF_THRESH:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
613 # venn_data[i][2] = 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
614
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
615 # no reason?
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
616 if noReason:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
617 venn_data[i][2] += 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
618
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
619 for i in xrange(len(notFound)):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
620 if venn_data[i][0]: set1.append(i+varAdj)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
621 if venn_data[i][1]: set2.append(i+varAdj)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
622 if venn_data[i][2]: set3.append(i+varAdj)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
623 varAdj += len(notFound)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
624
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
625 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
626 # if desired, write out vcf files.
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
627 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
628 notFound = sorted(notFound)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
629 FPvariants = sorted(FPvariants)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
630 if VCF_OUT:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
631 for line in open(GOLDEN_VCF,'r'):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
632 if line[0] != '#':
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
633 splt = line.split('\t')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
634 if splt[0] == refName:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
635 var = (int(splt[1]),splt[3],splt[4])
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
636 if var in notFound:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
637 vcfo2.write(line)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
638 for line in open(WORKFLOW_VCF,'r'):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
639 if line[0] != '#':
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
640 splt = line.split('\t')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
641 if splt[0] == refName:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
642 var = (int(splt[1]),splt[3],splt[4])
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
643 if var in FPvariants:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
644 vcfo3.write(line)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
645
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
646 print '{0:.3f} (sec)'.format(time.time()-tt)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
647
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
648 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
649 # close vcf output
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
650 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
651 print ''
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
652 if VCF_OUT:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
653 print OUT_PREFIX+'_FN.vcf'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
654 print OUT_PREFIX+'_FP.vcf'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
655 vcfo2.close()
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
656 vcfo3.close()
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
657
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
658 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
659 # plot some FN stuff
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
660 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
661 if NO_PLOT == False:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
662 nDetected = len(set(set1+set2+set3))
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
663 set1 = set(set1)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
664 set2 = set(set2)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
665 set3 = set(set3)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
666
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
667 if len(set1): s1 = 'Unmappable'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
668 else: s1 = ''
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
669 if len(set2): s2 = 'DP < '+str(DP_THRESH)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
670 else: s2 = ''
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
671 #if len(set3): s3 = 'AF < '+str(AF_THRESH)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
672 if len(set3): s3 = 'Unknown'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
673 else: s3 = ''
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
674
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
675 mpl.figure(0)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
676 tstr1 = 'False Negative Variants (Missed Detections)'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
677 #tstr2 = str(nDetected)+' / '+str(znF)+' FN variants categorized'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
678 tstr2 = ''
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
679 if MAPTRACK != None:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
680 v = venn3([set1, set2, set3], (s1, s2, s3))
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
681 else:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
682 v = venn2([set2, set3], (s2, s3))
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
683 mpl.figtext(0.5,0.95,tstr1,fontdict={'size':14,'weight':'bold'},horizontalalignment='center')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
684 mpl.figtext(0.5,0.03,tstr2,fontdict={'size':14,'weight':'bold'},horizontalalignment='center')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
685
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
686 ouf = OUT_PREFIX+'_FNvenn.pdf'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
687 print ouf
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
688 mpl.savefig(ouf)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
689
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
690 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
691 # spit out results to console
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
692 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
693 print '\n**********************************\n'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
694 if BEDFILE != None:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
695 print 'ONLY CONSIDERING VARIANTS FOUND WITHIN TARGETED REGIONS\n\n'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
696 print 'Total Golden Variants: ',ztV,'\t[',zgF,'filtered,',zgM,'merged,',zgR,'redundant ]'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
697 print 'Total Workflow Variants:',ztW,'\t[',zwF,'filtered,',zwM,'merged,',zwR,'redundant ]'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
698 print ''
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
699 if ztV > 0 and ztW > 0:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
700 print 'Perfect Matches:',znP,'({0:.2f}%)'.format(100.*float(znP)/ztV)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
701 print 'FN variants: ',znF,'({0:.2f}%)'.format(100.*float(znF)/ztV)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
702 print 'FP variants: ',zfP#,'({0:.2f}%)'.format(100.*float(zfP)/ztW)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
703 if FAST == False:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
704 print '\nNumber of equivalent variants denoted differently between the two vcfs:',znE
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
705 if BEDFILE != None:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
706 print '\nNumber of golden variants located in targeted regions that were too small to be sampled from:',zbM
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
707 if FAST:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
708 print "\nWarning! Running with '--fast' means that identical variants denoted differently between the two vcfs will not be detected! The values above may be lower than the true accuracy."
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
709 #if NO_PLOT:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
710 if True:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
711 print '\n#unmappable: ',len(set1)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
712 print '#low_coverage:',len(set2)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
713 print '#unknown: ',len(set3)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
714 print '\n**********************************\n'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
715
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
716
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
717
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
718
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
719
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
720 if __name__ == '__main__':
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
721 main()