comparison convert_VCF_info_fields.py @ 12:0f5f4a208660 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
author iuc
date Fri, 17 Sep 2021 20:22:49 +0000
parents 9fb055604648
children 222669c4afb6
comparison
equal deleted inserted replaced
11:2bf63b38ee9b 12:0f5f4a208660
5 # Usage statement: 5 # Usage statement:
6 # python convert_VCF_info_fields.py in_vcf.vcf out_vcf.vcf 6 # python convert_VCF_info_fields.py in_vcf.vcf out_vcf.vcf
7 7
8 # 10/21/2020 - Nathan P. Roach, natproach@gmail.com 8 # 10/21/2020 - Nathan P. Roach, natproach@gmail.com
9 9
10 import re
10 import sys 11 import sys
11 from collections import OrderedDict 12 from collections import OrderedDict
12 from math import log10 13 from math import log10
13 14
14 import scipy
15 import scipy.stats 15 import scipy.stats
16 16
17 17
18 def pval_to_phredqual(pval): 18 def pval_to_phredqual(pval):
19 try: 19 try:
31 info_dict[code] = val 31 info_dict[code] = val
32 return info_dict 32 return info_dict
33 33
34 34
35 def annotateVCF(in_vcf_filepath, out_vcf_filepath): 35 def annotateVCF(in_vcf_filepath, out_vcf_filepath):
36 """Postprocess output of medaka tools annotate.
37
38 Splits multiallelic sites into separate records.
39 Replaces medaka INFO fields that might represent information of the ref
40 and multiple alternate alleles with simple ref, alt allele counterparts.
41 """
42
36 in_vcf = open(in_vcf_filepath, 'r') 43 in_vcf = open(in_vcf_filepath, 'r')
37 out_vcf = open(out_vcf_filepath, 'w') 44 # medaka INFO fields that do not make sense after splitting of
38 to_skip = set(['SC', 'SR']) 45 # multi-allelic records
39 for i, line in enumerate(in_vcf): 46 # DP will be overwritten with the value of DPSP because medaka tools
40 if i == 1: 47 # annotate currently only calculates the latter correctly
41 out_vcf.write("##convert_VCF_info_fields=0.2\n") 48 # (https://github.com/nanoporetech/medaka/issues/192).
42 if line[0:2] == "##": 49 # DPS, which is as unreliable as DP, gets skipped and the code
43 if line[0:11] == "##INFO=<ID=": 50 # calculates the spanning reads equivalent DPSPS instead.
44 id_ = line[11:].split(',')[0] 51 to_skip = {'SC', 'SR', 'AR', 'DP', 'DPSP', 'DPS'}
45 if id_ in to_skip: 52 struct_meta_pat = re.compile('##(.+)=<ID=([^,]+)(,.+)?>')
53 header_lines = []
54 contig_ids = set()
55 contig_ids_simple = set()
56 # parse the metadata lines of the input VCF and drop:
57 # - duplicate lines
58 # - INFO lines declaring keys we are not going to write
59 # - redundant contig information
60 while True:
61 line = in_vcf.readline()
62 if line[:2] != '##':
63 assert line.startswith('#CHROM')
64 break
65 if line in header_lines:
66 # the annotate tool may generate lines already written by
67 # medaka variant again (example: medaka version line)
68 continue
69 match = struct_meta_pat.match(line)
70 if match:
71 match_type, match_id, match_misc = match.groups()
72 if match_type == 'INFO':
73 if match_id == 'DPSP':
74 line = line.replace('DPSP', 'DP')
75 elif match_id in to_skip:
46 continue 76 continue
47 out_vcf.write(line) 77 elif match_type == 'contig':
48 elif line[0] == "#": 78 contig_ids.add(match_id)
49 out_vcf.write('##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Spanning Reads Allele Frequency By Strand">\n') 79 if not match_misc:
50 out_vcf.write('##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\n') 80 # the annotate tools writes its own contig info,
51 out_vcf.write('##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Spanning Reads Allele Frequency">\n') 81 # which is redundant with contig info generated by
52 out_vcf.write('##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Spanning Reads Allele Frequency">\n') 82 # medaka variant, but lacks a length value.
53 out_vcf.write('##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\n') 83 # We don't need the incomplete line.
54 out_vcf.write('##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\n') 84 contig_ids_simple.add(match_id)
55 out_vcf.write('##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\n') 85 continue
56 out_vcf.write(line) 86 header_lines.append(line)
57 else: 87 # Lets check the above assumption about each ID-only contig line
88 # having a more complete counterpart.
89 assert not (contig_ids_simple - contig_ids)
90 header_lines.insert(1, '##convert_VCF_info_fields=0.2\n')
91 header_lines += [
92 '##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Depth of spanning reads by strand">\n',
93 '##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\n',
94 '##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Spanning Reads Allele Frequency">\n',
95 '##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Spanning Reads Allele Frequency">\n',
96 '##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\n',
97 '##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\n',
98 '##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\n',
99 line
100 ]
101
102 with open(out_vcf_filepath, 'w') as out_vcf:
103 out_vcf.writelines(header_lines)
104 for line in in_vcf:
58 fields = line.split('\t') 105 fields = line.split('\t')
59 info_dict = parseInfoField(fields[7]) 106 info_dict = parseInfoField(fields[7])
60 sr_list = [int(x) for x in info_dict["SR"].split(',')] 107 sr_list = [int(x) for x in info_dict["SR"].split(',')]
61 sc_list = [int(x) for x in info_dict["SC"].split(',')] 108 sc_list = [int(x) for x in info_dict["SC"].split(',')]
62 if len(sr_list) == len(sc_list): 109 if len(sr_list) != len(sc_list):
63 variant_list = fields[4].split(',') 110 print(
64 dpsp = int(info_dict["DPSP"]) 111 'WARNING - SR and SC are different lengths, '
65 ref_fwd, ref_rev = 0, 1 112 'skipping variant'
66 dpspf, dpspr = (int(x) for x in info_dict["AR"].split(',')) 113 )
67 for i in range(0, len(sr_list), 2): 114 print(line.strip()) # Print the line for debugging purposes
68 dpspf += sr_list[i] 115 continue
69 dpspr += sr_list[i + 1] 116 variant_list = fields[4].split(',')
70 for j, i in enumerate(range(2, len(sr_list), 2)): 117 dpsp = int(info_dict['DPSP'])
71 dp4 = (sr_list[ref_fwd], sr_list[ref_rev], sr_list[i], sr_list[i + 1]) 118 ref_fwd, ref_rev = 0, 1
72 dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]] 119 dpspf, dpspr = (int(x) for x in info_dict['AR'].split(','))
73 _, p_val = scipy.stats.fisher_exact(dp2x2) 120 for i in range(0, len(sr_list), 2):
74 sb = pval_to_phredqual(p_val) 121 dpspf += sr_list[i]
122 dpspr += sr_list[i + 1]
123 for j, i in enumerate(range(2, len(sr_list), 2)):
124 dp4 = (
125 sr_list[ref_fwd],
126 sr_list[ref_rev],
127 sr_list[i],
128 sr_list[i + 1]
129 )
130 dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]]
131 _, p_val = scipy.stats.fisher_exact(dp2x2)
132 sb = pval_to_phredqual(p_val)
75 133
76 as_ = (sc_list[ref_fwd], sc_list[ref_rev], sc_list[i], sc_list[i + 1]) 134 as_ = (
135 sc_list[ref_fwd],
136 sc_list[ref_rev],
137 sc_list[i],
138 sc_list[i + 1]
139 )
77 140
78 info = [] 141 info = []
79 for code in info_dict: 142 for code in info_dict:
80 if code in to_skip: 143 if code in to_skip:
81 continue 144 continue
82 val = info_dict[code] 145 val = info_dict[code]
83 info.append("%s=%s" % (code, val)) 146 info.append("%s=%s" % (code, val))
84 147
85 info.append("DPSPS=%d,%d" % (dpspf, dpspr)) 148 info.append('DP=%d' % dpsp)
149 info.append('DPSPS=%d,%d' % (dpspf, dpspr))
86 150
87 if dpsp == 0: 151 if dpsp == 0:
88 info.append("AF=NaN") 152 info.append('AF=NaN')
89 else: 153 else:
90 af = (dp4[2] + dp4[3]) / dpsp 154 af = (dp4[2] + dp4[3]) / dpsp
91 info.append("AF=%.6f" % (af)) 155 info.append('AF=%.6f' % af)
92 if dpspf == 0: 156 if dpspf == 0:
93 info.append("FAF=NaN") 157 info.append('FAF=NaN')
94 else: 158 else:
95 faf = dp4[2] / dpspf 159 faf = dp4[2] / dpspf
96 info.append("FAF=%.6f" % (faf)) 160 info.append('FAF=%.6f' % faf)
97 if dpspr == 0: 161 if dpspr == 0:
98 info.append("RAF=NaN") 162 info.append('RAF=NaN')
99 else: 163 else:
100 raf = dp4[3] / dpspr 164 raf = dp4[3] / dpspr
101 info.append("RAF=%.6f" % (raf)) 165 info.append('RAF=%.6f' % raf)
102 info.append("SB=%d" % (sb)) 166 info.append('SB=%d' % sb)
103 info.append("DP4=%d,%d,%d,%d" % (dp4)) 167 info.append('DP4=%d,%d,%d,%d' % dp4)
104 info.append("AS=%d,%d,%d,%d" % (as_)) 168 info.append('AS=%d,%d,%d,%d' % as_)
105 new_info = ';'.join(info) 169 new_info = ';'.join(info)
106 fields[4] = variant_list[j] 170 fields[4] = variant_list[j]
107 fields[7] = new_info 171 fields[7] = new_info
108 out_vcf.write("%s" % ("\t".join(fields))) 172 out_vcf.write('\t'.join(fields))
109 else:
110 print("WARNING - SR and SC are different lengths, skipping variant")
111 print(line.strip()) # Print the line for debugging purposes
112 in_vcf.close() 173 in_vcf.close()
113 out_vcf.close()
114 174
115 175
116 if __name__ == "__main__": 176 if __name__ == "__main__":
117 annotateVCF(sys.argv[1], sys.argv[2]) 177 annotateVCF(sys.argv[1], sys.argv[2])