Mercurial > repos > iuc > medaka_variant
comparison convert_VCF_info_fields.py @ 12:0f5f4a208660 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
author | iuc |
---|---|
date | Fri, 17 Sep 2021 20:22:49 +0000 |
parents | 9fb055604648 |
children | 222669c4afb6 |
comparison
equal
deleted
inserted
replaced
11:2bf63b38ee9b | 12:0f5f4a208660 |
---|---|
5 # Usage statement: | 5 # Usage statement: |
6 # python convert_VCF_info_fields.py in_vcf.vcf out_vcf.vcf | 6 # python convert_VCF_info_fields.py in_vcf.vcf out_vcf.vcf |
7 | 7 |
8 # 10/21/2020 - Nathan P. Roach, natproach@gmail.com | 8 # 10/21/2020 - Nathan P. Roach, natproach@gmail.com |
9 | 9 |
10 import re | |
10 import sys | 11 import sys |
11 from collections import OrderedDict | 12 from collections import OrderedDict |
12 from math import log10 | 13 from math import log10 |
13 | 14 |
14 import scipy | |
15 import scipy.stats | 15 import scipy.stats |
16 | 16 |
17 | 17 |
18 def pval_to_phredqual(pval): | 18 def pval_to_phredqual(pval): |
19 try: | 19 try: |
31 info_dict[code] = val | 31 info_dict[code] = val |
32 return info_dict | 32 return info_dict |
33 | 33 |
34 | 34 |
35 def annotateVCF(in_vcf_filepath, out_vcf_filepath): | 35 def annotateVCF(in_vcf_filepath, out_vcf_filepath): |
36 """Postprocess output of medaka tools annotate. | |
37 | |
38 Splits multiallelic sites into separate records. | |
39 Replaces medaka INFO fields that might represent information of the ref | |
40 and multiple alternate alleles with simple ref, alt allele counterparts. | |
41 """ | |
42 | |
36 in_vcf = open(in_vcf_filepath, 'r') | 43 in_vcf = open(in_vcf_filepath, 'r') |
37 out_vcf = open(out_vcf_filepath, 'w') | 44 # medaka INFO fields that do not make sense after splitting of |
38 to_skip = set(['SC', 'SR']) | 45 # multi-allelic records |
39 for i, line in enumerate(in_vcf): | 46 # DP will be overwritten with the value of DPSP because medaka tools |
40 if i == 1: | 47 # annotate currently only calculates the latter correctly |
41 out_vcf.write("##convert_VCF_info_fields=0.2\n") | 48 # (https://github.com/nanoporetech/medaka/issues/192). |
42 if line[0:2] == "##": | 49 # DPS, which is as unreliable as DP, gets skipped and the code |
43 if line[0:11] == "##INFO=<ID=": | 50 # calculates the spanning reads equivalent DPSPS instead. |
44 id_ = line[11:].split(',')[0] | 51 to_skip = {'SC', 'SR', 'AR', 'DP', 'DPSP', 'DPS'} |
45 if id_ in to_skip: | 52 struct_meta_pat = re.compile('##(.+)=<ID=([^,]+)(,.+)?>') |
53 header_lines = [] | |
54 contig_ids = set() | |
55 contig_ids_simple = set() | |
56 # parse the metadata lines of the input VCF and drop: | |
57 # - duplicate lines | |
58 # - INFO lines declaring keys we are not going to write | |
59 # - redundant contig information | |
60 while True: | |
61 line = in_vcf.readline() | |
62 if line[:2] != '##': | |
63 assert line.startswith('#CHROM') | |
64 break | |
65 if line in header_lines: | |
66 # the annotate tool may generate lines already written by | |
67 # medaka variant again (example: medaka version line) | |
68 continue | |
69 match = struct_meta_pat.match(line) | |
70 if match: | |
71 match_type, match_id, match_misc = match.groups() | |
72 if match_type == 'INFO': | |
73 if match_id == 'DPSP': | |
74 line = line.replace('DPSP', 'DP') | |
75 elif match_id in to_skip: | |
46 continue | 76 continue |
47 out_vcf.write(line) | 77 elif match_type == 'contig': |
48 elif line[0] == "#": | 78 contig_ids.add(match_id) |
49 out_vcf.write('##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Spanning Reads Allele Frequency By Strand">\n') | 79 if not match_misc: |
50 out_vcf.write('##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\n') | 80 # the annotate tools writes its own contig info, |
51 out_vcf.write('##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Spanning Reads Allele Frequency">\n') | 81 # which is redundant with contig info generated by |
52 out_vcf.write('##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Spanning Reads Allele Frequency">\n') | 82 # medaka variant, but lacks a length value. |
53 out_vcf.write('##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\n') | 83 # We don't need the incomplete line. |
54 out_vcf.write('##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\n') | 84 contig_ids_simple.add(match_id) |
55 out_vcf.write('##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\n') | 85 continue |
56 out_vcf.write(line) | 86 header_lines.append(line) |
57 else: | 87 # Lets check the above assumption about each ID-only contig line |
88 # having a more complete counterpart. | |
89 assert not (contig_ids_simple - contig_ids) | |
90 header_lines.insert(1, '##convert_VCF_info_fields=0.2\n') | |
91 header_lines += [ | |
92 '##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Depth of spanning reads by strand">\n', | |
93 '##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\n', | |
94 '##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Spanning Reads Allele Frequency">\n', | |
95 '##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Spanning Reads Allele Frequency">\n', | |
96 '##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\n', | |
97 '##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\n', | |
98 '##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\n', | |
99 line | |
100 ] | |
101 | |
102 with open(out_vcf_filepath, 'w') as out_vcf: | |
103 out_vcf.writelines(header_lines) | |
104 for line in in_vcf: | |
58 fields = line.split('\t') | 105 fields = line.split('\t') |
59 info_dict = parseInfoField(fields[7]) | 106 info_dict = parseInfoField(fields[7]) |
60 sr_list = [int(x) for x in info_dict["SR"].split(',')] | 107 sr_list = [int(x) for x in info_dict["SR"].split(',')] |
61 sc_list = [int(x) for x in info_dict["SC"].split(',')] | 108 sc_list = [int(x) for x in info_dict["SC"].split(',')] |
62 if len(sr_list) == len(sc_list): | 109 if len(sr_list) != len(sc_list): |
63 variant_list = fields[4].split(',') | 110 print( |
64 dpsp = int(info_dict["DPSP"]) | 111 'WARNING - SR and SC are different lengths, ' |
65 ref_fwd, ref_rev = 0, 1 | 112 'skipping variant' |
66 dpspf, dpspr = (int(x) for x in info_dict["AR"].split(',')) | 113 ) |
67 for i in range(0, len(sr_list), 2): | 114 print(line.strip()) # Print the line for debugging purposes |
68 dpspf += sr_list[i] | 115 continue |
69 dpspr += sr_list[i + 1] | 116 variant_list = fields[4].split(',') |
70 for j, i in enumerate(range(2, len(sr_list), 2)): | 117 dpsp = int(info_dict['DPSP']) |
71 dp4 = (sr_list[ref_fwd], sr_list[ref_rev], sr_list[i], sr_list[i + 1]) | 118 ref_fwd, ref_rev = 0, 1 |
72 dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]] | 119 dpspf, dpspr = (int(x) for x in info_dict['AR'].split(',')) |
73 _, p_val = scipy.stats.fisher_exact(dp2x2) | 120 for i in range(0, len(sr_list), 2): |
74 sb = pval_to_phredqual(p_val) | 121 dpspf += sr_list[i] |
122 dpspr += sr_list[i + 1] | |
123 for j, i in enumerate(range(2, len(sr_list), 2)): | |
124 dp4 = ( | |
125 sr_list[ref_fwd], | |
126 sr_list[ref_rev], | |
127 sr_list[i], | |
128 sr_list[i + 1] | |
129 ) | |
130 dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]] | |
131 _, p_val = scipy.stats.fisher_exact(dp2x2) | |
132 sb = pval_to_phredqual(p_val) | |
75 | 133 |
76 as_ = (sc_list[ref_fwd], sc_list[ref_rev], sc_list[i], sc_list[i + 1]) | 134 as_ = ( |
135 sc_list[ref_fwd], | |
136 sc_list[ref_rev], | |
137 sc_list[i], | |
138 sc_list[i + 1] | |
139 ) | |
77 | 140 |
78 info = [] | 141 info = [] |
79 for code in info_dict: | 142 for code in info_dict: |
80 if code in to_skip: | 143 if code in to_skip: |
81 continue | 144 continue |
82 val = info_dict[code] | 145 val = info_dict[code] |
83 info.append("%s=%s" % (code, val)) | 146 info.append("%s=%s" % (code, val)) |
84 | 147 |
85 info.append("DPSPS=%d,%d" % (dpspf, dpspr)) | 148 info.append('DP=%d' % dpsp) |
149 info.append('DPSPS=%d,%d' % (dpspf, dpspr)) | |
86 | 150 |
87 if dpsp == 0: | 151 if dpsp == 0: |
88 info.append("AF=NaN") | 152 info.append('AF=NaN') |
89 else: | 153 else: |
90 af = (dp4[2] + dp4[3]) / dpsp | 154 af = (dp4[2] + dp4[3]) / dpsp |
91 info.append("AF=%.6f" % (af)) | 155 info.append('AF=%.6f' % af) |
92 if dpspf == 0: | 156 if dpspf == 0: |
93 info.append("FAF=NaN") | 157 info.append('FAF=NaN') |
94 else: | 158 else: |
95 faf = dp4[2] / dpspf | 159 faf = dp4[2] / dpspf |
96 info.append("FAF=%.6f" % (faf)) | 160 info.append('FAF=%.6f' % faf) |
97 if dpspr == 0: | 161 if dpspr == 0: |
98 info.append("RAF=NaN") | 162 info.append('RAF=NaN') |
99 else: | 163 else: |
100 raf = dp4[3] / dpspr | 164 raf = dp4[3] / dpspr |
101 info.append("RAF=%.6f" % (raf)) | 165 info.append('RAF=%.6f' % raf) |
102 info.append("SB=%d" % (sb)) | 166 info.append('SB=%d' % sb) |
103 info.append("DP4=%d,%d,%d,%d" % (dp4)) | 167 info.append('DP4=%d,%d,%d,%d' % dp4) |
104 info.append("AS=%d,%d,%d,%d" % (as_)) | 168 info.append('AS=%d,%d,%d,%d' % as_) |
105 new_info = ';'.join(info) | 169 new_info = ';'.join(info) |
106 fields[4] = variant_list[j] | 170 fields[4] = variant_list[j] |
107 fields[7] = new_info | 171 fields[7] = new_info |
108 out_vcf.write("%s" % ("\t".join(fields))) | 172 out_vcf.write('\t'.join(fields)) |
109 else: | |
110 print("WARNING - SR and SC are different lengths, skipping variant") | |
111 print(line.strip()) # Print the line for debugging purposes | |
112 in_vcf.close() | 173 in_vcf.close() |
113 out_vcf.close() | |
114 | 174 |
115 | 175 |
116 if __name__ == "__main__": | 176 if __name__ == "__main__": |
117 annotateVCF(sys.argv[1], sys.argv[2]) | 177 annotateVCF(sys.argv[1], sys.argv[2]) |