comparison convert_VCF_info_fields.py @ 13:3fbefde449bc draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
author iuc
date Thu, 18 Nov 2021 20:01:04 +0000
parents 597407d61386
children
comparison
equal deleted inserted replaced
12:597407d61386 13:3fbefde449bc
22 ret = 2147483647 # transform pval of 0.0 to max signed 32 bit int 22 ret = 2147483647 # transform pval of 0.0 to max signed 32 bit int
23 return ret 23 return ret
24 24
25 25
26 def parseInfoField(info): 26 def parseInfoField(info):
27 info_fields = info.split(';') 27 info_fields = info.split(";")
28 info_dict = OrderedDict() 28 info_dict = OrderedDict()
29 for info_field in info_fields: 29 for info_field in info_fields:
30 code, val = info_field.split('=') 30 code, val = info_field.split("=")
31 info_dict[code] = val 31 info_dict[code] = val
32 return info_dict 32 return info_dict
33 33
34 34
35 def annotateVCF(in_vcf_filepath, out_vcf_filepath): 35 def annotateVCF(in_vcf_filepath, out_vcf_filepath):
38 Splits multiallelic sites into separate records. 38 Splits multiallelic sites into separate records.
39 Replaces medaka INFO fields that might represent information of the ref 39 Replaces medaka INFO fields that might represent information of the ref
40 and multiple alternate alleles with simple ref, alt allele counterparts. 40 and multiple alternate alleles with simple ref, alt allele counterparts.
41 """ 41 """
42 42
43 in_vcf = open(in_vcf_filepath, 'r') 43 in_vcf = open(in_vcf_filepath, "r")
44 # medaka INFO fields that do not make sense after splitting of 44 # medaka INFO fields that do not make sense after splitting of
45 # multi-allelic records 45 # multi-allelic records
46 # DP will be overwritten with the value of DPSP because medaka tools 46 # DP will be overwritten with the value of DPSP because medaka tools
47 # annotate currently only calculates the latter correctly 47 # annotate currently only calculates the latter correctly
48 # (https://github.com/nanoporetech/medaka/issues/192). 48 # (https://github.com/nanoporetech/medaka/issues/192).
49 # DPS, which is as unreliable as DP, gets skipped and the code 49 # DPS, which is as unreliable as DP, gets skipped and the code
50 # calculates the spanning reads equivalent DPSPS instead. 50 # calculates the spanning reads equivalent DPSPS instead.
51 to_skip = {'SC', 'SR', 'AR', 'DP', 'DPSP', 'DPS'} 51 to_skip = {"SC", "SR", "AR", "DP", "DPSP", "DPS"}
52 struct_meta_pat = re.compile('##(.+)=<ID=([^,]+)(,.+)?>') 52 struct_meta_pat = re.compile("##(.+)=<ID=([^,]+)(,.+)?>")
53 header_lines = [] 53 header_lines = []
54 contig_ids = set() 54 contig_ids = set()
55 contig_ids_simple = set() 55 contig_ids_simple = set()
56 # parse the metadata lines of the input VCF and drop: 56 # parse the metadata lines of the input VCF and drop:
57 # - duplicate lines 57 # - duplicate lines
58 # - INFO lines declaring keys we are not going to write 58 # - INFO lines declaring keys we are not going to write
59 # - redundant contig information 59 # - redundant contig information
60 while True: 60 while True:
61 line = in_vcf.readline() 61 line = in_vcf.readline()
62 if line[:2] != '##': 62 if line[:2] != "##":
63 assert line.startswith('#CHROM') 63 assert line.startswith("#CHROM")
64 break 64 break
65 if line in header_lines: 65 if line in header_lines:
66 # the annotate tool may generate lines already written by 66 # the annotate tool may generate lines already written by
67 # medaka variant again (example: medaka version line) 67 # medaka variant again (example: medaka version line)
68 continue 68 continue
69 match = struct_meta_pat.match(line) 69 match = struct_meta_pat.match(line)
70 if match: 70 if match:
71 match_type, match_id, match_misc = match.groups() 71 match_type, match_id, match_misc = match.groups()
72 if match_type == 'INFO': 72 if match_type == "INFO":
73 if match_id == 'DPSP': 73 if match_id == "DPSP":
74 line = line.replace('DPSP', 'DP') 74 line = line.replace("DPSP", "DP")
75 elif match_id in to_skip: 75 elif match_id in to_skip:
76 continue 76 continue
77 elif match_type == 'contig': 77 elif match_type == "contig":
78 contig_ids.add(match_id) 78 contig_ids.add(match_id)
79 if not match_misc: 79 if not match_misc:
80 # the annotate tools writes its own contig info, 80 # the annotate tools writes its own contig info,
81 # which is redundant with contig info generated by 81 # which is redundant with contig info generated by
82 # medaka variant, but lacks a length value. 82 # medaka variant, but lacks a length value.
85 continue 85 continue
86 header_lines.append(line) 86 header_lines.append(line)
87 # Lets check the above assumption about each ID-only contig line 87 # Lets check the above assumption about each ID-only contig line
88 # having a more complete counterpart. 88 # having a more complete counterpart.
89 assert not (contig_ids_simple - contig_ids) 89 assert not (contig_ids_simple - contig_ids)
90 header_lines.insert(1, '##convert_VCF_info_fields=0.2\n') 90 header_lines.insert(1, "##convert_VCF_info_fields=0.2\n")
91 header_lines += [ 91 header_lines += [
92 '##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Depth of spanning reads by strand">\n', 92 '##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Depth of spanning reads by strand">\n',
93 '##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\n', 93 '##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\n',
94 '##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Spanning Reads Allele Frequency">\n', 94 '##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Spanning Reads Allele Frequency">\n',
95 '##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Spanning Reads Allele Frequency">\n', 95 '##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Spanning Reads Allele Frequency">\n',
96 '##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\n', 96 '##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\n',
97 '##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\n', 97 '##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\n',
98 '##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\n', 98 '##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\n',
99 line 99 line,
100 ] 100 ]
101 101
102 with open(out_vcf_filepath, 'w') as out_vcf: 102 with open(out_vcf_filepath, "w") as out_vcf:
103 out_vcf.writelines(header_lines) 103 out_vcf.writelines(header_lines)
104 for line in in_vcf: 104 for line in in_vcf:
105 fields = line.split('\t') 105 fields = line.split("\t")
106 info_dict = parseInfoField(fields[7]) 106 info_dict = parseInfoField(fields[7])
107 sr_list = [int(x) for x in info_dict["SR"].split(',')] 107 sr_list = [int(x) for x in info_dict["SR"].split(",")]
108 sc_list = [int(x) for x in info_dict["SC"].split(',')] 108 sc_list = [int(x) for x in info_dict["SC"].split(",")]
109 if len(sr_list) != len(sc_list): 109 if len(sr_list) != len(sc_list):
110 print( 110 print("WARNING - SR and SC are different lengths, " "skipping variant")
111 'WARNING - SR and SC are different lengths, '
112 'skipping variant'
113 )
114 print(line.strip()) # Print the line for debugging purposes 111 print(line.strip()) # Print the line for debugging purposes
115 continue 112 continue
116 variant_list = fields[4].split(',') 113 variant_list = fields[4].split(",")
117 dpsp = int(info_dict['DPSP']) 114 dpsp = int(info_dict["DPSP"])
118 ref_fwd, ref_rev = 0, 1 115 ref_fwd, ref_rev = 0, 1
119 dpspf, dpspr = (int(x) for x in info_dict['AR'].split(',')) 116 dpspf, dpspr = (int(x) for x in info_dict["AR"].split(","))
120 for i in range(0, len(sr_list), 2): 117 for i in range(0, len(sr_list), 2):
121 dpspf += sr_list[i] 118 dpspf += sr_list[i]
122 dpspr += sr_list[i + 1] 119 dpspr += sr_list[i + 1]
123 for j, i in enumerate(range(2, len(sr_list), 2)): 120 for j, i in enumerate(range(2, len(sr_list), 2)):
124 dp4 = ( 121 dp4 = (sr_list[ref_fwd], sr_list[ref_rev], sr_list[i], sr_list[i + 1])
125 sr_list[ref_fwd],
126 sr_list[ref_rev],
127 sr_list[i],
128 sr_list[i + 1]
129 )
130 dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]] 122 dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]]
131 _, p_val = scipy.stats.fisher_exact(dp2x2) 123 _, p_val = scipy.stats.fisher_exact(dp2x2)
132 sb = pval_to_phredqual(p_val) 124 sb = pval_to_phredqual(p_val)
133 125
134 as_ = ( 126 as_ = (sc_list[ref_fwd], sc_list[ref_rev], sc_list[i], sc_list[i + 1])
135 sc_list[ref_fwd],
136 sc_list[ref_rev],
137 sc_list[i],
138 sc_list[i + 1]
139 )
140 127
141 info = [] 128 info = []
142 for code in info_dict: 129 for code in info_dict:
143 if code in to_skip: 130 if code in to_skip:
144 continue 131 continue
145 val = info_dict[code] 132 val = info_dict[code]
146 info.append("%s=%s" % (code, val)) 133 info.append("%s=%s" % (code, val))
147 134
148 info.append('DP=%d' % dpsp) 135 info.append("DP=%d" % dpsp)
149 info.append('DPSPS=%d,%d' % (dpspf, dpspr)) 136 info.append("DPSPS=%d,%d" % (dpspf, dpspr))
150 137
151 if dpsp == 0: 138 if dpsp == 0:
152 info.append('AF=NaN') 139 info.append("AF=NaN")
153 else: 140 else:
154 af = (dp4[2] + dp4[3]) / dpsp 141 af = (dp4[2] + dp4[3]) / dpsp
155 info.append('AF=%.6f' % af) 142 info.append("AF=%.6f" % af)
156 if dpspf == 0: 143 if dpspf == 0:
157 info.append('FAF=NaN') 144 info.append("FAF=NaN")
158 else: 145 else:
159 faf = dp4[2] / dpspf 146 faf = dp4[2] / dpspf
160 info.append('FAF=%.6f' % faf) 147 info.append("FAF=%.6f" % faf)
161 if dpspr == 0: 148 if dpspr == 0:
162 info.append('RAF=NaN') 149 info.append("RAF=NaN")
163 else: 150 else:
164 raf = dp4[3] / dpspr 151 raf = dp4[3] / dpspr
165 info.append('RAF=%.6f' % raf) 152 info.append("RAF=%.6f" % raf)
166 info.append('SB=%d' % sb) 153 info.append("SB=%d" % sb)
167 info.append('DP4=%d,%d,%d,%d' % dp4) 154 info.append("DP4=%d,%d,%d,%d" % dp4)
168 info.append('AS=%d,%d,%d,%d' % as_) 155 info.append("AS=%d,%d,%d,%d" % as_)
169 new_info = ';'.join(info) 156 new_info = ";".join(info)
170 fields[4] = variant_list[j] 157 fields[4] = variant_list[j]
171 fields[7] = new_info 158 fields[7] = new_info
172 out_vcf.write('\t'.join(fields)) 159 out_vcf.write("\t".join(fields))
173 in_vcf.close() 160 in_vcf.close()
174 161
175 162
176 if __name__ == "__main__": 163 if __name__ == "__main__":
177 annotateVCF(sys.argv[1], sys.argv[2]) 164 annotateVCF(sys.argv[1], sys.argv[2])