Mercurial > repos > iuc > medaka_variant_pipeline
comparison convert_VCF_info_fields.py @ 13:3fbefde449bc draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
author | iuc |
---|---|
date | Thu, 18 Nov 2021 20:01:04 +0000 |
parents | 597407d61386 |
children |
comparison
equal
deleted
inserted
replaced
12:597407d61386 | 13:3fbefde449bc |
---|---|
22 ret = 2147483647 # transform pval of 0.0 to max signed 32 bit int | 22 ret = 2147483647 # transform pval of 0.0 to max signed 32 bit int |
23 return ret | 23 return ret |
24 | 24 |
25 | 25 |
26 def parseInfoField(info): | 26 def parseInfoField(info): |
27 info_fields = info.split(';') | 27 info_fields = info.split(";") |
28 info_dict = OrderedDict() | 28 info_dict = OrderedDict() |
29 for info_field in info_fields: | 29 for info_field in info_fields: |
30 code, val = info_field.split('=') | 30 code, val = info_field.split("=") |
31 info_dict[code] = val | 31 info_dict[code] = val |
32 return info_dict | 32 return info_dict |
33 | 33 |
34 | 34 |
35 def annotateVCF(in_vcf_filepath, out_vcf_filepath): | 35 def annotateVCF(in_vcf_filepath, out_vcf_filepath): |
38 Splits multiallelic sites into separate records. | 38 Splits multiallelic sites into separate records. |
39 Replaces medaka INFO fields that might represent information of the ref | 39 Replaces medaka INFO fields that might represent information of the ref |
40 and multiple alternate alleles with simple ref, alt allele counterparts. | 40 and multiple alternate alleles with simple ref, alt allele counterparts. |
41 """ | 41 """ |
42 | 42 |
43 in_vcf = open(in_vcf_filepath, 'r') | 43 in_vcf = open(in_vcf_filepath, "r") |
44 # medaka INFO fields that do not make sense after splitting of | 44 # medaka INFO fields that do not make sense after splitting of |
45 # multi-allelic records | 45 # multi-allelic records |
46 # DP will be overwritten with the value of DPSP because medaka tools | 46 # DP will be overwritten with the value of DPSP because medaka tools |
47 # annotate currently only calculates the latter correctly | 47 # annotate currently only calculates the latter correctly |
48 # (https://github.com/nanoporetech/medaka/issues/192). | 48 # (https://github.com/nanoporetech/medaka/issues/192). |
49 # DPS, which is as unreliable as DP, gets skipped and the code | 49 # DPS, which is as unreliable as DP, gets skipped and the code |
50 # calculates the spanning reads equivalent DPSPS instead. | 50 # calculates the spanning reads equivalent DPSPS instead. |
51 to_skip = {'SC', 'SR', 'AR', 'DP', 'DPSP', 'DPS'} | 51 to_skip = {"SC", "SR", "AR", "DP", "DPSP", "DPS"} |
52 struct_meta_pat = re.compile('##(.+)=<ID=([^,]+)(,.+)?>') | 52 struct_meta_pat = re.compile("##(.+)=<ID=([^,]+)(,.+)?>") |
53 header_lines = [] | 53 header_lines = [] |
54 contig_ids = set() | 54 contig_ids = set() |
55 contig_ids_simple = set() | 55 contig_ids_simple = set() |
56 # parse the metadata lines of the input VCF and drop: | 56 # parse the metadata lines of the input VCF and drop: |
57 # - duplicate lines | 57 # - duplicate lines |
58 # - INFO lines declaring keys we are not going to write | 58 # - INFO lines declaring keys we are not going to write |
59 # - redundant contig information | 59 # - redundant contig information |
60 while True: | 60 while True: |
61 line = in_vcf.readline() | 61 line = in_vcf.readline() |
62 if line[:2] != '##': | 62 if line[:2] != "##": |
63 assert line.startswith('#CHROM') | 63 assert line.startswith("#CHROM") |
64 break | 64 break |
65 if line in header_lines: | 65 if line in header_lines: |
66 # the annotate tool may generate lines already written by | 66 # the annotate tool may generate lines already written by |
67 # medaka variant again (example: medaka version line) | 67 # medaka variant again (example: medaka version line) |
68 continue | 68 continue |
69 match = struct_meta_pat.match(line) | 69 match = struct_meta_pat.match(line) |
70 if match: | 70 if match: |
71 match_type, match_id, match_misc = match.groups() | 71 match_type, match_id, match_misc = match.groups() |
72 if match_type == 'INFO': | 72 if match_type == "INFO": |
73 if match_id == 'DPSP': | 73 if match_id == "DPSP": |
74 line = line.replace('DPSP', 'DP') | 74 line = line.replace("DPSP", "DP") |
75 elif match_id in to_skip: | 75 elif match_id in to_skip: |
76 continue | 76 continue |
77 elif match_type == 'contig': | 77 elif match_type == "contig": |
78 contig_ids.add(match_id) | 78 contig_ids.add(match_id) |
79 if not match_misc: | 79 if not match_misc: |
80 # the annotate tools writes its own contig info, | 80 # the annotate tools writes its own contig info, |
81 # which is redundant with contig info generated by | 81 # which is redundant with contig info generated by |
82 # medaka variant, but lacks a length value. | 82 # medaka variant, but lacks a length value. |
85 continue | 85 continue |
86 header_lines.append(line) | 86 header_lines.append(line) |
87 # Lets check the above assumption about each ID-only contig line | 87 # Lets check the above assumption about each ID-only contig line |
88 # having a more complete counterpart. | 88 # having a more complete counterpart. |
89 assert not (contig_ids_simple - contig_ids) | 89 assert not (contig_ids_simple - contig_ids) |
90 header_lines.insert(1, '##convert_VCF_info_fields=0.2\n') | 90 header_lines.insert(1, "##convert_VCF_info_fields=0.2\n") |
91 header_lines += [ | 91 header_lines += [ |
92 '##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Depth of spanning reads by strand">\n', | 92 '##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Depth of spanning reads by strand">\n', |
93 '##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\n', | 93 '##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\n', |
94 '##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Spanning Reads Allele Frequency">\n', | 94 '##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Spanning Reads Allele Frequency">\n', |
95 '##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Spanning Reads Allele Frequency">\n', | 95 '##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Spanning Reads Allele Frequency">\n', |
96 '##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\n', | 96 '##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\n', |
97 '##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\n', | 97 '##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\n', |
98 '##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\n', | 98 '##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\n', |
99 line | 99 line, |
100 ] | 100 ] |
101 | 101 |
102 with open(out_vcf_filepath, 'w') as out_vcf: | 102 with open(out_vcf_filepath, "w") as out_vcf: |
103 out_vcf.writelines(header_lines) | 103 out_vcf.writelines(header_lines) |
104 for line in in_vcf: | 104 for line in in_vcf: |
105 fields = line.split('\t') | 105 fields = line.split("\t") |
106 info_dict = parseInfoField(fields[7]) | 106 info_dict = parseInfoField(fields[7]) |
107 sr_list = [int(x) for x in info_dict["SR"].split(',')] | 107 sr_list = [int(x) for x in info_dict["SR"].split(",")] |
108 sc_list = [int(x) for x in info_dict["SC"].split(',')] | 108 sc_list = [int(x) for x in info_dict["SC"].split(",")] |
109 if len(sr_list) != len(sc_list): | 109 if len(sr_list) != len(sc_list): |
110 print( | 110 print("WARNING - SR and SC are different lengths, " "skipping variant") |
111 'WARNING - SR and SC are different lengths, ' | |
112 'skipping variant' | |
113 ) | |
114 print(line.strip()) # Print the line for debugging purposes | 111 print(line.strip()) # Print the line for debugging purposes |
115 continue | 112 continue |
116 variant_list = fields[4].split(',') | 113 variant_list = fields[4].split(",") |
117 dpsp = int(info_dict['DPSP']) | 114 dpsp = int(info_dict["DPSP"]) |
118 ref_fwd, ref_rev = 0, 1 | 115 ref_fwd, ref_rev = 0, 1 |
119 dpspf, dpspr = (int(x) for x in info_dict['AR'].split(',')) | 116 dpspf, dpspr = (int(x) for x in info_dict["AR"].split(",")) |
120 for i in range(0, len(sr_list), 2): | 117 for i in range(0, len(sr_list), 2): |
121 dpspf += sr_list[i] | 118 dpspf += sr_list[i] |
122 dpspr += sr_list[i + 1] | 119 dpspr += sr_list[i + 1] |
123 for j, i in enumerate(range(2, len(sr_list), 2)): | 120 for j, i in enumerate(range(2, len(sr_list), 2)): |
124 dp4 = ( | 121 dp4 = (sr_list[ref_fwd], sr_list[ref_rev], sr_list[i], sr_list[i + 1]) |
125 sr_list[ref_fwd], | |
126 sr_list[ref_rev], | |
127 sr_list[i], | |
128 sr_list[i + 1] | |
129 ) | |
130 dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]] | 122 dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]] |
131 _, p_val = scipy.stats.fisher_exact(dp2x2) | 123 _, p_val = scipy.stats.fisher_exact(dp2x2) |
132 sb = pval_to_phredqual(p_val) | 124 sb = pval_to_phredqual(p_val) |
133 | 125 |
134 as_ = ( | 126 as_ = (sc_list[ref_fwd], sc_list[ref_rev], sc_list[i], sc_list[i + 1]) |
135 sc_list[ref_fwd], | |
136 sc_list[ref_rev], | |
137 sc_list[i], | |
138 sc_list[i + 1] | |
139 ) | |
140 | 127 |
141 info = [] | 128 info = [] |
142 for code in info_dict: | 129 for code in info_dict: |
143 if code in to_skip: | 130 if code in to_skip: |
144 continue | 131 continue |
145 val = info_dict[code] | 132 val = info_dict[code] |
146 info.append("%s=%s" % (code, val)) | 133 info.append("%s=%s" % (code, val)) |
147 | 134 |
148 info.append('DP=%d' % dpsp) | 135 info.append("DP=%d" % dpsp) |
149 info.append('DPSPS=%d,%d' % (dpspf, dpspr)) | 136 info.append("DPSPS=%d,%d" % (dpspf, dpspr)) |
150 | 137 |
151 if dpsp == 0: | 138 if dpsp == 0: |
152 info.append('AF=NaN') | 139 info.append("AF=NaN") |
153 else: | 140 else: |
154 af = (dp4[2] + dp4[3]) / dpsp | 141 af = (dp4[2] + dp4[3]) / dpsp |
155 info.append('AF=%.6f' % af) | 142 info.append("AF=%.6f" % af) |
156 if dpspf == 0: | 143 if dpspf == 0: |
157 info.append('FAF=NaN') | 144 info.append("FAF=NaN") |
158 else: | 145 else: |
159 faf = dp4[2] / dpspf | 146 faf = dp4[2] / dpspf |
160 info.append('FAF=%.6f' % faf) | 147 info.append("FAF=%.6f" % faf) |
161 if dpspr == 0: | 148 if dpspr == 0: |
162 info.append('RAF=NaN') | 149 info.append("RAF=NaN") |
163 else: | 150 else: |
164 raf = dp4[3] / dpspr | 151 raf = dp4[3] / dpspr |
165 info.append('RAF=%.6f' % raf) | 152 info.append("RAF=%.6f" % raf) |
166 info.append('SB=%d' % sb) | 153 info.append("SB=%d" % sb) |
167 info.append('DP4=%d,%d,%d,%d' % dp4) | 154 info.append("DP4=%d,%d,%d,%d" % dp4) |
168 info.append('AS=%d,%d,%d,%d' % as_) | 155 info.append("AS=%d,%d,%d,%d" % as_) |
169 new_info = ';'.join(info) | 156 new_info = ";".join(info) |
170 fields[4] = variant_list[j] | 157 fields[4] = variant_list[j] |
171 fields[7] = new_info | 158 fields[7] = new_info |
172 out_vcf.write('\t'.join(fields)) | 159 out_vcf.write("\t".join(fields)) |
173 in_vcf.close() | 160 in_vcf.close() |
174 | 161 |
175 | 162 |
176 if __name__ == "__main__": | 163 if __name__ == "__main__": |
177 annotateVCF(sys.argv[1], sys.argv[2]) | 164 annotateVCF(sys.argv[1], sys.argv[2]) |