Mercurial > repos > iuc > medaka_consensus
comparison annotateVCF.py @ 4:84dcccebad3d draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit e80b649094384fc6d7a8f917300db3550cc99a44"
| author | iuc |
|---|---|
| date | Tue, 01 Sep 2020 03:07:44 -0400 |
| parents | |
| children | 22f6a0e7424f |
comparison
equal
deleted
inserted
replaced
| 3:a95960a63220 | 4:84dcccebad3d |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 # Takes in VCF file and a samtools mpileup output file | |
| 4 # Fills in annotation for the VCF file including AF, DP | |
| 5 # SB, and DP4 | |
| 6 # | |
| 7 # Usage statement: | |
| 8 # python annotateVCF.py in_vcf.vcf in_mpileup.txt out_vcf.vcf | |
| 9 # | |
| 10 # Can generate in_mileup.txt with samtools mpileup (and can restrict which sites to generate pileups for with in_vcf.vcf) | |
| 11 | |
| 12 # 08/24/2020 - Nathan P. Roach, natproach@gmail.com | |
| 13 | |
| 14 import sys | |
| 15 from math import isnan, log10 | |
| 16 | |
| 17 from scipy.stats import fisher_exact | |
| 18 | |
| 19 | |
| 20 def pval_to_phredqual(pval): | |
| 21 return int(round(-10. * log10(pval))) | |
| 22 | |
| 23 | |
| 24 def parseSimpleSNPpileup(fields, ref_base, alt_base): | |
| 25 base_to_idx = { | |
| 26 'A': 0, | |
| 27 'a': 0, | |
| 28 'T': 1, | |
| 29 't': 1, | |
| 30 'C': 2, | |
| 31 'c': 2, | |
| 32 'G': 3, | |
| 33 'g': 3 | |
| 34 } | |
| 35 | |
| 36 base_to_idx_stranded = { | |
| 37 'A': 0, | |
| 38 'T': 1, | |
| 39 'C': 2, | |
| 40 'G': 3, | |
| 41 'a': 4, | |
| 42 't': 5, | |
| 43 'c': 6, | |
| 44 'g': 7 | |
| 45 } | |
| 46 ref_base2 = fields[2] | |
| 47 counts = [0, 0, 0, 0] | |
| 48 stranded_counts = [0, 0, 0, 0, 0, 0, 0, 0] | |
| 49 ref_idx = base_to_idx[fields[2]] | |
| 50 dp = int(fields[3]) | |
| 51 carrot_flag = False | |
| 52 ins_flag = False | |
| 53 ins_str = "" | |
| 54 ins_len = 0 | |
| 55 insertion = "" | |
| 56 del_flag = False | |
| 57 del_str = "" | |
| 58 del_len = 0 | |
| 59 deletion = "" | |
| 60 # dollar_flag = False | |
| 61 for base in fields[4]: | |
| 62 if carrot_flag: | |
| 63 carrot_flag = False | |
| 64 continue | |
| 65 if ins_len > 0: | |
| 66 insertion += base | |
| 67 ins_len -= 1 | |
| 68 continue | |
| 69 if del_len > 0: | |
| 70 deletion += base | |
| 71 del_len -= 1 | |
| 72 continue | |
| 73 if ins_flag: | |
| 74 if base.isdigit(): | |
| 75 ins_str += base | |
| 76 else: | |
| 77 ins_len = int(ins_str) - 1 | |
| 78 insertion = base | |
| 79 ins_flag = False | |
| 80 elif del_flag: | |
| 81 if base.isdigit(): | |
| 82 del_str += base | |
| 83 else: | |
| 84 del_len = int(del_str) - 1 | |
| 85 deletion = base | |
| 86 del_flag = False | |
| 87 else: | |
| 88 if base == '^': | |
| 89 carrot_flag = True | |
| 90 continue | |
| 91 elif base == '$': | |
| 92 continue | |
| 93 elif base == '+': | |
| 94 ins_flag = True | |
| 95 elif base == '-': | |
| 96 del_flag = True | |
| 97 elif base == '.': | |
| 98 counts[ref_idx] += 1 | |
| 99 stranded_counts[base_to_idx_stranded[ref_base2]] += 1 | |
| 100 elif base == ',': | |
| 101 counts[ref_idx] += 1 | |
| 102 stranded_counts[base_to_idx_stranded[ref_base2.lower()]] += 1 | |
| 103 elif base == 'N' or base == 'n': | |
| 104 continue | |
| 105 elif base == '*': | |
| 106 continue | |
| 107 else: | |
| 108 counts[base_to_idx[base]] += 1 | |
| 109 stranded_counts[base_to_idx_stranded[base]] += 1 | |
| 110 af = float(counts[base_to_idx[alt_base]]) / float(sum(counts)) | |
| 111 if float(sum(stranded_counts[0:4])) == 0: | |
| 112 faf = float("nan") | |
| 113 else: | |
| 114 faf = float(stranded_counts[base_to_idx_stranded[alt_base]]) / float(sum(stranded_counts[0:4])) | |
| 115 if float(sum(stranded_counts[4:])) == 0: | |
| 116 raf = float("nan") | |
| 117 else: | |
| 118 raf = float(stranded_counts[base_to_idx_stranded[alt_base.lower()]]) / float(sum(stranded_counts[4:])) | |
| 119 dp4 = [stranded_counts[base_to_idx_stranded[ref_base]], | |
| 120 stranded_counts[base_to_idx_stranded[ref_base.lower()]], | |
| 121 stranded_counts[base_to_idx_stranded[alt_base]], | |
| 122 stranded_counts[base_to_idx_stranded[alt_base.lower()]]] | |
| 123 return (dp, af, faf, raf, dp4) | |
| 124 | |
| 125 | |
| 126 def parseIndelPileup(fields, ref_base, alt_base): | |
| 127 counts = [0, 0, 0, 0, 0, 0, 0, 0, 0] # indel ref match, indel fwd ref match, indel rev ref match, indel alt match, indel fwd alt match, indel rev alt match, other, other fwd, other rev | |
| 128 ref_base2 = fields[2] | |
| 129 | |
| 130 carrot_flag = False | |
| 131 ins_flag = False | |
| 132 ins_str = "" | |
| 133 ins_len = 0 | |
| 134 del_flag = False | |
| 135 del_str = "" | |
| 136 del_len = 0 | |
| 137 first_iter = True | |
| 138 forward_flag = False | |
| 139 last_seq = "" | |
| 140 last_seq_code = 'b' | |
| 141 for base in fields[4]: | |
| 142 if ins_flag: | |
| 143 if base.isdigit(): | |
| 144 ins_str += base | |
| 145 else: | |
| 146 ins_len = int(ins_str) | |
| 147 ins_flag = False | |
| 148 if del_flag: | |
| 149 if base.isdigit(): | |
| 150 del_str += base | |
| 151 else: | |
| 152 del_len = int(del_str) | |
| 153 del_flag = False | |
| 154 if ins_len > 0: | |
| 155 last_seq += base | |
| 156 last_seq_code = 'i' | |
| 157 ins_len -= 1 | |
| 158 continue | |
| 159 if del_len > 0: | |
| 160 last_seq += base | |
| 161 last_seq_code = 'd' | |
| 162 del_len -= 1 | |
| 163 continue | |
| 164 if carrot_flag: | |
| 165 carrot_flag = False | |
| 166 continue | |
| 167 if base == '.' or base == ','\ | |
| 168 or base == 'A' or base == 'a'\ | |
| 169 or base == 'C' or base == 'c'\ | |
| 170 or base == 'G' or base == 'g'\ | |
| 171 or base == 'T' or base == 't'\ | |
| 172 or base == 'N' or base == 'n'\ | |
| 173 or base == '>' or base == '<'\ | |
| 174 or base == '*' or base == '#': | |
| 175 if first_iter: | |
| 176 first_iter = False | |
| 177 else: | |
| 178 if last_seq_code == 'i': | |
| 179 if last_seq.upper() == alt_base.upper(): | |
| 180 counts[3] += 1 | |
| 181 if forward_flag: | |
| 182 counts[4] += 1 | |
| 183 else: | |
| 184 counts[5] += 1 | |
| 185 else: | |
| 186 counts[6] += 1 | |
| 187 if forward_flag: | |
| 188 counts[7] += 1 | |
| 189 else: | |
| 190 counts[8] += 1 | |
| 191 elif last_seq_code == 'd': | |
| 192 if last_seq.upper() == ref_base.upper(): | |
| 193 counts[3] += 1 | |
| 194 if forward_flag: | |
| 195 counts[4] += 1 | |
| 196 else: | |
| 197 counts[5] += 1 | |
| 198 else: | |
| 199 counts[6] += 1 | |
| 200 if forward_flag: | |
| 201 counts[7] += 1 | |
| 202 else: | |
| 203 counts[8] += 1 | |
| 204 elif last_seq_code == 'b': | |
| 205 if last_seq.upper() == ref_base.upper(): | |
| 206 counts[0] += 1 | |
| 207 if forward_flag: | |
| 208 counts[1] += 1 | |
| 209 else: | |
| 210 counts[2] += 1 | |
| 211 elif last_seq.upper() == alt_base.upper(): | |
| 212 counts[3] += 1 | |
| 213 if forward_flag: | |
| 214 counts[4] += 1 | |
| 215 else: | |
| 216 counts[5] += 1 | |
| 217 else: | |
| 218 counts[6] += 1 | |
| 219 if forward_flag: | |
| 220 counts[7] += 1 | |
| 221 else: | |
| 222 counts[8] += 1 | |
| 223 if base == '.': | |
| 224 last_seq = ref_base2 | |
| 225 forward_flag = True | |
| 226 last_seq_code = 'b' | |
| 227 elif base == ',': | |
| 228 last_seq = ref_base2 | |
| 229 forward_flag = False | |
| 230 last_seq_code = 'b' | |
| 231 elif base == '>' or base == '<' or base == '*' or base == '#': | |
| 232 continue | |
| 233 else: | |
| 234 forward_flag = base.isupper() | |
| 235 last_seq = base.upper() | |
| 236 last_seq_code = 'b' | |
| 237 elif base == '+': | |
| 238 ins_flag = True | |
| 239 ins_str = "" | |
| 240 elif base == '-': | |
| 241 del_flag = True | |
| 242 del_str = "" | |
| 243 elif base == '^': | |
| 244 carrot_flag = True | |
| 245 elif base == '$': | |
| 246 continue | |
| 247 if first_iter: | |
| 248 first_iter = False | |
| 249 | |
| 250 if last_seq_code == 'i': | |
| 251 if last_seq.upper() == alt_base.upper(): | |
| 252 counts[3] += 1 | |
| 253 if forward_flag: | |
| 254 counts[4] += 1 | |
| 255 else: | |
| 256 counts[5] += 1 | |
| 257 else: | |
| 258 counts[6] += 1 | |
| 259 if forward_flag: | |
| 260 counts[7] += 1 | |
| 261 else: | |
| 262 counts[8] += 1 | |
| 263 elif last_seq_code == 'd': | |
| 264 if last_seq.upper() == ref_base.upper(): | |
| 265 counts[3] += 1 | |
| 266 if forward_flag: | |
| 267 counts[4] += 1 | |
| 268 else: | |
| 269 counts[5] += 1 | |
| 270 else: | |
| 271 counts[6] += 1 | |
| 272 if forward_flag: | |
| 273 counts[7] += 1 | |
| 274 else: | |
| 275 counts[8] += 1 | |
| 276 elif last_seq_code == 'b': | |
| 277 if last_seq.upper() == ref_base.upper(): | |
| 278 counts[0] += 1 | |
| 279 if forward_flag: | |
| 280 counts[1] += 1 | |
| 281 else: | |
| 282 counts[2] += 1 | |
| 283 elif last_seq.upper() == alt_base.upper(): | |
| 284 counts[3] += 1 | |
| 285 if forward_flag: | |
| 286 counts[4] += 1 | |
| 287 else: | |
| 288 counts[5] += 1 | |
| 289 else: | |
| 290 counts[6] += 1 | |
| 291 if forward_flag: | |
| 292 counts[7] += 1 | |
| 293 else: | |
| 294 counts[8] += 1 | |
| 295 dp = int(fields[3]) | |
| 296 af = float(counts[3]) / float(sum([counts[0], counts[3], counts[6]])) | |
| 297 if sum([counts[1], counts[4], counts[7]]) == 0: | |
| 298 faf = float("nan") | |
| 299 else: | |
| 300 faf = float(counts[4]) / float(sum([counts[1], counts[4], counts[7]])) | |
| 301 if sum([counts[2], counts[5], counts[8]]) == 0: | |
| 302 raf = float("nan") | |
| 303 else: | |
| 304 raf = float(counts[5]) / float(sum([counts[2], counts[5], counts[8]])) | |
| 305 dp4 = [counts[1], counts[2], counts[4], counts[5]] | |
| 306 return (dp, af, faf, raf, dp4) | |
| 307 | |
| 308 | |
| 309 def annotateVCF(in_vcf_filepath, in_mpileup_filepath, out_vcf_filepath): | |
| 310 in_vcf = open(in_vcf_filepath, 'r') | |
| 311 in_mpileup = open(in_mpileup_filepath, 'r') | |
| 312 out_vcf = open(out_vcf_filepath, 'w') | |
| 313 | |
| 314 # First pass parsing of input vcf, output headerlines + new headerlines, add VCF sites we care about to to_examine (limits memory usage for sites that don't need annotation) | |
| 315 to_examine = {} | |
| 316 for line in in_vcf: | |
| 317 if line[0:2] == "##": | |
| 318 out_vcf.write(line) | |
| 319 elif line[0] == "#": | |
| 320 out_vcf.write("##annotateVCFVersion=0.1\n") | |
| 321 out_vcf.write("##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw Depth\">\n") | |
| 322 out_vcf.write("##INFO=<ID=AF,Number=1,Type=Float,Description=\"Allele Frequency\">\n") | |
| 323 out_vcf.write("##INFO=<ID=FAF,Number=1,Type=Float,Description=\"Forward Allele Frequency\">\n") | |
| 324 out_vcf.write("##INFO=<ID=RAF,Number=1,Type=Float,Description=\"Reverse Allele Frequency\">\n") | |
| 325 out_vcf.write("##INFO=<ID=SB,Number=1,Type=Integer,Description=\"Phred-scaled strand bias at this position\">\n") | |
| 326 out_vcf.write("##INFO=<ID=DP4,Number=4,Type=Integer,Description=\"Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases\">\n") | |
| 327 out_vcf.write(line) | |
| 328 else: | |
| 329 fields = line.strip().split() | |
| 330 if fields[0] in to_examine: | |
| 331 to_examine[fields[0]][int(fields[1])] = (fields[3], fields[4]) | |
| 332 else: | |
| 333 to_examine[fields[0]] = {int(fields[1]): (fields[3], fields[4])} | |
| 334 in_vcf.close() | |
| 335 data = {} | |
| 336 | |
| 337 # Populate data dictionary, which relates chromosome and position to the following: | |
| 338 # depth of coverage | |
| 339 # allele frequency | |
| 340 # forward strand allele frequency | |
| 341 # reverse strand allele frequency | |
| 342 # dp4 - depth of coverage of ref allele fwd strand, DOC of ref allele rev strand, DOC of alt allele fwd strand, DOC of alt allele rev strand | |
| 343 for line in in_mpileup: | |
| 344 fields = line.strip().split() | |
| 345 if fields[0] not in to_examine: | |
| 346 continue | |
| 347 if int(fields[1]) not in to_examine[fields[0]]: | |
| 348 continue | |
| 349 (ref_base, alt_base) = to_examine[fields[0]][int(fields[1])] | |
| 350 if len(ref_base.split(',')) > 1: # Can't handle multiple ref alleles | |
| 351 continue | |
| 352 if len(alt_base.split(',')) > 1: # Can't handle multiple alt alleles | |
| 353 continue | |
| 354 if len(ref_base) > 1 or len(alt_base) > 1: | |
| 355 if len(ref_base) > 1 and len(alt_base) > 1: # Can't handle complex indels | |
| 356 continue | |
| 357 data[(fields[0], int(fields[1]))] = parseIndelPileup(fields, ref_base, alt_base) | |
| 358 if len(ref_base) == 1 and len(alt_base) == 1: | |
| 359 data[(fields[0], int(fields[1]))] = parseSimpleSNPpileup(fields, ref_base, alt_base) | |
| 360 in_mpileup.close() | |
| 361 # Reopen vcf, this time, skip header, annotate all the sites for which there is an entry in data dictionary | |
| 362 # (Sites without entries have either multiple ref or alt bases, or have complex indels. Not supported (for now), and not reported as a result) | |
| 363 in_vcf = open(in_vcf_filepath, 'r') | |
| 364 for line in in_vcf: | |
| 365 if line[0] == '#': | |
| 366 continue | |
| 367 fields = line.strip().split('\t') | |
| 368 if (fields[0], int(fields[1])) not in data: | |
| 369 continue | |
| 370 (dp, af, faf, raf, dp4) = data[(fields[0], int(fields[1]))] | |
| 371 dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]] | |
| 372 _, p_val = fisher_exact(dp2x2) | |
| 373 sb = pval_to_phredqual(p_val) | |
| 374 if fields[7] == "": | |
| 375 info = [] | |
| 376 else: | |
| 377 info = fields[7].split(';') | |
| 378 info.append("DP=%d" % (dp)) | |
| 379 info.append("AF=%.6f" % (af)) | |
| 380 if isnan(faf): | |
| 381 info.append("FAF=NaN") | |
| 382 else: | |
| 383 info.append("FAF=%.6f" % (faf)) | |
| 384 if isnan(raf): | |
| 385 info.append("RAF=NaN") | |
| 386 else: | |
| 387 info.append("RAF=%.6f" % (raf)) | |
| 388 info.append("SB=%d" % (sb)) | |
| 389 info.append("DP4=%s" % (','.join([str(x) for x in dp4]))) | |
| 390 new_info = ';'.join(info) | |
| 391 fields[7] = new_info | |
| 392 out_vcf.write("%s\n" % ("\t".join(fields))) | |
| 393 in_vcf.close() | |
| 394 out_vcf.close() | |
| 395 | |
| 396 | |
| 397 if __name__ == "__main__": | |
| 398 annotateVCF(sys.argv[1], sys.argv[2], sys.argv[3]) |
