Mercurial > repos > iuc > medaka_variant_pipeline
annotate convert_VCF_info_fields.py @ 14:1d62240feff3 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 8051b858e0f5c6d7c1092754df02ed8303a53054
author | iuc |
---|---|
date | Mon, 27 Jun 2022 17:30:57 +0000 |
parents | 3fbefde449bc |
children |
rev | line source |
---|---|
6
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
1 #!/usr/bin/env python3 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
2 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
3 # Takes in VCF file annotated with medaka tools annotate and converts |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
4 # |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
5 # Usage statement: |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
6 # python convert_VCF_info_fields.py in_vcf.vcf out_vcf.vcf |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
7 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
8 # 10/21/2020 - Nathan P. Roach, natproach@gmail.com |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
9 |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
10 import re |
6
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
11 import sys |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
12 from collections import OrderedDict |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
13 from math import log10 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
14 |
10
7623e5888be9
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 0faf0ade3f13d7c78d93869823ea9fdf25c21b13"
iuc
parents:
8
diff
changeset
|
15 import scipy.stats |
6
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
16 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
17 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
18 def pval_to_phredqual(pval): |
7
08e0d74aac22
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit ed5a3aadbecc0decf9a797447f3ac7700683ea9a"
iuc
parents:
6
diff
changeset
|
19 try: |
08e0d74aac22
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit ed5a3aadbecc0decf9a797447f3ac7700683ea9a"
iuc
parents:
6
diff
changeset
|
20 ret = round(-10 * log10(pval)) |
08e0d74aac22
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit ed5a3aadbecc0decf9a797447f3ac7700683ea9a"
iuc
parents:
6
diff
changeset
|
21 except ValueError: |
08e0d74aac22
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit ed5a3aadbecc0decf9a797447f3ac7700683ea9a"
iuc
parents:
6
diff
changeset
|
22 ret = 2147483647 # transform pval of 0.0 to max signed 32 bit int |
08e0d74aac22
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit ed5a3aadbecc0decf9a797447f3ac7700683ea9a"
iuc
parents:
6
diff
changeset
|
23 return ret |
6
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
24 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
25 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
26 def parseInfoField(info): |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
27 info_fields = info.split(";") |
6
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
28 info_dict = OrderedDict() |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
29 for info_field in info_fields: |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
30 code, val = info_field.split("=") |
6
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
31 info_dict[code] = val |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
32 return info_dict |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
33 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
34 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
35 def annotateVCF(in_vcf_filepath, out_vcf_filepath): |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
36 """Postprocess output of medaka tools annotate. |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
37 |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
38 Splits multiallelic sites into separate records. |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
39 Replaces medaka INFO fields that might represent information of the ref |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
40 and multiple alternate alleles with simple ref, alt allele counterparts. |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
41 """ |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
42 |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
43 in_vcf = open(in_vcf_filepath, "r") |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
44 # medaka INFO fields that do not make sense after splitting of |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
45 # multi-allelic records |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
46 # DP will be overwritten with the value of DPSP because medaka tools |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
47 # annotate currently only calculates the latter correctly |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
48 # (https://github.com/nanoporetech/medaka/issues/192). |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
49 # DPS, which is as unreliable as DP, gets skipped and the code |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
50 # calculates the spanning reads equivalent DPSPS instead. |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
51 to_skip = {"SC", "SR", "AR", "DP", "DPSP", "DPS"} |
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
52 struct_meta_pat = re.compile("##(.+)=<ID=([^,]+)(,.+)?>") |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
53 header_lines = [] |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
54 contig_ids = set() |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
55 contig_ids_simple = set() |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
56 # parse the metadata lines of the input VCF and drop: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
57 # - duplicate lines |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
58 # - INFO lines declaring keys we are not going to write |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
59 # - redundant contig information |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
60 while True: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
61 line = in_vcf.readline() |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
62 if line[:2] != "##": |
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
63 assert line.startswith("#CHROM") |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
64 break |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
65 if line in header_lines: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
66 # the annotate tool may generate lines already written by |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
67 # medaka variant again (example: medaka version line) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
68 continue |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
69 match = struct_meta_pat.match(line) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
70 if match: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
71 match_type, match_id, match_misc = match.groups() |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
72 if match_type == "INFO": |
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
73 if match_id == "DPSP": |
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
74 line = line.replace("DPSP", "DP") |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
75 elif match_id in to_skip: |
6
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
76 continue |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
77 elif match_type == "contig": |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
78 contig_ids.add(match_id) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
79 if not match_misc: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
80 # the annotate tools writes its own contig info, |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
81 # which is redundant with contig info generated by |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
82 # medaka variant, but lacks a length value. |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
83 # We don't need the incomplete line. |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
84 contig_ids_simple.add(match_id) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
85 continue |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
86 header_lines.append(line) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
87 # Lets check the above assumption about each ID-only contig line |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
88 # having a more complete counterpart. |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
89 assert not (contig_ids_simple - contig_ids) |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
90 header_lines.insert(1, "##convert_VCF_info_fields=0.2\n") |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
91 header_lines += [ |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
92 '##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Depth of spanning reads by strand">\n', |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
93 '##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\n', |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
94 '##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Spanning Reads Allele Frequency">\n', |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
95 '##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Spanning Reads Allele Frequency">\n', |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
96 '##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\n', |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
97 '##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\n', |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
98 '##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\n', |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
99 line, |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
100 ] |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
101 |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
102 with open(out_vcf_filepath, "w") as out_vcf: |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
103 out_vcf.writelines(header_lines) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
104 for line in in_vcf: |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
105 fields = line.split("\t") |
6
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
106 info_dict = parseInfoField(fields[7]) |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
107 sr_list = [int(x) for x in info_dict["SR"].split(",")] |
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
108 sc_list = [int(x) for x in info_dict["SC"].split(",")] |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
109 if len(sr_list) != len(sc_list): |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
110 print("WARNING - SR and SC are different lengths, " "skipping variant") |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
111 print(line.strip()) # Print the line for debugging purposes |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
112 continue |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
113 variant_list = fields[4].split(",") |
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
114 dpsp = int(info_dict["DPSP"]) |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
115 ref_fwd, ref_rev = 0, 1 |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
116 dpspf, dpspr = (int(x) for x in info_dict["AR"].split(",")) |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
117 for i in range(0, len(sr_list), 2): |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
118 dpspf += sr_list[i] |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
119 dpspr += sr_list[i + 1] |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
120 for j, i in enumerate(range(2, len(sr_list), 2)): |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
121 dp4 = (sr_list[ref_fwd], sr_list[ref_rev], sr_list[i], sr_list[i + 1]) |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
122 dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]] |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
123 _, p_val = scipy.stats.fisher_exact(dp2x2) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
124 sb = pval_to_phredqual(p_val) |
6
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
125 |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
126 as_ = (sc_list[ref_fwd], sc_list[ref_rev], sc_list[i], sc_list[i + 1]) |
6
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
127 |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
128 info = [] |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
129 for code in info_dict: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
130 if code in to_skip: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
131 continue |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
132 val = info_dict[code] |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
133 info.append("%s=%s" % (code, val)) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
134 |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
135 info.append("DP=%d" % dpsp) |
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
136 info.append("DPSPS=%d,%d" % (dpspf, dpspr)) |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
137 |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
138 if dpsp == 0: |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
139 info.append("AF=NaN") |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
140 else: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
141 af = (dp4[2] + dp4[3]) / dpsp |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
142 info.append("AF=%.6f" % af) |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
143 if dpspf == 0: |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
144 info.append("FAF=NaN") |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
145 else: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
146 faf = dp4[2] / dpspf |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
147 info.append("FAF=%.6f" % faf) |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
148 if dpspr == 0: |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
149 info.append("RAF=NaN") |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
150 else: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
151 raf = dp4[3] / dpspr |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
152 info.append("RAF=%.6f" % raf) |
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
153 info.append("SB=%d" % sb) |
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
154 info.append("DP4=%d,%d,%d,%d" % dp4) |
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
155 info.append("AS=%d,%d,%d,%d" % as_) |
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
156 new_info = ";".join(info) |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
157 fields[4] = variant_list[j] |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
158 fields[7] = new_info |
13
3fbefde449bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
iuc
parents:
12
diff
changeset
|
159 out_vcf.write("\t".join(fields)) |
6
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
160 in_vcf.close() |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
161 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
162 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
163 if __name__ == "__main__": |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
164 annotateVCF(sys.argv[1], sys.argv[2]) |