Previous changeset 12:9f70e869f61e (2021-09-12) Next changeset 14:b2aae698b9d3 (2021-11-18) |
Commit message:
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303" |
modified:
convert_VCF_info_fields.py test-data/ref.fasta |
b |
diff -r 9f70e869f61e -r fa11aa8103b2 convert_VCF_info_fields.py --- a/convert_VCF_info_fields.py Sun Sep 12 20:36:05 2021 +0000 +++ b/convert_VCF_info_fields.py Fri Sep 17 20:22:04 2021 +0000 |
[ |
b'@@ -7,11 +7,11 @@\n \n # 10/21/2020 - Nathan P. Roach, natproach@gmail.com\n \n+import re\n import sys\n from collections import OrderedDict\n from math import log10\n \n-import scipy\n import scipy.stats\n \n \n@@ -33,84 +33,144 @@\n \n \n def annotateVCF(in_vcf_filepath, out_vcf_filepath):\n+ """Postprocess output of medaka tools annotate.\n+\n+ Splits multiallelic sites into separate records.\n+ Replaces medaka INFO fields that might represent information of the ref\n+ and multiple alternate alleles with simple ref, alt allele counterparts.\n+ """\n+\n in_vcf = open(in_vcf_filepath, \'r\')\n- out_vcf = open(out_vcf_filepath, \'w\')\n- to_skip = set([\'SC\', \'SR\'])\n- for i, line in enumerate(in_vcf):\n- if i == 1:\n- out_vcf.write("##convert_VCF_info_fields=0.2\\n")\n- if line[0:2] == "##":\n- if line[0:11] == "##INFO=<ID=":\n- id_ = line[11:].split(\',\')[0]\n- if id_ in to_skip:\n+ # medaka INFO fields that do not make sense after splitting of\n+ # multi-allelic records\n+ # DP will be overwritten with the value of DPSP because medaka tools\n+ # annotate currently only calculates the latter correctly\n+ # (https://github.com/nanoporetech/medaka/issues/192).\n+ # DPS, which is as unreliable as DP, gets skipped and the code\n+ # calculates the spanning reads equivalent DPSPS instead.\n+ to_skip = {\'SC\', \'SR\', \'AR\', \'DP\', \'DPSP\', \'DPS\'}\n+ struct_meta_pat = re.compile(\'##(.+)=<ID=([^,]+)(,.+)?>\')\n+ header_lines = []\n+ contig_ids = set()\n+ contig_ids_simple = set()\n+ # parse the metadata lines of the input VCF and drop:\n+ # - duplicate lines\n+ # - INFO lines declaring keys we are not going to write\n+ # - redundant contig information\n+ while True:\n+ line = in_vcf.readline()\n+ if line[:2] != \'##\':\n+ assert line.startswith(\'#CHROM\')\n+ break\n+ if line in header_lines:\n+ # the annotate tool may generate lines already written by\n+ # medaka variant again (example: medaka version line)\n+ continue\n+ match = struct_meta_pat.match(line)\n+ if match:\n+ match_type, match_id, match_misc = match.groups()\n+ if match_type == \'INFO\':\n+ if match_id == \'DPSP\':\n+ line = line.replace(\'DPSP\', \'DP\')\n+ elif match_id in to_skip:\n continue\n- out_vcf.write(line)\n- elif line[0] == "#":\n- out_vcf.write(\'##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Spanning Reads Allele Frequency By Strand">\\n\')\n- out_vcf.write(\'##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\\n\')\n- out_vcf.write(\'##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Spanning Reads Allele Frequency">\\n\')\n- out_vcf.write(\'##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Spanning Reads Allele Frequency">\\n\')\n- out_vcf.write(\'##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\\n\')\n- out_vcf.write(\'##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\\n\')\n- out_vcf.write(\'##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\\n\')\n- out_vcf.write(line)\n- else:\n+ elif match_type == \'contig\':\n+ contig_ids.add(match_id)\n+ if not match_misc:\n+ # the annotate tools writes its own contig info,\n+ # which is redundant with contig info generated by\n+ # medaka variant, but lacks a length value.\n+ # We don\'t need the incomplete line.\n+ '..b' print(line.strip()) # Print the line for debugging purposes\n+ continue\n+ variant_list = fields[4].split(\',\')\n+ dpsp = int(info_dict[\'DPSP\'])\n+ ref_fwd, ref_rev = 0, 1\n+ dpspf, dpspr = (int(x) for x in info_dict[\'AR\'].split(\',\'))\n+ for i in range(0, len(sr_list), 2):\n+ dpspf += sr_list[i]\n+ dpspr += sr_list[i + 1]\n+ for j, i in enumerate(range(2, len(sr_list), 2)):\n+ dp4 = (\n+ sr_list[ref_fwd],\n+ sr_list[ref_rev],\n+ sr_list[i],\n+ sr_list[i + 1]\n+ )\n+ dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]]\n+ _, p_val = scipy.stats.fisher_exact(dp2x2)\n+ sb = pval_to_phredqual(p_val)\n \n- as_ = (sc_list[ref_fwd], sc_list[ref_rev], sc_list[i], sc_list[i + 1])\n-\n- info = []\n- for code in info_dict:\n- if code in to_skip:\n- continue\n- val = info_dict[code]\n- info.append("%s=%s" % (code, val))\n-\n- info.append("DPSPS=%d,%d" % (dpspf, dpspr))\n+ as_ = (\n+ sc_list[ref_fwd],\n+ sc_list[ref_rev],\n+ sc_list[i],\n+ sc_list[i + 1]\n+ )\n \n- if dpsp == 0:\n- info.append("AF=NaN")\n- else:\n- af = (dp4[2] + dp4[3]) / dpsp\n- info.append("AF=%.6f" % (af))\n- if dpspf == 0:\n- info.append("FAF=NaN")\n- else:\n- faf = dp4[2] / dpspf\n- info.append("FAF=%.6f" % (faf))\n- if dpspr == 0:\n- info.append("RAF=NaN")\n- else:\n- raf = dp4[3] / dpspr\n- info.append("RAF=%.6f" % (raf))\n- info.append("SB=%d" % (sb))\n- info.append("DP4=%d,%d,%d,%d" % (dp4))\n- info.append("AS=%d,%d,%d,%d" % (as_))\n- new_info = \';\'.join(info)\n- fields[4] = variant_list[j]\n- fields[7] = new_info\n- out_vcf.write("%s" % ("\\t".join(fields)))\n- else:\n- print("WARNING - SR and SC are different lengths, skipping variant")\n- print(line.strip()) # Print the line for debugging purposes\n+ info = []\n+ for code in info_dict:\n+ if code in to_skip:\n+ continue\n+ val = info_dict[code]\n+ info.append("%s=%s" % (code, val))\n+\n+ info.append(\'DP=%d\' % dpsp)\n+ info.append(\'DPSPS=%d,%d\' % (dpspf, dpspr))\n+\n+ if dpsp == 0:\n+ info.append(\'AF=NaN\')\n+ else:\n+ af = (dp4[2] + dp4[3]) / dpsp\n+ info.append(\'AF=%.6f\' % af)\n+ if dpspf == 0:\n+ info.append(\'FAF=NaN\')\n+ else:\n+ faf = dp4[2] / dpspf\n+ info.append(\'FAF=%.6f\' % faf)\n+ if dpspr == 0:\n+ info.append(\'RAF=NaN\')\n+ else:\n+ raf = dp4[3] / dpspr\n+ info.append(\'RAF=%.6f\' % raf)\n+ info.append(\'SB=%d\' % sb)\n+ info.append(\'DP4=%d,%d,%d,%d\' % dp4)\n+ info.append(\'AS=%d,%d,%d,%d\' % as_)\n+ new_info = \';\'.join(info)\n+ fields[4] = variant_list[j]\n+ fields[7] = new_info\n+ out_vcf.write(\'\\t\'.join(fields))\n in_vcf.close()\n- out_vcf.close()\n \n \n if __name__ == "__main__":\n' |
b |
diff -r 9f70e869f61e -r fa11aa8103b2 test-data/ref.fasta --- a/test-data/ref.fasta Sun Sep 12 20:36:05 2021 +0000 +++ b/test-data/ref.fasta Fri Sep 17 20:22:04 2021 +0000 |
b |
@@ -1,2 +1,2 @@ >NC_045512.2 -ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAATTGGCAAAGAAATTTGACACCTTCAATGGGGAATGTCCAAATTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAACCAAGGGTTGAAAAGAAAAAGCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCACCAAATGAATGCAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCAGACGGGCGATTTTGTTAAAGCCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGAAGGTGCCACTACTTGTGGTTACTTACCCCAAAATGCTGTTGTTAAAATTTATTGTCCAGCATGTCACAATTCAGAAGTAGGACCTGAGCATAGTCTTGCCGAATACCATAATGAATCTGGCTTGAAAACCATTCTTCGTAAGGGTGGTCGCACTATTGCCTTTGGAGGCTGTGTGTTCTCTTATGTTGGTTGCCATAACAAGTGTGCCTATTGGGTTCCACGTGCTAGCGCTAACATAGGTTGTAACCATACAGGTGTTGTTGGAGAAGGTTCCGAAGGTCTTAATGACAACCTTCTTGAAATACTCCAAAAAGAGAAAGTCAACATCAATATTGTTGGTGACTTTAAACTTAATGAAGAGATCGCCATTATTTTGGCATCTTTTTCTGCTTCCACAAGTGCTTTTGTGGAAACTGTGAAAGGTTTGGATTATAAAGCATTCAAACAAATTGTTGAATCCTGTGGTAATTTTAAAGTTACAAAAGGAAAAGCTAAAAAAGGTGCCTGGAATATTGGTGAACAGAAATCAATACTGAGTCCTCTTTATGCATTTGCATCAGAGGCTGCTCGTGTTGTACGATCAATTTTCTCCCGCACTCTTGAAACTGCTCAAAATTCTGTGCGTGTTTTACAGAAGGCCGCTATAACAATACTAGATGGAATTTCACAGTATTCACTGAGACTCATTGATGCTATGATGTTCACATCTGATTTGGCTACTAACAATCTAGTTGTAATGGCCTACATTACAGGTGGTGTTGTTCAGTTGACTTCGCAGTGGCTAACTAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGATTGGCTTGAAGAGAAGTTTAAGGAAGGTGTAGAGTTTCTTAGAGACGGTTGGGAAATTGTTAAATTTATCTCAACCTGTGCTTGTGAAATTGTCGGTGGACAAATTGTCACCTGTGCAAAGGAAATTAAGGAGAGTGTTCAGACATTCTTTAAGCTTGTAAATAAATTTTTGGCTTTGTGTGCTGACTCTATCATTATTGGTGGAGCTAAACTTAAAGCCTTGAATTTAGGTGAAACATTTGTCACGCACTCAAAGGGATTGTACAGAAAGTGTGTTAAATCCAGAGAAGAAACTGGCCTACTCATGCCTCTAAAAGCCCCAAAAGAAATTATCTTCTTAGAGGGAGAAACACTTCCCACAGAAGTGTTAACAGAGGAAGTTGTCTTGAAAACTGGTGATTTACAACCATTAGAACAACCTACTAGTGAAGCTGTTGAAGCTCCATTGGTTGGTACACCAGTTTGTATTAACGGGCTTATGTTGCTCGAAATCAAAGACACAGAAAAGTACTGTGCCCTTGCACCTAATATGATGGTAACAAACAATACCTTCACACTCAAAGGCGGTGCACCAACAAAGGTTACTTTTGGTGATGACACTGTGATAGAAGTGCAAGGTTACAAGAGTGTGAATATCACTTTTGAACTTGATGAAAGGATTGATAAAGTACTTAATGAGAAGTGCTCTGCCTATACAGTTGAACTCGGTACAGAAGTAAATGAGTTCGCCTGTGTTGTGGCAGATGCTGTCATAAAAACTTTGCAACCAGTATCTGAATTACTTACACC +attaaaggtttataccttcccaggtaacaaaccaaccaactttcgatctcttgtagatctgttctctaaacgaactttaaaatctgtgtggctGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAATTGGCAAAGAAATTTGACACCTTCAATGGGGAATGTCCAAATTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAACCAAGGGTTGAAAAGAAAAAGCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCACCAAATGAATGCAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCAGACGGGCGATTTTGTTAAAGCCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGAAGGTGCCACTACTTGTGGTTACTTACCCCAAAATGCTGTTGTTAAAATTTATTGTCCAGCATGTCACAATTCAGAAGTAGGACCTGAGCATAGTCTTGCCGAATACCATAATGAATCTGGCTTGAAAACCATTCTTCGTAAGGGTGGTCGCACTATTGCCTTTGGAGGCTGTGTGTTCTCTTATGTTGGTTGCCATAACAAGTGTGCCTATTGGGTTCCACGTGCTAGCGCTAACATAGGTTGTAACCATACAGGTGTTGTTGGAGAAGGTTCCGAAGGTCTTAATGACAACCTTCTTGAAATACTCCAAAAAGAGAAAGTCAACATCAATATTGTTGGTGACTTTAAACTTAATGAAGAGATCGCCATTATTTTGGCATCTTTTTCTGCTTCCACAAGTGCTTTTGTGGAAACTGTGAAAGGTTTGGATTATAAAGCATTCAAACAAATTGTTGAATCCTGTGGTAATTTTAAAGTTACAAAAGGAAAAGCTAAAAAAGGTGCCTGGAATATTGGTGAACAGAAATCAATACTGAGTCCTCTTTATGCATTTGCATCAGAGGCTGCTCGTGTTGTACGATCAATTTTCTCCCGCACTCTTGAAACTGCTCAAAATTCTGTGCGTGTTTTACAGAAGGCCGCTATAACAATACTAGATGGAATTTCACAGTATTCACTGAGACTCATTGATGCTATGATGTTCACATCTGATTTGGCTACTAACAATCTAGTTGTAATGGCCTACATTACAGGTGGTGTTGTTCAGTTGACTTCGCAGTGGCTAACTAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGATTGGCTTGAAGAGAAGTTTAAGGAAGGTGTAGAGTTTCTTAGAGACGGTTGGGAAATTGTTAAATTTATCTCAACCTGTGCTTGTGAAATTGTCGGTGGACAAATTGTCACCTGTGCAAAGGAAATTAAGGAGAGTGTTCAGACATTCTTTAAGCTTGTAAATAAATTTTTGGCTTTGTGTGCTGACTCTATCATTATTGGTGGAGCTAAACTTAAAGCCTTGAATTTAGGTGAAACATTTGTCACGCACTCAAAGGGATTGTACAGAAAGTGTGTTAAATCCAGAGAAGAAACTGGCCTACTCATGCCTCTAAAAGCCCCAAAAGAAATTATCTTCTTAGAGGGAGAAACACTTCCCACAGAAGTGTTAACAGAGGAAGTTGTCTTGAAAACTGGTGATTTACAACCATTAGAACAACCTACTAGTGAAGCTGTTGAAGCTCCATTGGTTGGTACACCAGTTTGTATTAACGGGCTTATGTTGCTCGAAATCAAAGACACAGAAAAGTACTGTGCCCTTGCACCTAATATGATGGTAACAAACAATACCTTCACACTCAAAGGCGGTGCACCAACAAAGGTTACTTTTGGTGATGACACTGTGATAGAAGTGCAAGGTTACAAGAGTGTGAATATCACTTTTGAACTTGATGAAAGGATTGATAAAGTACTTAATGAGAAGTGCTCTGCCTATACAGTTGAACTCGGTACAGAAGTAAATGAGTTCGCCTGTGTTGTGGCAGATGCTGTCATAAAAACTTTGCAACCAGTATCTGAATTACTTACACC |