changeset 12:0f5f4a208660 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
author iuc
date Fri, 17 Sep 2021 20:22:49 +0000
parents 2bf63b38ee9b
children 222669c4afb6
files convert_VCF_info_fields.py test-data/ref.fasta variant.xml
diffstat 3 files changed, 159 insertions(+), 135 deletions(-) [+]
line wrap: on
line diff
--- a/convert_VCF_info_fields.py	Sun Sep 12 20:35:26 2021 +0000
+++ b/convert_VCF_info_fields.py	Fri Sep 17 20:22:49 2021 +0000
@@ -7,11 +7,11 @@
 
 # 10/21/2020 - Nathan P. Roach, natproach@gmail.com
 
+import re
 import sys
 from collections import OrderedDict
 from math import log10
 
-import scipy
 import scipy.stats
 
 
@@ -33,84 +33,144 @@
 
 
 def annotateVCF(in_vcf_filepath, out_vcf_filepath):
+    """Postprocess output of medaka tools annotate.
+
+    Splits multiallelic sites into separate records.
+    Replaces medaka INFO fields that might represent information of the ref
+    and multiple alternate alleles with simple ref, alt allele counterparts.
+    """
+
     in_vcf = open(in_vcf_filepath, 'r')
-    out_vcf = open(out_vcf_filepath, 'w')
-    to_skip = set(['SC', 'SR'])
-    for i, line in enumerate(in_vcf):
-        if i == 1:
-            out_vcf.write("##convert_VCF_info_fields=0.2\n")
-        if line[0:2] == "##":
-            if line[0:11] == "##INFO=<ID=":
-                id_ = line[11:].split(',')[0]
-                if id_ in to_skip:
+    # medaka INFO fields that do not make sense after splitting of
+    # multi-allelic records
+    # DP will be overwritten with the value of DPSP because medaka tools
+    # annotate currently only calculates the latter correctly
+    # (https://github.com/nanoporetech/medaka/issues/192).
+    # DPS, which is as unreliable as DP, gets skipped and the code
+    # calculates the spanning reads equivalent DPSPS instead.
+    to_skip = {'SC', 'SR', 'AR', 'DP', 'DPSP', 'DPS'}
+    struct_meta_pat = re.compile('##(.+)=<ID=([^,]+)(,.+)?>')
+    header_lines = []
+    contig_ids = set()
+    contig_ids_simple = set()
+    # parse the metadata lines of the input VCF and drop:
+    # - duplicate lines
+    # - INFO lines declaring keys we are not going to write
+    # - redundant contig information
+    while True:
+        line = in_vcf.readline()
+        if line[:2] != '##':
+            assert line.startswith('#CHROM')
+            break
+        if line in header_lines:
+            # the annotate tool may generate lines already written by
+            # medaka variant again (example: medaka version line)
+            continue
+        match = struct_meta_pat.match(line)
+        if match:
+            match_type, match_id, match_misc = match.groups()
+            if match_type == 'INFO':
+                if match_id == 'DPSP':
+                    line = line.replace('DPSP', 'DP')
+                elif match_id in to_skip:
                     continue
-            out_vcf.write(line)
-        elif line[0] == "#":
-            out_vcf.write('##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Spanning Reads Allele Frequency By Strand">\n')
-            out_vcf.write('##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\n')
-            out_vcf.write('##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Spanning Reads Allele Frequency">\n')
-            out_vcf.write('##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Spanning Reads Allele Frequency">\n')
-            out_vcf.write('##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\n')
-            out_vcf.write('##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\n')
-            out_vcf.write('##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\n')
-            out_vcf.write(line)
-        else:
+            elif match_type == 'contig':
+                contig_ids.add(match_id)
+                if not match_misc:
+                    # the annotate tools writes its own contig info,
+                    # which is redundant with contig info generated by
+                    # medaka variant, but lacks a length value.
+                    # We don't need the incomplete line.
+                    contig_ids_simple.add(match_id)
+                    continue
+        header_lines.append(line)
+    # Lets check the above assumption about each ID-only contig line
+    # having a more complete counterpart.
+    assert not (contig_ids_simple - contig_ids)
+    header_lines.insert(1, '##convert_VCF_info_fields=0.2\n')
+    header_lines += [
+        '##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Depth of spanning reads by strand">\n',
+        '##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\n',
+        '##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Spanning Reads Allele Frequency">\n',
+        '##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Spanning Reads Allele Frequency">\n',
+        '##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\n',
+        '##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\n',
+        '##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\n',
+        line
+    ]
+
+    with open(out_vcf_filepath, 'w') as out_vcf:
+        out_vcf.writelines(header_lines)
+        for line in in_vcf:
             fields = line.split('\t')
             info_dict = parseInfoField(fields[7])
             sr_list = [int(x) for x in info_dict["SR"].split(',')]
             sc_list = [int(x) for x in info_dict["SC"].split(',')]
-            if len(sr_list) == len(sc_list):
-                variant_list = fields[4].split(',')
-                dpsp = int(info_dict["DPSP"])
-                ref_fwd, ref_rev = 0, 1
-                dpspf, dpspr = (int(x) for x in info_dict["AR"].split(','))
-                for i in range(0, len(sr_list), 2):
-                    dpspf += sr_list[i]
-                    dpspr += sr_list[i + 1]
-                for j, i in enumerate(range(2, len(sr_list), 2)):
-                    dp4 = (sr_list[ref_fwd], sr_list[ref_rev], sr_list[i], sr_list[i + 1])
-                    dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]]
-                    _, p_val = scipy.stats.fisher_exact(dp2x2)
-                    sb = pval_to_phredqual(p_val)
+            if len(sr_list) != len(sc_list):
+                print(
+                    'WARNING - SR and SC are different lengths, '
+                    'skipping variant'
+                )
+                print(line.strip())  # Print the line for debugging purposes
+                continue
+            variant_list = fields[4].split(',')
+            dpsp = int(info_dict['DPSP'])
+            ref_fwd, ref_rev = 0, 1
+            dpspf, dpspr = (int(x) for x in info_dict['AR'].split(','))
+            for i in range(0, len(sr_list), 2):
+                dpspf += sr_list[i]
+                dpspr += sr_list[i + 1]
+            for j, i in enumerate(range(2, len(sr_list), 2)):
+                dp4 = (
+                    sr_list[ref_fwd],
+                    sr_list[ref_rev],
+                    sr_list[i],
+                    sr_list[i + 1]
+                )
+                dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]]
+                _, p_val = scipy.stats.fisher_exact(dp2x2)
+                sb = pval_to_phredqual(p_val)
 
-                    as_ = (sc_list[ref_fwd], sc_list[ref_rev], sc_list[i], sc_list[i + 1])
-
-                    info = []
-                    for code in info_dict:
-                        if code in to_skip:
-                            continue
-                        val = info_dict[code]
-                        info.append("%s=%s" % (code, val))
-
-                    info.append("DPSPS=%d,%d" % (dpspf, dpspr))
+                as_ = (
+                    sc_list[ref_fwd],
+                    sc_list[ref_rev],
+                    sc_list[i],
+                    sc_list[i + 1]
+                )
 
-                    if dpsp == 0:
-                        info.append("AF=NaN")
-                    else:
-                        af = (dp4[2] + dp4[3]) / dpsp
-                        info.append("AF=%.6f" % (af))
-                    if dpspf == 0:
-                        info.append("FAF=NaN")
-                    else:
-                        faf = dp4[2] / dpspf
-                        info.append("FAF=%.6f" % (faf))
-                    if dpspr == 0:
-                        info.append("RAF=NaN")
-                    else:
-                        raf = dp4[3] / dpspr
-                        info.append("RAF=%.6f" % (raf))
-                    info.append("SB=%d" % (sb))
-                    info.append("DP4=%d,%d,%d,%d" % (dp4))
-                    info.append("AS=%d,%d,%d,%d" % (as_))
-                    new_info = ';'.join(info)
-                    fields[4] = variant_list[j]
-                    fields[7] = new_info
-                    out_vcf.write("%s" % ("\t".join(fields)))
-            else:
-                print("WARNING - SR and SC are different lengths, skipping variant")
-                print(line.strip())  # Print the line for debugging purposes
+                info = []
+                for code in info_dict:
+                    if code in to_skip:
+                        continue
+                    val = info_dict[code]
+                    info.append("%s=%s" % (code, val))
+
+                info.append('DP=%d' % dpsp)
+                info.append('DPSPS=%d,%d' % (dpspf, dpspr))
+
+                if dpsp == 0:
+                    info.append('AF=NaN')
+                else:
+                    af = (dp4[2] + dp4[3]) / dpsp
+                    info.append('AF=%.6f' % af)
+                if dpspf == 0:
+                    info.append('FAF=NaN')
+                else:
+                    faf = dp4[2] / dpspf
+                    info.append('FAF=%.6f' % faf)
+                if dpspr == 0:
+                    info.append('RAF=NaN')
+                else:
+                    raf = dp4[3] / dpspr
+                    info.append('RAF=%.6f' % raf)
+                info.append('SB=%d' % sb)
+                info.append('DP4=%d,%d,%d,%d' % dp4)
+                info.append('AS=%d,%d,%d,%d' % as_)
+                new_info = ';'.join(info)
+                fields[4] = variant_list[j]
+                fields[7] = new_info
+                out_vcf.write('\t'.join(fields))
     in_vcf.close()
-    out_vcf.close()
 
 
 if __name__ == "__main__":
--- a/test-data/ref.fasta	Sun Sep 12 20:35:26 2021 +0000
+++ b/test-data/ref.fasta	Fri Sep 17 20:22:49 2021 +0000
@@ -1,2 +1,2 @@
 >NC_045512.2
-ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAATTGGCAAAGAAATTTGACACCTTCAATGGGGAATGTCCAAATTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAACCAAGGGTTGAAAAGAAAAAGCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCACCAAATGAATGCAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCAGACGGGCGATTTTGTTAAAGCCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGAAGGTGCCACTACTTGTGGTTACTTACCCCAAAATGCTGTTGTTAAAATTTATTGTCCAGCATGTCACAATTCAGAAGTAGGACCTGAGCATAGTCTTGCCGAATACCATAATGAATCTGGCTTGAAAACCATTCTTCGTAAGGGTGGTCGCACTATTGCCTTTGGAGGCTGTGTGTTCTCTTATGTTGGTTGCCATAACAAGTGTGCCTATTGGGTTCCACGTGCTAGCGCTAACATAGGTTGTAACCATACAGGTGTTGTTGGAGAAGGTTCCGAAGGTCTTAATGACAACCTTCTTGAAATACTCCAAAAAGAGAAAGTCAACATCAATATTGTTGGTGACTTTAAACTTAATGAAGAGATCGCCATTATTTTGGCATCTTTTTCTGCTTCCACAAGTGCTTTTGTGGAAACTGTGAAAGGTTTGGATTATAAAGCATTCAAACAAATTGTTGAATCCTGTGGTAATTTTAAAGTTACAAAAGGAAAAGCTAAAAAAGGTGCCTGGAATATTGGTGAACAGAAATCAATACTGAGTCCTCTTTATGCATTTGCATCAGAGGCTGCTCGTGTTGTACGATCAATTTTCTCCCGCACTCTTGAAACTGCTCAAAATTCTGTGCGTGTTTTACAGAAGGCCGCTATAACAATACTAGATGGAATTTCACAGTATTCACTGAGACTCATTGATGCTATGATGTTCACATCTGATTTGGCTACTAACAATCTAGTTGTAATGGCCTACATTACAGGTGGTGTTGTTCAGTTGACTTCGCAGTGGCTAACTAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGATTGGCTTGAAGAGAAGTTTAAGGAAGGTGTAGAGTTTCTTAGAGACGGTTGGGAAATTGTTAAATTTATCTCAACCTGTGCTTGTGAAATTGTCGGTGGACAAATTGTCACCTGTGCAAAGGAAATTAAGGAGAGTGTTCAGACATTCTTTAAGCTTGTAAATAAATTTTTGGCTTTGTGTGCTGACTCTATCATTATTGGTGGAGCTAAACTTAAAGCCTTGAATTTAGGTGAAACATTTGTCACGCACTCAAAGGGATTGTACAGAAAGTGTGTTAAATCCAGAGAAGAAACTGGCCTACTCATGCCTCTAAAAGCCCCAAAAGAAATTATCTTCTTAGAGGGAGAAACACTTCCCACAGAAGTGTTAACAGAGGAAGTTGTCTTGAAAACTGGTGATTTACAACCATTAGAACAACCTACTAGTGAAGCTGTTGAAGCTCCATTGGTTGGTACACCAGTTTGTATTAACGGGCTTATGTTGCTCGAAATCAAAGACACAGAAAAGTACTGTGCCCTTGCACCTAATATGATGGTAACAAACAATACCTTCACACTCAAAGGCGGTGCACCAACAAAGGTTACTTTTGGTGATGACACTGTGATAGAAGTGCAAGGTTACAAGAGTGTGAATATCACTTTTGAACTTGATGAAAGGATTGATAAAGTACTTAATGAGAAGTGCTCTGCCTATACAGTTGAACTCGGTACAGAAGTAAATGAGTTCGCCTGTGTTGTGGCAGATGCTGTCATAAAAACTTTGCAACCAGTATCTGAATTACTTACACC
+attaaaggtttataccttcccaggtaacaaaccaaccaactttcgatctcttgtagatctgttctctaaacgaactttaaaatctgtgtggctGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAATTGGCAAAGAAATTTGACACCTTCAATGGGGAATGTCCAAATTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAACCAAGGGTTGAAAAGAAAAAGCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCACCAAATGAATGCAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCAGACGGGCGATTTTGTTAAAGCCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGAAGGTGCCACTACTTGTGGTTACTTACCCCAAAATGCTGTTGTTAAAATTTATTGTCCAGCATGTCACAATTCAGAAGTAGGACCTGAGCATAGTCTTGCCGAATACCATAATGAATCTGGCTTGAAAACCATTCTTCGTAAGGGTGGTCGCACTATTGCCTTTGGAGGCTGTGTGTTCTCTTATGTTGGTTGCCATAACAAGTGTGCCTATTGGGTTCCACGTGCTAGCGCTAACATAGGTTGTAACCATACAGGTGTTGTTGGAGAAGGTTCCGAAGGTCTTAATGACAACCTTCTTGAAATACTCCAAAAAGAGAAAGTCAACATCAATATTGTTGGTGACTTTAAACTTAATGAAGAGATCGCCATTATTTTGGCATCTTTTTCTGCTTCCACAAGTGCTTTTGTGGAAACTGTGAAAGGTTTGGATTATAAAGCATTCAAACAAATTGTTGAATCCTGTGGTAATTTTAAAGTTACAAAAGGAAAAGCTAAAAAAGGTGCCTGGAATATTGGTGAACAGAAATCAATACTGAGTCCTCTTTATGCATTTGCATCAGAGGCTGCTCGTGTTGTACGATCAATTTTCTCCCGCACTCTTGAAACTGCTCAAAATTCTGTGCGTGTTTTACAGAAGGCCGCTATAACAATACTAGATGGAATTTCACAGTATTCACTGAGACTCATTGATGCTATGATGTTCACATCTGATTTGGCTACTAACAATCTAGTTGTAATGGCCTACATTACAGGTGGTGTTGTTCAGTTGACTTCGCAGTGGCTAACTAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGATTGGCTTGAAGAGAAGTTTAAGGAAGGTGTAGAGTTTCTTAGAGACGGTTGGGAAATTGTTAAATTTATCTCAACCTGTGCTTGTGAAATTGTCGGTGGACAAATTGTCACCTGTGCAAAGGAAATTAAGGAGAGTGTTCAGACATTCTTTAAGCTTGTAAATAAATTTTTGGCTTTGTGTGCTGACTCTATCATTATTGGTGGAGCTAAACTTAAAGCCTTGAATTTAGGTGAAACATTTGTCACGCACTCAAAGGGATTGTACAGAAAGTGTGTTAAATCCAGAGAAGAAACTGGCCTACTCATGCCTCTAAAAGCCCCAAAAGAAATTATCTTCTTAGAGGGAGAAACACTTCCCACAGAAGTGTTAACAGAGGAAGTTGTCTTGAAAACTGGTGATTTACAACCATTAGAACAACCTACTAGTGAAGCTGTTGAAGCTCCATTGGTTGGTACACCAGTTTGTATTAACGGGCTTATGTTGCTCGAAATCAAAGACACAGAAAAGTACTGTGCCCTTGCACCTAATATGATGGTAACAAACAATACCTTCACACTCAAAGGCGGTGCACCAACAAAGGTTACTTTTGGTGATGACACTGTGATAGAAGTGCAAGGTTACAAGAGTGTGAATATCACTTTTGAACTTGATGAAAGGATTGATAAAGTACTTAATGAGAAGTGCTCTGCCTATACAGTTGAACTCGGTACAGAAGTAAATGAGTTCGCCTGTGTTGTGGCAGATGCTGTCATAAAAACTTTGCAACCAGTATCTGAATTACTTACACC
--- a/variant.xml	Sun Sep 12 20:35:26 2021 +0000
+++ b/variant.xml	Fri Sep 17 20:22:49 2021 +0000
@@ -1,28 +1,11 @@
 <?xml version="1.0"?>
-<tool id="medaka_variant" name="medaka variant tool" version="@TOOL_VERSION@+galaxy0" profile="@PROFILE@">
-    <description>Probability decoding</description>
+<tool id="medaka_variant" name="medaka variant tool" version="@TOOL_VERSION@+galaxy1" profile="@PROFILE@">
+    <description>decodes variant calls from medaka consensus output</description>
     <macros>
         <import>macros.xml</import>
     </macros>
     <expand macro="requirements"/>
-
     <expand macro="version_command"/>
-
-    <configfiles>
-        <configfile name="convert_fasta">
-import sys
-infile = open(sys.argv[1], 'r')
-outfile = open(sys.argv[2], 'w')
-for line in infile:
-    if line[0] == '>':
-        outfile.write(line)
-    else:
-        outfile.write(line.upper())
-infile.close()
-outfile.close()
-        </configfile>
-    </configfiles>
-
     <command detect_errors="exit_code"><![CDATA[
 ## initialize
 @REF_FASTA@
@@ -44,8 +27,6 @@
     #for $current in $pool.inputs
         '$current'
     #end for
-    '$out_result' ## output
-    2>&1 | tee '$out_log'
 #elif $pool.pool_mode == "No":
     ## run
     medaka variant
@@ -60,16 +41,17 @@
     ## required
     reference.fa
     '$pool.input'
-    '$out_result' ##output
+#end if
+#if str($output_annotated.output_annotated_select) == 'false':
+    '$out_variants' ##output
     2>&1 | tee '$out_log'
-#end if
-#if $out_annotated:
-    ## medaka annotate errors out if the reference is lower case at a position it's annotating because it checks vs the ref base in the vcf
-    && python '$convert_fasta' reference.fa  upper_reference.fa
+#else
+    raw.vcf ##output of medaka variant
+    2>&1 | tee '$out_log'
     && ln -s '$output_annotated.in_bam' in.bam
     && ln -s '$output_annotated.in_bam.metadata.bam_index' in.bai
-    && medaka tools annotate --dpsp --pad $output_annotated.pad '$out_result' upper_reference.fa in.bam tmp.vcf
-    && python '$__tool_directory__/convert_VCF_info_fields.py' tmp.vcf '$out_annotated'
+    && medaka tools annotate --dpsp --pad $output_annotated.pad raw.vcf reference.fa in.bam tmp.vcf
+    && python '$__tool_directory__/convert_VCF_info_fields.py' tmp.vcf '$out_variants'
 #end if
     ]]></command>
     <inputs>
@@ -99,39 +81,38 @@
         <param argument="--ambig_ref" type="boolean" truevalue="--ambig_ref" falsevalue="" label="Decode variants at ambiguous reference positions?" checked="false"/>
         <param argument="--gvcf" type="boolean" truevalue="--gvcf" falsevalue="" label="Output VCF records for reference loci predicted to be non-variant?" checked="false"/>
         <conditional name="output_annotated">
-            <param name="output_annotated_select" type="select" label="Output annotated VCF?" help="Annotate allele frequency, depth of coverage, etc for each variant (requires BAM file)">
-                <option value="true" selected="true">Output annotated VCF</option>
-                <option value="false">Don't output annotated VCF</option>
+            <param name="output_annotated_select" type="select"
+            label="Type of VCF to generate"
+            help="Variant INFO fields in the VCF can be extended to include allele frequency, depth of coverage, etc., but this requires a BAM dataset to calculate those values from.">
+                <option value="true" selected="true">Write annotated VCF with extended INFO</option>
+                <option value="false">Write original decoded VCF with minimal INFO field</option>
             </param>
             <when value="true">
-                <param name="in_bam" type="data" format="bam" optional="false" label="BAM to annotate the VCF"/>
-                <param name="pad" type="integer" min="1" value="25" label="Padding width on either side of variant for realignment in medaka tools anntotate, used to calculate DPSP, DPSPS, AF, FAF, RAF, SB, DP4, and AS in the output annotated VCF"/>
+                <param name="in_bam" type="data" format="bam" optional="false" label="BAM to caclulate additional INFO fields from"/>
+                <param name="pad" type="integer" min="1" value="25"
+                label="Padding width on either side of variant for realignment"
+                help="To calculate the additional INFO fields the tool will run medaka tools anntotate, which performs local realignment of the region +- this width around each variant. All calculated new fields will depend on the width chosen, so only change this value if you know what you are doing." />
             </when>
             <when value="false"/>
         </conditional>
         <param name="output_log_bool" type="boolean" label="Output log file?" checked="true"/>
     </inputs>
     <outputs>
-        <!-- standard -->
-        <data name="out_result" format="vcf" label="${tool.name} on ${on_string}: Result"/>
-        <!-- optional -->
-        <data name="out_annotated" format="vcf" label="${tool.name} on ${on_string}: Annotated">
-            <filter>output_annotated['output_annotated_select']!='false'</filter>
-        </data>
+        <data name="out_variants" format="vcf" label="${tool.name} on ${on_string}: called variants"/>
         <data name="out_log" format="tabular" label="${tool.name} on ${on_string}: Log">
             <filter>output_log_bool</filter>
         </data>
     </outputs>
     <tests>
         <!-- #1 default -->
-        <test expect_num_outputs="3">
+        <test expect_num_outputs="2">
             <conditional name="pool">
                 <param name="pool_mode" value="Yes"/>
                 <param name="inputs" value="medaka_test.hdf,medaka_test.hdf"/>
             </conditional>
             <conditional name="reference_source">
                 <param name="reference_source_selector" value="history"/>
-                <param name="ref_file" value="ref.fasta.gz"/>
+                <param name="ref_file" value="ref.fasta"/>
             </conditional>
             <param name="ambig_ref" value="true"/>
             <conditional name="output_annotated">
@@ -139,17 +120,9 @@
                 <param name="in_bam" value="medaka_test.bam"/>
             </conditional>
             <param name="output_log_bool" value="true"/>
-            
-            <output name="out_result">
+            <output name="out_variants">
                 <assert_contents>
-                    <has_n_lines n="9"/>
-                    <has_line line="##fileformat=VCFv4.1" />
-                    <has_line_matching expression="##medaka_version=[0-9]+\.[0-9]+\.[0-9]+" />
-                </assert_contents>
-            </output>
-            <output name="out_annotated">
-                <assert_contents>
-                    <has_n_lines n="23"/>
+                    <has_n_lines n="18"/>
                     <has_line line="##fileformat=VCFv4.1" />
                     <has_line_matching expression="##medaka_version=[0-9]+\.[0-9]+\.[0-9]+" />
                     <has_line_matching expression="##convert_VCF_info_fields=[0-9]+\.[0-9]+" />
@@ -162,7 +135,7 @@
             </output>
         </test>
         <!--No pooling-->
-        <test expect_num_outputs="3">
+        <test expect_num_outputs="2">
             <conditional name="pool">
                 <param name="pool_mode" value="No"/>
                 <param name="input" value="medaka_test.hdf"/>
@@ -177,17 +150,9 @@
                 <param name="in_bam" value="medaka_test.bam"/>
             </conditional>
             <param name="output_log_bool" value="true"/>
-            
-            <output name="out_result">
+            <output name="out_variants">
                 <assert_contents>
-                    <has_n_lines n="9"/>
-                    <has_line line="##fileformat=VCFv4.1" />
-                    <has_line_matching expression="##medaka_version=[0-9]+\.[0-9]+\.[0-9]+" />
-                </assert_contents>
-            </output>
-            <output name="out_annotated">
-                <assert_contents>
-                    <has_n_lines n="23"/>
+                    <has_n_lines n="18"/>
                     <has_line line="##fileformat=VCFv4.1" />
                     <has_line_matching expression="##medaka_version=[0-9]+\.[0-9]+\.[0-9]+" />
                     <has_line_matching expression="##convert_VCF_info_fields=[0-9]+\.[0-9]+" />
@@ -213,8 +178,7 @@
                 <param name="output_annotated_select" value="false"/>
             </conditional>
             <param name="output_log_bool" value="false"/>
-            
-            <output name="out_result">
+            <output name="out_variants">
                 <assert_contents>
                     <has_n_lines n="9"/>
                     <has_line line="##fileformat=VCFv4.1" />