Mercurial > repos > iuc > medaka_variant
changeset 12:0f5f4a208660 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
author | iuc |
---|---|
date | Fri, 17 Sep 2021 20:22:49 +0000 |
parents | 2bf63b38ee9b |
children | 222669c4afb6 |
files | convert_VCF_info_fields.py test-data/ref.fasta variant.xml |
diffstat | 3 files changed, 159 insertions(+), 135 deletions(-) [+] |
line wrap: on
line diff
--- a/convert_VCF_info_fields.py Sun Sep 12 20:35:26 2021 +0000 +++ b/convert_VCF_info_fields.py Fri Sep 17 20:22:49 2021 +0000 @@ -7,11 +7,11 @@ # 10/21/2020 - Nathan P. Roach, natproach@gmail.com +import re import sys from collections import OrderedDict from math import log10 -import scipy import scipy.stats @@ -33,84 +33,144 @@ def annotateVCF(in_vcf_filepath, out_vcf_filepath): + """Postprocess output of medaka tools annotate. + + Splits multiallelic sites into separate records. + Replaces medaka INFO fields that might represent information of the ref + and multiple alternate alleles with simple ref, alt allele counterparts. + """ + in_vcf = open(in_vcf_filepath, 'r') - out_vcf = open(out_vcf_filepath, 'w') - to_skip = set(['SC', 'SR']) - for i, line in enumerate(in_vcf): - if i == 1: - out_vcf.write("##convert_VCF_info_fields=0.2\n") - if line[0:2] == "##": - if line[0:11] == "##INFO=<ID=": - id_ = line[11:].split(',')[0] - if id_ in to_skip: + # medaka INFO fields that do not make sense after splitting of + # multi-allelic records + # DP will be overwritten with the value of DPSP because medaka tools + # annotate currently only calculates the latter correctly + # (https://github.com/nanoporetech/medaka/issues/192). + # DPS, which is as unreliable as DP, gets skipped and the code + # calculates the spanning reads equivalent DPSPS instead. + to_skip = {'SC', 'SR', 'AR', 'DP', 'DPSP', 'DPS'} + struct_meta_pat = re.compile('##(.+)=<ID=([^,]+)(,.+)?>') + header_lines = [] + contig_ids = set() + contig_ids_simple = set() + # parse the metadata lines of the input VCF and drop: + # - duplicate lines + # - INFO lines declaring keys we are not going to write + # - redundant contig information + while True: + line = in_vcf.readline() + if line[:2] != '##': + assert line.startswith('#CHROM') + break + if line in header_lines: + # the annotate tool may generate lines already written by + # medaka variant again (example: medaka version line) + continue + match = struct_meta_pat.match(line) + if match: + match_type, match_id, match_misc = match.groups() + if match_type == 'INFO': + if match_id == 'DPSP': + line = line.replace('DPSP', 'DP') + elif match_id in to_skip: continue - out_vcf.write(line) - elif line[0] == "#": - out_vcf.write('##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Spanning Reads Allele Frequency By Strand">\n') - out_vcf.write('##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\n') - out_vcf.write('##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Spanning Reads Allele Frequency">\n') - out_vcf.write('##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Spanning Reads Allele Frequency">\n') - out_vcf.write('##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\n') - out_vcf.write('##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\n') - out_vcf.write('##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\n') - out_vcf.write(line) - else: + elif match_type == 'contig': + contig_ids.add(match_id) + if not match_misc: + # the annotate tools writes its own contig info, + # which is redundant with contig info generated by + # medaka variant, but lacks a length value. + # We don't need the incomplete line. + contig_ids_simple.add(match_id) + continue + header_lines.append(line) + # Lets check the above assumption about each ID-only contig line + # having a more complete counterpart. + assert not (contig_ids_simple - contig_ids) + header_lines.insert(1, '##convert_VCF_info_fields=0.2\n') + header_lines += [ + '##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Depth of spanning reads by strand">\n', + '##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\n', + '##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Spanning Reads Allele Frequency">\n', + '##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Spanning Reads Allele Frequency">\n', + '##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\n', + '##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\n', + '##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\n', + line + ] + + with open(out_vcf_filepath, 'w') as out_vcf: + out_vcf.writelines(header_lines) + for line in in_vcf: fields = line.split('\t') info_dict = parseInfoField(fields[7]) sr_list = [int(x) for x in info_dict["SR"].split(',')] sc_list = [int(x) for x in info_dict["SC"].split(',')] - if len(sr_list) == len(sc_list): - variant_list = fields[4].split(',') - dpsp = int(info_dict["DPSP"]) - ref_fwd, ref_rev = 0, 1 - dpspf, dpspr = (int(x) for x in info_dict["AR"].split(',')) - for i in range(0, len(sr_list), 2): - dpspf += sr_list[i] - dpspr += sr_list[i + 1] - for j, i in enumerate(range(2, len(sr_list), 2)): - dp4 = (sr_list[ref_fwd], sr_list[ref_rev], sr_list[i], sr_list[i + 1]) - dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]] - _, p_val = scipy.stats.fisher_exact(dp2x2) - sb = pval_to_phredqual(p_val) + if len(sr_list) != len(sc_list): + print( + 'WARNING - SR and SC are different lengths, ' + 'skipping variant' + ) + print(line.strip()) # Print the line for debugging purposes + continue + variant_list = fields[4].split(',') + dpsp = int(info_dict['DPSP']) + ref_fwd, ref_rev = 0, 1 + dpspf, dpspr = (int(x) for x in info_dict['AR'].split(',')) + for i in range(0, len(sr_list), 2): + dpspf += sr_list[i] + dpspr += sr_list[i + 1] + for j, i in enumerate(range(2, len(sr_list), 2)): + dp4 = ( + sr_list[ref_fwd], + sr_list[ref_rev], + sr_list[i], + sr_list[i + 1] + ) + dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]] + _, p_val = scipy.stats.fisher_exact(dp2x2) + sb = pval_to_phredqual(p_val) - as_ = (sc_list[ref_fwd], sc_list[ref_rev], sc_list[i], sc_list[i + 1]) - - info = [] - for code in info_dict: - if code in to_skip: - continue - val = info_dict[code] - info.append("%s=%s" % (code, val)) - - info.append("DPSPS=%d,%d" % (dpspf, dpspr)) + as_ = ( + sc_list[ref_fwd], + sc_list[ref_rev], + sc_list[i], + sc_list[i + 1] + ) - if dpsp == 0: - info.append("AF=NaN") - else: - af = (dp4[2] + dp4[3]) / dpsp - info.append("AF=%.6f" % (af)) - if dpspf == 0: - info.append("FAF=NaN") - else: - faf = dp4[2] / dpspf - info.append("FAF=%.6f" % (faf)) - if dpspr == 0: - info.append("RAF=NaN") - else: - raf = dp4[3] / dpspr - info.append("RAF=%.6f" % (raf)) - info.append("SB=%d" % (sb)) - info.append("DP4=%d,%d,%d,%d" % (dp4)) - info.append("AS=%d,%d,%d,%d" % (as_)) - new_info = ';'.join(info) - fields[4] = variant_list[j] - fields[7] = new_info - out_vcf.write("%s" % ("\t".join(fields))) - else: - print("WARNING - SR and SC are different lengths, skipping variant") - print(line.strip()) # Print the line for debugging purposes + info = [] + for code in info_dict: + if code in to_skip: + continue + val = info_dict[code] + info.append("%s=%s" % (code, val)) + + info.append('DP=%d' % dpsp) + info.append('DPSPS=%d,%d' % (dpspf, dpspr)) + + if dpsp == 0: + info.append('AF=NaN') + else: + af = (dp4[2] + dp4[3]) / dpsp + info.append('AF=%.6f' % af) + if dpspf == 0: + info.append('FAF=NaN') + else: + faf = dp4[2] / dpspf + info.append('FAF=%.6f' % faf) + if dpspr == 0: + info.append('RAF=NaN') + else: + raf = dp4[3] / dpspr + info.append('RAF=%.6f' % raf) + info.append('SB=%d' % sb) + info.append('DP4=%d,%d,%d,%d' % dp4) + info.append('AS=%d,%d,%d,%d' % as_) + new_info = ';'.join(info) + fields[4] = variant_list[j] + fields[7] = new_info + out_vcf.write('\t'.join(fields)) in_vcf.close() - out_vcf.close() if __name__ == "__main__":
--- a/test-data/ref.fasta Sun Sep 12 20:35:26 2021 +0000 +++ b/test-data/ref.fasta Fri Sep 17 20:22:49 2021 +0000 @@ -1,2 +1,2 @@ >NC_045512.2 -ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAATTGGCAAAGAAATTTGACACCTTCAATGGGGAATGTCCAAATTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAACCAAGGGTTGAAAAGAAAAAGCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCACCAAATGAATGCAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCAGACGGGCGATTTTGTTAAAGCCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGAAGGTGCCACTACTTGTGGTTACTTACCCCAAAATGCTGTTGTTAAAATTTATTGTCCAGCATGTCACAATTCAGAAGTAGGACCTGAGCATAGTCTTGCCGAATACCATAATGAATCTGGCTTGAAAACCATTCTTCGTAAGGGTGGTCGCACTATTGCCTTTGGAGGCTGTGTGTTCTCTTATGTTGGTTGCCATAACAAGTGTGCCTATTGGGTTCCACGTGCTAGCGCTAACATAGGTTGTAACCATACAGGTGTTGTTGGAGAAGGTTCCGAAGGTCTTAATGACAACCTTCTTGAAATACTCCAAAAAGAGAAAGTCAACATCAATATTGTTGGTGACTTTAAACTTAATGAAGAGATCGCCATTATTTTGGCATCTTTTTCTGCTTCCACAAGTGCTTTTGTGGAAACTGTGAAAGGTTTGGATTATAAAGCATTCAAACAAATTGTTGAATCCTGTGGTAATTTTAAAGTTACAAAAGGAAAAGCTAAAAAAGGTGCCTGGAATATTGGTGAACAGAAATCAATACTGAGTCCTCTTTATGCATTTGCATCAGAGGCTGCTCGTGTTGTACGATCAATTTTCTCCCGCACTCTTGAAACTGCTCAAAATTCTGTGCGTGTTTTACAGAAGGCCGCTATAACAATACTAGATGGAATTTCACAGTATTCACTGAGACTCATTGATGCTATGATGTTCACATCTGATTTGGCTACTAACAATCTAGTTGTAATGGCCTACATTACAGGTGGTGTTGTTCAGTTGACTTCGCAGTGGCTAACTAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGATTGGCTTGAAGAGAAGTTTAAGGAAGGTGTAGAGTTTCTTAGAGACGGTTGGGAAATTGTTAAATTTATCTCAACCTGTGCTTGTGAAATTGTCGGTGGACAAATTGTCACCTGTGCAAAGGAAATTAAGGAGAGTGTTCAGACATTCTTTAAGCTTGTAAATAAATTTTTGGCTTTGTGTGCTGACTCTATCATTATTGGTGGAGCTAAACTTAAAGCCTTGAATTTAGGTGAAACATTTGTCACGCACTCAAAGGGATTGTACAGAAAGTGTGTTAAATCCAGAGAAGAAACTGGCCTACTCATGCCTCTAAAAGCCCCAAAAGAAATTATCTTCTTAGAGGGAGAAACACTTCCCACAGAAGTGTTAACAGAGGAAGTTGTCTTGAAAACTGGTGATTTACAACCATTAGAACAACCTACTAGTGAAGCTGTTGAAGCTCCATTGGTTGGTACACCAGTTTGTATTAACGGGCTTATGTTGCTCGAAATCAAAGACACAGAAAAGTACTGTGCCCTTGCACCTAATATGATGGTAACAAACAATACCTTCACACTCAAAGGCGGTGCACCAACAAAGGTTACTTTTGGTGATGACACTGTGATAGAAGTGCAAGGTTACAAGAGTGTGAATATCACTTTTGAACTTGATGAAAGGATTGATAAAGTACTTAATGAGAAGTGCTCTGCCTATACAGTTGAACTCGGTACAGAAGTAAATGAGTTCGCCTGTGTTGTGGCAGATGCTGTCATAAAAACTTTGCAACCAGTATCTGAATTACTTACACC +attaaaggtttataccttcccaggtaacaaaccaaccaactttcgatctcttgtagatctgttctctaaacgaactttaaaatctgtgtggctGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAATTGGCAAAGAAATTTGACACCTTCAATGGGGAATGTCCAAATTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAACCAAGGGTTGAAAAGAAAAAGCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCACCAAATGAATGCAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCAGACGGGCGATTTTGTTAAAGCCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGAAGGTGCCACTACTTGTGGTTACTTACCCCAAAATGCTGTTGTTAAAATTTATTGTCCAGCATGTCACAATTCAGAAGTAGGACCTGAGCATAGTCTTGCCGAATACCATAATGAATCTGGCTTGAAAACCATTCTTCGTAAGGGTGGTCGCACTATTGCCTTTGGAGGCTGTGTGTTCTCTTATGTTGGTTGCCATAACAAGTGTGCCTATTGGGTTCCACGTGCTAGCGCTAACATAGGTTGTAACCATACAGGTGTTGTTGGAGAAGGTTCCGAAGGTCTTAATGACAACCTTCTTGAAATACTCCAAAAAGAGAAAGTCAACATCAATATTGTTGGTGACTTTAAACTTAATGAAGAGATCGCCATTATTTTGGCATCTTTTTCTGCTTCCACAAGTGCTTTTGTGGAAACTGTGAAAGGTTTGGATTATAAAGCATTCAAACAAATTGTTGAATCCTGTGGTAATTTTAAAGTTACAAAAGGAAAAGCTAAAAAAGGTGCCTGGAATATTGGTGAACAGAAATCAATACTGAGTCCTCTTTATGCATTTGCATCAGAGGCTGCTCGTGTTGTACGATCAATTTTCTCCCGCACTCTTGAAACTGCTCAAAATTCTGTGCGTGTTTTACAGAAGGCCGCTATAACAATACTAGATGGAATTTCACAGTATTCACTGAGACTCATTGATGCTATGATGTTCACATCTGATTTGGCTACTAACAATCTAGTTGTAATGGCCTACATTACAGGTGGTGTTGTTCAGTTGACTTCGCAGTGGCTAACTAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGATTGGCTTGAAGAGAAGTTTAAGGAAGGTGTAGAGTTTCTTAGAGACGGTTGGGAAATTGTTAAATTTATCTCAACCTGTGCTTGTGAAATTGTCGGTGGACAAATTGTCACCTGTGCAAAGGAAATTAAGGAGAGTGTTCAGACATTCTTTAAGCTTGTAAATAAATTTTTGGCTTTGTGTGCTGACTCTATCATTATTGGTGGAGCTAAACTTAAAGCCTTGAATTTAGGTGAAACATTTGTCACGCACTCAAAGGGATTGTACAGAAAGTGTGTTAAATCCAGAGAAGAAACTGGCCTACTCATGCCTCTAAAAGCCCCAAAAGAAATTATCTTCTTAGAGGGAGAAACACTTCCCACAGAAGTGTTAACAGAGGAAGTTGTCTTGAAAACTGGTGATTTACAACCATTAGAACAACCTACTAGTGAAGCTGTTGAAGCTCCATTGGTTGGTACACCAGTTTGTATTAACGGGCTTATGTTGCTCGAAATCAAAGACACAGAAAAGTACTGTGCCCTTGCACCTAATATGATGGTAACAAACAATACCTTCACACTCAAAGGCGGTGCACCAACAAAGGTTACTTTTGGTGATGACACTGTGATAGAAGTGCAAGGTTACAAGAGTGTGAATATCACTTTTGAACTTGATGAAAGGATTGATAAAGTACTTAATGAGAAGTGCTCTGCCTATACAGTTGAACTCGGTACAGAAGTAAATGAGTTCGCCTGTGTTGTGGCAGATGCTGTCATAAAAACTTTGCAACCAGTATCTGAATTACTTACACC
--- a/variant.xml Sun Sep 12 20:35:26 2021 +0000 +++ b/variant.xml Fri Sep 17 20:22:49 2021 +0000 @@ -1,28 +1,11 @@ <?xml version="1.0"?> -<tool id="medaka_variant" name="medaka variant tool" version="@TOOL_VERSION@+galaxy0" profile="@PROFILE@"> - <description>Probability decoding</description> +<tool id="medaka_variant" name="medaka variant tool" version="@TOOL_VERSION@+galaxy1" profile="@PROFILE@"> + <description>decodes variant calls from medaka consensus output</description> <macros> <import>macros.xml</import> </macros> <expand macro="requirements"/> - <expand macro="version_command"/> - - <configfiles> - <configfile name="convert_fasta"> -import sys -infile = open(sys.argv[1], 'r') -outfile = open(sys.argv[2], 'w') -for line in infile: - if line[0] == '>': - outfile.write(line) - else: - outfile.write(line.upper()) -infile.close() -outfile.close() - </configfile> - </configfiles> - <command detect_errors="exit_code"><![CDATA[ ## initialize @REF_FASTA@ @@ -44,8 +27,6 @@ #for $current in $pool.inputs '$current' #end for - '$out_result' ## output - 2>&1 | tee '$out_log' #elif $pool.pool_mode == "No": ## run medaka variant @@ -60,16 +41,17 @@ ## required reference.fa '$pool.input' - '$out_result' ##output +#end if +#if str($output_annotated.output_annotated_select) == 'false': + '$out_variants' ##output 2>&1 | tee '$out_log' -#end if -#if $out_annotated: - ## medaka annotate errors out if the reference is lower case at a position it's annotating because it checks vs the ref base in the vcf - && python '$convert_fasta' reference.fa upper_reference.fa +#else + raw.vcf ##output of medaka variant + 2>&1 | tee '$out_log' && ln -s '$output_annotated.in_bam' in.bam && ln -s '$output_annotated.in_bam.metadata.bam_index' in.bai - && medaka tools annotate --dpsp --pad $output_annotated.pad '$out_result' upper_reference.fa in.bam tmp.vcf - && python '$__tool_directory__/convert_VCF_info_fields.py' tmp.vcf '$out_annotated' + && medaka tools annotate --dpsp --pad $output_annotated.pad raw.vcf reference.fa in.bam tmp.vcf + && python '$__tool_directory__/convert_VCF_info_fields.py' tmp.vcf '$out_variants' #end if ]]></command> <inputs> @@ -99,39 +81,38 @@ <param argument="--ambig_ref" type="boolean" truevalue="--ambig_ref" falsevalue="" label="Decode variants at ambiguous reference positions?" checked="false"/> <param argument="--gvcf" type="boolean" truevalue="--gvcf" falsevalue="" label="Output VCF records for reference loci predicted to be non-variant?" checked="false"/> <conditional name="output_annotated"> - <param name="output_annotated_select" type="select" label="Output annotated VCF?" help="Annotate allele frequency, depth of coverage, etc for each variant (requires BAM file)"> - <option value="true" selected="true">Output annotated VCF</option> - <option value="false">Don't output annotated VCF</option> + <param name="output_annotated_select" type="select" + label="Type of VCF to generate" + help="Variant INFO fields in the VCF can be extended to include allele frequency, depth of coverage, etc., but this requires a BAM dataset to calculate those values from."> + <option value="true" selected="true">Write annotated VCF with extended INFO</option> + <option value="false">Write original decoded VCF with minimal INFO field</option> </param> <when value="true"> - <param name="in_bam" type="data" format="bam" optional="false" label="BAM to annotate the VCF"/> - <param name="pad" type="integer" min="1" value="25" label="Padding width on either side of variant for realignment in medaka tools anntotate, used to calculate DPSP, DPSPS, AF, FAF, RAF, SB, DP4, and AS in the output annotated VCF"/> + <param name="in_bam" type="data" format="bam" optional="false" label="BAM to caclulate additional INFO fields from"/> + <param name="pad" type="integer" min="1" value="25" + label="Padding width on either side of variant for realignment" + help="To calculate the additional INFO fields the tool will run medaka tools anntotate, which performs local realignment of the region +- this width around each variant. All calculated new fields will depend on the width chosen, so only change this value if you know what you are doing." /> </when> <when value="false"/> </conditional> <param name="output_log_bool" type="boolean" label="Output log file?" checked="true"/> </inputs> <outputs> - <!-- standard --> - <data name="out_result" format="vcf" label="${tool.name} on ${on_string}: Result"/> - <!-- optional --> - <data name="out_annotated" format="vcf" label="${tool.name} on ${on_string}: Annotated"> - <filter>output_annotated['output_annotated_select']!='false'</filter> - </data> + <data name="out_variants" format="vcf" label="${tool.name} on ${on_string}: called variants"/> <data name="out_log" format="tabular" label="${tool.name} on ${on_string}: Log"> <filter>output_log_bool</filter> </data> </outputs> <tests> <!-- #1 default --> - <test expect_num_outputs="3"> + <test expect_num_outputs="2"> <conditional name="pool"> <param name="pool_mode" value="Yes"/> <param name="inputs" value="medaka_test.hdf,medaka_test.hdf"/> </conditional> <conditional name="reference_source"> <param name="reference_source_selector" value="history"/> - <param name="ref_file" value="ref.fasta.gz"/> + <param name="ref_file" value="ref.fasta"/> </conditional> <param name="ambig_ref" value="true"/> <conditional name="output_annotated"> @@ -139,17 +120,9 @@ <param name="in_bam" value="medaka_test.bam"/> </conditional> <param name="output_log_bool" value="true"/> - - <output name="out_result"> + <output name="out_variants"> <assert_contents> - <has_n_lines n="9"/> - <has_line line="##fileformat=VCFv4.1" /> - <has_line_matching expression="##medaka_version=[0-9]+\.[0-9]+\.[0-9]+" /> - </assert_contents> - </output> - <output name="out_annotated"> - <assert_contents> - <has_n_lines n="23"/> + <has_n_lines n="18"/> <has_line line="##fileformat=VCFv4.1" /> <has_line_matching expression="##medaka_version=[0-9]+\.[0-9]+\.[0-9]+" /> <has_line_matching expression="##convert_VCF_info_fields=[0-9]+\.[0-9]+" /> @@ -162,7 +135,7 @@ </output> </test> <!--No pooling--> - <test expect_num_outputs="3"> + <test expect_num_outputs="2"> <conditional name="pool"> <param name="pool_mode" value="No"/> <param name="input" value="medaka_test.hdf"/> @@ -177,17 +150,9 @@ <param name="in_bam" value="medaka_test.bam"/> </conditional> <param name="output_log_bool" value="true"/> - - <output name="out_result"> + <output name="out_variants"> <assert_contents> - <has_n_lines n="9"/> - <has_line line="##fileformat=VCFv4.1" /> - <has_line_matching expression="##medaka_version=[0-9]+\.[0-9]+\.[0-9]+" /> - </assert_contents> - </output> - <output name="out_annotated"> - <assert_contents> - <has_n_lines n="23"/> + <has_n_lines n="18"/> <has_line line="##fileformat=VCFv4.1" /> <has_line_matching expression="##medaka_version=[0-9]+\.[0-9]+\.[0-9]+" /> <has_line_matching expression="##convert_VCF_info_fields=[0-9]+\.[0-9]+" /> @@ -213,8 +178,7 @@ <param name="output_annotated_select" value="false"/> </conditional> <param name="output_log_bool" value="false"/> - - <output name="out_result"> + <output name="out_variants"> <assert_contents> <has_n_lines n="9"/> <has_line line="##fileformat=VCFv4.1" />