Mercurial > repos > iuc > medaka_variant

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/convert_VCF_info_fields.py	Wed Feb 10 08:28:04 2021 +0000
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+
+# Takes in VCF file annotated with medaka tools annotate and converts
+#
+# Usage statement:
+# python convert_VCF_info_fields.py in_vcf.vcf out_vcf.vcf
+
+# 10/21/2020 - Nathan P. Roach, natproach@gmail.com
+
+import sys
+from collections import OrderedDict
+from math import log10
+
+from scipy.stats import fisher_exact
+
+
+def pval_to_phredqual(pval):
+    return round(-10 * log10(pval))
+
+
+def parseInfoField(info):
+    info_fields = info.split(';')
+    info_dict = OrderedDict()
+    for info_field in info_fields:
+        code, val = info_field.split('=')
+        info_dict[code] = val
+    return info_dict
+
+
+def annotateVCF(in_vcf_filepath, out_vcf_filepath):
+    in_vcf = open(in_vcf_filepath, 'r')
+    out_vcf = open(out_vcf_filepath, 'w')
+    to_skip = set(['SC', 'SR'])
+    for i, line in enumerate(in_vcf):
+        if i == 1:
+            out_vcf.write("##convert_VCF_info_fields=0.1\n")
+        if line[0:2] == "##":
+            if line[0:11] == "##INFO=<ID=":
+                id_ = line[11:].split(',')[0]
+                if id_ in to_skip:
+                    continue
+            out_vcf.write(line)
+        elif line[0] == "#":
+            out_vcf.write('##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Spanning Reads Allele Frequency By Strand">\n')
+            out_vcf.write('##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\n')
+            out_vcf.write('##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Spanning Reads Allele Frequency">\n')
+            out_vcf.write('##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Spanning Reads Allele Frequency">\n')
+            out_vcf.write('##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\n')
+            out_vcf.write('##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\n')
+            out_vcf.write('##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\n')
+            out_vcf.write(line)
+        else:
+            fields = line.split('\t')
+            info_dict = parseInfoField(fields[7])
+            sr_list = [int(x) for x in info_dict["SR"].split(',')]
+            sc_list = [int(x) for x in info_dict["SC"].split(',')]
+            if len(sr_list) == len(sc_list):
+                variant_list = fields[4].split(',')
+                dpsp = int(info_dict["DPSP"])
+                ref_fwd, ref_rev = 0, 1
+                dpspf, dpspr = (int(x) for x in info_dict["AR"].split(','))
+                for i in range(0, len(sr_list), 2):
+                    dpspf += sr_list[i]
+                    dpspr += sr_list[i + 1]
+                for j, i in enumerate(range(2, len(sr_list), 2)):
+                    dp4 = (sr_list[ref_fwd], sr_list[ref_rev], sr_list[i], sr_list[i + 1])
+                    dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]]
+                    _, p_val = fisher_exact(dp2x2)
+                    sb = pval_to_phredqual(p_val)
+
+                    as_ = (sc_list[ref_fwd], sc_list[ref_rev], sc_list[i], sc_list[i + 1])
+
+                    info = []
+                    for code in info_dict:
+                        if code in to_skip:
+                            continue
+                        val = info_dict[code]
+                        info.append("%s=%s" % (code, val))
+
+                    info.append("DPSPS=%d,%d" % (dpspf, dpspr))
+
+                    if dpsp == 0:
+                        info.append("AF=NaN")
+                    else:
+                        af = dp4[2] + dp4[3] / dpsp
+                        info.append("AF=%.6f" % (af))
+                    if dpspf == 0:
+                        info.append("FAF=NaN")
+                    else:
+                        faf = dp4[2] / dpspf
+                        info.append("FAF=%.6f" % (faf))
+                    if dpspr == 0:
+                        info.append("RAF=NaN")
+                    else:
+                        raf = dp4[3] / dpspr
+                        info.append("RAF=%.6f" % (raf))
+                    info.append("SB=%d" % (sb))
+                    info.append("DP4=%d,%d,%d,%d" % (dp4))
+                    info.append("AS=%d,%d,%d,%d" % (as_))
+                    new_info = ';'.join(info)
+                    fields[4] = variant_list[j]
+                    fields[7] = new_info
+                    out_vcf.write("%s" % ("\t".join(fields)))
+            else:
+                print("WARNING - SR and SC are different lengths, skipping variant")
+                print(line.strip())  # Print the line for debugging purposes
+    in_vcf.close()
+    out_vcf.close()
+
+
+if __name__ == "__main__":
+    annotateVCF(sys.argv[1], sys.argv[2])
--- a/variant.xml	Fri Oct 16 17:05:21 2020 +0000
+++ b/variant.xml	Wed Feb 10 08:28:04 2021 +0000
@@ -1,5 +1,5 @@
 <?xml version="1.0"?>
-<tool id="medaka_variant" name="medaka variant tool" version="@TOOL_VERSION@+galaxy3" profile="@PROFILE@">
+<tool id="medaka_variant" name="medaka variant tool" version="@TOOL_VERSION@+galaxy4" profile="@PROFILE@">
     <description>Probability decoding</description>
     <macros>
         <import>macros.xml</import>
@@ -47,12 +47,11 @@
     2>&1 | tee '$out_log'
 #end if

-#if $out_annotated
-    && samtools faidx reference.fa
-    && grep -v "^#" '$out_result' > tmp.txt
-    && cut -f 1,2 tmp.txt > positions.txt
-    && samtools mpileup -d 0 --positions positions.txt -Q $output_annotated.min_basequal -q $output_annotated.min_mapqual -f reference.fa '$output_annotated.in_bam' > mpileup.txt
-    && python '$__tool_directory__/annotateVCF.py' '$out_result' mpileup.txt '$out_annotated'
+#if $out_annotated:
+    && ln -s '$output_annotated.in_bam' in.bam
+    && ln -s '$output_annotated.in_bam.metadata.bam_index' in.bai
+    && medaka tools annotate --pad $output_annotated.pad '$out_result' reference.fa in.bam tmp.vcf
+    && '$__tool_directory__/convert_VCF_info_fields.py' tmp.vcf '$out_annotated'
 #end if
     ]]></command>
     <inputs>
@@ -85,9 +84,8 @@
                 <option value="false">Don't output annotated VCF</option>
             </param>
             <when value="true">
-                <param name="in_bam" type="data" format="bam" optional="false" label="BAM to annotate the VCF" />
-                <param name="min_basequal" type="integer" value="0" min="0" max="50" label="Minimum base quality" help="The minimum base quality (default: 0) at a given variant position required to include a read in the calculation of variant statistics (samtools mpileup -Q)" />
-                <param name="min_mapqual" type="integer" value="0" min="0" max="60" label="Minimum mapping quality" help="The minimum mapping quality (default: 0) required for a read to be considered in calculation of variant statistics (samtools mpileup -q)" />
+                <param name="in_bam" type="data" format="bam" optional="false" label="BAM to annotate the VCF"/>
+                <param name="pad" type="integer" min="1" value="25" label="Padding width on either side of variant for realignment in medaka tools anntotate, used to calculate DPSP, DPSPS, AF, FAF, RAF, SB, DP4, and AS in the output annotated VCF"/>
             </when>
             <when value="false"/>
         </conditional>
@@ -98,7 +96,7 @@
         <data name="out_result" format="vcf" label="${tool.name} on ${on_string}: Result"/>
         <!-- optional -->
         <data name="out_annotated" format="vcf" label="${tool.name} on ${on_string}: Annotated">
-            <filter>output_annotated_select</filter>
+            <filter>output_annotated['output_annotated_select']!='false'</filter>
         </data>
         <data name="out_log" format="tabular" label="${tool.name} on ${on_string}: Log">
             <filter>output_log_bool</filter>
@@ -125,15 +123,15 @@
                 <assert_contents>
                     <has_n_lines n="9"/>
                     <has_line line="##fileformat=VCFv4.1" />
-                    <has_line line="##medaka_version=1.0.3" />
+                    <has_line_matching expression="##medaka_version=[0-9]+\.[0-9]+\.[0-9]+" />
                 </assert_contents>
             </output>
             <output name="out_annotated">
                 <assert_contents>
-                    <has_n_lines n="16"/>
+                    <has_n_lines n="22"/>
                     <has_line line="##fileformat=VCFv4.1" />
-                    <has_line line="##medaka_version=1.0.3" />
-                    <has_line line="##annotateVCFVersion=0.2" />
+                    <has_line_matching expression="##medaka_version=[0-9]+\.[0-9]+\.[0-9]+" />
+                    <has_line_matching expression="##convert_VCF_info_fields=[0-9]+\.[0-9]+" />
                 </assert_contents>
             </output>
             <output name="out_log">
@@ -162,15 +160,15 @@
                 <assert_contents>
                     <has_n_lines n="9"/>
                     <has_line line="##fileformat=VCFv4.1" />
-                    <has_line line="##medaka_version=1.0.3" />
+                    <has_line_matching expression="##medaka_version=[0-9]+\.[0-9]+\.[0-9]+" />
                 </assert_contents>
             </output>
             <output name="out_annotated">
                 <assert_contents>
-                    <has_n_lines n="16"/>
+                    <has_n_lines n="22"/>
                     <has_line line="##fileformat=VCFv4.1" />
                     <has_line line="##medaka_version=1.0.3" />
-                    <has_line line="##annotateVCFVersion=0.2" />
+                    <has_line_matching expression="##medaka_version=[0-9]+\.[0-9]+\.[0-9]+" />
                 </assert_contents>
             </output>
             <output name="out_log">
@@ -179,6 +177,29 @@
                 </assert_contents>
             </output>
         </test>
+        <!--No annotation or log-->
+        <test expect_num_outputs="1">
+            <conditional name="pool">
+                <param name="pool_mode" value="No"/>
+                <param name="input" value="medaka_test.hdf"/>
+            </conditional>
+            <conditional name="reference_source">
+                <param name="reference_source_selector" value="history"/>
+                <param name="ref_file" value="ref.fasta"/>
+            </conditional>
+            <conditional name="output_annotated">
+                <param name="output_annotated_select" value="false"/>
+            </conditional>
+            <param name="output_log_bool" value="false"/>
+
+            <output name="out_result">
+                <assert_contents>
+                    <has_n_lines n="9"/>
+                    <has_line line="##fileformat=VCFv4.1" />
+                    <has_line_matching expression="##medaka_version=[0-9]+\.[0-9]+\.[0-9]+" />
+                </assert_contents>
+            </output>
+        </test>
     </tests>
     <help><![CDATA[
 .. class:: infomark