# HG changeset patch
# User mheinzl
# Date 1658481584 0
# Node ID fdfe9a919ff77fabf6f9001e95ff230a0c04025a
# Parent  1797e461d674925776920c3ff12daf51e5eb7f7f
planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8-dirty

diff -r 1797e461d674 -r fdfe9a919ff7 mut2read.py
--- a/mut2read.py	Mon Mar 29 09:22:57 2021 +0000
+++ b/mut2read.py	Fri Jul 22 09:19:44 2022 +0000
@@ -20,6 +20,7 @@
 import argparse
 import json
 import os
+import re
 import sys
 
 import numpy as np
@@ -68,6 +69,7 @@
     # get tags
     tag_dict = {}
     cvrg_dict = {}
+    tag_dict_ref = {}
 
     for variant in VCF(file1):
         chrom = variant.CHROM
@@ -77,54 +79,108 @@
             continue
         else:
             alt = variant.ALT[0]
+        alt = alt.upper()
+        ref = ref.upper()
+        if "N" in alt:  # skip indels with N in alt allele --> it is not an indel but just a mismatch at the position where the N is (checked this in IGV)
+            continue
         chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt
         dcs_len = []
-        if len(ref) == len(alt):
-            for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000):
-                if pileupcolumn.reference_pos == stop_pos:
-                    count_alt = 0
-                    count_ref = 0
-                    count_indel = 0
-                    count_n = 0
-                    count_other = 0
-                    count_lowq = 0
-                    print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups),
-                          "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n)
-                    for pileupread in pileupcolumn.pileups:
-                        if not pileupread.is_del and not pileupread.is_refskip:
-                            # query position is None if is_del or is_refskip is set.
-                            nuc = pileupread.alignment.query_sequence[pileupread.query_position]
-                            dcs_len.append(len(pileupread.alignment.query_sequence))
-                            if nuc == alt:
-                                count_alt += 1
-                                tag = pileupread.alignment.query_name
-                                if tag in tag_dict:
-                                    tag_dict[tag][chrom_stop_pos] = alt
-                                else:
-                                    tag_dict[tag] = {}
-                                    tag_dict[tag][chrom_stop_pos] = alt
-                            elif nuc == ref:
-                                count_ref += 1
-                            elif nuc == "N":
-                                count_n += 1
-                            elif nuc == "lowQ":
-                                count_lowq += 1
+        for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000):
+            if pileupcolumn.reference_pos == stop_pos:
+                count_alt = 0
+                count_ref = 0
+                count_indel = 0
+                count_n = 0
+                count_other = 0
+                count_lowq = 0
+                for pileupread in pileupcolumn.pileups:
+                    if not pileupread.is_refskip:
+                        if pileupread.is_del:
+                            p = pileupread.query_position_or_next
+                            e = p + len(alt) - 1
+                        else:
+                            p = pileupread.query_position
+                            e = p + len(alt)
+                        s = p
+                        split_cigar = re.split('(\d+)', pileupread.alignment.cigarstring)
+                        if len(ref) < len(alt):
+                            if "I" in split_cigar:
+                                all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"]
+                                for ai in all_insertions:  # if multiple insertions in DCS
+                                    ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
+                                    ins_count = split_cigar[ai - 1]  # nr of insertions should match with alt allele
+                                    if "I" in split_cigar and sum(ins_index) == p + 1 and len(alt) - 1 == int(ins_count):  # if position in read matches and length of insertion
+                                        nuc = pileupread.alignment.query_sequence[s:e]
+                                        break
+                                    else:
+                                        nuc = pileupread.alignment.query_sequence[s]
                             else:
-                                count_other += 1
-                        else:
-                            count_indel += 1
-                    dcs_median = np.median(np.array(dcs_len))
-                    cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median)
+                                nuc = pileupread.alignment.query_sequence[s]
+                        elif len(ref) > len(alt):
+                            ref_positions = pileupread.alignment.get_reference_positions(full_length=True)[s:p + len(ref)]
+                            if "D" in split_cigar:
+                                all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"]
+                                for di, ai in enumerate(all_deletions):  # if multiple insertions in DCS
+                                    if di > 0:  # more than 1 deletion, don't count previous deletion to position
+                                        all_deletions_mod = split_cigar[:ai - 1]
+                                        prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")]
+                                        split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx]
+                                        del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()]
+                                    else:  # first deletion in read, sum all previous (mis)matches and insertions to position
+                                        del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
+                                    del_count = split_cigar[ai - 1]  # nr of deletions should match with ref allele
+                                    if "D" in split_cigar and sum(del_index) == p + 1 and len(ref) - 1 == int(del_count):
+                                        nuc = pileupread.alignment.query_sequence[s:e]
+                                        if nuc == "":
+                                            nuc = str(alt)
+                                        break
+                                    else:
+                                        nuc = pileupread.alignment.query_sequence[s:s + len(ref)]
+                            elif len(ref_positions) < len(ref):  # DCS has reference but the position is at the very end of the DCS and therefore not the full reference positions are there
+                                nuc = pileupread.alignment.get_reference_sequence()[s:s + len(ref)]
+                                if nuc.upper() == ref[:len(nuc)]:
+                                    nuc = str(ref)
+                            else:
+                                nuc = pileupread.alignment.query_sequence[s:s + len(ref)]
+                        else:  # SNV: query position is None if is_del or is_refskip is set.
+                            nuc = pileupread.alignment.query_sequence[s]
 
-                    print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" %
-                          (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n,
-                           count_indel, count_lowq, dcs_median))
-        else:
-            print("indels are currently not evaluated")
+                        nuc = nuc.upper()
+                        tag = pileupread.alignment.query_name
+                        if "_" in tag:
+                            tag = re.split('_', tag)[0]
+
+                        if nuc == alt:
+                            count_alt += 1
+                            if tag in tag_dict:
+                                tag_dict[tag][chrom_stop_pos] = alt
+                            else:
+                                tag_dict[tag] = {}
+                                tag_dict[tag][chrom_stop_pos] = alt
+                        elif nuc == ref:
+                            count_ref += 1
+                            if tag in tag_dict_ref:
+                                tag_dict_ref[tag][chrom_stop_pos] = ref
+                            else:
+                                tag_dict_ref[tag] = {}
+                                tag_dict_ref[tag][chrom_stop_pos] = ref
+                        elif nuc == "N":
+                            count_n += 1
+                        elif nuc == "lowQ":
+                            count_lowq += 1
+                        else:
+                            count_other += 1
+                        dcs_len.append(len(pileupread.alignment.query_sequence))
+
+                dcs_median = np.median(np.array(dcs_len))
+                cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median)
+                print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" %
+                      (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n,
+                       count_indel, count_lowq, dcs_median))
     bam.close()
 
     with open(json_file, "w") as f:
-        json.dump((tag_dict, cvrg_dict), f)
+        json.dump((tag_dict, cvrg_dict, tag_dict_ref), f)
 
     # create fastq from aligned reads
     with open(outfile, 'w') as out:
@@ -134,12 +190,11 @@
                 splits = line.split('\t')
                 tag = splits[0]
 
-                if tag in tag_dict:
+                if tag in tag_dict or tag in tag_dict_ref:
                     str1 = splits[4]
                     curr_seq = str1.replace("-", "")
                     str2 = splits[5]
                     curr_qual = str2.replace(" ", "")
-
                     out.write("@" + splits[0] + "." + splits[1] + "." + splits[2] + "\n")
                     out.write(curr_seq + "\n")
                     out.write("+" + "\n")
diff -r 1797e461d674 -r fdfe9a919ff7 mut2read.xml
--- a/mut2read.xml	Mon Mar 29 09:22:57 2021 +0000
+++ b/mut2read.xml	Fri Jul 22 09:19:44 2022 +0000
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<tool id="mut2read" name="DCS mutations to tags/reads:" version="2.1.1" profile="19.01">
+<tool id="mut2read" name="DCS mutations to tags/reads:" version="3.0.0" profile="19.01">
     <description>Extracts all tags that carry a mutation in the duplex consensus sequence (DCS)</description>
     <macros>
         <import>va_macros.xml</import>
@@ -17,7 +17,7 @@
     ]]>
     </command>
     <inputs>
-        <param name="file1" type="data" format="vcf" label="DCS Mutation File" optional="false" help="VCF file with DCS mutations. See Help section below for a detailed explanation."/>
+        <param name="file1" type="data" format="vcf" label="DCS Mutation File" optional="false" help="VCF file with DCS mutations. See the Help section below for a detailed explanation."/>
         <param name="file2" type="data" format="bam" label="DCS BAM File" optional="false" help="BAM file with aligned DCS reads."/>
         <param name="file3" type="data" format="tabular" label="Aligned Families File" optional="false" help="TABULAR file with aligned families."/>
     </inputs>
@@ -27,11 +27,11 @@
     </outputs>
     <tests>
         <test>
-            <param name="file1" value="FreeBayes_test.vcf" lines_diff="2"/>
+            <param name="file1" value="FreeBayes_test.vcf"/>
             <param name="file2" value="DCS_test.bam"/>
             <param name="file3" value="Aligned_Families_test.tabular"/>
-            <output name="output_fastq" file="Interesting_Reads_test.fastq" lines_diff="136"/>
-            <output name="output_json" file="tag_count_dict_test.json" lines_diff="2"/>
+            <output name="output_fastq" file="Interesting_Reads_test.fastq"/>
+            <output name="output_json" file="tag_count_dict_test.json"/>
         </test>
     </tests>
     <help> <![CDATA[
@@ -39,12 +39,12 @@
 
 Takes a VCF file with mutations, a BAM file of aligned DCS reads, and a 
 tabular file with aligned families as input and prints all tags of reads that 
-carry a mutation to a user specified output file and creates a fastq file of 
-reads of tags with a mutation.
+carry a mutation or have the reference allele to a user-specified output file and creates a fastq file of 
+reads of tags with a mutation and the reference allele.
 
 **Input** 
 
-**Dataset 1:** VCF file with duplex consesus sequence (DCS) mutations. E.g. 
+**Dataset 1:** VCF file with duplex consensus sequence (DCS) mutations. E.g. 
 generated by the `FreeBayes <https://arxiv.org/abs/1207.3907>`_ or `LoFreq <https://academic.oup.com/nar/article/40/22/11189/1152727>`_ variant caller.
 
 
@@ -57,7 +57,7 @@
 
 **Output**
 
-The output is a json file containing dictonaries of the tags of reads containing mutations 
+The output is a json file containing dictionaries of the tags of reads containing mutations 
 in the DCS and a fastq file of all reads of these tags.
 
     ]]> 
diff -r 1797e461d674 -r fdfe9a919ff7 mut2sscs.py
--- a/mut2sscs.py	Mon Mar 29 09:22:57 2021 +0000
+++ b/mut2sscs.py	Fri Jul 22 09:19:44 2022 +0000
@@ -23,6 +23,7 @@
 import argparse
 import json
 import os
+import re
 import sys
 
 import numpy as np
@@ -71,60 +72,111 @@
             continue
         else:
             alt = variant.ALT[0]
+        alt = alt.upper()
+        ref = ref.upper()
+        if "N" in alt:  # skip indels with N in alt allele --> it is not an indel but just a mismatch at the position where the N is (checked this in IGV)
+            continue
         chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt
-        if len(ref) == len(alt):
-            for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=1000000000):
-                if pileupcolumn.reference_pos == stop_pos:
-                    count_alt = 0
-                    count_ref = 0
-                    count_indel = 0
-                    print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups),
-                          "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n)
-                    for pileupread in pileupcolumn.pileups:
-                        if not pileupread.is_del and not pileupread.is_refskip:
-                            tag = pileupread.alignment.query_name
-                            abba = tag[-2:]
-                            # query position is None if is_del or is_refskip is set.
-                            if pileupread.alignment.query_sequence[pileupread.query_position] == alt:
-                                count_alt += 1
-                                if chrom_stop_pos in mut_pos_dict:
-                                    if abba in mut_pos_dict[chrom_stop_pos]:
-                                        mut_pos_dict[chrom_stop_pos][abba] += 1
+        for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=1000000000):
+            if pileupcolumn.reference_pos == stop_pos:
+                count_alt = 0
+                count_ref = 0
+                count_indel = 0
+                for pileupread in pileupcolumn.pileups:
+                    if not pileupread.is_refskip:
+                        tag = pileupread.alignment.query_name
+                        abba = tag[-2:]
+                        if pileupread.is_del:
+                            p = pileupread.query_position_or_next
+                            e = p + len(alt) - 1
+                        else:
+                            p = pileupread.query_position
+                            e = p + len(alt)
+                        tag = pileupread.alignment.query_name
+                        if "_" in tag:
+                            tag = re.split('_', tag)[0]
+                        s = p
+                        split_cigar = re.split('(\d+)', pileupread.alignment.cigarstring)
+                        if len(ref) < len(alt):  # insertion
+                            if "I" in split_cigar:
+                                all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"]
+                                for ai in all_insertions:  # if multiple insertions in DCS
+                                    ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
+                                    ins_count = split_cigar[ai - 1]  # nr of insertions should match with alt allele
+                                    if "I" in split_cigar and sum(ins_index) == p + 1 and len(alt) - 1 == int(ins_count):
+                                        nuc = pileupread.alignment.query_sequence[s:e]
+                                        break
                                     else:
-                                        mut_pos_dict[chrom_stop_pos][abba] = 1
-                                else:
-                                    mut_pos_dict[chrom_stop_pos] = {}
-                                    mut_pos_dict[chrom_stop_pos][abba] = 1
-                                if chrom_stop_pos not in ref_pos_dict:
-                                    ref_pos_dict[chrom_stop_pos] = {}
-                                    ref_pos_dict[chrom_stop_pos][abba] = 0
+                                        nuc = pileupread.alignment.query_sequence[s]
+                            else:
+                                nuc = pileupread.alignment.query_sequence[s]
+                        elif len(ref) > len(alt):  # deletion
+                            ref_positions = pileupread.alignment.get_reference_positions(full_length=True)[s:p + len(ref)]
+                            if "D" in split_cigar:
+                                all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"]
+                                for di, ai in enumerate(all_deletions):  # if multiple insertions in DCS
+                                    if di > 0:  # more than 1 deletion, don't count previous deletion to position
+                                        all_deletions_mod = split_cigar[:ai - 1]
+                                        prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")]
+                                        split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx]
+                                        del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()]
+                                    else:  # first deletion in read, sum all previous (mis)matches and insertions to position
+                                        del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
+                                    del_count = split_cigar[ai - 1]  # nr of deletions should match with ref allele
+                                    if "D" in split_cigar and sum(del_index) == p + 1 and len(ref) - 1 == int(del_count):
+                                        nuc = pileupread.alignment.query_sequence[s:e]
+                                        if nuc == "":
+                                            nuc = str(alt)
+                                        break
+                                    else:
+                                        nuc = pileupread.alignment.query_sequence[s:s + len(ref)]
+                            elif len(ref_positions) < len(ref):  # DCS has reference but the position is at the very end of the DCS and therefore not the full reference positions are there
+                                nuc = pileupread.alignment.get_reference_sequence()[s:s + len(ref)]
+                                if nuc.upper() == ref[:len(nuc)]:
+                                    nuc = str(ref)
+                            else:
+                                nuc = pileupread.alignment.query_sequence[s:s + len(ref)]
+                        else:  # SNV: query position is None if is_del or is_refskip is set.
+                            nuc = pileupread.alignment.query_sequence[s]
 
-                            elif pileupread.alignment.query_sequence[pileupread.query_position] == ref:
-                                count_ref += 1
-                                if chrom_stop_pos in ref_pos_dict:
-                                    if abba in ref_pos_dict[chrom_stop_pos]:
-                                        ref_pos_dict[chrom_stop_pos][abba] += 1
-                                    else:
-                                        ref_pos_dict[chrom_stop_pos][abba] = 1
+                        nuc = nuc.upper()
+
+                        if nuc == alt:
+                            count_alt += 1
+                            if chrom_stop_pos in mut_pos_dict:
+                                if abba in mut_pos_dict[chrom_stop_pos]:
+                                    mut_pos_dict[chrom_stop_pos][abba] += 1
                                 else:
-                                    ref_pos_dict[chrom_stop_pos] = {}
+                                    mut_pos_dict[chrom_stop_pos][abba] = 1
+                            else:
+                                mut_pos_dict[chrom_stop_pos] = {}
+                                mut_pos_dict[chrom_stop_pos][abba] = 1
+                            if chrom_stop_pos not in ref_pos_dict:
+                                ref_pos_dict[chrom_stop_pos] = {}
+                                ref_pos_dict[chrom_stop_pos][abba] = 0
+                        elif nuc == ref:
+                            count_ref += 1
+                            if chrom_stop_pos in ref_pos_dict:
+                                if abba in ref_pos_dict[chrom_stop_pos]:
+                                    ref_pos_dict[chrom_stop_pos][abba] += 1
+                                else:
                                     ref_pos_dict[chrom_stop_pos][abba] = 1
                             else:
-                                count_indel += 1
-
-                    print("coverage at pos %s = %s, ref = %s, alt = %s, indel = %s,\n" %
-                          (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_indel))
+                                ref_pos_dict[chrom_stop_pos] = {}
+                                ref_pos_dict[chrom_stop_pos][abba] = 1
+                        else:
+                            count_indel += 1
+                print("coverage at pos %s = %s, ref = %s, alt = %s, other = %s,\n" %
+                      (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_indel))
 
-            # if mutation is in DCS file but not in SSCS, then set counts to NA
-            if chrom_stop_pos not in mut_pos_dict.keys():
-                mut_pos_dict[chrom_stop_pos] = {}
-                mut_pos_dict[chrom_stop_pos]["ab"] = 0
-                mut_pos_dict[chrom_stop_pos]["ba"] = 0
-                ref_pos_dict[chrom_stop_pos] = {}
-                ref_pos_dict[chrom_stop_pos]["ab"] = 0
-                ref_pos_dict[chrom_stop_pos]["ba"] = 0
-        else:
-            print("indels are currently not evaluated")
+        # if mutation is in DCS file but not in SSCS, then set counts to NA
+        if chrom_stop_pos not in mut_pos_dict.keys():
+            mut_pos_dict[chrom_stop_pos] = {}
+            mut_pos_dict[chrom_stop_pos]["ab"] = 0
+            mut_pos_dict[chrom_stop_pos]["ba"] = 0
+            ref_pos_dict[chrom_stop_pos] = {}
+            ref_pos_dict[chrom_stop_pos]["ab"] = 0
+            ref_pos_dict[chrom_stop_pos]["ba"] = 0
     bam.close()
 
     # save counts
diff -r 1797e461d674 -r fdfe9a919ff7 mut2sscs.xml
--- a/mut2sscs.xml	Mon Mar 29 09:22:57 2021 +0000
+++ b/mut2sscs.xml	Fri Jul 22 09:19:44 2022 +0000
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<tool id="mut2sscs" name="DCS mutations to SSCS stats:" version="2.1.1" profile="19.01">
-    <description>Extracts all tags from the single stranded consensus sequence (SSCS) bam file that carry a mutation at the same position a mutation is called in the duplex consensus sequence (DCS) and calculates their frequencies</description>
+<tool id="mut2sscs" name="DCS mutations to SSCS stats:" version="3.0.0" profile="19.01">
+    <description>Extracts all tags from the single-stranded consensus sequence (SSCS) bam file that carries a mutation at the same position a mutation is called in the duplex consensus sequence (DCS) and calculates their frequencies</description>
     <macros>
         <import>va_macros.xml</import>
     </macros>
@@ -15,7 +15,7 @@
     ]]>
     </command>
     <inputs>
-        <param name="file1" type="data" format="vcf" label="DCS Mutation File" optional="false" help="VCF file with DCS mutations. See Help section below for a detailed explanation."/>
+        <param name="file1" type="data" format="vcf" label="DCS Mutation File" optional="false" help="VCF file with DCS mutations. See the Help section below for a detailed explanation."/>
         <param name="file2" type="data" format="bam" label="SSCS BAM File" optional="false" help="BAM file with aligned SSCS reads."/>
     </inputs>
     <outputs>
@@ -25,29 +25,29 @@
         <test>
             <param name="file1" value="FreeBayes_test.vcf"/>
             <param name="file2" value="SSCS_test.bam"/>
-            <output name="output_json" file="SSCS_counts_test.json" lines_diff="2"/>
+            <output name="output_json" file="SSCS_counts_test.json"/>
         </test>
     </tests>
     <help> <![CDATA[
 **What it does**
 
 Takes a VCF file with DCS mutations and a BAM file of aligned SSCS reads 
-as input and writes statistics about tags of reads that carry a mutation in the 
-SSCS at the same position a mutation is called in the DCS to a user specified output file..
+as input and writes statistics about tags of reads that carry a mutation and the reference allele in the 
+SSCS at the same position a mutation is called in the DCS to a user-specified output file.
 
 **Input** 
 
-**Dataset 1:** VCF file with duplex consesus sequence (DCS) mutations. E.g. 
+**Dataset 1:** VCF file with duplex consensus sequence (DCS) mutations. E.g. 
 generated by the `FreeBayes <https://arxiv.org/abs/1207.3907>`_ or `LoFreq <https://academic.oup.com/nar/article/40/22/11189/1152727>`_ variant caller.
 
-**Dataset 2:** BAM file of aligned single stranded consensus sequence (SSCS) 
+**Dataset 2:** BAM file of the aligned single stranded consensus sequence (SSCS) 
 reads. This file can be obtained by the tool `Map with BWA-MEM 
 <https://arxiv.org/abs/1303.3997>`_.
 
 **Output**
 
-The output is a json file containing dictonaries with stats of tags that carry a mutation in the SSCS 
-at the same position a mutation is called in the DCS.
+The output is a json file containing dictionaries with stats of tags that carry a mutation in the SSCS 
+at the same position, a mutation is called in the DCS.
 
     ]]> 
     </help>
diff -r 1797e461d674 -r fdfe9a919ff7 read2mut.py
--- a/read2mut.py	Mon Mar 29 09:22:57 2021 +0000
+++ b/read2mut.py	Fri Jul 22 09:19:44 2022 +0000
@@ -87,6 +87,7 @@
     outfile2 = args.outputFile2
     outfile3 = args.outputFile3
     outputFile_csv = args.outputFile_csv
+
     thresh = args.thresh
     phred_score = args.phred
     trim = args.trim
@@ -111,7 +112,7 @@
 
     # load dicts
     with open(json_file, "r") as f:
-        (tag_dict, cvrg_dict) = json.load(f)
+        (tag_dict, cvrg_dict, tag_dict_ref) = json.load(f)
 
     with open(sscs_json, "r") as f:
         (mut_pos_dict, ref_pos_dict) = json.load(f)
@@ -138,106 +139,207 @@
             continue
         else:
             alt = variant.ALT[0]
-        chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt
+
+        alt = alt.upper()
+        ref = ref.upper()
+        if "N" in alt:  # skip indels with N in alt allele --> it is not an indel but just a mismatch at the position where the N is (checked this in IGV)
+            continue
 
-        if len(ref) == len(alt):
-            mut_array.append([chrom, stop_pos, ref, alt])
-            i += 1
-            mut_dict[chrom_stop_pos] = {}
-            mut_read_pos_dict[chrom_stop_pos] = {}
-            reads_dict[chrom_stop_pos] = {}
-            mut_read_cigar_dict[chrom_stop_pos] = {}
-            real_start_end[chrom_stop_pos] = {}
+        chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt
+        mut_array.append([chrom, stop_pos, ref, alt])
+        i += 1
+        mut_dict[chrom_stop_pos] = {}
+        mut_read_pos_dict[chrom_stop_pos] = {}
+        reads_dict[chrom_stop_pos] = {}
+        mut_read_cigar_dict[chrom_stop_pos] = {}
+        real_start_end[chrom_stop_pos] = {}
+        for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=1000000000):
+            if pileupcolumn.reference_pos == stop_pos:
+                count_alt = 0
+                count_ref = 0
+                count_indel = 0
+                count_n = 0
+                count_other = 0
+                count_lowq = 0
+                n = 0
+                for pileupread in pileupcolumn.pileups:
+                    n += 1
+                    if not pileupread.is_refskip:
+                        if pileupread.is_del:
+                            p = pileupread.query_position_or_next
+                            e = p + len(alt) - 1
+                        else:
+                            p = pileupread.query_position
+                            e = p + len(alt)
+                        s = p
+                        tag = pileupread.alignment.query_name
+                        split_cigar = re.split('(\d+)', pileupread.alignment.cigarstring)
 
-            for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000):
-                if pileupcolumn.reference_pos == stop_pos:
-                    count_alt = 0
-                    count_ref = 0
-                    count_indel = 0
-                    count_n = 0
-                    count_other = 0
-                    count_lowq = 0
-                    n = 0
-                    for pileupread in pileupcolumn.pileups:
-                        n += 1
-                        if not pileupread.is_del and not pileupread.is_refskip:
-                            tag = pileupread.alignment.query_name
-                            nuc = pileupread.alignment.query_sequence[pileupread.query_position]
-                            phred = ord(pileupread.alignment.qual[pileupread.query_position]) - 33
-                            # if read is softclipped, store real position in reference
-                            if "S" in pileupread.alignment.cigarstring:
-                                # spftclipped at start
-                                if re.search(r"^[0-9]+S", pileupread.alignment.cigarstring):
-                                    start = pileupread.alignment.reference_start - int(pileupread.alignment.cigarstring.split("S")[0])
-                                    end = pileupread.alignment.reference_end
-                                # softclipped at end
-                                elif re.search(r"S$", pileupread.alignment.cigarstring):
-                                    end = pileupread.alignment.reference_end + int(re.split("[A-Z]", str(pileupread.alignment.cigarstring))[-2])
-                                    start = pileupread.alignment.reference_start
-                            else:
+                        if len(ref) < len(alt):
+                            if "I" in split_cigar:
+                                all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"]
+                                for ai in all_insertions:  # if multiple insertions in DCS
+                                    ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
+                                    ins_count = split_cigar[ai - 1]  # nr of insertions should match with alt allele
+                                    if "I" in split_cigar and sum(ins_index) == p + 1 and int(ins_count) >= len(alt) - 1:  # if pe read matches exatcly to insertion
+                                        nuc = pileupread.alignment.query_sequence[s:e]
+                                        phred = ord(pileupread.alignment.qual[s]) - 33
+                                        break
+                                    elif "I" in split_cigar and sum(ins_index) == p + 1 and int(ins_count) < len(alt) - 1:  # if pe read has shorter insertion -- not alt
+                                        nuc = pileupread.alignment.query_sequence[s:s+int(ins_count)+1]
+                                        phred = ord(pileupread.alignment.qual[s]) - 33
+                                        break
+                                    else:  # insertion in pe reads but not at the desired position
+                                        nuc = pileupread.alignment.query_sequence[s]
+                                        phred = ord(pileupread.alignment.qual[s]) - 33
+                            elif "D" in split_cigar:  # if deletion in pe read, don't count
+                                all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"]
+                                for di, ai in enumerate(all_deletions):  # if multiple insertions in DCS
+                                    if di > 0:  # more than 1 deletion, don't count previous deletion to position
+                                        all_deletions_mod = split_cigar[:ai - 1]
+                                        prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")]
+                                        split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx]
+                                        del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()]
+                                    else:  # first deletion in read, sum all previous (mis)matches and insertions to position
+                                        del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
+                                    if "D" in split_cigar and sum(del_index) == p + 1:  # if deletion at that position
+                                        nuc = "D"
+                                        phred = ord(pileupread.alignment.qual[s]) - 33
+                                        break
+                                    else:
+                                        nuc = pileupread.alignment.query_sequence[s]
+                                        phred = ord(pileupread.alignment.qual[s]) - 33
+                            else:  # insertion in pe reads but not at the desired position
+                                nuc = pileupread.alignment.query_sequence[s]
+                                phred = ord(pileupread.alignment.qual[s]) - 33
+
+                        elif len(ref) > len(alt):
+                            ref_positions = pileupread.alignment.get_reference_positions(full_length=True)[s:p + len(ref)]
+                            if "D" in split_cigar:
+                                all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"]
+                                for di, ai in enumerate(all_deletions):  # if multiple insertions in DCS
+                                    if di > 0:  # more than 1 deletion, don't count previous deletion to position
+                                        all_deletions_mod = split_cigar[:ai - 1]
+                                        prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")]
+                                        split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx]
+                                        del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()]
+                                    else:  # first deletion in read, sum all previous (mis)matches and insertions to position
+                                        del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
+                                    del_count = split_cigar[ai - 1]  # deletion on that position but does not necesserily match in the length
+                                    if "D" in split_cigar and sum(del_index) == p + 1 and int(del_count) >= len(ref) - 1:  # if pe read matches exatcly to deletion
+                                        nuc = pileupread.alignment.query_sequence[s:e]
+                                        phred = ord(pileupread.alignment.qual[s]) - 33
+                                        if nuc == "":
+                                            nuc = str(alt)
+                                        break
+                                    elif "D" in split_cigar and sum(del_index) == p + 1 and int(del_count) < len(ref) - 1:  # if pe read has shorter deletion --> not alt
+                                        nuc = str(ref)[:int(del_count)+1]
+                                        phred = ord(pileupread.alignment.qual[s]) - 33
+                                        break
+                                    else:  # deletion in pe reads but not at the desired position
+                                        nuc = pileupread.alignment.query_sequence[s:s + len(ref)]
+                                        phred = ord(pileupread.alignment.qual[s]) - 33
+                            elif "I" in split_cigar:  # if pe read has insertion --> not count
+                                all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"]
+                                for ai in all_insertions:  # if multiple insertions in DCS
+                                    ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
+                                    ins_count = split_cigar[ai - 1]  # nr of insertions should match with alt allele
+                                    if "I" in split_cigar and sum(ins_index) == p + 1:
+                                        nuc = "I"
+                                        phred = ord(pileupread.alignment.qual[s]) - 33
+                                        break
+                                    else:
+                                        nuc = pileupread.alignment.query_sequence[s]
+                                        phred = ord(pileupread.alignment.qual[s]) - 33
+                            elif len(ref_positions) < len(ref):  # DCS has reference but the position is at the very end of the DCS and therefore not the full reference positions are there
+                                nuc = pileupread.alignment.get_reference_sequence()[s:s + len(ref)]
+                                phred = ord(pileupread.alignment.qual[s]) - 33
+                                if nuc.upper() == ref[:len(nuc)]:
+                                    nuc = str(ref)
+                            else:  # deletion in pe reads but not at the desired position
+                                nuc = pileupread.alignment.query_sequence[s:s + len(ref)]
+                                phred = ord(pileupread.alignment.qual[s]) - 33
+                        else:  # SNV: query position is None if is_del or is_refskip is set.
+                            nuc = pileupread.alignment.query_sequence[s]
+                            phred = ord(pileupread.alignment.qual[s]) - 33
+
+                        nuc = nuc.upper()
+
+                        # if read is softclipped, store real position in reference
+                        if "S" in pileupread.alignment.cigarstring:
+                            # spftclipped at start
+                            if re.search(r"^[0-9]+S", pileupread.alignment.cigarstring):
+                                start = pileupread.alignment.reference_start - int(pileupread.alignment.cigarstring.split("S")[0])
                                 end = pileupread.alignment.reference_end
+                            # softclipped at end
+                            elif re.search(r"S$", pileupread.alignment.cigarstring):
+                                end = pileupread.alignment.reference_end + int(re.split("[A-Z]", str(pileupread.alignment.cigarstring))[-2])
                                 start = pileupread.alignment.reference_start
-
-                            if phred < phred_score:
-                                nuc = "lowQ"
-                            if tag not in mut_dict[chrom_stop_pos]:
-                                mut_dict[chrom_stop_pos][tag] = {}
-                            if nuc in mut_dict[chrom_stop_pos][tag]:
-                                mut_dict[chrom_stop_pos][tag][nuc] += 1
-                            else:
-                                mut_dict[chrom_stop_pos][tag][nuc] = 1
-                            if tag not in mut_read_pos_dict[chrom_stop_pos]:
-                                mut_read_pos_dict[chrom_stop_pos][tag] = [pileupread.query_position + 1]
-                                reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)]
-                                mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring]
-                                real_start_end[chrom_stop_pos][tag] = [(start, end)]
+                        else:
+                            end = pileupread.alignment.reference_end
+                            start = pileupread.alignment.reference_start
+                        if phred < phred_score:
+                            nuc = "lowQ"
+                        if tag not in mut_dict[chrom_stop_pos]:
+                            mut_dict[chrom_stop_pos][tag] = {}
+                        if nuc in mut_dict[chrom_stop_pos][tag]:
+                            mut_dict[chrom_stop_pos][tag][nuc] += 1
+                        else:
+                            mut_dict[chrom_stop_pos][tag][nuc] = 1
+                        if tag not in mut_read_pos_dict[chrom_stop_pos]:
+                            mut_read_pos_dict[chrom_stop_pos][tag] = [s + 1]
+                            reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)]
+                            mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring]
+                            real_start_end[chrom_stop_pos][tag] = [(start, end)]
+                        else:
+                            mut_read_pos_dict[chrom_stop_pos][tag].append(s + 1)
+                            reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence))
+                            mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring)
+                            real_start_end[chrom_stop_pos][tag].append((start, end))
+                        if nuc == alt:
+                            count_alt += 1
+                            if tag not in mut_read_dict:
+                                mut_read_dict[tag] = {}
+                                mut_read_dict[tag][chrom_stop_pos] = (alt, ref)
                             else:
-                                mut_read_pos_dict[chrom_stop_pos][tag].append(pileupread.query_position + 1)
-                                reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence))
-                                mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring)
-                                real_start_end[chrom_stop_pos][tag].append((start, end))
-                            if nuc == alt:
-                                count_alt += 1
-                                if tag not in mut_read_dict:
-                                    mut_read_dict[tag] = {}
-                                    mut_read_dict[tag][chrom_stop_pos] = (alt, ref)
-                                else:
-                                    mut_read_dict[tag][chrom_stop_pos] = (alt, ref)
-                            elif nuc == ref:
-                                count_ref += 1
-                            elif nuc == "N":
-                                count_n += 1
-                            elif nuc == "lowQ":
-                                count_lowq += 1
-                            else:
-                                count_other += 1
+                                mut_read_dict[tag][chrom_stop_pos] = (alt, ref)
+                        elif nuc == ref:
+                            count_ref += 1
+                        elif nuc == "N":
+                            count_n += 1
+                        elif nuc == "lowQ":
+                            count_lowq += 1
                         else:
-                            count_indel += 1
+                            count_other += 1
+                    else:
+                        count_indel += 1
 
     mut_array = np.array(mut_array)
     for read in bam.fetch(until_eof=True):
         if read.is_unmapped:
             pure_tag = read.query_name[:-5]
             nuc = "na"
-            for key in tag_dict[pure_tag].keys():
-                if key not in mut_dict:
-                    mut_dict[key] = {}
-                if read.query_name not in mut_dict[key]:
-                    mut_dict[key][read.query_name] = {}
-                if nuc in mut_dict[key][read.query_name]:
-                    mut_dict[key][read.query_name][nuc] += 1
-                else:
-                    mut_dict[key][read.query_name][nuc] = 1
+            if pure_tag in tag_dict.keys():  # stored all ref and alt reads --> get only alt reads
+                for key in tag_dict[pure_tag].keys():
+                    if key not in mut_dict:
+                        mut_dict[key] = {}
+                    if read.query_name not in mut_dict[key]:
+                        mut_dict[key][read.query_name] = {}
+                    if nuc in mut_dict[key][read.query_name]:
+                        mut_dict[key][read.query_name][nuc] += 1
+                    else:
+                        mut_dict[key][read.query_name][nuc] = 1
     bam.close()
-
     # create pure_tags_dict
     pure_tags_dict = {}
+    pure_tags_dict_ref = {}
     for key1, value1 in sorted(mut_dict.items()):
         i = np.where(np.array(['#'.join(str(i) for i in z)
                                for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0]
         ref = mut_array[i, 2]
         alt = mut_array[i, 3]
         pure_tags_dict[key1] = {}
+        pure_tags_dict_ref[key1] = {}
         for key2, value2 in sorted(value1.items()):
             for key3, value3 in value2.items():
                 pure_tag = key2[:-5]
@@ -247,6 +349,12 @@
                     else:
                         pure_tags_dict[key1][pure_tag] = 1
 
+                if key3 == ref:
+                    if pure_tag in pure_tags_dict_ref[key1]:
+                        pure_tags_dict_ref[key1][pure_tag] += 1
+                    else:
+                        pure_tags_dict_ref[key1][pure_tag] = 1
+
     # create pure_tags_dict_short with thresh
     if thresh > 0:
         pure_tags_dict_short = {}
@@ -279,7 +387,7 @@
     format23 = workbook3.add_format({'bg_color': '#FFC7CE'})  # red
     format33 = workbook3.add_format({'bg_color': '#FACC2E'})  # yellow
 
-    header_line = ('variant ID', 'tier', 'tag', 'mate', 'read pos.ab', 'read pos.ba', 'read median length.ab',
+    header_line = ('variant ID', 'tier', 'allele', 'tag', 'mate', 'read pos.ab', 'read pos.ba', 'read median length.ab',
                    'read median length.ba', 'DCS median length',
                    'FS.ab', 'FS.ba', 'FSqc.ab', 'FSqc.ba', 'ref.ab', 'ref.ba', 'alt.ab', 'alt.ba',
                    'rel. ref.ab', 'rel. ref.ba', 'rel. alt.ab', 'rel. alt.ba',
@@ -288,6 +396,7 @@
                    'in phase', 'chimeric tag')
     ws1.write_row(0, 0, header_line)
     csv_writer.writerow(header_line)
+
     counter_tier11 = 0
     counter_tier12 = 0
     counter_tier21 = 0
@@ -308,12 +417,14 @@
 
     row = 1
     tier_dict = {}
+    tier_dict_ref = {}
     chimera_dict = {}
     for key1, value1 in sorted(mut_dict.items()):
         counts_mut = 0
         chimeric_tag_list = []
         chimeric_tag = {}
-        if key1 in pure_tags_dict_short.keys():
+        if (key1 in pure_tags_dict_short.keys()) or (key1 in pure_tags_dict_ref.keys()):  # ref or alt
+
             change_tier_after_print = []
             i = np.where(np.array(['#'.join(str(i) for i in z)
                                    for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0]
@@ -323,11 +434,13 @@
             whole_array = list(pure_tags_dict_short[key1].keys())
 
             tier_dict[key1] = {}
+            tier_dict_ref[key1] = {}
             values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 2.5", 0),
                                 ("tier 3.1", 0), ("tier 3.2", 0), ("tier 4", 0), ("tier 5.1", 0), ("tier 5.2", 0), ("tier 5.3", 0), ("tier 5.4", 0), ("tier 5.5", 0),
                                 ("tier 6", 0), ("tier 7", 0)]
             for k, v in values_tier_dict:
                 tier_dict[key1][k] = v
+                tier_dict_ref[key1][k] = v
 
             used_keys = []
             if 'ab' in mut_pos_dict[key1].keys():
@@ -349,7 +462,16 @@
             for key2, value2 in sorted(value1.items()):
                 add_mut14 = ""
                 add_mut23 = ""
-                if (key2[:-5] in pure_tags_dict_short[key1].keys()) and (key2[:-5] not in used_keys) and (key1 in tag_dict[key2[:-5]].keys()):
+                if key2[:-5] not in tag_dict.keys() and key2[:-5] not in tag_dict_ref.keys():  # skip reads that have not alt or ref
+                    continue
+
+                if ((key2[:-5] in tag_dict.keys() and key2[:-5] in pure_tags_dict_short[key1].keys() and key1 in tag_dict[key2[:-5]].keys()) or (key2[:-5] in tag_dict_ref.keys() and key2[:-5] in pure_tags_dict_ref[key1].keys() and key1 in tag_dict_ref[key2[:-5]].keys())) and (key2[:-5] not in used_keys):
+
+                    if key2[:-5] in pure_tags_dict_short[key1].keys():
+                        variant_type = "alt"
+                    elif key2[:-5] in pure_tags_dict_ref[key1].keys():
+                        variant_type = "ref"
+
                     if key2[:-5] + '.ab.1' in mut_dict[key1].keys():
                         total1 = sum(mut_dict[key1][key2[:-5] + '.ab.1'].values())
                         if 'na' in mut_dict[key1][key2[:-5] + '.ab.1'].keys():
@@ -550,36 +672,59 @@
 
                     used_keys.append(key2[:-5])
                     counts_mut += 1
-                    if (alt1f + alt2f + alt3f + alt4f) > 0.5:
+                    if (variant_type == "alt" and ((alt1f + alt2f + alt3f + alt4f) > 0.5)) or (variant_type == "ref" and ((ref1f + ref2f + ref3f + ref4f) > 0.5)):
+                        if variant_type == "alt":
+                            tier1ff, tier2ff, tier3ff, tier4ff = alt1f, alt2f, alt3f, alt4f
+                            tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim = alt1f, alt2f, alt3f, alt4f
+                        elif variant_type == "ref":
+                            tier1ff, tier2ff, tier3ff, tier4ff = ref1f, ref2f, ref3f, ref4f
+                            tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim = ref1f, ref2f, ref3f, ref4f
+
                         total1new_trim, total2new_trim, total3new_trim, total4new_trim = total1new, total2new, total3new, total4new
                         if total1new == 0:
                             ref1f = alt1f = None
                             alt1ff = -1
                             alt1ff_trim = -1
+                            tier1ff = -1
+                            tier1ff_trim = -1
                         else:
                             alt1ff = alt1f
                             alt1ff_trim = alt1f
+                            tier1ff = tier1ff
+                            tier1ff_trim = tier1ff_trim
                         if total2new == 0:
                             ref2f = alt2f = None
                             alt2ff = -1
                             alt2ff_trim = -1
+                            tier2ff = -1
+                            tier2ff_trim = -1
                         else:
                             alt2ff = alt2f
                             alt2ff_trim = alt2f
+                            tier2ff = tier2ff
+                            tier2ff_trim = tier2ff_trim
                         if total3new == 0:
                             ref3f = alt3f = None
                             alt3ff = -1
                             alt3ff_trim = -1
+                            tier3ff = -1
+                            tier3ff_trim = -1
                         else:
                             alt3ff = alt3f
                             alt3ff_trim = alt3f
+                            tier3ff = tier3ff
+                            tier3ff_trim = tier3ff_trim
                         if total4new == 0:
                             ref4f = alt4f = None
                             alt4ff = -1
                             alt4ff_trim = -1
+                            tier4ff = -1
+                            tier4ff_trim = -1
                         else:
                             alt4ff = alt4f
                             alt4ff_trim = alt4f
+                            tier4ff = tier4ff
+                            tier4ff_trim = tier4ff_trim
 
                         beg1 = beg4 = beg2 = beg3 = 0
 
@@ -605,7 +750,7 @@
                         # mate 1 - SSCS ab
                         softclipped_idx1 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs1]
                         safe_div_result = safe_div(sum(softclipped_idx1), float(len(softclipped_idx1)))
-                        if (safe_div_result == None):
+                        if (safe_div_result is None):
                             ratio1 = False
                         else:
                             ratio1 = safe_div_result >= threshold_reads
@@ -629,7 +774,7 @@
                         # mate 1 - SSCS ba
                         softclipped_idx4 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs4]
                         safe_div_result = safe_div(sum(softclipped_idx4), float(len(softclipped_idx4)))
-                        if (safe_div_result == None):
+                        if (safe_div_result is None):
                             ratio4 = False
                         else:
                             ratio4 = safe_div_result >= threshold_reads
@@ -653,7 +798,7 @@
                         # mate 2 - SSCS ab
                         softclipped_idx2 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs2]
                         safe_div_result = safe_div(sum(softclipped_idx2), float(len(softclipped_idx2)))
-                        if (safe_div_result == None):
+                        if (safe_div_result is None):
                             ratio2 = False
                         else:
                             ratio2 = safe_div_result >= threshold_reads
@@ -677,7 +822,7 @@
                         # mate 2 - SSCS ba
                         softclipped_idx3 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs3]
                         safe_div_result = safe_div(sum(softclipped_idx3), float(len(softclipped_idx3)))
-                        if (safe_div_result == None):
+                        if (safe_div_result is None):
                             ratio3 = False
                         else:
                             ratio3 = safe_div_result >= threshold_reads
@@ -698,21 +843,21 @@
                             ratio_dist_start3 = safe_div(sum([True if x <= thr else False for x in dist_start_read3]), float(sum(softclipped_idx3))) >= threshold_reads
                             ratio_dist_end3 = safe_div(sum([True if x <= thr else False for x in dist_end_read3]), float(sum(softclipped_idx3))) >= threshold_reads
 
-                        if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) &  # contradictory variant
-                            all(float(ij) == 0. for ij in [alt2ff, alt3ff])) |
-                            (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) &
-                             all(float(ij) == 0. for ij in [alt1ff, alt4ff]))):
-                            alt1ff = 0
-                            alt4ff = 0
-                            alt2ff = 0
-                            alt3ff = 0
+                        if ((all(float(ij) >= 0.5 for ij in [tier1ff, tier4ff]) &  # contradictory variant
+                            all(float(ij) == 0. for ij in [tier2ff, tier3ff])) |
+                            (all(float(ij) >= 0.5 for ij in [tier2ff, tier3ff]) &
+                             all(float(ij) == 0. for ij in [tier1ff, tier4ff]))):
+                            tier1ff = 0
+                            tier4ff = 0
+                            tier2ff = 0
+                            tier3ff = 0
                             trimmed = False
                             contradictory = True
                         # softclipping tiers
                         # information of both mates available --> all reads for both mates and SSCS are softclipped
                         elif (ratio1 & ratio4 & ratio2 & ratio3 &
                               (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) &
-                              all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):  # all mates available
+                              all(float(ij) > 0. for ij in [tier1ff, tier2ff, tier3ff, tier4ff])):  # all mates available
                             # if distance between softclipping and mutation is at start or end of the read smaller than threshold
                             softclipped_mutation_allMates = True
                             softclipped_mutation_oneOfTwoMates = False
@@ -720,16 +865,16 @@
                             softclipped_mutation_oneOfTwoSSCS_diffMates = False
                             softclipped_mutation_oneMate = False
                             softclipped_mutation_oneMateOneSSCS = False
-                            alt1ff = 0
-                            alt4ff = 0
-                            alt2ff = 0
-                            alt3ff = 0
+                            tier1ff = 0
+                            tier4ff = 0
+                            tier2ff = 0
+                            tier3ff = 0
                             trimmed = False
                             contradictory = False
                         # information of both mates available --> only one mate softclipped
                         elif (((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4)) |
                                (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3))) &
-                              all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):  # all mates available
+                              all(float(ij) > 0. for ij in [tier1ff, tier2ff, tier3ff, tier4ff])):  # all mates available
                             # if distance between softclipping and mutation is at start or end of the read smaller than threshold
                             min_start1 = min(min([ij[0] for ij in ref_positions1]), min([ij[0] for ij in ref_positions4]))  # red
                             min_start2 = min(min([ij[0] for ij in ref_positions2]), min([ij[0] for ij in ref_positions3]))  # blue
@@ -763,10 +908,10 @@
                                     read_len_median3 = read_len_median3 - n_spacer_barcode
                             else:
                                 softclipped_mutation_oneOfTwoMates = True
-                                alt1ff = 0
-                                alt4ff = 0
-                                alt2ff = 0
-                                alt3ff = 0
+                                tier1ff = 0
+                                tier4ff = 0
+                                tier2ff = 0
+                                tier3ff = 0
                                 trimmed = False
                                 contradictory = False
                             softclipped_mutation_allMates = False
@@ -780,6 +925,7 @@
                                     total1new = 0
                                     alt1ff = 0
                                     alt1f = 0
+                                    tier1ff = 0
                                     trimmed = True
 
                                 if ((read_pos4 >= 0) and ((read_pos4 <= trim) | (abs(read_len_median4 - read_pos4) <= trim))):
@@ -787,6 +933,7 @@
                                     total4new = 0
                                     alt4ff = 0
                                     alt4f = 0
+                                    tier4ff = 0
                                     trimmed = True
 
                                 if ((read_pos2 >= 0) and ((read_pos2 <= trim) | (abs(read_len_median2 - read_pos2) <= trim))):
@@ -794,6 +941,7 @@
                                     total2new = 0
                                     alt2ff = 0
                                     alt2f = 0
+                                    tier2ff = 0
                                     trimmed = True
 
                                 if ((read_pos3 >= 0) and ((read_pos3 <= trim) | (abs(read_len_median3 - read_pos3) <= trim))):
@@ -801,6 +949,7 @@
                                     total3new = 0
                                     alt3ff = 0
                                     alt3f = 0
+                                    tier3ff = 0
                                     trimmed = True
                                 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4)
                                 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3)
@@ -808,7 +957,7 @@
                         # information of both mates available --> only one mate softclipped
                         elif (((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) &
                               ((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) &
-                              all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):  # all mates available
+                              all(float(ij) > 0. for ij in [tier1ff, tier2ff, tier3ff, tier4ff])):  # all mates available
                             # if distance between softclipping and mutation is at start or end of the read smaller than threshold
                             softclipped_mutation_allMates = False
                             softclipped_mutation_oneOfTwoMates = False
@@ -816,18 +965,18 @@
                             softclipped_mutation_oneOfTwoSSCS_diffMates = False
                             softclipped_mutation_oneMate = False
                             softclipped_mutation_oneMateOneSSCS = False
-                            alt1ff = 0
-                            alt4ff = 0
-                            alt2ff = 0
-                            alt3ff = 0
+                            tier1ff = 0
+                            tier4ff = 0
+                            tier2ff = 0
+                            tier3ff = 0
                             trimmed = False
                             contradictory = False
 
                         # information of one mate available --> all reads of one mate are softclipped
                         elif ((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) &
-                              all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff])) |
+                              all(float(ij) < 0. for ij in [tier2ff, tier3ff]) & all(float(ij) > 0. for ij in [tier1ff, tier4ff])) |
                               (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) &
-                              all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) > 0. for ij in [alt2ff, alt3ff]))):  # all mates available
+                              all(float(ij) < 0. for ij in [tier1ff, tier4ff]) & all(float(ij) > 0. for ij in [tier2ff, tier3ff]))):  # all mates available
                             # if distance between softclipping and mutation is at start or end of the read smaller than threshold
                             softclipped_mutation_allMates = False
                             softclipped_mutation_oneOfTwoMates = False
@@ -835,17 +984,17 @@
                             softclipped_mutation_oneOfTwoSSCS_diffMates = False
                             softclipped_mutation_oneMate = True
                             softclipped_mutation_oneMateOneSSCS = False
-                            alt1ff = 0
-                            alt4ff = 0
-                            alt2ff = 0
-                            alt3ff = 0
+                            tier1ff = 0
+                            tier4ff = 0
+                            tier2ff = 0
+                            tier3ff = 0
                             trimmed = False
                             contradictory = False
                         # information of one mate available --> only one SSCS is softclipped
                         elif ((((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) &
-                              (all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff]))) |
+                              (all(float(ij) < 0. for ij in [tier2ff, tier3ff]) & all(float(ij) > 0. for ij in [tier1ff, tier4ff]))) |
                               (((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) &
-                              (all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) < 0. for ij in [alt2ff, alt3ff])))):  # all mates available
+                              (all(float(ij) < 0. for ij in [tier1ff, tier4ff]) & all(float(ij) < 0. for ij in [tier2ff, tier3ff])))):  # all mates available
                             # if distance between softclipping and mutation is at start or end of the read smaller than threshold
                             softclipped_mutation_allMates = False
                             softclipped_mutation_oneOfTwoMates = False
@@ -853,10 +1002,10 @@
                             softclipped_mutation_oneOfTwoSSCS_diffMates = False
                             softclipped_mutation_oneMate = False
                             softclipped_mutation_oneMateOneSSCS = True
-                            alt1ff = 0
-                            alt4ff = 0
-                            alt2ff = 0
-                            alt3ff = 0
+                            tier1ff = 0
+                            tier4ff = 0
+                            tier2ff = 0
+                            tier3ff = 0
                             trimmed = False
                             contradictory = False
 
@@ -866,6 +1015,7 @@
                                 total1new = 0
                                 alt1ff = 0
                                 alt1f = 0
+                                tier1ff = 0
                                 trimmed = True
 
                             if ((read_pos4 >= 0) and ((read_pos4 <= trim) | (abs(read_len_median4 - read_pos4) <= trim))):
@@ -873,6 +1023,7 @@
                                 total4new = 0
                                 alt4ff = 0
                                 alt4f = 0
+                                tier4ff = 0
                                 trimmed = True
 
                             if ((read_pos2 >= 0) and ((read_pos2 <= trim) | (abs(read_len_median2 - read_pos2) <= trim))):
@@ -880,6 +1031,7 @@
                                 total2new = 0
                                 alt2ff = 0
                                 alt2f = 0
+                                tier2ff = 0
                                 trimmed = True
 
                             if ((read_pos3 >= 0) and ((read_pos3 <= trim) | (abs(read_len_median3 - read_pos3) <= trim))):
@@ -887,117 +1039,145 @@
                                 total3new = 0
                                 alt3ff = 0
                                 alt3f = 0
+                                tier3ff = 0
                                 trimmed = True
                             details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4)
                             details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3)
 
                         # assign tiers
                         if ((all(int(ij) >= 3 for ij in [total1new, total4new]) &
-                             all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) |
+                             all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff])) |
                             (all(int(ij) >= 3 for ij in [total2new, total3new]) &
-                             all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
+                             all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]))):
                             tier = "1.1"
                             counter_tier11 += 1
-                            tier_dict[key1]["tier 1.1"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 1.1"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 1.1"] += 1
 
                         elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) &
                               any(int(ij) >= 3 for ij in [total1new, total4new]) &
                               any(int(ij) >= 3 for ij in [total2new, total3new]) &
-                              all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):
+                              all(float(ij) >= 0.75 for ij in [tier1ff, tier2ff, tier3ff, tier4ff])):
                             tier = "1.2"
                             counter_tier12 += 1
-                            tier_dict[key1]["tier 1.2"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 1.2"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 1.2"] += 1
 
                         elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
                                any(int(ij) >= 3 for ij in [total1new, total4new]) &
-                               all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) |
+                               all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff])) |
                               (all(int(ij) >= 1 for ij in [total2new, total3new]) &
                                any(int(ij) >= 3 for ij in [total2new, total3new]) &
-                               all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
+                               all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]))):
                             tier = "2.1"
                             counter_tier21 += 1
-                            tier_dict[key1]["tier 2.1"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 2.1"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 2.1"] += 1
 
                         elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) &
-                              all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):
+                              all(float(ij) >= 0.75 for ij in [tier1ff, tier2ff, tier3ff, tier4ff])):
                             tier = "2.2"
                             counter_tier22 += 1
-                            tier_dict[key1]["tier 2.2"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 2.2"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 2.2"] += 1
 
                         elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
                                any(int(ij) >= 3 for ij in [total2new, total3new]) &
-                               all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]) &
-                               any(float(ij) >= 0.75 for ij in [alt2ff, alt3ff])) |
+                               all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff]) &
+                               any(float(ij) >= 0.75 for ij in [tier2ff, tier3ff])) |
                               (all(int(ij) >= 1 for ij in [total2new, total3new]) &
                                any(int(ij) >= 3 for ij in [total1new, total4new]) &
-                               all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]) &
-                               any(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]))):
+                               all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]) &
+                               any(float(ij) >= 0.75 for ij in [tier1ff, tier4ff]))):
                             tier = "2.3"
                             counter_tier23 += 1
-                            tier_dict[key1]["tier 2.3"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 2.3"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 2.3"] += 1
 
                         elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
-                               all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) |
+                               all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff])) |
                               (all(int(ij) >= 1 for ij in [total2new, total3new]) &
-                               all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
+                               all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]))):
                             tier = "2.4"
                             counter_tier24 += 1
-                            tier_dict[key1]["tier 2.4"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 2.4"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 2.4"] += 1
 
                         elif ((len(pure_tags_dict_short[key1]) > 1) &
-                              (all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) |
-                               all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))):
+                              (all(float(ij) >= 0.5 for ij in [tier1ff, tier4ff]) |
+                               all(float(ij) >= 0.5 for ij in [tier2ff, tier3ff]))):
                             tier = "3.1"
                             counter_tier31 += 1
-                            tier_dict[key1]["tier 3.1"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 3.1"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 3.1"] += 1
 
                         elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
-                               all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff])) |
+                               all(float(ij) >= 0.5 for ij in [tier1ff, tier4ff])) |
                               (all(int(ij) >= 1 for ij in [total2new, total3new]) &
-                               all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))):
+                               all(float(ij) >= 0.5 for ij in [tier2ff, tier3ff]))):
                             tier = "3.2"
                             counter_tier32 += 1
-                            tier_dict[key1]["tier 3.2"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 3.2"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 3.2"] += 1
 
                         elif (trimmed):
                             tier = "4"
                             counter_tier4 += 1
-                            tier_dict[key1]["tier 4"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 4"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 4"] += 1
 
                             # assign tiers
                             if ((all(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) &
-                                 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) |
+                                 all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim])) |
                                 (all(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) &
-                                 all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))):
+                                 all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]))):
                                 trimmed_actual_high_tier = True
                             elif (all(int(ij) >= 1 for ij in [total1new_trim, total2new_trim, total3new_trim, total4new_trim]) &
                                   any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) &
                                   any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) &
-                                  all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt2ff_trim, alt3ff_trim, alt4ff_trim])):
+                                  all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim])):
                                 trimmed_actual_high_tier = True
                             elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) &
                                    any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) &
-                                   all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) |
+                                   all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim])) |
                                   (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) &
                                    any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) &
-                                   all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))):
+                                   all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]))):
                                 trimmed_actual_high_tier = True
                             elif (all(int(ij) >= 1 for ij in [total1new_trim, total2new_trim, total3new_trim, total4new_trim]) &
-                                  all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt2ff_trim, alt3ff_trim, alt4ff_trim])):
+                                  all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim])):
                                 trimmed_actual_high_tier = True
                             elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) &
                                    any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) &
-                                   all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim]) &
-                                   any(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim])) |
+                                   all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim]) &
+                                   any(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim])) |
                                   (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) &
                                    any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) &
-                                   all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]) &
-                                   any(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim]))):
+                                   all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]) &
+                                   any(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim]))):
                                 trimmed_actual_high_tier = True
                             elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) &
-                                   all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) |
+                                   all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim])) |
                                   (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) &
-                                   all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))):
+                                   all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]))):
                                 trimmed_actual_high_tier = True
                             else:
                                 trimmed_actual_high_tier = False
@@ -1005,37 +1185,58 @@
                         elif softclipped_mutation_allMates:
                             tier = "5.1"
                             counter_tier51 += 1
-                            tier_dict[key1]["tier 5.1"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 5.1"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 5.1"] += 1
 
                         elif softclipped_mutation_oneOfTwoMates:
                             tier = "5.2"
                             counter_tier52 += 1
-                            tier_dict[key1]["tier 5.2"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 5.2"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 5.2"] += 1
 
                         elif softclipped_mutation_oneOfTwoSSCS:
                             tier = "5.3"
                             counter_tier53 += 1
-                            tier_dict[key1]["tier 5.3"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 5.3"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 5.3"] += 1
 
                         elif softclipped_mutation_oneMate:
                             tier = "5.4"
                             counter_tier54 += 1
-                            tier_dict[key1]["tier 5.4"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 5.4"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 5.4"] += 1
 
                         elif softclipped_mutation_oneMateOneSSCS:
                             tier = "5.5"
                             counter_tier55 += 1
-                            tier_dict[key1]["tier 5.5"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 5.5"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 5.5"] += 1
 
                         elif (contradictory):
                             tier = "6"
                             counter_tier6 += 1
-                            tier_dict[key1]["tier 6"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 6"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 6"] += 1
 
                         else:
                             tier = "7"
                             counter_tier7 += 1
-                            tier_dict[key1]["tier 7"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 7"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 7"] += 1
 
                         chrom, pos, ref_a, alt_a = re.split(r'\#', key1)
                         var_id = '-'.join([chrom, str(int(pos)+1), ref, alt])
@@ -1114,13 +1315,12 @@
                             read_pos2 = read_len_median2 = None
                         if (read_pos3 == -1):
                             read_pos3 = read_len_median3 = None
-                        line = (var_id, tier, key2[:-5], 'ab1.ba2', read_pos1, read_pos4, read_len_median1, read_len_median4, dcs_median) + details1 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut14, chimera)
+                        line = (var_id, tier, variant_type, key2[:-5], 'ab1.ba2', read_pos1, read_pos4, read_len_median1, read_len_median4, dcs_median) + details1 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut14, chimera)
                         # ws1.write_row(row, 0, line)
                         # csv_writer.writerow(line)
-                        line2 = ("", "", key2[:-5], 'ab2.ba1', read_pos2, read_pos3, read_len_median2, read_len_median3, dcs_median) + details2 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut23, chimera)
+                        line2 = ("", "", "", key2[:-5], 'ab2.ba1', read_pos2, read_pos3, read_len_median2, read_len_median3, dcs_median) + details2 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut23, chimera)
                         # ws1.write_row(row + 1, 0, line2)
                         # csv_writer.writerow(line2)
-
                         # ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2),
                         #                       {'type': 'formula',
                         #                        'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1),
@@ -1155,12 +1355,19 @@
             # write to file
             # move tier 4 counts to tier 2.5 if there other mutations with tier <= 2.4
             sum_highTiers = sum([tier_dict[key1][ij] for ij in list(sorted(tier_dict[key1].keys()))[:6]])
+            sum_highTiers_ref = sum([tier_dict_ref[key1][ij] for ij in list(sorted(tier_dict_ref[key1].keys()))[:6]])
             correct_tier = False
+            correct_tier_ref = False
             if tier_dict[key1]["tier 4"] > 0 and sum_highTiers > 0:
                 tier_dict[key1]["tier 2.5"] = tier_dict[key1]["tier 4"]
                 tier_dict[key1]["tier 4"] = 0
                 correct_tier = True
 
+            if tier_dict_ref[key1]["tier 4"] > 0 and sum_highTiers_ref > 0:
+                tier_dict_ref[key1]["tier 2.5"] = tier_dict_ref[key1]["tier 4"]
+                tier_dict_ref[key1]["tier 4"] = 0
+                correct_tier_ref = True
+
             for sample in change_tier_after_print:
                 row_number = sample[0]
                 line1 = sample[1]
@@ -1168,44 +1375,47 @@
                 actual_high_tier = sample[3]
                 current_tier = list(line1)[1]
 
-                if correct_tier and (current_tier == "4") and actual_high_tier:
+                if line1[2] == "alt" and correct_tier and (current_tier == "4") and actual_high_tier:
+                    line1 = list(line1)
+                    line1[1] = "2.5"
+                    line1 = tuple(line1)
+                    counter_tier25 += 1
+                    counter_tier4 -= 1
+                if line1[2] == "ref" and correct_tier_ref and (current_tier == "4") and actual_high_tier:
                     line1 = list(line1)
                     line1[1] = "2.5"
                     line1 = tuple(line1)
                     counter_tier25 += 1
                     counter_tier4 -= 1
+
                 ws1.write_row(row_number, 0, line1)
                 csv_writer.writerow(line1)
                 ws1.write_row(row_number + 1, 0, line2)
                 csv_writer.writerow(line2)
+                if line1[2] == "alt":
+                    ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), 'format': format1, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
+                    ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), 'format': format3, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
+                    ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row_number + 1), 'format': format2, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
+                elif line1[2] == "ref":
+                    ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), 'format': format1, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
+                    ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), 'format': format3, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
+                    ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row_number + 1), 'format': format2, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
 
-                ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2),
-                                       {'type': 'formula',
-                                        'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1),
-                                        'format': format1,
-                                        'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
-                ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2),
-                                       {'type': 'formula',
-                                        'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1),
-                                        'format': format3,
-                                        'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
-                ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2),
-                                       {'type': 'formula',
-                                        'criteria': '=$B${}>="3"'.format(row_number + 1),
-                                        'format': format2,
-                                        'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
     # sheet 2
     if chimera_correction:
-        header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'chimera-corrected cvrg (tiers 1.1-2.5)', 'chimeras in AC alt (tiers 1.1-2.5)', 'chimera-corrected AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)',
-                        'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 'tier 2.5',
-                        'tier 3.1', 'tier 3.2', 'tier 4', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'tier 7', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
-                        'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-2.5', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6', 'AF 1.1-7')
+        header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC ref (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'chimera-corrected cvrg (tiers 1.1-2.5)', 'chimeras in AC alt (tiers 1.1-2.5)', 'chimera-corrected AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)',
+                        'tier 1.1 (alt)', 'tier 1.2 (alt)', 'tier 2.1 (alt)', 'tier 2.2 (alt)', 'tier 2.3 (alt)', 'tier 2.4 (alt)', 'tier 2.5 (alt)',
+                        'tier 3.1 (alt)', 'tier 3.2 (alt)', 'tier 4 (alt)', 'tier 5.1 (alt)', 'tier 5.2 (alt)', 'tier 5.3 (alt)', 'tier 5.4 (alt)', 'tier 5.5 (alt)', 'tier 6 (alt)', 'tier 7 (alt)',
+                        'tier 1.1 (ref)', 'tier 1.2 (ref)', 'tier 2.1 (ref)', 'tier 2.2 (ref)', 'tier 2.3 (ref)', 'tier 2.4 (ref)', 'tier 2.5 (ref)',
+                        'tier 3.1 (ref)', 'tier 3.2 (ref)', 'tier 4 (ref)', 'tier 5.1 (ref)', 'tier 5.2 (ref)', 'tier 5.3 (ref)', 'tier 5.4 (ref)', 'tier 5.5 (ref)', 'tier 6 (ref)', 'tier 7 (ref)'
+                        )
     else:
-        header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)',
-                        'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 'tier 2.5',
-                        'tier 3.1', 'tier 3.2', 'tier 4', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'tier 7', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
-                        'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-2.5', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6', 'AF 1.1-7')
-
+        header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC ref (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)',
+                        'tier 1.1 (alt)', 'tier 1.2 (alt)', 'tier 2.1 (alt)', 'tier 2.2 (alt)', 'tier 2.3 (alt)', 'tier 2.4 (alt)', 'tier 2.5 (alt)',
+                        'tier 3.1 (alt)', 'tier 3.2 (alt)', 'tier 4 (alt)', 'tier 5.1 (alt)', 'tier 5.2 (alt)', 'tier 5.3 (alt)', 'tier 5.4 (alt)', 'tier 5.5 (alt)', 'tier 6 (alt)', 'tier 7 (alt)',
+                        'tier 1.1 (ref)', 'tier 1.2 (ref)', 'tier 2.1 (ref)', 'tier 2.2 (ref)', 'tier 2.3 (ref)', 'tier 2.4 (ref)', 'tier 2.5 (ref)',
+                        'tier 3.1 (ref)', 'tier 3.2 (ref)', 'tier 4 (ref)', 'tier 5.1 (ref)', 'tier 5.2 (ref)', 'tier 5.3 (ref)', 'tier 5.4 (ref)', 'tier 5.5 (ref)', 'tier 6 (ref)', 'tier 7 (ref)'
+                        )
     ws2.write_row(0, 0, header_line2)
     row = 0
 
@@ -1220,9 +1430,12 @@
             alt_count = cvrg_dict[key1][1]
             cvrg = ref_count + alt_count
 
+            ref_tiers = tier_dict_ref[key1]
+
             var_id = '-'.join([chrom, str(int(pos)+1), ref, alt])
             lst = [var_id, cvrg]
             used_tiers = []
+            used_tiers_ref = [t for k, t in sorted(ref_tiers.items())]
             cum_af = []
             for key2, value2 in sorted(value1.items()):
                 # calculate cummulative AF
@@ -1233,24 +1446,25 @@
             if sum(used_tiers) == 0:  # skip mutations that are filtered by the VA in the first place
                 continue
             lst.extend([sum(used_tiers), safe_div(sum(used_tiers), cvrg)])
-            lst.extend([(cvrg - sum(used_tiers[-10:])), sum(used_tiers[0:7]), safe_div(sum(used_tiers[0:7]), (cvrg - sum(used_tiers[-10:])))])
+            lst.extend([(sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])), sum(used_tiers_ref[0:7]), sum(used_tiers[0:7]), safe_div(sum(used_tiers[0:7]), (sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])))])
             if chimera_correction:
                 chimeras_all = chimera_dict[key1][1]
                 new_alt = sum(used_tiers[0:7]) - chimeras_all
                 fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers[0:7])))
                 if fraction_chimeras is None:
                     fraction_chimeras = 0.
-                new_cvrg = (cvrg - sum(used_tiers[-10:])) * (1. - fraction_chimeras)
+                new_cvrg = (sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])) * (1. - fraction_chimeras)
                 lst.extend([new_cvrg, chimeras_all, safe_div(new_alt, new_cvrg)])
             lst.extend([alt_count, safe_div(alt_count, cvrg)])
             lst.extend(used_tiers)
-            lst.extend(cum_af)
+            lst.extend(used_tiers_ref)
+            # lst.extend(cum_af)
             lst = tuple(lst)
             ws2.write_row(row + 1, 0, lst)
             if chimera_correction:
-                ws2.conditional_format('M{}:N{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$M$1="tier 1.1"', 'format': format12, 'multi_range': 'M{}:N{} M1:N1'.format(row + 2, row + 2)})
-                ws2.conditional_format('O{}:S{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$O$1="tier 2.1"', 'format': format32, 'multi_range': 'O{}:S{} O1:S1'.format(row + 2, row + 2)})
-                ws2.conditional_format('T{}:AC{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$T$1="tier 3.1"', 'format': format22, 'multi_range': 'T{}:AC{} T1:AC1'.format(row + 2, row + 2)})
+                ws2.conditional_format('N{}:O{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$N$1="tier 1.1 (alt)"', 'format': format12, 'multi_range': 'N{}:O{} N1:O1 AE{}:AF{} AE1:AF1'.format(row + 2, row + 2, row + 2, row + 2)})
+                ws2.conditional_format('P{}:T{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 2.1 (alt)"', 'format': format32, 'multi_range': 'P{}:T{} P1:T1 AG{}:AK{} AG1:AK1'.format(row + 2, row + 2, row + 2, row + 2)})
+                ws2.conditional_format('U{}:AD{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$U$1="tier 3.1 (alt)"', 'format': format22, 'multi_range': 'U{}:AD{} U1:AD1 AL{}:AU{} AL1:AU1'.format(row + 2, row + 2, row + 2, row + 2)})
             else:
                 ws2.conditional_format('J{}:K{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$J$1="tier 1.1"', 'format': format12, 'multi_range': 'J{}:K{} J1:K1'.format(row + 2, row + 2)})
                 ws2.conditional_format('L{}:P{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$L$1="tier 2.1"', 'format': format32, 'multi_range': 'L{}:P{} L1:P1'.format(row + 2, row + 2)})
diff -r 1797e461d674 -r fdfe9a919ff7 read2mut.xml
--- a/read2mut.xml	Mon Mar 29 09:22:57 2021 +0000
+++ b/read2mut.xml	Fri Jul 22 09:19:44 2022 +0000
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<tool id="read2mut" name="Call specific mutations in reads:" version="2.1.4" profile="19.01">
-    <description>Looks for reads with mutation at known positions and calculates frequencies and stats.</description>
+<tool id="read2mut" name="Call specific mutations in reads:" version="3.0.0" profile="19.01">
+    <description>Looks for reads with a mutation at known positions and calculates frequencies and stats.</description>
     <macros>
         <import>va_macros.xml</import>
     </macros>
@@ -28,12 +28,12 @@
     ]]>
     </command>
     <inputs>
-        <param name="file1" type="data" format="vcf" label="DCS Mutation File" optional="false" help="VCF file with DCS mutations. See Help section below for a detailed explanation."/>
+        <param name="file1" type="data" format="vcf" label="DCS Mutation File" optional="false" help="VCF file with DCS mutations. See the Help section below for a detailed explanation."/>
         <param name="file2" type="data" format="bam" label="BAM File of raw reads" optional="false" help="BAM file with aligned raw reads of selected tags."/>
         <param name="file3" type="data" format="json" label="JSON File with DCS tag stats" optional="false" help="JSON file generated by DCS mutations to tags/reads"/>
         <param name="file4" type="data" format="json" label="JSON File with SSCS tag stats" optional="false" help="JSON file generated by DCS mutations to SSCS stats."/>
-        <param name="thresh" type="integer" label="Tag count threshold" value="0" help="Integer threshold for displaying mutations. Only mutations occuring in DCS of less than thresh tags are displayed. Default of 0 displays all."/>
-        <param name="phred" type="integer" label="Phred quality score threshold" min="0" max="41" value="20" help="Integer threshold for Phred quality score. Only reads higher than this threshold are considered. Default = 20."/>
+        <param name="thresh" type="integer" label="Tag count threshold" value="0" help="Integer threshold for displaying mutations. Only mutations occurring in DCS of less than thresh tags are displayed. Default of 0 displays all."/>
+        <param name="phred" type="integer" label="Phred quality score threshold" min="0" max="41" value="20" help="Integer threshold for Phred quality score. Only reads higher than this threshold is considered. Default = 20."/>
         <param name="trim" type="integer" label="Trimming threshold" value="10" help="Integer threshold for assigning mutations at start and end of reads to lower tier. Default 10."/>
         <param name="chimera_correction" type="boolean" label="Apply chimera correction?" truevalue="--chimera_correction" falsevalue="" checked="False" help="Count chimeric variants and correct the variant frequencies."/>
         <param name="softclipping_dist" type="integer" label="Distance between artifact and softclipping of the reads" min="1" value="15" help="Count mutation as an artifact if mutation lies within this parameter away from the softclipping part of the reads. Default = 20"/>
@@ -57,10 +57,10 @@
             <param name="chimera_correction"/>
             <param name="softclipping_dist" value="15"/>
             <param name="reads_threshold" value="1.0"/>
-            <output name="output_xlsx" file="Variant_Analyzer_summary_test.xlsx" decompress="true" lines_diff="10"/>
-            <output name="outputFile_csv" file="Variant_Analyzer_summary_test.csv" decompress="true" lines_diff="10"/>
-            <output name="output_xlsx2" file="Variant_Analyzer_allele_frequencies_test.xlsx" decompress="true" lines_diff="10"/>
-            <output name="output_xlsx3" file="Variant_Analyzer_tiers_test.xlsx" decompress="true" lines_diff="10"/>
+            <output name="output_xlsx" file="Variant_Analyzer_summary_test.xlsx" decompress="true"/>
+            <output name="outputFile_csv" file="Variant_Analyzer_summary_test.csv" decompress="true"/>
+            <output name="output_xlsx2" file="Variant_Analyzer_allele_frequencies_test.xlsx" decompress="true"/>
+            <output name="output_xlsx3" file="Variant_Analyzer_tiers_test.xlsx" decompress="true"/>
         </test>
     </tests>
     <help> <![CDATA[
@@ -73,25 +73,25 @@
 
 **Input** 
 
-**Dataset 1:** VCF file with duplex consesus sequence (DCS) mutations. E.g. 
+**Dataset 1:** VCF file with duplex consensus sequence (DCS) mutations. E.g. 
 generated by the `FreeBayes <https://arxiv.org/abs/1207.3907>`_ or `LoFreq <https://academic.oup.com/nar/article/40/22/11189/1152727>`_ variant caller.
 
 **Dataset 2:** BAM file of aligned raw reads. This file can be obtained by the 
 tool `Map with BWA-MEM <https://arxiv.org/abs/1303.3997>`_.
 
 **Dataset 3:** JSON file generated by the **DCS mutations to tags/reads** tool 
-containing dictonaries of the tags of reads containing mutations 
+containing dictionaries of the tags of reads containing mutations 
 in the DCS.
 
 **Dataset 4:** JSON file generated by the **DCS mutations to SSCS stats** tool 
-stats of tags that carry a mutation in the SSCS at the same position a mutation 
+stats of tags that carry a mutation and the reference allele in the SSCS at the same position a mutation 
 is called in the DCS.
 
 **Output**
 
-The output are three XLSX files containing frequencies stats for DCS mutations based 
-on information from the raw reads and a CSV file containing the summary information without color-coding. In addition to that a tier based 
-classification is provided based on the amout of support for a true variant call.
+The output is three XLSX files containing frequencies stats for DCS mutations based 
+on information from the raw reads and a CSV file containing the summary information without color-coding. In addition to that, a tier-based 
+classification is provided based on the amount of support for a true variant call.
 
 
     ]]> 
diff -r 1797e461d674 -r fdfe9a919ff7 test-data/Interesting_Reads_test.trim.bai
Binary file test-data/Interesting_Reads_test.trim.bai has changed
diff -r 1797e461d674 -r fdfe9a919ff7 test-data/Interesting_Reads_test.trim.bam
Binary file test-data/Interesting_Reads_test.trim.bam has changed
diff -r 1797e461d674 -r fdfe9a919ff7 test-data/Variant_Analyzer_allele_frequencies_test.xlsx
Binary file test-data/Variant_Analyzer_allele_frequencies_test.xlsx has changed
diff -r 1797e461d674 -r fdfe9a919ff7 test-data/Variant_Analyzer_summary_test.csv
--- a/test-data/Variant_Analyzer_summary_test.csv	Mon Mar 29 09:22:57 2021 +0000
+++ b/test-data/Variant_Analyzer_summary_test.csv	Fri Jul 22 09:19:44 2022 +0000
@@ -1,7 +1,11 @@
-variant ID,tier,tag,mate,read pos.ab,read pos.ba,read median length.ab,read median length.ba,DCS median length,FS.ab,FS.ba,FSqc.ab,FSqc.ba,ref.ab,ref.ba,alt.ab,alt.ba,rel. ref.ab,rel. ref.ba,rel. alt.ab,rel. alt.ba,na.ab,na.ba,lowq.ab,lowq.ba,trim.ab,trim.ba,SSCS alt.ab,SSCS alt.ba,SSCS ref.ab,SSCS ref.ba,in phase,chimeric tag
-ACH_TDII_5regions-505-C-A,2.1,GATAACCTTGCTTCGTGATTAATC,ab1.ba2,132.0,131.0,264.0,263.0,173.0,1,3,1,3,0,0,1,3,0,0,1.0,1.0,0,0,0,0,0,0,2,1,1,1,,
-,,GATAACCTTGCTTCGTGATTAATC,ab2.ba1,,,,,173.0,0,0,0,0,0,0,0,0,,,,,0,0,0,0,0,0,2,1,1,1,,
-ACH_TDII_5regions-571-C-T,1.1,GATTGGATAACGTTGTGGCAATTG,ab1.ba2,129.0,218.0,195.0,284.0,143.0,4,5,4,5,0,0,4,5,0,0,1.0,1.0,0,0,0,0,0,0,1,1,2,1,,
-,,GATTGGATAACGTTGTGGCAATTG,ab2.ba1,264.0,264.0,278.5,283.5,143.0,2,2,2,2,0,0,2,2,0,0,1.0,1.0,0,0,0,0,0,0,1,1,2,1,,
-ACH_TDII_5regions-958-T-C,2.4,CCTCCCGGCAGTGCGAAAATGTCA,ab1.ba2,,,,,195.0,0,0,0,0,0,0,0,0,,,,,0,0,0,0,0,0,1,1,1,0,,
-,,CCTCCCGGCAGTGCGAAAATGTCA,ab2.ba1,97.0,91.0,283.0,277.0,195.0,1,1,1,1,0,0,1,1,0,0,1.0,1.0,0,0,0,0,0,0,1,1,1,0,,
+variant ID,tier,allele,tag,mate,read pos.ab,read pos.ba,read median length.ab,read median length.ba,DCS median length,FS.ab,FS.ba,FSqc.ab,FSqc.ba,ref.ab,ref.ba,alt.ab,alt.ba,rel. ref.ab,rel. ref.ba,rel. alt.ab,rel. alt.ba,na.ab,na.ba,lowq.ab,lowq.ba,trim.ab,trim.ba,SSCS alt.ab,SSCS alt.ba,SSCS ref.ab,SSCS ref.ba,in phase,chimeric tag
+ACH_TDII_5regions-505-C-A,2.1,alt,GATAACCTTGCTTCGTGATTAATC,ab1.ba2,132.0,131.0,264.0,263.0,173.0,1,3,1,3,0,0,1,3,0,0,1.0,1.0,0,0,0,0,0,0,2,1,1,1,,
+,,,GATAACCTTGCTTCGTGATTAATC,ab2.ba1,,,,,173.0,0,0,0,0,0,0,0,0,,,,,0,0,0,0,0,0,2,1,1,1,,
+ACH_TDII_5regions-505-C-A,1.1,ref,GATTGGATAACGTTGTGGCAATTG,ab1.ba2,73.0,152.0,205.0,284.0,173.0,3,5,3,5,3,5,0,0,1.0,1.0,0,0,0,0,0,0,0,0,2,1,1,1,,
+,,,GATTGGATAACGTTGTGGCAATTG,ab2.ba1,198.0,198.0,263.5,283.0,173.0,4,3,4,3,4,3,0,0,1.0,1.0,0,0,0,0,0,0,0,0,2,1,1,1,,
+ACH_TDII_5regions-571-C-T,2.1,ref,GATAACCTTGCTTCGTGATTAATC,ab1.ba2,198.0,197.0,264.0,263.0,143.0,1,3,1,3,1,3,0,0,1.0,1.0,0,0,0,0,0,0,0,0,1,1,2,1,,
+,,,GATAACCTTGCTTCGTGATTAATC,ab2.ba1,,,,,143.0,0,0,0,0,0,0,0,0,,,,,0,0,0,0,0,0,1,1,2,1,,
+ACH_TDII_5regions-571-C-T,1.1,alt,GATTGGATAACGTTGTGGCAATTG,ab1.ba2,129.0,218.0,195.0,284.0,143.0,4,5,4,5,0,0,4,5,0,0,1.0,1.0,0,0,0,0,0,0,1,1,2,1,,
+,,,GATTGGATAACGTTGTGGCAATTG,ab2.ba1,264.0,264.0,278.5,283.5,143.0,2,2,2,2,0,0,2,2,0,0,1.0,1.0,0,0,0,0,0,0,1,1,2,1,,
+ACH_TDII_5regions-958-T-C,2.4,alt,CCTCCCGGCAGTGCGAAAATGTCA,ab1.ba2,,,,,195.0,0,0,0,0,0,0,0,0,,,,,0,0,0,0,0,0,1,1,1,0,,
+,,,CCTCCCGGCAGTGCGAAAATGTCA,ab2.ba1,97.0,91.0,283.0,277.0,195.0,1,1,1,1,0,0,1,1,0,0,1.0,1.0,0,0,0,0,0,0,1,1,1,0,,
diff -r 1797e461d674 -r fdfe9a919ff7 test-data/Variant_Analyzer_summary_test.xlsx
Binary file test-data/Variant_Analyzer_summary_test.xlsx has changed
diff -r 1797e461d674 -r fdfe9a919ff7 test-data/Variant_Analyzer_tiers_test.xlsx
Binary file test-data/Variant_Analyzer_tiers_test.xlsx has changed
diff -r 1797e461d674 -r fdfe9a919ff7 test-data/tag_count_dict_test.json
--- a/test-data/tag_count_dict_test.json	Mon Mar 29 09:22:57 2021 +0000
+++ b/test-data/tag_count_dict_test.json	Fri Jul 22 09:19:44 2022 +0000
@@ -1,1 +1,1 @@
-[{"GATAACCTTGCTTCGTGATTAATC": {"ACH_TDII_5regions#504#C#A": "A"}, "GATTGGATAACGTTGTGGCAATTG": {"ACH_TDII_5regions#570#C#T": "T"}, "CCTCCCGGCAGTGCGAAAATGTCA": {"ACH_TDII_5regions#957#T#C": "C"}}, {"ACH_TDII_5regions#957#T#C": [0, 1, 195.0], "ACH_TDII_5regions#504#C#A": [1, 1, 173.0], "ACH_TDII_5regions#570#C#T": [1, 1, 143.0]}]
\ No newline at end of file
+[{"GATAACCTTGCTTCGTGATTAATC": {"ACH_TDII_5regions#504#C#A": "A"}, "GATTGGATAACGTTGTGGCAATTG": {"ACH_TDII_5regions#570#C#T": "T"}, "CCTCCCGGCAGTGCGAAAATGTCA": {"ACH_TDII_5regions#957#T#C": "C"}}, {"ACH_TDII_5regions#957#T#C": [0, 1, 195.0], "ACH_TDII_5regions#504#C#A": [1, 1, 173.0], "ACH_TDII_5regions#570#C#T": [1, 1, 143.0]}, {"GATAACCTTGCTTCGTGATTAATC": {"ACH_TDII_5regions#570#C#T": "C"}, "GATTGGATAACGTTGTGGCAATTG": {"ACH_TDII_5regions#504#C#A": "C"}}]
\ No newline at end of file