Mercurial > repos > mheinzl > hd

--- a/hd.py	Mon Jun 03 05:37:01 2019 -0400
+++ b/hd.py	Wed Jul 24 05:58:15 2019 -0400
@@ -5,16 +5,17 @@
 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria)
 # Contact: monika.heinzl@edumail.at
 #
-# Takes at least one TABULAR file with tags before the alignment to the SSCS and optionally a second TABULAR file as input.
-# The program produces a plot which shows a histogram of Hamming distances separated after family sizes,
-# a family size distribution separated after Hamming distances for all (sample_size=0) or a given sample of SSCSs or SSCSs, which form a DCS.
-# In additon, the tool produces HD and FSD plots for the difference between the HDs of both parts of the tags and for the chimeric reads
-# and finally a CSV file with the data of the plots.
-# It is also possible to perform the HD analysis with shortened tags with given sizes as input.
+# Takes at least one TABULAR file with tags before the alignment to the SSCS and
+# optionally a second TABULAR file as input. The program produces a plot which shows a histogram of Hamming distances
+# separated after family sizes, a family size distribution separated after Hamming distances for all (sample_size=0)
+# or a given sample of SSCSs or SSCSs, which form a DCS. In additon, the tool produces HD and FSD plots for the
+# difference between the HDs of both parts of the tags and for the chimeric reads and finally a CSV file with the
+# data of the plots. It is also possible to perform the HD analysis with shortened tags with given sizes as input.
 # The tool can run on a certain number of processors, which can be defined by the user.

 # USAGE: python hd.py --inputFile filename --inputName1 filename --sample_size int /
-#        --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular
+#        --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int
+#        --nr_above_bars True/False --output_tabular outptufile_name_tabular

 import argparse
 import itertools
@@ -34,7 +35,7 @@


 def plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, originalCounts,
-                   title_file1, subtitle, pdf, relative=False, diff=True):
+                   subtitle, pdf, relative=False, diff=True, rel_freq=False):
     if diff is False:
         colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"]
         labels = ["HD=1", "HD=2", "HD=3", "HD=4", "HD=5-8", "HD>8"]
@@ -48,36 +49,37 @@
     fig = plt.figure(figsize=(6, 7))
     ax = fig.add_subplot(111)
     plt.subplots_adjust(bottom=0.1)
-    p1 = numpy.bincount(numpy.concatenate((familySizeList1)))
+    p1 = numpy.bincount(numpy.concatenate(familySizeList1))
     maximumY = numpy.amax(p1)

     if len(range(minimumXFS, maximumXFS)) == 0:
         range1 = range(minimumXFS - 1, minimumXFS + 2)
     else:
         range1 = range(0, maximumXFS + 2)
-    counts = plt.hist(familySizeList1, label=labels,
-                      color=colors, stacked=True,
-                      rwidth=0.8, alpha=1, align="left",
-                      edgecolor="None", bins=range1)
+
+    if rel_freq:
+        w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(familySizeList1)) for data in familySizeList1]
+        counts = plt.hist(familySizeList1, label=labels, weights=w, color=colors, stacked=True,
+                          rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1)
+        plt.ylabel("Relative Frequency", fontsize=14)
+        plt.ylim((0, (float(maximumY) / sum(p1)) * 1.1))
+    else:
+        counts = plt.hist(familySizeList1, label=labels, color=colors, stacked=True,
+                          rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1)
+        if len(numpy.concatenate(familySizeList1)) != 0:
+            plt.ylim((0, max(numpy.bincount(numpy.concatenate(familySizeList1))) * 1.1))
+        plt.ylabel("Absolute Frequency", fontsize=14)
+        plt.ylim((0, maximumY * 1.2))
     plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
-
-    # plt.title(title_file1, fontsize=12)
     plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
     plt.xlabel("Family size", fontsize=14)
-    plt.ylabel("Absolute Frequency", fontsize=14)
-
     ticks = numpy.arange(0, maximumXFS + 1, 1)
     ticks1 = map(str, ticks)
     if maximumXFS >= 20:
         ticks1[len(ticks1) - 1] = ">=20"
     plt.xticks(numpy.array(ticks), ticks1)
     [l.set_visible(False) for (i, l) in enumerate(ax.get_xticklabels()) if i % 5 != 0]
-
     plt.xlim((0, maximumXFS + 1))
-    if len(numpy.concatenate(familySizeList1)) != 0:
-        plt.ylim((0, max(numpy.bincount(numpy.concatenate(familySizeList1))) * 1.1))
-
-    plt.ylim((0, maximumY * 1.2))
     legend = "\nfamily size: \nabsolute frequency: \nrelative frequency: "
     plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure)

@@ -86,17 +88,17 @@
         max_count = ">= 20"
     else:
         max_count = max(originalCounts)
-    legend1 = "{}\n{}\n{:.5f}".format(max_count, count[len(count) - 1], float(count[len(count) - 1]) / sum(count))
+    legend1 = "{}\n{}\n{:.5f}".format(max_count, p1[len(p1) - 1], float(p1[len(p1) - 1]) / sum(p1))
     plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure)
-    legend3 = "singletons\n{:,}\n{:.5f}".format(int(counts[0][len(counts[0]) - 1][1]), float(counts[0][len(counts[0]) - 1][1]) / sum(counts[0][len(counts[0]) - 1]))
+    legend3 = "singletons\n{:,}\n{:.5f}".format(int(p1[1]), float(p1[1]) / sum(p1))
     plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12)
     plt.grid(b=True, which='major', color='#424242', linestyle=':')
-
     pdf.savefig(fig, bbox_inches="tight")
     plt.close("all")


-def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, title_file1, pdf, xlabel, relative=False, nr_above_bars=True, nr_unique_chimeras=0, len_sample=0):
+def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False,
+                  nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False):
     if relative is True:
         step = 0.1
     else:
@@ -104,70 +106,153 @@

     fig = plt.figure(figsize=(6, 8))
     plt.subplots_adjust(bottom=0.1)
-    con_list1 = numpy.concatenate(list1)
-    p1 = numpy.array([v for k, v in sorted(Counter(con_list1).iteritems())])
+    p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())])
     maximumY = numpy.amax(p1)
-    maximumX = int(maximumX)
-    print("max X", maximumX )
-
     if relative is True:  # relative difference
         bin1 = numpy.arange(-1, maximumX + 0.2, 0.1)
     else:
         bin1 = maximumX + 1

-    counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1,
-                      label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10",
-                             "FS>10"], rwidth=0.8,
-                      color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"],
-                      stacked=True, alpha=1,
-                      align="left",
-                      range=(0, maximumX + 1))
+    if rel_freq:
+        w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1]
+        counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w,
+                          label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8,
+                          color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"],
+                          stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
+        plt.ylim((0, (float(maximumY) / sum(p1)) * 1.2))
+        plt.ylabel("Relative Frequency", fontsize=14)
+        bins = counts[1]  # width of bins
+        counts = numpy.array(map(float, counts[0][5]))
+
+    else:
+        counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1,
+                          label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8,
+                          color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"],
+                          stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
+        maximumY = numpy.amax(p1)
+        plt.ylim((0, maximumY * 1.2))
+        plt.ylabel("Absolute Frequency", fontsize=14)
+        bins = counts[1]  # width of bins
+        counts = numpy.array(map(int, counts[0][5]))
+
     plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
-    bins = counts[1]  # width of bins
-    counts = numpy.array(map(int, counts[0][5]))
     plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
-    # plt.title(title_file1, fontsize=12)
     plt.xlabel(xlabel, fontsize=14)
-    plt.ylabel("Absolute Frequency", fontsize=14)
-
     plt.grid(b=True, which='major', color='#424242', linestyle=':')
-    plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1))
+    plt.xlim((minimumX - step, maximumX + step))
+    # plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1))
     plt.xticks(numpy.arange(0, maximumX + step, step))

-    plt.ylim((0, maximumY * 1.2))
-
-    if nr_above_bars is True:
+    if nr_above_bars:
         bin_centers = -0.4 * numpy.diff(bins) + bins[:-1]
         for x_label, label in zip(counts, bin_centers):  # labels for values
             if x_label == 0:
                 continue
             else:
-                plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts), 1),
-                             xy=(label, x_label + len(con_list1) * 0.01),
-                             xycoords="data", color="#000066", fontsize=10)
+                if rel_freq:
+                    plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))),
+                                                       float(x_label)),
+                                 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001),
+                                 xycoords="data", color="#000066", fontsize=10)
+                else:
+                    plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)),
+                                 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01),
+                                 xycoords="data", color="#000066", fontsize=10)
+
+    if nr_unique_chimeras != 0:
+        if (relative and ((counts[len(counts)-1] / nr_unique_chimeras) == 2)) or \
+                (sum(counts) / nr_unique_chimeras) == 2:
+            legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})"\
+                .format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2)
+        else:
+            legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format(
+                lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras)
+    else:
+        legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(
+            lenTags, len_sample, len(numpy.concatenate(list1)))

-    legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(lenTags, len_sample, sum(counts))
-    plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure)
+    plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure)
+    pdf.savefig(fig, bbox_inches="tight")
+    plt.close("all")
+    plt.clf()
+
+
+def plotHDwithDCS(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False,
+                  nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False):
+    step = 1
+    fig = plt.figure(figsize=(6, 8))
+    plt.subplots_adjust(bottom=0.1)
+    p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())])
+    maximumY = numpy.amax(p1)
+    bin1 = maximumX + 1
+    if rel_freq:
+        w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1]
+        counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w,
+                          label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"],
+                          stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
+        plt.ylim((0, (float(maximumY) / sum(p1)) * 1.2))
+        plt.ylabel("Relative Frequency", fontsize=14)
+        bins = counts[1]  # width of bins
+        counts = numpy.array(map(float, counts[0][2]))

-    # if nr_unique_chimeras != 0 and len_sample != 0:
-    #     if relative == True:
-    #         legend = "nr. of unique chimeric tags= {:,} ({:.5f}) (rel.diff=1)".format(nr_unique_chimeras,
-    #                                                                      int(nr_unique_chimeras) / float(len_sample))
-    #     else:
-    #         legend = "nr. of unique chimeric tags= {:,} ({:.5f})".format(nr_unique_chimeras, int(nr_unique_chimeras) / float(len_sample))
-    #     plt.text(0.14, -0.09, legend, size=12, transform=plt.gcf().transFigure)
+    else:
+        counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1,
+                          label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"],
+                          stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
+        plt.ylim((0, maximumY * 1.2))
+        plt.ylabel("Absolute Frequency", fontsize=14)
+        bins = counts[1]  # width of bins
+        counts = numpy.array(map(int, counts[0][2]))
+
+    plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
+    plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
+    plt.xlabel(xlabel, fontsize=14)
+    plt.grid(b=True, which='major', color='#424242', linestyle=':')
+    plt.xlim((minimumX - step, maximumX + step))
+    # plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1))
+    plt.xticks(numpy.arange(0, maximumX + step, step))
+
+    if nr_above_bars:
+        bin_centers = -0.4 * numpy.diff(bins) + bins[:-1]
+        for x_label, label in zip(counts, bin_centers):  # labels for values
+            if x_label == 0:
+                continue
+            else:
+                if rel_freq:
+                    plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))),
+                                                       float(x_label)),
+                                 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001),
+                                 xycoords="data", color="#000066", fontsize=10)
+                else:
+                    plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)),
+                                 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01),
+                                 xycoords="data", color="#000066", fontsize=10)
+
+    if nr_unique_chimeras != 0:
+        if (sum(counts) / nr_unique_chimeras) == 2:
+            legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})".\
+                format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2)
+        else:
+            legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format(
+                lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras)
+    else:
+        legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(
+            lenTags, len_sample, len(numpy.concatenate(list1)))
+    plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure)
+
+    legend2 = "SSCS ab = {:,}\nSSCS ba = {:,}\nDCS = {:,}".format(len(list1[1]), len(list1[2]), len(list1[0]))
+    plt.text(0.6, -0.047, legend2, size=12, transform=plt.gcf().transFigure)

     pdf.savefig(fig, bbox_inches="tight")
     plt.close("all")
     plt.clf()


-def plotHDwithinSeq_Sum2(sum1, sum1min, sum2, sum2min, min_value, lenTags, title_file1, pdf, len_sample):
+def plotHDwithinSeq(sum1, sum1min, sum2, sum2min, min_value, lenTags, pdf, len_sample, rel_freq=False):
     fig = plt.figure(figsize=(6, 8))
     plt.subplots_adjust(bottom=0.1)

     ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)]  # new hd within tags
-
     maximumX = numpy.amax(numpy.concatenate(ham_partial))
     minimumX = numpy.amin(numpy.concatenate(ham_partial))
     maximumY = numpy.amax(numpy.array(numpy.concatenate(map(lambda x: numpy.bincount(x), ham_partial))))
@@ -177,21 +262,30 @@
     else:
         range1 = range(minimumX, maximumX + 2)

-    plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, label=["HD a", "HD b'", "HD b", "HD a'", "HD a+b"], bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], edgecolor='black', linewidth=1)
+    if rel_freq:
+        w = [numpy.zeros_like(data) + 1. / len(data) for data in ham_partial]
+        plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, weights=w,
+                 label=["HD a", "HD b'", "HD b", "HD a'", "HD a+b', a'+b"],
+                 bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"],
+                 edgecolor='black', linewidth=1)
+        plt.ylabel("Relative Frequency", fontsize=14)
+        # plt.ylim(-0.1, (float(maximumY) / len(numpy.concatenate(ham_partial))) * 1.2)
+    else:
+        plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False,
+                 label=["HD a", "HD b'", "HD b", "HD a'", "HD a+b', a'+b"],
+                 bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"],
+                 edgecolor='black', linewidth=1)
+        plt.ylabel("Absolute Frequency", fontsize=14)

     plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1))
     plt.suptitle('Hamming distances within tags', fontsize=14)
-    # plt.title(title_file1, fontsize=12)
     plt.xlabel("HD", fontsize=14)
-    plt.ylabel("Absolute Frequency", fontsize=14)
     plt.grid(b=True, which='major', color='#424242', linestyle=':')
-
-    plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2))
+    plt.xlim((minimumX - 1, maximumX + 1))
+    # plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2))
     plt.xticks(numpy.arange(0, maximumX + 1, 1.0))
-    # plt.ylim(0, maximumY * 1.2)
-    legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(lenTags, len_sample, len(numpy.concatenate(ham_partial)))
-
-    # legend = "sample size= {:,} against {:,}".format(len(numpy.concatenate(ham_partial)), lenTags)
+    legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(
+        lenTags, len_sample, len(numpy.concatenate(ham_partial)))
     plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure)
     pdf.savefig(fig, bbox_inches="tight")
     plt.close("all")
@@ -206,7 +300,6 @@
         count = numpy.zeros((len(uniqueFS), 6))
     else:
         count = numpy.zeros((len(uniqueFS), 7))
-
     state = 1
     for i in list1:
         counts = list(Counter(i).items())
@@ -218,57 +311,48 @@
             continue
         else:
             if state == 1:
-                for i, l in zip(uniqueFS, nr):
+                for k, l in zip(uniqueFS, nr):
                     for j in table:
                         if j[0] == uniqueFS[l]:
                             count[l, 0] = j[1]
             if state == 2:
-                for i, l in zip(uniqueFS, nr):
+                for k, l in zip(uniqueFS, nr):
                     for j in table:
                         if j[0] == uniqueFS[l]:
                             count[l, 1] = j[1]
-
             if state == 3:
-                for i, l in zip(uniqueFS, nr):
+                for k, l in zip(uniqueFS, nr):
                     for j in table:
                         if j[0] == uniqueFS[l]:
                             count[l, 2] = j[1]
-
             if state == 4:
-                for i, l in zip(uniqueFS, nr):
+                for k, l in zip(uniqueFS, nr):
                     for j in table:
                         if j[0] == uniqueFS[l]:
                             count[l, 3] = j[1]
-
             if state == 5:
-                for i, l in zip(uniqueFS, nr):
+                for k, l in zip(uniqueFS, nr):
                     for j in table:
                         if j[0] == uniqueFS[l]:
                             count[l, 4] = j[1]
-
             if state == 6:
-                for i, l in zip(uniqueFS, nr):
+                for k, l in zip(uniqueFS, nr):
                     for j in table:
                         if j[0] == uniqueFS[l]:
                             count[l, 5] = j[1]
-
             if state == 7:
-                for i, l in zip(uniqueFS, nr):
+                for k, l in zip(uniqueFS, nr):
                     for j in table:
                         if j[0] == uniqueFS[l]:
                             count[l, 6] = j[1]
             state = state + 1
-
         sumRow = count.sum(axis=1)
         sumCol = count.sum(axis=0)
-
     uniqueFS = uniqueFS.astype(str)
     if uniqueFS[len(uniqueFS) - 1] == "20":
         uniqueFS[len(uniqueFS) - 1] = ">20"
-
     first = ["FS={}".format(i) for i in uniqueFS]
     final = numpy.column_stack((first, count, sumRow))
-
     return (final, sumCol)


@@ -276,13 +360,15 @@
     output_file.write(name)
     output_file.write("\n")
     if diff is False:
-        output_file.write("{}HD=1{}HD=2{}HD=3{}HD=4{}HD=5-8{}HD>8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep))
+        output_file.write("{}HD=1{}HD=2{}HD=3{}HD=4{}HD=5-8{}HD>8{}sum{}\n".format(
+            sep, sep, sep, sep, sep, sep, sep, sep))
     else:
         if rel is False:
-            output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep))
+            output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format(
+                sep, sep, sep, sep, sep, sep, sep, sep, sep))
         else:
-            output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep))
-
+            output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n".
+                              format(sep, sep, sep, sep, sep, sep, sep, sep, sep))
     for item in summary:
         for nr in item:
             if "FS" not in nr and "diff" not in nr:
@@ -314,46 +400,40 @@
             continue
         else:
             if state == 1:
-                for i, l in zip(uniqueHD, nr):
+                for k, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 0] = j[1]
             if state == 2:
-                for i, l in zip(uniqueHD, nr):
+                for k, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 1] = j[1]
-
             if state == 3:
-                for i, l in zip(uniqueHD, nr):
+                for k, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 2] = j[1]
-
             if state == 4:
-                for i, l in zip(uniqueHD, nr):
+                for k, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 3] = j[1]
-
             if state == 5:
-                for i, l in zip(uniqueHD, nr):
+                for k, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 4] = j[1]
-
             if state == 6:
-                for i, l in zip(uniqueHD, nr):
+                for k, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 5] = j[1]
             state = state + 1
-
         sumRow = count.sum(axis=1)
         sumCol = count.sum(axis=0)
         first = ["{}{}".format(row_label, i) for i in uniqueHD]
         final = numpy.column_stack((first, count, sumRow))
-
     return (final, sumCol)


@@ -362,7 +442,6 @@
     uniqueHD = numpy.unique(selfAB)
     nr = numpy.arange(0, len(uniqueHD), 1)
     count = numpy.zeros((len(uniqueHD), 5))
-
     state = 1
     for i in list1:
         counts = list(Counter(i).items())
@@ -374,45 +453,81 @@
             continue
         else:
             if state == 1:
-                for i, l in zip(uniqueHD, nr):
+                for k, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 0] = j[1]
             if state == 2:
-                for i, l in zip(uniqueHD, nr):
+                for k, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 1] = j[1]
             if state == 3:
-                for i, l in zip(uniqueHD, nr):
+                for k, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 2] = j[1]
             if state == 4:
-                for i, l in zip(uniqueHD, nr):
+                for k, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 3] = j[1]
             if state == 5:
-                for i, l in zip(uniqueHD, nr):
+                for k, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 4] = j[1]
-
             state = state + 1
-
         sumRow = count.sum(axis=1)
         sumCol = count.sum(axis=0)
         first = ["HD={}".format(i) for i in uniqueHD]
         final = numpy.column_stack((first, count, sumRow))
+    return (final, sumCol)

+
+def createTableHDwithDCS(list1):
+    selfAB = numpy.concatenate(list1)
+    uniqueHD = numpy.unique(selfAB)
+    nr = numpy.arange(0, len(uniqueHD), 1)
+    count = numpy.zeros((len(uniqueHD), len(list1)))
+    state = 1
+    for i in list1:
+        counts = list(Counter(i).items())
+        hd = [item[0] for item in counts]
+        c = [item[1] for item in counts]
+        table = numpy.column_stack((hd, c))
+        if len(table) == 0:
+            state = state + 1
+            continue
+        else:
+            if state == 1:
+                for k, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 0] = j[1]
+            if state == 2:
+                for k, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 1] = j[1]
+            if state == 3:
+                for k, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 2] = j[1]
+            state = state + 1
+        sumRow = count.sum(axis=1)
+        sumCol = count.sum(axis=0)
+        first = ["HD={}".format(i) for i in uniqueHD]
+        final = numpy.column_stack((first, count, sumRow))
     return (final, sumCol)


 def createFileHD(summary, sumCol, overallSum, output_file, name, sep):
     output_file.write(name)
     output_file.write("\n")
-    output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep))
+    output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format(
+        sep, sep, sep, sep, sep, sep, sep, sep))
     for item in summary:
         for nr in item:
             if "HD" not in nr and "diff" not in nr:
@@ -428,10 +543,29 @@
     output_file.write("\n\n")


+def createFileHDwithDCS(summary, sumCol, overallSum, output_file, name, sep):
+    output_file.write(name)
+    output_file.write("\n")
+    output_file.write("{}DCS{}SSCS ab{}SSCS ba{}sum{}\n".format(sep, sep, sep, sep, sep))
+    for item in summary:
+        for nr in item:
+            if "HD" not in nr:
+                nr = nr.astype(float)
+                nr = nr.astype(int)
+            output_file.write("{}{}".format(nr, sep))
+        output_file.write("\n")
+    output_file.write("sum{}".format(sep))
+    sumCol = map(int, sumCol)
+    for el in sumCol:
+        output_file.write("{}{}".format(el, sep))
+    output_file.write("{}{}".format(overallSum.astype(int), sep))
+    output_file.write("\n\n")
+
+
 def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name, sep):
     output_file.write(name)
     output_file.write("\n")
-    output_file.write("{}HD a{}HD b'{}HD b{}HD a'{}HD a+b{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep))
+    output_file.write("{}HD DCS{}HD b'{}HD b{}HD a'{}HD a+b', a'+b{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep))
     for item in summary:
         for nr in item:
             if "HD" not in nr:
@@ -454,17 +588,14 @@
     for a in array1:
         dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2])  # fastest
         res[i] = numpy.amin(dist[dist > 0])  # pick min distance greater than zero
-        # print(i)
         i += 1
     return res


 def hamming_difference(array1, array2, mate_b):
     array2 = numpy.unique(array2)  # remove duplicate sequences to decrease running time
-
     array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1])  # mate1 part1
     array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1])  # mate1 part 2
-
     array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2])  # mate2 part1
     array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2])  # mate2 part2

@@ -499,6 +630,7 @@
         half2_mate1 = array1_half
         half1_mate2 = array2_half2
         half2_mate2 = array2_half
+
     # half1_mate1, index_halves = numpy.unique(half1_mate1, return_index=True)
     # print(len(half1_mate1))
     # half2_mate1 = half2_mate1[index_halves]
@@ -514,16 +646,18 @@
         array2_half_withoutSame = half1_mate2[index_withoutSame]
         array2_half2_withoutSame = half2_mate2[index_withoutSame]
         array2_withoutSame = array2[index_withoutSame]  # whole tag (=not splitted into 2 halfs)
-
+        # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
         dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in
-                            array2_half_withoutSame])  # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
+                            array2_half_withoutSame])
         min_index = numpy.where(dist == dist.min())[0]  # get index of min HD
         min_value = dist.min()
         # min_value = dist[min_index]  # get minimum HDs
-        min_tag_half2 = array2_half2_withoutSame[min_index]  # get all "b's" of the tag or all "a's" of the tag with minimum HD
+        # get all "b's" of the tag or all "a's" of the tag with minimum HD
+        min_tag_half2 = array2_half2_withoutSame[min_index]
         min_tag_array2 = array2_withoutSame[min_index]  # get whole tag with min HD

-        dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in min_tag_half2])  # calculate HD of "b" to all "b's" or "a" to all "a's"
+        dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in
+                                        min_tag_half2])  # calculate HD of "b" to all "b's" or "a" to all "a's"
         max_value = dist_second_half.max()
         max_index = numpy.where(dist_second_half == dist_second_half.max())[0]  # get index of max HD
         max_tag = min_tag_array2[max_index]
@@ -548,14 +682,11 @@
             min_tagsList_zeros.append(numpy.array(tag))
             difference1_zeros = abs(min_value - max_value)  # hd of non-identical part
             diff11_zeros.append(difference1_zeros)
-            max_tag_list.append(max_tag)
+            max_tag_list.append(numpy.array(max_tag))
         else:
             min_tagsList_zeros.append(None)
             diff11_zeros.append(None)
-            max_tag_list.append(numpy.array(["None"]))
-
-            # max_tag_list.append(numpy.array(max_tag))
-
+            max_tag_list.append(None)
         i += 1

     # print(i)
@@ -567,7 +698,8 @@
     # relativeDiffList = [st for st in relativeDiffList if st != 999]
     # diff11_zeros = [st for st in diff11_zeros if st != 999]
     # min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999]
-    return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros, ham1min, ham2min, max_tag_list])
+    return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros,
+             min_tagsList_zeros, ham1min, ham2min, max_tag_list])


 def readFileReferenceFree(file):
@@ -608,7 +740,6 @@
 def familySizeDistributionWithHD(fs, ham, diff=False, rel=True):
     hammingDistances = numpy.unique(ham)
     fs = numpy.asarray(fs)
-
     ham = numpy.asarray(ham)
     bigFamilies2 = numpy.where(fs > 19)[0]
     if len(bigFamilies2) != 0:
@@ -663,6 +794,53 @@
     return(list1, hammingDistances, maximum, minimum)


+def hammingDistanceWithDCS(minHD_tags_zeros, diff_zeros, data_array):
+    diff_zeros = numpy.array(diff_zeros)
+    maximum = numpy.amax(diff_zeros)
+    minimum = numpy.amin(diff_zeros)
+    minHD_tags_zeros = numpy.array(minHD_tags_zeros)
+
+    idx = numpy.concatenate([numpy.where(data_array[:, 1] == i)[0] for i in minHD_tags_zeros])
+    subset_data = data_array[idx, :]
+
+    seq = numpy.array(subset_data[:, 1])
+
+    # find all unique tags and get the indices for ALL tags, but only once
+    u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
+    DCS_tags = u[c == 2]
+    rest_tags = u[c == 1]
+
+    dcs = numpy.repeat("DCS", len(DCS_tags))
+    idx_sscs = numpy.concatenate([numpy.where(subset_data[:, 1] == i)[0] for i in rest_tags])
+    sscs = subset_data[idx_sscs, 2]
+
+    all_tags = numpy.column_stack((numpy.concatenate((DCS_tags, subset_data[idx_sscs, 1])),
+                                   numpy.concatenate((dcs, sscs))))
+    hd_DCS = []
+    ab_SSCS = []
+    ba_SSCS = []
+
+    for i in range(len(all_tags)):
+        tag = all_tags[i, :]
+        hd = diff_zeros[numpy.where(minHD_tags_zeros == tag[0])[0]]
+
+        if tag[1] == "DCS":
+            hd_DCS.append(hd)
+        elif tag[1] == "ab":
+            ab_SSCS.append(hd)
+        elif tag[1] == "ba":
+            ba_SSCS.append(hd)
+
+    if len(hd_DCS) != 0:
+        hd_DCS = numpy.concatenate(hd_DCS)
+    if len(ab_SSCS) != 0:
+        ab_SSCS = numpy.concatenate(ab_SSCS)
+    if len(ba_SSCS) != 0:
+        ba_SSCS = numpy.concatenate(ba_SSCS)
+    list1 = [hd_DCS, ab_SSCS, ba_SSCS]  # list for plotting
+    return(list1, maximum, minimum)
+
+
 def make_argparser():
     parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data')
     parser.add_argument('--inputFile',
@@ -678,11 +856,15 @@
                         help='Only tags of the DCSs are included in the HD analysis')

     parser.add_argument('--minFS', default=1, type=int,
-                        help='Only tags, which have a family size greater or equal than specified, are included in the HD analysis')
+                        help='Only tags, which have a family size greater or equal than specified, '
+                             'are included in the HD analysis')
     parser.add_argument('--maxFS', default=0, type=int,
-                        help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis')
+                        help='Only tags, which have a family size smaller or equal than specified, '
+                             'are included in the HD analysis')
     parser.add_argument('--nr_above_bars', action="store_true",
-                        help='If no, values above bars in the histograms are removed')
+                        help='If False, values above bars in the histograms are removed')
+    parser.add_argument('--rel_freq', action="store_false",
+                        help='If True, the relative frequencies are displayed.')

     parser.add_argument('--output_tabular', default="data.tabular", type=str,
                         help='Name of the tabular file.')
@@ -695,36 +877,33 @@


 def Hamming_Distance_Analysis(argv):
-
+# def Hamming_Distance_Analysis(file1, name1, index_size, title_savedFile_pdf, title_savedFile_csv,
+#                               output_chimeras_tabular, onlyDuplicates, minFS=1, maxFS=0, nr_above_bars=True,
+#                               subset=False, nproc=12, rel_freq=False):
     parser = make_argparser()
     args = parser.parse_args(argv[1:])
-
     file1 = args.inputFile
     name1 = args.inputName1
-
     index_size = args.sample_size
     title_savedFile_pdf = args.output_pdf
     title_savedFile_csv = args.output_tabular
     output_chimeras_tabular = args.output_chimeras_tabular
-
-    sep = "\t"
     onlyDuplicates = args.only_DCS
+    rel_freq = args.rel_freq
     minFS = args.minFS
     maxFS = args.maxFS
     nr_above_bars = args.nr_above_bars
-
     subset = args.subset_tag
     nproc = args.nproc
+    sep = "\t"

     # input checks
     if index_size < 0:
         print("index_size is a negative integer.")
         exit(2)
-
     if nproc <= 0:
         print("nproc is smaller or equal zero")
         exit(3)
-
     if subset < 0:
         print("subset_tag is smaller or equal zero.")
         exit(5)
@@ -735,22 +914,31 @@
     plt.rcParams['ytick.labelsize'] = 14
     plt.rcParams['patch.edgecolor'] = "#000000"
     plt.rc('figure', figsize=(11.69, 8.27))  # A4 format
-
     name1 = name1.split(".tabular")[0]

     with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf:
         print("dataset: ", name1)
         integers, data_array = readFileReferenceFree(file1)
         data_array = numpy.array(data_array)
-        print("total nr of tags with Ns:", len(data_array))
-        n = [i for i, x in enumerate(data_array[:, 1]) if "N" in x]
-        if len(n) != 0:  # delete tags with N in the tag from data
-            print("nr of tags with N's within tag:", len(n), float(len(n)) / len(data_array))
+        print("total nr of tags:", len(data_array))
+
+        # filter tags out which contain any other character than ATCG
+        valid_bases = ["A", "T", "G", "C"]
+        tagsToDelete = []
+        for idx, t in enumerate(data_array[:, 1]):
+            for char in t:
+                if char not in valid_bases:
+                    tagsToDelete.append(idx)
+                    break
+
+        if len(tagsToDelete) != 0:  # delete tags with N in the tag from data
+            print("nr of tags with any other character than A, T, C, G:", len(tagsToDelete),
+                  float(len(tagsToDelete)) / len(data_array))
             index_whole_array = numpy.arange(0, len(data_array), 1)
-            index_withoutN_inTag = numpy.delete(index_whole_array, n)
+            index_withoutN_inTag = numpy.delete(index_whole_array, tagsToDelete)
             data_array = data_array[index_withoutN_inTag, :]
             integers = integers[index_withoutN_inTag]
-            print("total nr of tags without Ns:", len(data_array))
+            print("total nr of filtered tags:", len(data_array))

         int_f = numpy.array(data_array[:, 0]).astype(int)
         data_array = data_array[numpy.where(int_f >= minFS)]
@@ -768,7 +956,7 @@

             # find all unique tags and get the indices for ALL tags, but only once
             u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
-            d = u[c > 1]
+            d = u[c == 2]

             # get family sizes, tag for duplicates
             duplTags_double = integers[numpy.in1d(seq, d)]
@@ -779,9 +967,9 @@
             duplTags_seq = seq[numpy.in1d(seq, d)][0::2]  # ab - tags

             if minFS > 1:
-                duplTags_tag = duplTags_tag[(duplTags >= 3) & (duplTagsBA >= 3)]
-                duplTags_seq = duplTags_seq[(duplTags >= 3) & (duplTagsBA >= 3)]
-                duplTags = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)]  # ab+ba with FS>=3
+                duplTags_tag = duplTags_tag[(duplTags >= minFS) & (duplTagsBA >= minFS)]
+                duplTags_seq = duplTags_seq[(duplTags >= minFS) & (duplTagsBA >= minFS)]
+                duplTags = duplTags[(duplTags >= minFS) & (duplTagsBA >= minFS)]  # ab+ba with FS>=3

             data_array = numpy.column_stack((duplTags, duplTags_seq))
             data_array = numpy.column_stack((data_array, duplTags_tag))
@@ -819,15 +1007,32 @@
         else:
             numpy.random.shuffle(data_array)
             unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True)  # get only unique tags
-            result = numpy.random.choice(unique_indices, size=index_size, replace=False)  # array of random sequences of size=index.size
+            result = numpy.random.choice(unique_indices, size=index_size,
+                                         replace=False)  # array of random sequences of size=index.size

             # result = numpy.random.choice(len(integers), size=index_size,
             #                             replace=False)  # array of random sequences of size=index.size
             # result = numpy.where(numpy.array(random_tags) == numpy.array(data_array[:,1]))[0]

-        # with open("index_result1_{}.pkl".format(app_f), "wb") as o:
+        # with open("index_result.pkl", "wb") as o:
         #     pickle.dump(result, o, pickle.HIGHEST_PROTOCOL)

+        # save counts
+        # with open(data_folder + "index_sampleTags1000_Barcode3_DCS.pkl", "wb") as f:
+        #     pickle.dump(result, f, pickle.HIGHEST_PROTOCOL)
+        # with open(data_folder + "dataArray_sampleTags1000_Barcode3_DCS.pkl", "wb") as f1:
+        #     pickle.dump(data_array, f1, pickle.HIGHEST_PROTOCOL)
+        #
+        # with open(data_folder + "index_sampleTags100.pkl", "rb") as f:
+        #     result = pickle.load(f)
+        #
+        # with open(data_folder + "dataArray_sampleTags100.pkl", "rb") as f1:
+        #     data_array = pickle.load(f1)
+
+        # with open(data_folder + "index_result.txt", "w") as t:
+        #     for text in result:
+        #         t.write("{}\n".format(text))
+
         # comparison random tags to whole dataset
         result1 = data_array[result, 1]  # random tags
         result2 = data_array[:, 1]  # all tags
@@ -873,10 +1078,9 @@
         minHD_tags = numpy.concatenate([item[4] for item in diff_list_a])
         minHD_tags_zeros1 = numpy.concatenate([item[7] for item in diff_list_a])
         minHD_tags_zeros2 = numpy.concatenate([item[7] for item in diff_list_b])
-        chim_tags = [item[10] for item in diff_list_a]
-        chim_tags2 = [item[10] for item in diff_list_b]
-        chimera_tags1 = [ii if isinstance(i, list) else i for i in chim_tags for ii in i]
-        chimera_tags2 = [ii if isinstance(i, list) else i for i in chim_tags2 for ii in i]
+
+        chimera_tags1 = sum([item[10] for item in diff_list_a], [])
+        chimera_tags2 = numpy.concatenate([item[10] for item in diff_list_b])

         rel_Diff = []
         diff_zeros = []
@@ -884,7 +1088,8 @@
         diff = []
         chimera_tags = []
         for d1, d2, rel1, rel2, zeros1, zeros2, tag1, tag2, ctag1, ctag2 in \
-                zip(diff1, diff2, rel_Diff1, rel_Diff2, diff_zeros1, diff_zeros2, minHD_tags_zeros1, minHD_tags_zeros2, chimera_tags1, chimera_tags2):
+                zip(diff1, diff2, rel_Diff1, rel_Diff2, diff_zeros1, diff_zeros2, minHD_tags_zeros1, minHD_tags_zeros2,
+                    chimera_tags1, chimera_tags2):
             rel_Diff.append(max(rel1, rel2))
             diff.append(max(d1, d2))

@@ -903,7 +1108,7 @@
                 chimera_tags.append(ctag2)

         chimera_tags_new = chimera_tags
-        #data_chimeraAnalysis = numpy.column_stack((minHD_tags_zeros, chimera_tags_new))
+        data_chimeraAnalysis = numpy.column_stack((minHD_tags_zeros, chimera_tags_new))
         # chimeras_dic = defaultdict(list)
         #
         # for t1, t2 in zip(minHD_tags_zeros, chimera_tags_new):
@@ -911,71 +1116,63 @@
         #         t2 = numpy.concatenate(t2)
         #     chimeras_dic[t1].append(t2)

+        checked_tags = []
+        stat_maxTags = []
+
         with open(output_chimeras_tabular, "w") as output_file1:
-            output_file1.write("chimera tag\tsimilar tag with HD=0\n")
-            for i in range(len(minHD_tags_zeros)):
-                tag1 = minHD_tags_zeros[i]
+            output_file1.write("chimera tag\tfamily size, read direction\tsimilar tag with HD=0\n")
+            for i in range(len(data_chimeraAnalysis)):
+                tag1 = data_chimeraAnalysis[i, 0]
+
+                info_tag1 = data_array[data_array[:, 1] == tag1, :]
+                fs_tag1 = ["{} {}".format(t[0], t[2]) for t in info_tag1]
+
+                if tag1 in checked_tags:  # skip tag if already written to file
+                    continue
+
                 sample_half_a = tag1[0:(len(tag1)) / 2]
                 sample_half_b = tag1[len(tag1) / 2:len(tag1)]

-                max_tags = chimera_tags_new[i]
-                if isinstance(max_tags, list) and len(max_tags) > 1:
+                max_tags = data_chimeraAnalysis[i, 1]
+                if len(max_tags) > 1 and type(max_tags) is not numpy.ndarray:
                     max_tags = numpy.concatenate(max_tags)
-                #if isinstance(max_tags, list): #and type(max_tags) is not numpy.ndarray:
-                #    print(max_tags)
-                #    max_tags = numpy.concatenate(max_tags)
                 max_tags = numpy.unique(max_tags)
+                stat_maxTags.append(len(max_tags))

-                chimera_half_a = numpy.array([i[0:(len(i)) / 2] for i in max_tags])  # mate1 part1
-                chimera_half_b = numpy.array([i[len(i) / 2:len(i)] for i in max_tags])  # mate1 part 2
+                info_maxTags = [data_array[data_array[:, 1] == t, :] for t in max_tags]
+
+                chimera_half_a = numpy.array([t[0:(len(t)) / 2] for t in max_tags])  # mate1 part1
+                chimera_half_b = numpy.array([t[len(t) / 2:len(t)] for t in max_tags])  # mate1 part 2

                 new_format = []
                 for j in range(len(max_tags)):
+                    fs_maxTags = ["{} {}".format(t[0], t[2]) for t in info_maxTags[j]]
+
                     if sample_half_a == chimera_half_a[j]:
-                        max_tag = "*{}* {}".format(chimera_half_a[j], chimera_half_b[j])
+                        max_tag = "*{}* {} {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags))
                         new_format.append(max_tag)

                     elif sample_half_b == chimera_half_b[j]:
-                        max_tag = "{} *{}*".format(chimera_half_a[j], chimera_half_b[j])
+                        max_tag = "{} *{}* {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags))
                         new_format.append(max_tag)
+                    checked_tags.append(max_tags[j])

-                sample_tag = "{} {}".format(sample_half_a, sample_half_b)
+                sample_tag = "{} {}\t{}".format(sample_half_a, sample_half_b, ", ".join(fs_tag1))
                 output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format)))
-            output_file1.write(
-                "This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.\n "
-                "The tags were separated by an empty space into their halves and the * marks the identical half.")
+
+                checked_tags.append(tag1)

-            # unique_chimeras = numpy.array(minHD_tags_zeros)
-            #
-            # sample_half_a = numpy.array([i[0:(len(i)) / 2] for i in unique_chimeras])  # mate1 part1
-            # sample_half_b = numpy.array([i[len(i) / 2:len(i)] for i in unique_chimeras])  # mate1 part 2
-            #
-            # output_file1.write("sample tag\tsimilar tag\n")
-            # for tag1, a, b in zip(unique_chimeras, sample_half_a, sample_half_b):
-            #     max_tags = numpy.concatenate(chimeras_dic.get(tag1))
-            #     max_tags = numpy.unique(max_tags)
-            #
-            #     chimera_half_a = numpy.array([i[0:(len(i)) / 2] for i in max_tags])  # mate1 part1
-            #     chimera_half_b = numpy.array([i[len(i) / 2:len(i)] for i in max_tags])  # mate1 part 2
-            #
-            #     new_format = []
-            #     for i in range(len(max_tags)):
-            #         if a == chimera_half_a[i]:
-            #             max_tag = "*{}* {}".format(chimera_half_a[i], chimera_half_b[i])
-            #             new_format.append(max_tag)
-            #
-            #         elif b == chimera_half_b[i]:
-            #             max_tag = "{} *{}*".format(chimera_half_a[i], chimera_half_b[i])
-            #             new_format.append(max_tag)
-            #
-            #     sample_tag = "{} {}".format(a, b)
-            #     output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format)))
-            # output_file1.write(
-            #     "This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.\n "
-            #     "The tags were separated by an empty space into their halves and the * marks the identical half.")
-
-        nr_chimeric_tags = len(minHD_tags_zeros)
-        print("nr of unique chimeras", nr_chimeric_tags)
+            output_file1.write(
+                "This file contains all tags that were identified as chimeras as the first column and the "
+                "corresponding tags which returned a Hamming distance of zero in either the first or the second "
+                "half of the sample tag as the second column.\n"
+                "The tags were separated by an empty space into their halves and the * marks the identical half.")
+            output_file1.write("\n\nStatistics of nr. of tags that returned max. HD (2nd column)\n")
+            output_file1.write("minimum\t{}\ttag(s)\n".format(numpy.amin(numpy.array(stat_maxTags))))
+            output_file1.write("mean\t{}\ttag(s)\n".format(numpy.mean(numpy.array(stat_maxTags))))
+            output_file1.write("median\t{}\ttag(s)\n".format(numpy.median(numpy.array(stat_maxTags))))
+            output_file1.write("maximum\t{}\ttag(s)\n".format(numpy.amax(numpy.array(stat_maxTags))))
+            output_file1.write("sum\t{}\ttag(s)\n".format(numpy.sum(numpy.array(stat_maxTags))))

         lenTags = len(data_array)
         len_sample = len(result1)
@@ -992,6 +1189,9 @@
             rel_Diff = numpy.tile(rel_Diff, 2)
             diff_zeros = numpy.tile(diff_zeros, 2)

+        nr_chimeric_tags = len(data_chimeraAnalysis)
+        print("nr of chimeras", nr_chimeric_tags)
+
         # prepare data for different kinds of plots
         # distribution of FSs separated after HD
         familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False)
@@ -1011,8 +1211,8 @@
             lst_minHD_tags.append(seqDic.get(i))

         if onlyDuplicates:
-            lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags], [item_b[1] for item_b in lst_minHD_tags])).astype(int)
-
+            lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags],
+                                                [item_b[1] for item_b in lst_minHD_tags])).astype(int)
         # histogram with absolute and relative difference between HDs of both parts of the tag
         listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff)
         listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags,
@@ -1023,43 +1223,55 @@
             for i in minHD_tags_zeros:
                 lst_minHD_tags_zeros.append(seqDic.get(i))  # get family size for tags of chimeric reads
             if onlyDuplicates:
-                lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros], [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int)
+                lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros],
+                                                          [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int)

             # histogram with HD of non-identical half
-            listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(lst_minHD_tags_zeros, diff_zeros)
+            listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(
+            lst_minHD_tags_zeros, diff_zeros)
+
+            if onlyDuplicates is False:
+                listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros = hammingDistanceWithDCS(minHD_tags_zeros,
+                                                                                             diff_zeros, data_array)

         # plot Hamming Distance with Family size distribution
-        plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf,
-                      subtitle="Hamming distance separated by family size", title_file1=name1, lenTags=lenTags,
+        plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, rel_freq=rel_freq,
+                      subtitle="Hamming distance separated by family size", lenTags=lenTags,
                       xlabel="HD", nr_above_bars=nr_above_bars, len_sample=len_sample)

         # Plot FSD with separation after
-        plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS,
+        plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, rel_freq=rel_freq,
                        originalCounts=quant, subtitle="Family size distribution separated by Hamming distance",
-                       pdf=pdf, relative=False, title_file1=name1, diff=False)
+                       pdf=pdf, relative=False, diff=False)

         # Plot HD within tags
-        plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags,
-                             title_file1=name1, len_sample=len_sample)
+        plotHDwithinSeq(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags,
+                        rel_freq=rel_freq, len_sample=len_sample)

         # Plot difference between HD's separated after FSD
         plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf,
-                      subtitle="Delta Hamming distance within tags",
-                      title_file1=name1, lenTags=lenTags,
+                      subtitle="Delta Hamming distance within tags", lenTags=lenTags, rel_freq=rel_freq,
                       xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars, len_sample=len_sample)

         plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf,
-                      subtitle="Chimera Analysis: relative delta Hamming distances",
-                      title_file1=name1, lenTags=lenTags,
-                      xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
+                      subtitle="Chimera Analysis: relative delta Hamming distance", lenTags=lenTags, rel_freq=rel_freq,
+                      xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars,
+                      nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)

         # plots for chimeric reads
         if len(minHD_tags_zeros) != 0:
             # HD
-            plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, subtitle="Hamming distance of chimeras",
-                          title_file1=name1, lenTags=lenTags, xlabel="HD", relative=False,
+            plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf,
+                          subtitle="Hamming distance of chimeric families (CF)", rel_freq=rel_freq,
+                          lenTags=lenTags, xlabel="HD", relative=False,
                           nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)

+            if onlyDuplicates is False:
+                plotHDwithDCS(listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros, pdf=pdf,
+                              subtitle="Hamming distance of chimeric families (CF)", rel_freq=rel_freq,
+                              lenTags=lenTags, xlabel="HD", relative=False,
+                              nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
+
         # print all data to a CSV file
         # HD
         summary, sumCol = createTableHD(list1, "HD=")
@@ -1087,9 +1299,12 @@
             summary15, sumCol15 = createTableHD(listDifference1_zeros, "HD=")
             overallSum15 = sum(sumCol15)

+            if onlyDuplicates is False:
+                summary16, sumCol16 = createTableHDwithDCS(listDCS_zeros)
+                overallSum16 = sum(sumCol16)
+
         output_file.write("{}\n".format(name1))
-        output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len(
-            numpy.concatenate(list1)), lenTags, lenTags))
+        output_file.write("nr of tags{}{:,}\nsample size{}{:,}\n\n".format(sep, lenTags, sep, len_sample))

         # HD
         createFileHD(summary, sumCol, overallSum, output_file,
@@ -1109,27 +1324,36 @@

         # HD within tags
         output_file.write(
-            "The Hamming distances were calculated by comparing the first halve against all halves and selected the minimum value (HD a).\n"
-            "For the second half of the tag, we compared them against all tags which resulted in the minimum HD of the previous step and selected the maximum value (HD b').\n"
-            "Finally, it was possible to calculate the absolute and relative differences between the HDs (absolute and relative delta HD).\n"
-            "These calculations were repeated, but starting with the second half in the first step to find all possible chimeras in the data (HD b and HD  For simplicity we used the maximum value between the delta values in the end.\n"
-            "When only tags that can form DCS were allowed in the analysis, family sizes for the forward and reverse (ab and ba) will be included in the plots.\n")
+            "The Hamming distances were calculated by comparing the first halve against all halves and selected the "
+            "minimum value (HD a).\nFor the second half of the tag, we compared them against all tags which resulted "
+            "in the minimum HD of the previous step and selected the maximum value (HD b').\nFinally, it was possible "
+            "to calculate the absolute and relative differences between the HDs (absolute and relative delta HD).\n"
+            "These calculations were repeated, but starting with the second half in the first step to find all "
+            "possible chimeras in the data (HD b and HD  For simplicity we used the maximum value between the delta "
+            "values in the end.\nWhen only tags that can form DCS were allowed in the analysis, family sizes for the "
+            "forward and reverse (ab and ba) will be included in the plots.\n")

-        output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2))
+        output_file.write("\nlength of one half of the tag{}{}\n\n".format(sep, len(data_array[0, 1]) / 2))

         createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file,
                               "Hamming distance of each half in the tag", sep)
         createFileHD(summary11, sumCol11, overallSum11, output_file,
-                     "Absolute delta Hamming distances within the tag", sep)
+                     "Absolute delta Hamming distance within the tag", sep)

         createFileHD(summary13, sumCol13, overallSum13, output_file,
-                     "Chimera analysis: relative delta Hamming distances", sep)
+                     "Chimera analysis: relative delta Hamming distance", sep)

         if len(minHD_tags_zeros) != 0:
             output_file.write(
-                "Chimeras:\nAll tags were filtered: only those tags where at least one half was identical (HD=0) and therefore, had a relative delta of 1 were kept. These tags are considered as chimeric.\nSo the Hamming distances of the chimeric tags are shown.\n")
+                "All tags were filtered: only those tags where at least one half was identical (HD=0) and therefore, "
+                "had a relative delta of 1 were kept. These tags are considered as chimeric.\nSo the Hamming distances "
+                "of the chimeric tags are shown.\n")
             createFileHD(summary15, sumCol15, overallSum15, output_file,
-                         "Hamming distances of chimeras", sep)
+                         "Hamming distance of chimeric families separated after FS", sep)
+
+            if onlyDuplicates is False:
+                createFileHDwithDCS(summary16, sumCol16, overallSum16, output_file,
+                                    "Hamming distance of chimeric families separated after DCS and single SSCS", sep)

         output_file.write("\n")
--- a/hd.xml	Mon Jun 03 05:37:01 2019 -0400
+++ b/hd.xml	Wed Jul 24 05:58:15 2019 -0400
@@ -1,12 +1,12 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<tool id="hd" name="HD:" version="1.0.0">
+<tool id="hd" name="HD:" version="1.0.1">
     <description>hamming distance analysis of duplex tags</description>
     <requirements>
         <requirement type="package" version="2.7">python</requirement>
         <requirement type="package" version="1.4.0">matplotlib</requirement>
     </requirements>
     <command>
-        python2 '$__tool_directory__/hd.py' --inputFile '$inputFile' --inputName1 '$inputFile.name' --sample_size $sampleSize --subset_tag $subsetTag --nproc $nproc $onlyDCS --minFS $minFS --maxFS $maxFS
+        python2 '$__tool_directory__/hd.py' --inputFile '$inputFile' --inputName1 '$inputFile.name' --sample_size $sampleSize --subset_tag $subsetTag --nproc $nproc $onlyDCS $rel_freq --minFS $minFS --maxFS $maxFS
 		$nr_above_bars --output_pdf $output_pdf --output_tabular $output_tabular --output_chimeras_tabular $output_chimeras_tabular
     </command>
     <inputs>
@@ -15,6 +15,7 @@
         <param name="minFS" type="integer" label="minimum family size of the tags" min="1" value="1" help="filters the tags after their family size: Families with a smaller size are skipped. Default: min. family size = 1."/>
         <param name="maxFS" type="integer" label="max family size of the tags" min="0" value="0" help="filters the tags after their family size: Families with a larger size are skipped. If max. family size is 0, no upper bound is defined and the maximum family size in the analysis will be the maximum family size of the whole dataset. Default: max. family size = 0."/>
         <param name="onlyDCS" type="boolean" label="only DCS in the analysis?" truevalue="" falsevalue="--only_DCS" checked="False" help="Only tags, which have a partner tag (ab and ba) in the dataset, are included in the analysis."/>
+        <param name="rel_freq" type="boolean" label="relative frequency?" truevalue="" falsevalue="--rel_freq" checked="False" help="If True, the relative frequencies instead of the absolute values are displayed in the plots."/>
         <param name="subsetTag" type="integer" label="shorten tag in the analysis?" value="0" help="By this parameter an analysis with shorter tag length is simulated. If this parameter is 0 (by default), the tags with its original length are used in the analysis."/>
         <param name="nproc" type="integer" label="number of processors" value="8" help="Number of processor used for computing."/>
         <param name="nr_above_bars" type="boolean" label="include numbers above bars?" truevalue="--nr_above_bars" falsevalue="" checked="True" help="The absolute and relative values of the data can be included or removed from the plots. "/>
@@ -93,7 +94,9 @@
     - Higher relative differences occur either due to low total HDs and/or larger absolute differences, both things that indicate that 2 tags were originally the same tag.
     - A relative difference of 1 means that one part of the tags is identical. Since it is very unlikely that by chance two different tags have a HD of 0 between one of their parts, the HDs in the other part are probably artificially introduced (chimeric reads).

- 6. The last page contains a graph representing the **HD of the chimeric tags** which is at the same time the HD of the non-identical halves of the tags with a relative difference of 1 from the previous page.
+ 6. The last page contains a graph representing the **HD of the chimeric tags** which is at the same time the HD of the non-identical halves of the chimeric tags with a relative difference of 1 from the previous page.
+
+ 7. The last page is only generated when the parameter "only DCS in the analysis?" is **False**. The graph represents the **HD of the chimeric tags** which is at the same time the HD of the non-identical halves of the chimeric tags and indicates if they can form a DCS or not.

  .. class:: infomark
--- a/test-data/hd_chimeras_output.tab	Mon Jun 03 05:37:01 2019 -0400
+++ b/test-data/hd_chimeras_output.tab	Wed Jul 24 05:58:15 2019 -0400
@@ -1,23 +1,22 @@
-chimera tag	similar tag with HD=0
-AAAAAAAAAAAA AACCAAAACTTC	*AAAAAAAAAAAA* TCTTTCTTTGAG
-AAAAAAAAAAAA ACCAGGCGTCGA	*AAAAAAAAAAAA* AACCAAAACTTC, *AAAAAAAAAAAA* AGCTCCACGTTG, *AAAAAAAAAAAA* CAGTGTTGAGAC, *AAAAAAAAAAAA* TCTTTCTTTGAG, *AAAAAAAAAAAA* TTGGGTTCCTTA
-AAAAAAAAAAAA AGCTCCACGTTG	*AAAAAAAAAAAA* CAGTGTTGAGAC, *AAAAAAAAAAAA* CCGCTCCTCACA
-AAAAAAAAAAAA ATCGTGGTTTGT	*AAAAAAAAAAAA* CAGTGTTGAGAC, AAAAAAAAAAAG *ATCGTGGTTTGT*
-AAAAAAAAAAAA ATTCACCCTTGT	*AAAAAAAAAAAA* CAGTGTTGAGAC, AAAAAAAAAAAT *ATTCACCCTTGT*
-AAAAAAAAAAAA CACACTTAACTT	*AAAAAAAAAAAA* ATTCACCCTTGT, *AAAAAAAAAAAA* CCGCTCCTCACA, *AAAAAAAAAAAA* TCTTTCTTTGAG
-AAAAAAAAAAAA CAGTGTTGAGAC	*AAAAAAAAAAAA* ATCGTGGTTTGT, *AAAAAAAAAAAA* ATTCACCCTTGT, *AAAAAAAAAAAA* CACACTTAACTT
-AAAAAAAAAAAA CCGCTCCTCACA	*AAAAAAAAAAAA* AGCTCCACGTTG, *AAAAAAAAAAAA* CACACTTAACTT
-AAAAAAAAAAAA GGCAACACAGAA	*AAAAAAAAAAAA* ATCGTGGTTTGT, AAAAAAAAAAAG *GGCAACACAGAA*
-AAAAAAAAAAAA TCTTTCTTTGAG	*AAAAAAAAAAAA* AACCAAAACTTC, AAAAAAAAAAAG *TCTTTCTTTGAG*
-AAAAAAAAAAAA TTGGGTTCCTTA	*AAAAAAAAAAAA* ACCAGGCGTCGA, *AAAAAAAAAAAA* GGCAACACAGAA, *AAAAAAAAAAAA* TCTTTCTTTGAG
-AAAAAAAAAAAG AGTCGCACCCAG	*AAAAAAAAAAAG* ATCGTGGTTTGT
-AAAAAAAAAAAG ATCGTGGTTTGT	AAAAAAAAAAAA *ATCGTGGTTTGT*, *AAAAAAAAAAAG* TAGCCCTAAACG
-AAAAAAAAAAAG CGCAACACAGAA	*AAAAAAAAAAAG* ATCGTGGTTTGT
-AAAAAAAAAAAG GGCAACACAGAA	AAAAAAAAAAAA *GGCAACACAGAA*, *AAAAAAAAAAAG* ATCGTGGTTTGT
-AAAAAAAAAAAG TAGCCCTAAACG	*AAAAAAAAAAAG* ATCGTGGTTTGT
-AAAAAAAAAAAG TCTTTCTTTGAG	AAAAAAAAAAAA *TCTTTCTTTGAG*, *AAAAAAAAAAAG* ATCGTGGTTTGT, *AAAAAAAAAAAG* CGCAACACAGAA, *AAAAAAAAAAAG* GGCAACACAGAA
-AAAAAAAAAAAT ATCATAGACTCT	*AAAAAAAAAAAT* ATTCACCCTTGT
-AAAAAAAAAAAT ATTCACCCTTGT	AAAAAAAAAAAA *ATTCACCCTTGT*, *AAAAAAAAAAAT* ATCATAGACTCT
-AAAAAAAAAAAT ATTCGAAAGTTA	*AAAAAAAAAAAT* ATCATAGACTCT, *AAAAAAAAAAAT* ATTCACCCTTGT
+chimera tag	family size, read direction	similar tag with HD=0
+AAAAAAAAAAAA AACCAAAACTTC	1 ba	*AAAAAAAAAAAA* TCTTTCTTTGAG 2 ab
+AAAAAAAAAAAA ACCAGGCGTCGA	1 ba	*AAAAAAAAAAAA* AACCAAAACTTC 1 ba, *AAAAAAAAAAAA* AGCTCCACGTTG 1 ba, *AAAAAAAAAAAA* CAGTGTTGAGAC 1 ba, *AAAAAAAAAAAA* TCTTTCTTTGAG 2 ab, *AAAAAAAAAAAA* TTGGGTTCCTTA 1 ab
+AAAAAAAAAAAA ATCGTGGTTTGT	1 ba	*AAAAAAAAAAAA* CAGTGTTGAGAC 1 ba, AAAAAAAAAAAG *ATCGTGGTTTGT* 4 ba
+AAAAAAAAAAAA ATTCACCCTTGT	1 ba	*AAAAAAAAAAAA* CAGTGTTGAGAC 1 ba, AAAAAAAAAAAT *ATTCACCCTTGT* 6 ba
+AAAAAAAAAAAA CACACTTAACTT	7 ba	*AAAAAAAAAAAA* ATTCACCCTTGT 1 ba, *AAAAAAAAAAAA* CCGCTCCTCACA 4 ba, *AAAAAAAAAAAA* TCTTTCTTTGAG 2 ab
+AAAAAAAAAAAA GGCAACACAGAA	1 ab	*AAAAAAAAAAAA* ATCGTGGTTTGT 1 ba, AAAAAAAAAAAG *GGCAACACAGAA* 3 ab
+AAAAAAAAAAAG AGTCGCACCCAG	1 ba	*AAAAAAAAAAAG* ATCGTGGTTTGT 4 ba
+AAAAAAAAAAAG CGCAACACAGAA	1 ab	*AAAAAAAAAAAG* ATCGTGGTTTGT 4 ba
+AAAAAAAAAAAG TAGCCCTAAACG	1 ab	*AAAAAAAAAAAG* ATCGTGGTTTGT 4 ba
+AAAAAAAAAAAG TCTTTCTTTGAG	1 ab	AAAAAAAAAAAA *TCTTTCTTTGAG* 2 ab, *AAAAAAAAAAAG* ATCGTGGTTTGT 4 ba, *AAAAAAAAAAAG* CGCAACACAGAA 1 ab, *AAAAAAAAAAAG* GGCAACACAGAA 3 ab
+AAAAAAAAAAAT ATCATAGACTCT	1 ab	*AAAAAAAAAAAT* ATTCACCCTTGT 6 ba
+AAAAAAAAAAAT ATTCGAAAGTTA	1 ba	*AAAAAAAAAAAT* ATCATAGACTCT 1 ab, *AAAAAAAAAAAT* ATTCACCCTTGT 6 ba
 This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.
- The tags were separated by an empty space into their halves and the * marks the identical half.
\ No newline at end of file
+The tags were separated by an empty space into their halves and the * marks the identical half.
+
+Statistics of nr. of tags that returned max. HD (2nd column)
+minimum	1	tag(s)
+mean	2.08333333333	tag(s)
+median	2.0	tag(s)
+maximum	5	tag(s)
+sum	25	tag(s)
Binary file test-data/hd_output.pdf has changed
--- a/test-data/hd_output.tab	Mon Jun 03 05:37:01 2019 -0400
+++ b/test-data/hd_output.tab	Wed Jul 24 05:58:15 2019 -0400
@@ -1,5 +1,6 @@
 hd_data.tab
-number of tags per file	20 (from 20) against 20
+nr of tags	20
+sample size	20

 Hamming distance separated by family size
 	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
@@ -29,10 +30,11 @@
 Finally, it was possible to calculate the absolute and relative differences between the HDs (absolute and relative delta HD).
 These calculations were repeated, but starting with the second half in the first step to find all possible chimeras in the data (HD b and HD  For simplicity we used the maximum value between the delta values in the end.
 When only tags that can form DCS were allowed in the analysis, family sizes for the forward and reverse (ab and ba) will be included in the plots.
-length of one part of the tag = 12
+
+length of one half of the tag	12

 Hamming distance of each half in the tag
-	HD a	HD b'	HD b	HD a'	HD a+b	sum
+	HD DCS	HD b'	HD b	HD a'	HD a+b', a'+b	sum
 HD=0	20	0	8	1	0	29
 HD=1	0	0	1	19	8	28
 HD=2	0	0	0	0	1	1
@@ -46,7 +48,7 @@
 HD=12	0	7	0	0	7	14
 sum	20	20	20	20	40	120

-Absolute delta Hamming distances within the tag
+Absolute delta Hamming distance within the tag
 	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
 diff=7	1	0	0	0	0	0	1
 diff=8	1	0	0	0	1	0	2
@@ -56,15 +58,14 @@
 diff=12	5	1	0	1	0	0	7
 sum	14	1	1	2	2	0	20

-Chimera analysis: relative delta Hamming distances
+Chimera analysis: relative delta Hamming distance
 	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
 diff=1.0	14	1	1	2	2	0	20
 sum	14	1	1	2	2	0	20

-Chimeras:
 All tags were filtered: only those tags where at least one half was identical (HD=0) and therefore, had a relative delta of 1 were kept. These tags are considered as chimeric.
 So the Hamming distances of the chimeric tags are shown.
-Hamming distances of chimeras
+Hamming distance of chimeric families separated after FS
 	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
 HD=7	1	0	0	0	0	0	1
 HD=8	1	0	0	0	1	0	2
@@ -74,4 +75,14 @@
 HD=12	5	1	0	1	0	0	7
 sum	14	1	1	2	2	0	20

+Hamming distance of chimeric families separated after DCS and single SSCS
+	DCS	SSCS ab	SSCS ba	sum
+HD=7.0	0	0	1	1
+HD=8.0	0	1	1	2
+HD=9.0	0	1	0	1
+HD=10.0	0	1	1	2
+HD=11.0	0	3	4	7
+HD=12.0	0	2	5	7
+sum	0	8	12	20

+