Mercurial > repos > mheinzl > hd

diff hd.py @ 29:6b15b3b6405c draft
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/hd commit 5b3ab8c6467fe3a52e89f5a7d175bd8a0189018a-dirty
author: mheinzl
date: Wed, 24 Jul 2019 05:58:15 -0400
parents: 1fa7342a140d
children: 46bfbec0f9e6
--- a/hd.py	Mon Jun 03 05:37:01 2019 -0400
+++ b/hd.py	Wed Jul 24 05:58:15 2019 -0400
@@ -5,16 +5,17 @@
 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria)
 # Contact: monika.heinzl@edumail.at
 #
-# Takes at least one TABULAR file with tags before the alignment to the SSCS and optionally a second TABULAR file as input.
-# The program produces a plot which shows a histogram of Hamming distances separated after family sizes,
-# a family size distribution separated after Hamming distances for all (sample_size=0) or a given sample of SSCSs or SSCSs, which form a DCS.
-# In additon, the tool produces HD and FSD plots for the difference between the HDs of both parts of the tags and for the chimeric reads
-# and finally a CSV file with the data of the plots.
-# It is also possible to perform the HD analysis with shortened tags with given sizes as input.
+# Takes at least one TABULAR file with tags before the alignment to the SSCS and
+# optionally a second TABULAR file as input. The program produces a plot which shows a histogram of Hamming distances
+# separated after family sizes, a family size distribution separated after Hamming distances for all (sample_size=0)
+# or a given sample of SSCSs or SSCSs, which form a DCS. In additon, the tool produces HD and FSD plots for the
+# difference between the HDs of both parts of the tags and for the chimeric reads and finally a CSV file with the
+# data of the plots. It is also possible to perform the HD analysis with shortened tags with given sizes as input.
 # The tool can run on a certain number of processors, which can be defined by the user.
 
 # USAGE: python hd.py --inputFile filename --inputName1 filename --sample_size int /
-#        --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular
+#        --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int
+#        --nr_above_bars True/False --output_tabular outptufile_name_tabular
 
 import argparse
 import itertools
@@ -34,7 +35,7 @@
 
 
 def plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, originalCounts,
-                   title_file1, subtitle, pdf, relative=False, diff=True):
+                   subtitle, pdf, relative=False, diff=True, rel_freq=False):
     if diff is False:
         colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"]
         labels = ["HD=1", "HD=2", "HD=3", "HD=4", "HD=5-8", "HD>8"]
@@ -48,36 +49,37 @@
     fig = plt.figure(figsize=(6, 7))
     ax = fig.add_subplot(111)
     plt.subplots_adjust(bottom=0.1)
-    p1 = numpy.bincount(numpy.concatenate((familySizeList1)))
+    p1 = numpy.bincount(numpy.concatenate(familySizeList1))
     maximumY = numpy.amax(p1)
 
     if len(range(minimumXFS, maximumXFS)) == 0:
         range1 = range(minimumXFS - 1, minimumXFS + 2)
     else:
         range1 = range(0, maximumXFS + 2)
-    counts = plt.hist(familySizeList1, label=labels,
-                      color=colors, stacked=True,
-                      rwidth=0.8, alpha=1, align="left",
-                      edgecolor="None", bins=range1)
+
+    if rel_freq:
+        w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(familySizeList1)) for data in familySizeList1]
+        counts = plt.hist(familySizeList1, label=labels, weights=w, color=colors, stacked=True,
+                          rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1)
+        plt.ylabel("Relative Frequency", fontsize=14)
+        plt.ylim((0, (float(maximumY) / sum(p1)) * 1.1))
+    else:
+        counts = plt.hist(familySizeList1, label=labels, color=colors, stacked=True,
+                          rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1)
+        if len(numpy.concatenate(familySizeList1)) != 0:
+            plt.ylim((0, max(numpy.bincount(numpy.concatenate(familySizeList1))) * 1.1))
+        plt.ylabel("Absolute Frequency", fontsize=14)
+        plt.ylim((0, maximumY * 1.2))
     plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
-
-    # plt.title(title_file1, fontsize=12)
     plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
     plt.xlabel("Family size", fontsize=14)
-    plt.ylabel("Absolute Frequency", fontsize=14)
-
     ticks = numpy.arange(0, maximumXFS + 1, 1)
     ticks1 = map(str, ticks)
     if maximumXFS >= 20:
         ticks1[len(ticks1) - 1] = ">=20"
     plt.xticks(numpy.array(ticks), ticks1)
     [l.set_visible(False) for (i, l) in enumerate(ax.get_xticklabels()) if i % 5 != 0]
-
     plt.xlim((0, maximumXFS + 1))
-    if len(numpy.concatenate(familySizeList1)) != 0:
-        plt.ylim((0, max(numpy.bincount(numpy.concatenate(familySizeList1))) * 1.1))
-
-    plt.ylim((0, maximumY * 1.2))
     legend = "\nfamily size: \nabsolute frequency: \nrelative frequency: "
     plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure)
 
@@ -86,17 +88,17 @@
         max_count = ">= 20"
     else:
         max_count = max(originalCounts)
-    legend1 = "{}\n{}\n{:.5f}".format(max_count, count[len(count) - 1], float(count[len(count) - 1]) / sum(count))
+    legend1 = "{}\n{}\n{:.5f}".format(max_count, p1[len(p1) - 1], float(p1[len(p1) - 1]) / sum(p1))
     plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure)
-    legend3 = "singletons\n{:,}\n{:.5f}".format(int(counts[0][len(counts[0]) - 1][1]), float(counts[0][len(counts[0]) - 1][1]) / sum(counts[0][len(counts[0]) - 1]))
+    legend3 = "singletons\n{:,}\n{:.5f}".format(int(p1[1]), float(p1[1]) / sum(p1))
     plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12)
     plt.grid(b=True, which='major', color='#424242', linestyle=':')
-
     pdf.savefig(fig, bbox_inches="tight")
     plt.close("all")
 
 
-def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, title_file1, pdf, xlabel, relative=False, nr_above_bars=True, nr_unique_chimeras=0, len_sample=0):
+def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False,
+                  nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False):
     if relative is True:
         step = 0.1
     else:
@@ -104,70 +106,153 @@
 
     fig = plt.figure(figsize=(6, 8))
     plt.subplots_adjust(bottom=0.1)
-    con_list1 = numpy.concatenate(list1)
-    p1 = numpy.array([v for k, v in sorted(Counter(con_list1).iteritems())])
+    p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())])
     maximumY = numpy.amax(p1)
-    maximumX = int(maximumX)
-    print("max X", maximumX )
-
     if relative is True:  # relative difference
         bin1 = numpy.arange(-1, maximumX + 0.2, 0.1)
     else:
         bin1 = maximumX + 1
 
-    counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1,
-                      label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10",
-                             "FS>10"], rwidth=0.8,
-                      color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"],
-                      stacked=True, alpha=1,
-                      align="left",
-                      range=(0, maximumX + 1))
+    if rel_freq:
+        w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1]
+        counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w,
+                          label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8,
+                          color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"],
+                          stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
+        plt.ylim((0, (float(maximumY) / sum(p1)) * 1.2))
+        plt.ylabel("Relative Frequency", fontsize=14)
+        bins = counts[1]  # width of bins
+        counts = numpy.array(map(float, counts[0][5]))
+
+    else:
+        counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1,
+                          label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8,
+                          color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"],
+                          stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
+        maximumY = numpy.amax(p1)
+        plt.ylim((0, maximumY * 1.2))
+        plt.ylabel("Absolute Frequency", fontsize=14)
+        bins = counts[1]  # width of bins
+        counts = numpy.array(map(int, counts[0][5]))
+
     plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
-    bins = counts[1]  # width of bins
-    counts = numpy.array(map(int, counts[0][5]))
     plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
-    # plt.title(title_file1, fontsize=12)
     plt.xlabel(xlabel, fontsize=14)
-    plt.ylabel("Absolute Frequency", fontsize=14)
-
     plt.grid(b=True, which='major', color='#424242', linestyle=':')
-    plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1))
+    plt.xlim((minimumX - step, maximumX + step))
+    # plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1))
     plt.xticks(numpy.arange(0, maximumX + step, step))
 
-    plt.ylim((0, maximumY * 1.2))
-
-    if nr_above_bars is True:
+    if nr_above_bars:
         bin_centers = -0.4 * numpy.diff(bins) + bins[:-1]
         for x_label, label in zip(counts, bin_centers):  # labels for values
             if x_label == 0:
                 continue
             else:
-                plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts), 1),
-                             xy=(label, x_label + len(con_list1) * 0.01),
-                             xycoords="data", color="#000066", fontsize=10)
+                if rel_freq:
+                    plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))),
+                                                       float(x_label)),
+                                 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001),
+                                 xycoords="data", color="#000066", fontsize=10)
+                else:
+                    plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)),
+                                 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01),
+                                 xycoords="data", color="#000066", fontsize=10)
+
+    if nr_unique_chimeras != 0:
+        if (relative and ((counts[len(counts)-1] / nr_unique_chimeras) == 2)) or \
+                (sum(counts) / nr_unique_chimeras) == 2:
+            legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})"\
+                .format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2)
+        else:
+            legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format(
+                lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras)
+    else:
+        legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(
+            lenTags, len_sample, len(numpy.concatenate(list1)))
 
-    legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(lenTags, len_sample, sum(counts))
-    plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure)
+    plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure)
+    pdf.savefig(fig, bbox_inches="tight")
+    plt.close("all")
+    plt.clf()
+
+
+def plotHDwithDCS(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False,
+                  nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False):
+    step = 1
+    fig = plt.figure(figsize=(6, 8))
+    plt.subplots_adjust(bottom=0.1)
+    p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())])
+    maximumY = numpy.amax(p1)
+    bin1 = maximumX + 1
+    if rel_freq:
+        w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1]
+        counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w,
+                          label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"],
+                          stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
+        plt.ylim((0, (float(maximumY) / sum(p1)) * 1.2))
+        plt.ylabel("Relative Frequency", fontsize=14)
+        bins = counts[1]  # width of bins
+        counts = numpy.array(map(float, counts[0][2]))
 
-    # if nr_unique_chimeras != 0 and len_sample != 0:
-    #     if relative == True:
-    #         legend = "nr. of unique chimeric tags= {:,} ({:.5f}) (rel.diff=1)".format(nr_unique_chimeras,
-    #                                                                      int(nr_unique_chimeras) / float(len_sample))
-    #     else:
-    #         legend = "nr. of unique chimeric tags= {:,} ({:.5f})".format(nr_unique_chimeras, int(nr_unique_chimeras) / float(len_sample))
-    #     plt.text(0.14, -0.09, legend, size=12, transform=plt.gcf().transFigure)
+    else:
+        counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1,
+                          label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"],
+                          stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
+        plt.ylim((0, maximumY * 1.2))
+        plt.ylabel("Absolute Frequency", fontsize=14)
+        bins = counts[1]  # width of bins
+        counts = numpy.array(map(int, counts[0][2]))
+
+    plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
+    plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
+    plt.xlabel(xlabel, fontsize=14)
+    plt.grid(b=True, which='major', color='#424242', linestyle=':')
+    plt.xlim((minimumX - step, maximumX + step))
+    # plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1))
+    plt.xticks(numpy.arange(0, maximumX + step, step))
+
+    if nr_above_bars:
+        bin_centers = -0.4 * numpy.diff(bins) + bins[:-1]
+        for x_label, label in zip(counts, bin_centers):  # labels for values
+            if x_label == 0:
+                continue
+            else:
+                if rel_freq:
+                    plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))),
+                                                       float(x_label)),
+                                 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001),
+                                 xycoords="data", color="#000066", fontsize=10)
+                else:
+                    plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)),
+                                 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01),
+                                 xycoords="data", color="#000066", fontsize=10)
+
+    if nr_unique_chimeras != 0:
+        if (sum(counts) / nr_unique_chimeras) == 2:
+            legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})".\
+                format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2)
+        else:
+            legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format(
+                lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras)
+    else:
+        legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(
+            lenTags, len_sample, len(numpy.concatenate(list1)))
+    plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure)
+
+    legend2 = "SSCS ab = {:,}\nSSCS ba = {:,}\nDCS = {:,}".format(len(list1[1]), len(list1[2]), len(list1[0]))
+    plt.text(0.6, -0.047, legend2, size=12, transform=plt.gcf().transFigure)
 
     pdf.savefig(fig, bbox_inches="tight")
     plt.close("all")
     plt.clf()
 
 
-def plotHDwithinSeq_Sum2(sum1, sum1min, sum2, sum2min, min_value, lenTags, title_file1, pdf, len_sample):
+def plotHDwithinSeq(sum1, sum1min, sum2, sum2min, min_value, lenTags, pdf, len_sample, rel_freq=False):
     fig = plt.figure(figsize=(6, 8))
     plt.subplots_adjust(bottom=0.1)
 
     ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)]  # new hd within tags
-
     maximumX = numpy.amax(numpy.concatenate(ham_partial))
     minimumX = numpy.amin(numpy.concatenate(ham_partial))
     maximumY = numpy.amax(numpy.array(numpy.concatenate(map(lambda x: numpy.bincount(x), ham_partial))))
@@ -177,21 +262,30 @@
     else:
         range1 = range(minimumX, maximumX + 2)
 
-    plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, label=["HD a", "HD b'", "HD b", "HD a'", "HD a+b"], bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], edgecolor='black', linewidth=1)
+    if rel_freq:
+        w = [numpy.zeros_like(data) + 1. / len(data) for data in ham_partial]
+        plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, weights=w,
+                 label=["HD a", "HD b'", "HD b", "HD a'", "HD a+b', a'+b"],
+                 bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"],
+                 edgecolor='black', linewidth=1)
+        plt.ylabel("Relative Frequency", fontsize=14)
+        # plt.ylim(-0.1, (float(maximumY) / len(numpy.concatenate(ham_partial))) * 1.2)
+    else:
+        plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False,
+                 label=["HD a", "HD b'", "HD b", "HD a'", "HD a+b', a'+b"],
+                 bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"],
+                 edgecolor='black', linewidth=1)
+        plt.ylabel("Absolute Frequency", fontsize=14)
 
     plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1))
     plt.suptitle('Hamming distances within tags', fontsize=14)
-    # plt.title(title_file1, fontsize=12)
     plt.xlabel("HD", fontsize=14)
-    plt.ylabel("Absolute Frequency", fontsize=14)
     plt.grid(b=True, which='major', color='#424242', linestyle=':')
-
-    plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2))
+    plt.xlim((minimumX - 1, maximumX + 1))
+    # plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2))
     plt.xticks(numpy.arange(0, maximumX + 1, 1.0))
-    # plt.ylim(0, maximumY * 1.2)
-    legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(lenTags, len_sample, len(numpy.concatenate(ham_partial)))
-
-    # legend = "sample size= {:,} against {:,}".format(len(numpy.concatenate(ham_partial)), lenTags)
+    legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(
+        lenTags, len_sample, len(numpy.concatenate(ham_partial)))
     plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure)
     pdf.savefig(fig, bbox_inches="tight")
     plt.close("all")
@@ -206,7 +300,6 @@
         count = numpy.zeros((len(uniqueFS), 6))
     else:
         count = numpy.zeros((len(uniqueFS), 7))
-
     state = 1
     for i in list1:
         counts = list(Counter(i).items())
@@ -218,57 +311,48 @@
             continue
         else:
             if state == 1:
-                for i, l in zip(uniqueFS, nr):
+                for k, l in zip(uniqueFS, nr):
                     for j in table:
                         if j[0] == uniqueFS[l]:
                             count[l, 0] = j[1]
             if state == 2:
-                for i, l in zip(uniqueFS, nr):
+                for k, l in zip(uniqueFS, nr):
                     for j in table:
                         if j[0] == uniqueFS[l]:
                             count[l, 1] = j[1]
-
             if state == 3:
-                for i, l in zip(uniqueFS, nr):
+                for k, l in zip(uniqueFS, nr):
                     for j in table:
                         if j[0] == uniqueFS[l]:
                             count[l, 2] = j[1]
-
             if state == 4:
-                for i, l in zip(uniqueFS, nr):
+                for k, l in zip(uniqueFS, nr):
                     for j in table:
                         if j[0] == uniqueFS[l]:
                             count[l, 3] = j[1]
-
             if state == 5:
-                for i, l in zip(uniqueFS, nr):
+                for k, l in zip(uniqueFS, nr):
                     for j in table:
                         if j[0] == uniqueFS[l]:
                             count[l, 4] = j[1]
-
             if state == 6:
-                for i, l in zip(uniqueFS, nr):
+                for k, l in zip(uniqueFS, nr):
                     for j in table:
                         if j[0] == uniqueFS[l]:
                             count[l, 5] = j[1]
-
             if state == 7:
-                for i, l in zip(uniqueFS, nr):
+                for k, l in zip(uniqueFS, nr):
                     for j in table:
                         if j[0] == uniqueFS[l]:
                             count[l, 6] = j[1]
             state = state + 1
-
         sumRow = count.sum(axis=1)
         sumCol = count.sum(axis=0)
-
     uniqueFS = uniqueFS.astype(str)
     if uniqueFS[len(uniqueFS) - 1] == "20":
         uniqueFS[len(uniqueFS) - 1] = ">20"
-
     first = ["FS={}".format(i) for i in uniqueFS]
     final = numpy.column_stack((first, count, sumRow))
-
     return (final, sumCol)
 
 
@@ -276,13 +360,15 @@
     output_file.write(name)
     output_file.write("\n")
     if diff is False:
-        output_file.write("{}HD=1{}HD=2{}HD=3{}HD=4{}HD=5-8{}HD>8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep))
+        output_file.write("{}HD=1{}HD=2{}HD=3{}HD=4{}HD=5-8{}HD>8{}sum{}\n".format(
+            sep, sep, sep, sep, sep, sep, sep, sep))
     else:
         if rel is False:
-            output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep))
+            output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format(
+                sep, sep, sep, sep, sep, sep, sep, sep, sep))
         else:
-            output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep))
-
+            output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n".
+                              format(sep, sep, sep, sep, sep, sep, sep, sep, sep))
     for item in summary:
         for nr in item:
             if "FS" not in nr and "diff" not in nr:
@@ -314,46 +400,40 @@
             continue
         else:
             if state == 1:
-                for i, l in zip(uniqueHD, nr):
+                for k, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 0] = j[1]
             if state == 2:
-                for i, l in zip(uniqueHD, nr):
+                for k, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 1] = j[1]
-
             if state == 3:
-                for i, l in zip(uniqueHD, nr):
+                for k, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 2] = j[1]
-
             if state == 4:
-                for i, l in zip(uniqueHD, nr):
+                for k, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 3] = j[1]
-
             if state == 5:
-                for i, l in zip(uniqueHD, nr):
+                for k, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 4] = j[1]
-
             if state == 6:
-                for i, l in zip(uniqueHD, nr):
+                for k, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 5] = j[1]
             state = state + 1
-
         sumRow = count.sum(axis=1)
         sumCol = count.sum(axis=0)
         first = ["{}{}".format(row_label, i) for i in uniqueHD]
         final = numpy.column_stack((first, count, sumRow))
-
     return (final, sumCol)
 
 
@@ -362,7 +442,6 @@
     uniqueHD = numpy.unique(selfAB)
     nr = numpy.arange(0, len(uniqueHD), 1)
     count = numpy.zeros((len(uniqueHD), 5))
-
     state = 1
     for i in list1:
         counts = list(Counter(i).items())
@@ -374,45 +453,81 @@
             continue
         else:
             if state == 1:
-                for i, l in zip(uniqueHD, nr):
+                for k, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 0] = j[1]
             if state == 2:
-                for i, l in zip(uniqueHD, nr):
+                for k, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 1] = j[1]
             if state == 3:
-                for i, l in zip(uniqueHD, nr):
+                for k, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 2] = j[1]
             if state == 4:
-                for i, l in zip(uniqueHD, nr):
+                for k, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 3] = j[1]
             if state == 5:
-                for i, l in zip(uniqueHD, nr):
+                for k, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 4] = j[1]
-
             state = state + 1
-
         sumRow = count.sum(axis=1)
         sumCol = count.sum(axis=0)
         first = ["HD={}".format(i) for i in uniqueHD]
         final = numpy.column_stack((first, count, sumRow))
+    return (final, sumCol)
 
+
+def createTableHDwithDCS(list1):
+    selfAB = numpy.concatenate(list1)
+    uniqueHD = numpy.unique(selfAB)
+    nr = numpy.arange(0, len(uniqueHD), 1)
+    count = numpy.zeros((len(uniqueHD), len(list1)))
+    state = 1
+    for i in list1:
+        counts = list(Counter(i).items())
+        hd = [item[0] for item in counts]
+        c = [item[1] for item in counts]
+        table = numpy.column_stack((hd, c))
+        if len(table) == 0:
+            state = state + 1
+            continue
+        else:
+            if state == 1:
+                for k, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 0] = j[1]
+            if state == 2:
+                for k, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 1] = j[1]
+            if state == 3:
+                for k, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 2] = j[1]
+            state = state + 1
+        sumRow = count.sum(axis=1)
+        sumCol = count.sum(axis=0)
+        first = ["HD={}".format(i) for i in uniqueHD]
+        final = numpy.column_stack((first, count, sumRow))
     return (final, sumCol)
 
 
 def createFileHD(summary, sumCol, overallSum, output_file, name, sep):
     output_file.write(name)
     output_file.write("\n")
-    output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep))
+    output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format(
+        sep, sep, sep, sep, sep, sep, sep, sep))
     for item in summary:
         for nr in item:
             if "HD" not in nr and "diff" not in nr:
@@ -428,10 +543,29 @@
     output_file.write("\n\n")
 
 
+def createFileHDwithDCS(summary, sumCol, overallSum, output_file, name, sep):
+    output_file.write(name)
+    output_file.write("\n")
+    output_file.write("{}DCS{}SSCS ab{}SSCS ba{}sum{}\n".format(sep, sep, sep, sep, sep))
+    for item in summary:
+        for nr in item:
+            if "HD" not in nr:
+                nr = nr.astype(float)
+                nr = nr.astype(int)
+            output_file.write("{}{}".format(nr, sep))
+        output_file.write("\n")
+    output_file.write("sum{}".format(sep))
+    sumCol = map(int, sumCol)
+    for el in sumCol:
+        output_file.write("{}{}".format(el, sep))
+    output_file.write("{}{}".format(overallSum.astype(int), sep))
+    output_file.write("\n\n")
+
+
 def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name, sep):
     output_file.write(name)
     output_file.write("\n")
-    output_file.write("{}HD a{}HD b'{}HD b{}HD a'{}HD a+b{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep))
+    output_file.write("{}HD DCS{}HD b'{}HD b{}HD a'{}HD a+b', a'+b{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep))
     for item in summary:
         for nr in item:
             if "HD" not in nr:
@@ -454,17 +588,14 @@
     for a in array1:
         dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2])  # fastest
         res[i] = numpy.amin(dist[dist > 0])  # pick min distance greater than zero
-        # print(i)
         i += 1
     return res
 
 
 def hamming_difference(array1, array2, mate_b):
     array2 = numpy.unique(array2)  # remove duplicate sequences to decrease running time
-
     array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1])  # mate1 part1
     array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1])  # mate1 part 2
-
     array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2])  # mate2 part1
     array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2])  # mate2 part2
 
@@ -499,6 +630,7 @@
         half2_mate1 = array1_half
         half1_mate2 = array2_half2
         half2_mate2 = array2_half
+
     # half1_mate1, index_halves = numpy.unique(half1_mate1, return_index=True)
     # print(len(half1_mate1))
     # half2_mate1 = half2_mate1[index_halves]
@@ -514,16 +646,18 @@
         array2_half_withoutSame = half1_mate2[index_withoutSame]
         array2_half2_withoutSame = half2_mate2[index_withoutSame]
         array2_withoutSame = array2[index_withoutSame]  # whole tag (=not splitted into 2 halfs)
-
+        # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
         dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in
-                            array2_half_withoutSame])  # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
+                            array2_half_withoutSame])
         min_index = numpy.where(dist == dist.min())[0]  # get index of min HD
         min_value = dist.min()
         # min_value = dist[min_index]  # get minimum HDs
-        min_tag_half2 = array2_half2_withoutSame[min_index]  # get all "b's" of the tag or all "a's" of the tag with minimum HD
+        # get all "b's" of the tag or all "a's" of the tag with minimum HD
+        min_tag_half2 = array2_half2_withoutSame[min_index]
         min_tag_array2 = array2_withoutSame[min_index]  # get whole tag with min HD
 
-        dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in min_tag_half2])  # calculate HD of "b" to all "b's" or "a" to all "a's"
+        dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in
+                                        min_tag_half2])  # calculate HD of "b" to all "b's" or "a" to all "a's"
         max_value = dist_second_half.max()
         max_index = numpy.where(dist_second_half == dist_second_half.max())[0]  # get index of max HD
         max_tag = min_tag_array2[max_index]
@@ -548,14 +682,11 @@
             min_tagsList_zeros.append(numpy.array(tag))
             difference1_zeros = abs(min_value - max_value)  # hd of non-identical part
             diff11_zeros.append(difference1_zeros)
-            max_tag_list.append(max_tag)
+            max_tag_list.append(numpy.array(max_tag))
         else:
             min_tagsList_zeros.append(None)
             diff11_zeros.append(None)
-            max_tag_list.append(numpy.array(["None"]))
-
-            # max_tag_list.append(numpy.array(max_tag))
-
+            max_tag_list.append(None)
         i += 1
 
     # print(i)
@@ -567,7 +698,8 @@
     # relativeDiffList = [st for st in relativeDiffList if st != 999]
     # diff11_zeros = [st for st in diff11_zeros if st != 999]
     # min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999]
-    return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros, ham1min, ham2min, max_tag_list])
+    return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros,
+             min_tagsList_zeros, ham1min, ham2min, max_tag_list])
 
 
 def readFileReferenceFree(file):
@@ -608,7 +740,6 @@
 def familySizeDistributionWithHD(fs, ham, diff=False, rel=True):
     hammingDistances = numpy.unique(ham)
     fs = numpy.asarray(fs)
-
     ham = numpy.asarray(ham)
     bigFamilies2 = numpy.where(fs > 19)[0]
     if len(bigFamilies2) != 0:
@@ -663,6 +794,53 @@
     return(list1, hammingDistances, maximum, minimum)
 
 
+def hammingDistanceWithDCS(minHD_tags_zeros, diff_zeros, data_array):
+    diff_zeros = numpy.array(diff_zeros)
+    maximum = numpy.amax(diff_zeros)
+    minimum = numpy.amin(diff_zeros)
+    minHD_tags_zeros = numpy.array(minHD_tags_zeros)
+
+    idx = numpy.concatenate([numpy.where(data_array[:, 1] == i)[0] for i in minHD_tags_zeros])
+    subset_data = data_array[idx, :]
+
+    seq = numpy.array(subset_data[:, 1])
+
+    # find all unique tags and get the indices for ALL tags, but only once
+    u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
+    DCS_tags = u[c == 2]
+    rest_tags = u[c == 1]
+
+    dcs = numpy.repeat("DCS", len(DCS_tags))
+    idx_sscs = numpy.concatenate([numpy.where(subset_data[:, 1] == i)[0] for i in rest_tags])
+    sscs = subset_data[idx_sscs, 2]
+
+    all_tags = numpy.column_stack((numpy.concatenate((DCS_tags, subset_data[idx_sscs, 1])),
+                                   numpy.concatenate((dcs, sscs))))
+    hd_DCS = []
+    ab_SSCS = []
+    ba_SSCS = []
+
+    for i in range(len(all_tags)):
+        tag = all_tags[i, :]
+        hd = diff_zeros[numpy.where(minHD_tags_zeros == tag[0])[0]]
+
+        if tag[1] == "DCS":
+            hd_DCS.append(hd)
+        elif tag[1] == "ab":
+            ab_SSCS.append(hd)
+        elif tag[1] == "ba":
+            ba_SSCS.append(hd)
+
+    if len(hd_DCS) != 0:
+        hd_DCS = numpy.concatenate(hd_DCS)
+    if len(ab_SSCS) != 0:
+        ab_SSCS = numpy.concatenate(ab_SSCS)
+    if len(ba_SSCS) != 0:
+        ba_SSCS = numpy.concatenate(ba_SSCS)
+    list1 = [hd_DCS, ab_SSCS, ba_SSCS]  # list for plotting
+    return(list1, maximum, minimum)
+
+
 def make_argparser():
     parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data')
     parser.add_argument('--inputFile',
@@ -678,11 +856,15 @@
                         help='Only tags of the DCSs are included in the HD analysis')
 
     parser.add_argument('--minFS', default=1, type=int,
-                        help='Only tags, which have a family size greater or equal than specified, are included in the HD analysis')
+                        help='Only tags, which have a family size greater or equal than specified, '
+                             'are included in the HD analysis')
     parser.add_argument('--maxFS', default=0, type=int,
-                        help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis')
+                        help='Only tags, which have a family size smaller or equal than specified, '
+                             'are included in the HD analysis')
     parser.add_argument('--nr_above_bars', action="store_true",
-                        help='If no, values above bars in the histograms are removed')
+                        help='If False, values above bars in the histograms are removed')
+    parser.add_argument('--rel_freq', action="store_false",
+                        help='If True, the relative frequencies are displayed.')
 
     parser.add_argument('--output_tabular', default="data.tabular", type=str,
                         help='Name of the tabular file.')
@@ -695,36 +877,33 @@
 
 
 def Hamming_Distance_Analysis(argv):
-
+# def Hamming_Distance_Analysis(file1, name1, index_size, title_savedFile_pdf, title_savedFile_csv,
+#                               output_chimeras_tabular, onlyDuplicates, minFS=1, maxFS=0, nr_above_bars=True,
+#                               subset=False, nproc=12, rel_freq=False):
     parser = make_argparser()
     args = parser.parse_args(argv[1:])
-
     file1 = args.inputFile
     name1 = args.inputName1
-
     index_size = args.sample_size
     title_savedFile_pdf = args.output_pdf
     title_savedFile_csv = args.output_tabular
     output_chimeras_tabular = args.output_chimeras_tabular
-
-    sep = "\t"
     onlyDuplicates = args.only_DCS
+    rel_freq = args.rel_freq
     minFS = args.minFS
     maxFS = args.maxFS
     nr_above_bars = args.nr_above_bars
-
     subset = args.subset_tag
     nproc = args.nproc
+    sep = "\t"
 
     # input checks
     if index_size < 0:
         print("index_size is a negative integer.")
         exit(2)
-
     if nproc <= 0:
         print("nproc is smaller or equal zero")
         exit(3)
-
     if subset < 0:
         print("subset_tag is smaller or equal zero.")
         exit(5)
@@ -735,22 +914,31 @@
     plt.rcParams['ytick.labelsize'] = 14
     plt.rcParams['patch.edgecolor'] = "#000000"
     plt.rc('figure', figsize=(11.69, 8.27))  # A4 format
-
     name1 = name1.split(".tabular")[0]
 
     with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf:
         print("dataset: ", name1)
         integers, data_array = readFileReferenceFree(file1)
         data_array = numpy.array(data_array)
-        print("total nr of tags with Ns:", len(data_array))
-        n = [i for i, x in enumerate(data_array[:, 1]) if "N" in x]
-        if len(n) != 0:  # delete tags with N in the tag from data
-            print("nr of tags with N's within tag:", len(n), float(len(n)) / len(data_array))
+        print("total nr of tags:", len(data_array))
+
+        # filter tags out which contain any other character than ATCG
+        valid_bases = ["A", "T", "G", "C"]
+        tagsToDelete = []
+        for idx, t in enumerate(data_array[:, 1]):
+            for char in t:
+                if char not in valid_bases:
+                    tagsToDelete.append(idx)
+                    break
+
+        if len(tagsToDelete) != 0:  # delete tags with N in the tag from data
+            print("nr of tags with any other character than A, T, C, G:", len(tagsToDelete),
+                  float(len(tagsToDelete)) / len(data_array))
             index_whole_array = numpy.arange(0, len(data_array), 1)
-            index_withoutN_inTag = numpy.delete(index_whole_array, n)
+            index_withoutN_inTag = numpy.delete(index_whole_array, tagsToDelete)
             data_array = data_array[index_withoutN_inTag, :]
             integers = integers[index_withoutN_inTag]
-            print("total nr of tags without Ns:", len(data_array))
+            print("total nr of filtered tags:", len(data_array))
 
         int_f = numpy.array(data_array[:, 0]).astype(int)
         data_array = data_array[numpy.where(int_f >= minFS)]
@@ -768,7 +956,7 @@
 
             # find all unique tags and get the indices for ALL tags, but only once
             u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
-            d = u[c > 1]
+            d = u[c == 2]
 
             # get family sizes, tag for duplicates
             duplTags_double = integers[numpy.in1d(seq, d)]
@@ -779,9 +967,9 @@
             duplTags_seq = seq[numpy.in1d(seq, d)][0::2]  # ab - tags
 
             if minFS > 1:
-                duplTags_tag = duplTags_tag[(duplTags >= 3) & (duplTagsBA >= 3)]
-                duplTags_seq = duplTags_seq[(duplTags >= 3) & (duplTagsBA >= 3)]
-                duplTags = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)]  # ab+ba with FS>=3
+                duplTags_tag = duplTags_tag[(duplTags >= minFS) & (duplTagsBA >= minFS)]
+                duplTags_seq = duplTags_seq[(duplTags >= minFS) & (duplTagsBA >= minFS)]
+                duplTags = duplTags[(duplTags >= minFS) & (duplTagsBA >= minFS)]  # ab+ba with FS>=3
 
             data_array = numpy.column_stack((duplTags, duplTags_seq))
             data_array = numpy.column_stack((data_array, duplTags_tag))
@@ -819,15 +1007,32 @@
         else:
             numpy.random.shuffle(data_array)
             unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True)  # get only unique tags
-            result = numpy.random.choice(unique_indices, size=index_size, replace=False)  # array of random sequences of size=index.size
+            result = numpy.random.choice(unique_indices, size=index_size,
+                                         replace=False)  # array of random sequences of size=index.size
 
             # result = numpy.random.choice(len(integers), size=index_size,
             #                             replace=False)  # array of random sequences of size=index.size
             # result = numpy.where(numpy.array(random_tags) == numpy.array(data_array[:,1]))[0]
 
-        # with open("index_result1_{}.pkl".format(app_f), "wb") as o:
+        # with open("index_result.pkl", "wb") as o:
         #     pickle.dump(result, o, pickle.HIGHEST_PROTOCOL)
 
+        # save counts
+        # with open(data_folder + "index_sampleTags1000_Barcode3_DCS.pkl", "wb") as f:
+        #     pickle.dump(result, f, pickle.HIGHEST_PROTOCOL)
+        # with open(data_folder + "dataArray_sampleTags1000_Barcode3_DCS.pkl", "wb") as f1:
+        #     pickle.dump(data_array, f1, pickle.HIGHEST_PROTOCOL)
+        #
+        # with open(data_folder + "index_sampleTags100.pkl", "rb") as f:
+        #     result = pickle.load(f)
+        #
+        # with open(data_folder + "dataArray_sampleTags100.pkl", "rb") as f1:
+        #     data_array = pickle.load(f1)
+
+        # with open(data_folder + "index_result.txt", "w") as t:
+        #     for text in result:
+        #         t.write("{}\n".format(text))
+
         # comparison random tags to whole dataset
         result1 = data_array[result, 1]  # random tags
         result2 = data_array[:, 1]  # all tags
@@ -873,10 +1078,9 @@
         minHD_tags = numpy.concatenate([item[4] for item in diff_list_a])
         minHD_tags_zeros1 = numpy.concatenate([item[7] for item in diff_list_a])
         minHD_tags_zeros2 = numpy.concatenate([item[7] for item in diff_list_b])
-        chim_tags = [item[10] for item in diff_list_a]
-        chim_tags2 = [item[10] for item in diff_list_b]
-        chimera_tags1 = [ii if isinstance(i, list) else i for i in chim_tags for ii in i]
-        chimera_tags2 = [ii if isinstance(i, list) else i for i in chim_tags2 for ii in i]
+
+        chimera_tags1 = sum([item[10] for item in diff_list_a], [])
+        chimera_tags2 = numpy.concatenate([item[10] for item in diff_list_b])
 
         rel_Diff = []
         diff_zeros = []
@@ -884,7 +1088,8 @@
         diff = []
         chimera_tags = []
         for d1, d2, rel1, rel2, zeros1, zeros2, tag1, tag2, ctag1, ctag2 in \
-                zip(diff1, diff2, rel_Diff1, rel_Diff2, diff_zeros1, diff_zeros2, minHD_tags_zeros1, minHD_tags_zeros2, chimera_tags1, chimera_tags2):
+                zip(diff1, diff2, rel_Diff1, rel_Diff2, diff_zeros1, diff_zeros2, minHD_tags_zeros1, minHD_tags_zeros2,
+                    chimera_tags1, chimera_tags2):
             rel_Diff.append(max(rel1, rel2))
             diff.append(max(d1, d2))
 
@@ -903,7 +1108,7 @@
                 chimera_tags.append(ctag2)
 
         chimera_tags_new = chimera_tags
-        #data_chimeraAnalysis = numpy.column_stack((minHD_tags_zeros, chimera_tags_new))
+        data_chimeraAnalysis = numpy.column_stack((minHD_tags_zeros, chimera_tags_new))
         # chimeras_dic = defaultdict(list)
         #
         # for t1, t2 in zip(minHD_tags_zeros, chimera_tags_new):
@@ -911,71 +1116,63 @@
         #         t2 = numpy.concatenate(t2)
         #     chimeras_dic[t1].append(t2)
 
+        checked_tags = []
+        stat_maxTags = []
+
         with open(output_chimeras_tabular, "w") as output_file1:
-            output_file1.write("chimera tag\tsimilar tag with HD=0\n")
-            for i in range(len(minHD_tags_zeros)):
-                tag1 = minHD_tags_zeros[i]
+            output_file1.write("chimera tag\tfamily size, read direction\tsimilar tag with HD=0\n")
+            for i in range(len(data_chimeraAnalysis)):
+                tag1 = data_chimeraAnalysis[i, 0]
+
+                info_tag1 = data_array[data_array[:, 1] == tag1, :]
+                fs_tag1 = ["{} {}".format(t[0], t[2]) for t in info_tag1]
+
+                if tag1 in checked_tags:  # skip tag if already written to file
+                    continue
+
                 sample_half_a = tag1[0:(len(tag1)) / 2]
                 sample_half_b = tag1[len(tag1) / 2:len(tag1)]
 
-                max_tags = chimera_tags_new[i]
-                if isinstance(max_tags, list) and len(max_tags) > 1:
+                max_tags = data_chimeraAnalysis[i, 1]
+                if len(max_tags) > 1 and type(max_tags) is not numpy.ndarray:
                     max_tags = numpy.concatenate(max_tags)
-                #if isinstance(max_tags, list): #and type(max_tags) is not numpy.ndarray:
-                #    print(max_tags)
-                #    max_tags = numpy.concatenate(max_tags)
                 max_tags = numpy.unique(max_tags)
+                stat_maxTags.append(len(max_tags))
 
-                chimera_half_a = numpy.array([i[0:(len(i)) / 2] for i in max_tags])  # mate1 part1
-                chimera_half_b = numpy.array([i[len(i) / 2:len(i)] for i in max_tags])  # mate1 part 2
+                info_maxTags = [data_array[data_array[:, 1] == t, :] for t in max_tags]
+
+                chimera_half_a = numpy.array([t[0:(len(t)) / 2] for t in max_tags])  # mate1 part1
+                chimera_half_b = numpy.array([t[len(t) / 2:len(t)] for t in max_tags])  # mate1 part 2
 
                 new_format = []
                 for j in range(len(max_tags)):
+                    fs_maxTags = ["{} {}".format(t[0], t[2]) for t in info_maxTags[j]]
+
                     if sample_half_a == chimera_half_a[j]:
-                        max_tag = "*{}* {}".format(chimera_half_a[j], chimera_half_b[j])
+                        max_tag = "*{}* {} {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags))
                         new_format.append(max_tag)
 
                     elif sample_half_b == chimera_half_b[j]:
-                        max_tag = "{} *{}*".format(chimera_half_a[j], chimera_half_b[j])
+                        max_tag = "{} *{}* {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags))
                         new_format.append(max_tag)
+                    checked_tags.append(max_tags[j])
 
-                sample_tag = "{} {}".format(sample_half_a, sample_half_b)
+                sample_tag = "{} {}\t{}".format(sample_half_a, sample_half_b, ", ".join(fs_tag1))
                 output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format)))
-            output_file1.write(
-                "This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.\n "
-                "The tags were separated by an empty space into their halves and the * marks the identical half.")
+
+                checked_tags.append(tag1)
 
-            # unique_chimeras = numpy.array(minHD_tags_zeros)
-            #
-            # sample_half_a = numpy.array([i[0:(len(i)) / 2] for i in unique_chimeras])  # mate1 part1
-            # sample_half_b = numpy.array([i[len(i) / 2:len(i)] for i in unique_chimeras])  # mate1 part 2
-            #
-            # output_file1.write("sample tag\tsimilar tag\n")
-            # for tag1, a, b in zip(unique_chimeras, sample_half_a, sample_half_b):
-            #     max_tags = numpy.concatenate(chimeras_dic.get(tag1))
-            #     max_tags = numpy.unique(max_tags)
-            #
-            #     chimera_half_a = numpy.array([i[0:(len(i)) / 2] for i in max_tags])  # mate1 part1
-            #     chimera_half_b = numpy.array([i[len(i) / 2:len(i)] for i in max_tags])  # mate1 part 2
-            #
-            #     new_format = []
-            #     for i in range(len(max_tags)):
-            #         if a == chimera_half_a[i]:
-            #             max_tag = "*{}* {}".format(chimera_half_a[i], chimera_half_b[i])
-            #             new_format.append(max_tag)
-            #
-            #         elif b == chimera_half_b[i]:
-            #             max_tag = "{} *{}*".format(chimera_half_a[i], chimera_half_b[i])
-            #             new_format.append(max_tag)
-            #
-            #     sample_tag = "{} {}".format(a, b)
-            #     output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format)))
-            # output_file1.write(
-            #     "This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.\n "
-            #     "The tags were separated by an empty space into their halves and the * marks the identical half.")
-
-        nr_chimeric_tags = len(minHD_tags_zeros)
-        print("nr of unique chimeras", nr_chimeric_tags)
+            output_file1.write(
+                "This file contains all tags that were identified as chimeras as the first column and the "
+                "corresponding tags which returned a Hamming distance of zero in either the first or the second "
+                "half of the sample tag as the second column.\n"
+                "The tags were separated by an empty space into their halves and the * marks the identical half.")
+            output_file1.write("\n\nStatistics of nr. of tags that returned max. HD (2nd column)\n")
+            output_file1.write("minimum\t{}\ttag(s)\n".format(numpy.amin(numpy.array(stat_maxTags))))
+            output_file1.write("mean\t{}\ttag(s)\n".format(numpy.mean(numpy.array(stat_maxTags))))
+            output_file1.write("median\t{}\ttag(s)\n".format(numpy.median(numpy.array(stat_maxTags))))
+            output_file1.write("maximum\t{}\ttag(s)\n".format(numpy.amax(numpy.array(stat_maxTags))))
+            output_file1.write("sum\t{}\ttag(s)\n".format(numpy.sum(numpy.array(stat_maxTags))))
 
         lenTags = len(data_array)
         len_sample = len(result1)
@@ -992,6 +1189,9 @@
             rel_Diff = numpy.tile(rel_Diff, 2)
             diff_zeros = numpy.tile(diff_zeros, 2)
 
+        nr_chimeric_tags = len(data_chimeraAnalysis)
+        print("nr of chimeras", nr_chimeric_tags)
+
         # prepare data for different kinds of plots
         # distribution of FSs separated after HD
         familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False)
@@ -1011,8 +1211,8 @@
             lst_minHD_tags.append(seqDic.get(i))
 
         if onlyDuplicates:
-            lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags], [item_b[1] for item_b in lst_minHD_tags])).astype(int)
-
+            lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags],
+                                                [item_b[1] for item_b in lst_minHD_tags])).astype(int)
         # histogram with absolute and relative difference between HDs of both parts of the tag
         listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff)
         listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags,
@@ -1023,43 +1223,55 @@
             for i in minHD_tags_zeros:
                 lst_minHD_tags_zeros.append(seqDic.get(i))  # get family size for tags of chimeric reads
             if onlyDuplicates:
-                lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros], [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int)
+                lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros],
+                                                          [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int)
 
             # histogram with HD of non-identical half
-            listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(lst_minHD_tags_zeros, diff_zeros)
+            listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(
+            lst_minHD_tags_zeros, diff_zeros)
+
+            if onlyDuplicates is False:
+                listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros = hammingDistanceWithDCS(minHD_tags_zeros,
+                                                                                             diff_zeros, data_array)
 
         # plot Hamming Distance with Family size distribution
-        plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf,
-                      subtitle="Hamming distance separated by family size", title_file1=name1, lenTags=lenTags,
+        plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, rel_freq=rel_freq,
+                      subtitle="Hamming distance separated by family size", lenTags=lenTags,
                       xlabel="HD", nr_above_bars=nr_above_bars, len_sample=len_sample)
 
         # Plot FSD with separation after
-        plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS,
+        plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, rel_freq=rel_freq,
                        originalCounts=quant, subtitle="Family size distribution separated by Hamming distance",
-                       pdf=pdf, relative=False, title_file1=name1, diff=False)
+                       pdf=pdf, relative=False, diff=False)
 
         # Plot HD within tags
-        plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags,
-                             title_file1=name1, len_sample=len_sample)
+        plotHDwithinSeq(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags,
+                        rel_freq=rel_freq, len_sample=len_sample)
 
         # Plot difference between HD's separated after FSD
         plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf,
-                      subtitle="Delta Hamming distance within tags",
-                      title_file1=name1, lenTags=lenTags,
+                      subtitle="Delta Hamming distance within tags", lenTags=lenTags, rel_freq=rel_freq,
                       xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars, len_sample=len_sample)
 
         plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf,
-                      subtitle="Chimera Analysis: relative delta Hamming distances",
-                      title_file1=name1, lenTags=lenTags,
-                      xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
+                      subtitle="Chimera Analysis: relative delta Hamming distance", lenTags=lenTags, rel_freq=rel_freq,
+                      xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars,
+                      nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
 
         # plots for chimeric reads
         if len(minHD_tags_zeros) != 0:
             # HD
-            plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, subtitle="Hamming distance of chimeras",
-                          title_file1=name1, lenTags=lenTags, xlabel="HD", relative=False,
+            plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf,
+                          subtitle="Hamming distance of chimeric families (CF)", rel_freq=rel_freq,
+                          lenTags=lenTags, xlabel="HD", relative=False,
                           nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
 
+            if onlyDuplicates is False:
+                plotHDwithDCS(listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros, pdf=pdf,
+                              subtitle="Hamming distance of chimeric families (CF)", rel_freq=rel_freq,
+                              lenTags=lenTags, xlabel="HD", relative=False,
+                              nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
+
         # print all data to a CSV file
         # HD
         summary, sumCol = createTableHD(list1, "HD=")
@@ -1087,9 +1299,12 @@
             summary15, sumCol15 = createTableHD(listDifference1_zeros, "HD=")
             overallSum15 = sum(sumCol15)
 
+            if onlyDuplicates is False:
+                summary16, sumCol16 = createTableHDwithDCS(listDCS_zeros)
+                overallSum16 = sum(sumCol16)
+
         output_file.write("{}\n".format(name1))
-        output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len(
-            numpy.concatenate(list1)), lenTags, lenTags))
+        output_file.write("nr of tags{}{:,}\nsample size{}{:,}\n\n".format(sep, lenTags, sep, len_sample))
 
         # HD
         createFileHD(summary, sumCol, overallSum, output_file,
@@ -1109,27 +1324,36 @@
 
         # HD within tags
         output_file.write(
-            "The Hamming distances were calculated by comparing the first halve against all halves and selected the minimum value (HD a).\n"
-            "For the second half of the tag, we compared them against all tags which resulted in the minimum HD of the previous step and selected the maximum value (HD b').\n"
-            "Finally, it was possible to calculate the absolute and relative differences between the HDs (absolute and relative delta HD).\n"
-            "These calculations were repeated, but starting with the second half in the first step to find all possible chimeras in the data (HD b and HD  For simplicity we used the maximum value between the delta values in the end.\n"
-            "When only tags that can form DCS were allowed in the analysis, family sizes for the forward and reverse (ab and ba) will be included in the plots.\n")
+            "The Hamming distances were calculated by comparing the first halve against all halves and selected the "
+            "minimum value (HD a).\nFor the second half of the tag, we compared them against all tags which resulted "
+            "in the minimum HD of the previous step and selected the maximum value (HD b').\nFinally, it was possible "
+            "to calculate the absolute and relative differences between the HDs (absolute and relative delta HD).\n"
+            "These calculations were repeated, but starting with the second half in the first step to find all "
+            "possible chimeras in the data (HD b and HD  For simplicity we used the maximum value between the delta "
+            "values in the end.\nWhen only tags that can form DCS were allowed in the analysis, family sizes for the "
+            "forward and reverse (ab and ba) will be included in the plots.\n")
 
-        output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2))
+        output_file.write("\nlength of one half of the tag{}{}\n\n".format(sep, len(data_array[0, 1]) / 2))
 
         createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file,
                               "Hamming distance of each half in the tag", sep)
         createFileHD(summary11, sumCol11, overallSum11, output_file,
-                     "Absolute delta Hamming distances within the tag", sep)
+                     "Absolute delta Hamming distance within the tag", sep)
 
         createFileHD(summary13, sumCol13, overallSum13, output_file,
-                     "Chimera analysis: relative delta Hamming distances", sep)
+                     "Chimera analysis: relative delta Hamming distance", sep)
 
         if len(minHD_tags_zeros) != 0:
             output_file.write(
-                "Chimeras:\nAll tags were filtered: only those tags where at least one half was identical (HD=0) and therefore, had a relative delta of 1 were kept. These tags are considered as chimeric.\nSo the Hamming distances of the chimeric tags are shown.\n")
+                "All tags were filtered: only those tags where at least one half was identical (HD=0) and therefore, "
+                "had a relative delta of 1 were kept. These tags are considered as chimeric.\nSo the Hamming distances "
+                "of the chimeric tags are shown.\n")
             createFileHD(summary15, sumCol15, overallSum15, output_file,
-                         "Hamming distances of chimeras", sep)
+                         "Hamming distance of chimeric families separated after FS", sep)
+
+            if onlyDuplicates is False:
+                createFileHDwithDCS(summary16, sumCol16, overallSum16, output_file,
+                                    "Hamming distance of chimeric families separated after DCS and single SSCS", sep)
 
         output_file.write("\n")
author	mheinzl
date	Wed, 24 Jul 2019 05:58:15 -0400
parents	1fa7342a140d
children	46bfbec0f9e6