Mercurial > repos > mheinzl > hd

--- a/hd.py	Wed May 23 14:47:43 2018 -0400
+++ b/hd.py	Mon Oct 08 05:56:04 2018 -0400
@@ -13,34 +13,35 @@
 # It is also possible to perform the HD analysis with shortened tags with given sizes as input.
 # The tool can run on a certain number of processors, which can be defined by the user.

-# USAGE: python HDnew6_1Plot_FINAL.py --inputFile filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --sample_size int/0 --sep "characterWhichSeparatesCSVFile" /
-#        --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False--output_csv outptufile_name_csv --output_pdf outptufile_name_pdf
+# USAGE: python hd.py --inputFile filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --sample_size int/0 --sep "characterWhichSeparatesCSVFile" /
+#        --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular --output_pdf outputfile_name_pdf

-import numpy
+import argparse
 import itertools
 import operator
-import matplotlib.pyplot as plt
-import os.path
-import cPickle as pickle
-from multiprocessing.pool import Pool
+import sys
+from collections import Counter
 from functools import partial
-import argparse
-import sys
-import os
+from multiprocessing.pool import Pool
+
+import matplotlib.pyplot as plt
+import numpy
 from matplotlib.backends.backend_pdf import PdfPages
-from collections import Counter
+
+plt.switch_backend('agg')

-def plotFSDwithHD2(familySizeList1,maximumXFS,minimumXFS, originalCounts,
-                   title_file1, subtitle, pdf, relative=False, diff = True):
+
+def plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, originalCounts,
+                   title_file1, subtitle, pdf, relative=False, diff=True):
     if diff is False:
         colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"]
-        labels = ["HD=1", "HD=2", "HD=3", "HD=4", "HD=5-8","HD>8"]
+        labels = ["HD=1", "HD=2", "HD=3", "HD=4", "HD=5-8", "HD>8"]
     else:
         colors = ["#93A6AB", "#403C14", "#731E41", "#BAB591", "#085B6F", "#E8AA35", "#726C66"]
         if relative is True:
             labels = ["d=0", "d=0.1", "d=0.2", "d=0.3", "d=0.4", "d=0.5-0.8", "d>0.8"]
         else:
-            labels = ["d=0","d=1", "d=2", "d=3", "d=4", "d=5-8","d>8"]
+            labels = ["d=0", "d=1", "d=2", "d=3", "d=4", "d=5-8", "d>8"]

     fig = plt.figure(figsize=(6, 7))
     ax = fig.add_subplot(111)
@@ -54,11 +55,11 @@
         range1 = range(0, maximumXFS + 2)
     counts = plt.hist(familySizeList1, label=labels,
                       color=colors, stacked=True,
-                      rwidth=0.8,alpha=1, align="left",
-                      edgecolor="None",bins=range1)
+                      rwidth=0.8, alpha=1, align="left",
+                      edgecolor="None", bins=range1)
     plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))

-    #plt.title(title_file1, fontsize=12)
+    # plt.title(title_file1, fontsize=12)
     plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
     plt.xlabel("Family size", fontsize=14)
     plt.ylabel("Absolute Frequency", fontsize=14)
@@ -79,20 +80,17 @@
     plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure)

     count = numpy.bincount(originalCounts)  # original counts
-    legend1 = "{}\n{}\n{:.5f}" \
-        .format(max(originalCounts), count[len(count) - 1], float(count[len(count) - 1]) / sum(count))
+    legend1 = "{}\n{}\n{:.5f}".format(max(originalCounts), count[len(count) - 1], float(count[len(count) - 1]) / sum(count))
     plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure)
-    legend3 = "singletons\n{:,}\n{:.5f}".format(int(counts[0][len(counts[0]) - 1][1]),
-                                                float(counts[0][len(counts[0]) - 1][1]) / sum(
-                                                    counts[0][len(counts[0]) - 1]))
+    legend3 = "singletons\n{:,}\n{:.5f}".format(int(counts[0][len(counts[0]) - 1][1]), float(counts[0][len(counts[0]) - 1][1]) / sum(counts[0][len(counts[0]) - 1]))
     plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12)
     plt.grid(b=True, which='major', color='#424242', linestyle=':')

     pdf.savefig(fig, bbox_inches="tight")
     plt.close("all")
-
-def plotHDwithFSD(list1,maximumX,minimumX, subtitle, lenTags, title_file1,pdf,
-                   xlabel,relative=False, nr_above_bars = True):
+
+
+def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, title_file1, pdf, xlabel, relative=False, nr_above_bars=True):
     if relative is True:
         step = 0.1
     else:
@@ -120,7 +118,7 @@
     bins = counts[1]  # width of bins
     counts = numpy.array(map(int, counts[0][5]))
     plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
-   # plt.title(title_file1, fontsize=12)
+    # plt.title(title_file1, fontsize=12)
     plt.xlabel(xlabel, fontsize=14)
     plt.ylabel("Absolute Frequency", fontsize=14)

@@ -138,8 +136,8 @@
             else:
                 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts), 1),
                              xy=(label, x_label + len(con_list1) * 0.01),
-                             xycoords="data", color="#000066",fontsize=10)
-
+                             xycoords="data", color="#000066", fontsize=10)
+
     legend = "sample size= {:,} against {:,}".format(sum(counts), lenTags)
     plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure)

@@ -147,47 +145,42 @@
     plt.close("all")
     plt.clf()

-def plotHDwithinSeq_Sum2(sum1, sum2,sum1min, sum2min, min_value, lenTags, title_file1, pdf):
+
+def plotHDwithinSeq_Sum2(sum1, sum1min, sum2, sum2min, min_value, lenTags, title_file1, pdf):
     fig = plt.figure(figsize=(6, 8))
     plt.subplots_adjust(bottom=0.1)

-    #ham = [sum1, sum2,numpy.array(min_value)]  # new hd within tags
-    ham = [sum1, sum2, sum1min, sum2min, numpy.array(min_value)]  # new hd within tags
-
+    ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)]  # new hd within tags

-    maximumX = numpy.amax(numpy.concatenate(ham))
-    minimumX = numpy.amin(numpy.concatenate(ham))
-    maximumY = numpy.amax(numpy.concatenate(map(lambda (x): numpy.bincount(x), ham)))
+    maximumX = numpy.amax(numpy.concatenate(ham_partial))
+    minimumX = numpy.amin(numpy.concatenate(ham_partial))
+    maximumY = numpy.amax(numpy.array(numpy.concatenate(map(lambda (x): numpy.bincount(x), ham_partial))))

     if len(range(minimumX, maximumX)) == 0:
         range1 = minimumX
     else:
         range1 = range(minimumX, maximumX + 2)

-    counts = plt.hist(ham, align="left", rwidth=0.8, stacked=False,
-                     # label=[ "HD a", "HD b","HD a+b"],
-                     label=[ "HD a","HD b'", "HD b", "HD a'", "HD a+b"],
-                      #bins=range1, color=[ "#58ACFA", "#FA5858","#585858"],
-                      color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"],
-                       edgecolor='black', linewidth=1)
+    plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, label=[ "HD a", "HD b'", "HD b", "HD a'", "HD a+b"], bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], edgecolor='black', linewidth=1)
+
     plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1))
     plt.suptitle('Hamming distances within tags', fontsize=14)
-    #plt.title(title_file1, fontsize=12)
+    # plt.title(title_file1, fontsize=12)
     plt.xlabel("HD", fontsize=14)
     plt.ylabel("Absolute Frequency", fontsize=14)
     plt.grid(b=True, which='major', color='#424242', linestyle=':')

-
-    plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.1))
+    plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2))
     plt.xticks(numpy.arange(0, maximumX + 1, 1.0))
-    plt.ylim((0, maximumY * 1.2))
+    # plt.ylim(0, maximumY * 1.2)

-    legend = "sample size= {:,} against {:,}".format(len(ham[0]), lenTags, lenTags)
+    legend = "sample size= {:,} against {:,}".format(sum(ham_partial[4]), lenTags)
     plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure)
     pdf.savefig(fig, bbox_inches="tight")
     plt.close("all")
     plt.clf()

+
 def createTableFSD2(list1, diff=True):
     selfAB = numpy.concatenate(list1)
     uniqueFS = numpy.unique(selfAB)
@@ -208,7 +201,7 @@
             continue
         else:
             if state == 1:
-                for i, l  in zip(uniqueFS, nr):
+                for i, l in zip(uniqueFS, nr):
                     for j in table:
                         if j[0] == uniqueFS[l]:
                             count[l, 0] = j[1]
@@ -261,31 +254,33 @@

     return (final, sumCol)

-def createFileFSD2(summary, sumCol, overallSum, output_file, name,sep, rel=False, diff=True):
+
+def createFileFSD2(summary, sumCol, overallSum, output_file, name, sep, rel=False, diff=True):
     output_file.write(name)
     output_file.write("\n")
     if diff is False:
-        output_file.write("{}HD=1{}HD=2{}HD=3{}HD=4{}HD=5-8{}HD>8{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep))
+        output_file.write("{}HD=1{}HD=2{}HD=3{}HD=4{}HD=5-8{}HD>8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep))
     else:
         if rel is False:
-            output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep,sep))
+            output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep))
         else:
-            output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep,sep))
+            output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep))

     for item in summary:
         for nr in item:
             if "FS" not in nr and "diff" not in nr:
                 nr = nr.astype(float)
                 nr = nr.astype(int)
-            output_file.write("{}{}".format(nr,sep))
+            output_file.write("{}{}".format(nr, sep))
         output_file.write("\n")
     output_file.write("sum{}".format(sep))
     sumCol = map(int, sumCol)
     for el in sumCol:
-        output_file.write("{}{}".format(el,sep))
-    output_file.write("{}{}".format(overallSum.astype(int),sep))
+        output_file.write("{}{}".format(el, sep))
+    output_file.write("{}{}".format(overallSum.astype(int), sep))
     output_file.write("\n\n")

+
 def createTableHD(list1, row_label):
     selfAB = numpy.concatenate(list1)
     uniqueHD = numpy.unique(selfAB)
@@ -302,7 +297,7 @@
             continue
         else:
             if state == 1:
-                for i, l  in zip(uniqueHD, nr):
+                for i, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 0] = j[1]
@@ -339,16 +334,17 @@

         sumRow = count.sum(axis=1)
         sumCol = count.sum(axis=0)
-        first = ["{}{}".format(row_label,i) for i in uniqueHD]
+        first = ["{}{}".format(row_label, i) for i in uniqueHD]
         final = numpy.column_stack((first, count, sumRow))

     return (final, sumCol)

+
 def createTableHDwithTags(list1):
     selfAB = numpy.concatenate(list1)
     uniqueHD = numpy.unique(selfAB)
     nr = numpy.arange(0, len(uniqueHD), 1)
-    count = numpy.zeros((len(uniqueHD), 3))
+    count = numpy.zeros((len(uniqueHD), 5))

     state = 1
     for i in list1:
@@ -361,7 +357,7 @@
             continue
         else:
             if state == 1:
-                for i, l  in zip(uniqueHD, nr):
+                for i, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 0] = j[1]
@@ -370,12 +366,22 @@
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 1] = j[1]
-
             if state == 3:
                 for i, l in zip(uniqueHD, nr):
                     for j in table:
                         if j[0] == uniqueHD[l]:
                             count[l, 2] = j[1]
+            if state == 4:
+                for i, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 3] = j[1]
+            if state == 5:
+                for i, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 4] = j[1]
+
             state = state + 1

         sumRow = count.sum(axis=1)
@@ -385,42 +391,45 @@

     return (final, sumCol)

-def createFileHD(summary, sumCol, overallSum, output_file, name,sep):
+
+def createFileHD(summary, sumCol, overallSum, output_file, name, sep):
     output_file.write(name)
     output_file.write("\n")
-    output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep))
+    output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep))
     for item in summary:
         for nr in item:
             if "HD" not in nr and "diff" not in nr:
                 nr = nr.astype(float)
                 nr = nr.astype(int)
-            output_file.write("{}{}".format(nr,sep))
+            output_file.write("{}{}".format(nr, sep))
         output_file.write("\n")
     output_file.write("sum{}".format(sep))
     sumCol = map(int, sumCol)
     for el in sumCol:
-        output_file.write("{}{}".format(el,sep))
-    output_file.write("{}{}".format(overallSum.astype(int),sep))
+        output_file.write("{}{}".format(el, sep))
+    output_file.write("{}{}".format(overallSum.astype(int), sep))
     output_file.write("\n\n")

-def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name,sep):
+
+def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name, sep):
     output_file.write(name)
     output_file.write("\n")
-    output_file.write("{}HD a+b;HD a{}HD b{}sum{}\n".format(sep,sep,sep,sep))
+    output_file.write("{}HD a{}HD b'{}HD b{}HD a'{}HD a+b{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep))
     for item in summary:
         for nr in item:
             if "HD" not in nr:
                 nr = nr.astype(float)
                 nr = nr.astype(int)
-            output_file.write("{}{}".format(nr,sep))
+            output_file.write("{}{}".format(nr, sep))
         output_file.write("\n")
     output_file.write("sum{}".format(sep))
     sumCol = map(int, sumCol)
     for el in sumCol:
-        output_file.write("{}{}".format(el,sep))
-    output_file.write("{}{}".format(overallSum.astype(int),sep))
+        output_file.write("{}{}".format(el, sep))
+    output_file.write("{}{}".format(overallSum.astype(int), sep))
     output_file.write("\n\n")
-
+
+
 def hamming(array1, array2):
     res = 99 * numpy.ones(len(array1))
     i = 0
@@ -428,28 +437,28 @@
     for a in array1:
         dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2])  # fastest
         res[i] = numpy.amin(dist[dist > 0])  # pick min distance greater than zero
-        #print(i)
+        # print(i)
         i += 1
     return res

+
 def hamming_difference(array1, array2, mate_b):
     array2 = numpy.unique(array2)  # remove duplicate sequences to decrease running time
-    array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1
-    array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2
+    array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1])  # mate1 part1
+    array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1])  # mate1 part 2

-    array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1
+    array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2])  # mate2 part1
     array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2])  # mate2 part2

-    #diff11 = 999 * numpy.ones(len(array2))
-    #relativeDiffList = 999 * numpy.ones(len(array2))
-    #ham1 = 999 * numpy.ones(len(array2))
-    #ham2 = 999 * numpy.ones(len(array2))
-    #min_valueList = 999 * numpy.ones(len(array2))
-    #min_tagsList = 999 * numpy.ones(len(array2))
-    #diff11_zeros = 999 * numpy.ones(len(array2))
-    #min_tagsList_zeros = 999 * numpy.ones(len(array2))
-
-
+    # diff11 = 999 * numpy.ones(len(array2))
+    # relativeDiffList = 999 * numpy.ones(len(array2))
+    # ham1 = 999 * numpy.ones(len(array2))
+    # ham2 = 999 * numpy.ones(len(array2))
+    # min_valueList = 999 * numpy.ones(len(array2))
+    # min_tagsList = 999 * numpy.ones(len(array2))
+    # diff11_zeros = 999 * numpy.ones(len(array2))
+    # min_tagsList_zeros = 999 * numpy.ones(len(array2))
+
     diff11 = []
     relativeDiffList = []
     ham1 = []
@@ -460,20 +469,20 @@
     min_tagsList = []
     diff11_zeros = []
     min_tagsList_zeros = []
-    i = 0 # counter, only used to see how many HDs of tags were already calculated
-    if mate_b is False: # HD calculation for all a's
+    i = 0  # counter, only used to see how many HDs of tags were already calculated
+    if mate_b is False:  # HD calculation for all a's
         half1_mate1 = array1_half
         half2_mate1 = array1_half2
         half1_mate2 = array2_half
         half2_mate2 = array2_half2
-    elif mate_b is True: # HD calculation for all b's
+    elif mate_b is True:  # HD calculation for all b's
         half1_mate1 = array1_half2
         half2_mate1 = array1_half
         half1_mate2 = array2_half2
         half2_mate2 = array2_half

     for a, b, tag in zip(half1_mate1, half2_mate1, array1):
-        ## exclude identical tag from array2, to prevent comparison to itself
+        # exclude identical tag from array2, to prevent comparison to itself
         sameTag = numpy.where(array2 == tag)
         indexArray2 = numpy.arange(0, len(array2), 1)
         index_withoutSame = numpy.delete(indexArray2, sameTag)  # delete identical tag from the data
@@ -481,63 +490,59 @@
         # all tags without identical tag
         array2_half_withoutSame = half1_mate2[index_withoutSame]
         array2_half2_withoutSame = half2_mate2[index_withoutSame]
-        #array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs)
+        # array2_withoutSame = array2[index_withoutSame]  # whole tag (=not splitted into 2 halfs)

         dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in
                             array2_half_withoutSame])  # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
-        min_index = numpy.where(dist == dist.min()) # get index of min HD
+        min_index = numpy.where(dist == dist.min())  # get index of min HD
         min_value = dist[min_index]  # get minimum HDs
         min_tag_half2 = array2_half2_withoutSame[min_index]  # get all "b's" of the tag or all "a's" of the tag with minimum HD
-        #min_tag = array2_withoutSame[min_index] # get whole tag with min HD
+        # min_tag = array2_withoutSame[min_index]  # get whole tag with min HD

         dist2 = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in
                              min_tag_half2])  # calculate HD of "b" to all "b's" or "a" to all "a's"
-        for d_1, d_2 in zip(min_value, dist2):
+        for d, d2 in zip(min_value, dist2):
             if mate_b is True:  # half2, corrects the variable of the HD from both halfs if it is a or b
-                d = d_2
-                d2 = d_1
                 ham2.append(d)
                 ham2min.append(d2)
             else:  # half1, corrects the variable of the HD from both halfs if it is a or b
-                d = d_1
-                d2 = d_2
                 ham1.append(d)
                 ham1min.append(d2)
-
+
             min_valueList.append(d + d2)
             min_tagsList.append(tag)
-           # ham1.append(d)
-        #    ham2.append(d2)
             difference1 = abs(d - d2)
             diff11.append(difference1)
             rel_difference = round(float(difference1) / (d + d2), 1)
             relativeDiffList.append(rel_difference)

-            #### tags which have identical parts:
+            # tags which have identical parts:
             if d == 0 or d2 == 0:
                 min_tagsList_zeros.append(tag)
                 difference1_zeros = abs(d - d2)
                 diff11_zeros.append(difference1_zeros)
         i += 1
-
-        #print(i)
-    #diff11 = [st for st in diff11 if st != 999]
-    #ham1 = [st for st in ham1 if st != 999]
-    #ham2 = [st for st in ham2 if st != 999]
-    #min_valueList = [st for st in min_valueList if st != 999]
-    #min_tagsList = [st for st in min_tagsList if st != 999]
-    #relativeDiffList = [st for st in relativeDiffList if st != 999]
-    #diff11_zeros = [st for st in diff11_zeros if st != 999]
-    #min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999]
+
+    # print(i)
+    # diff11 = [st for st in diff11 if st != 999]
+    # ham1 = [st for st in ham1 if st != 999]
+    # ham2 = [st for st in ham2 if st != 999]
+    # min_valueList = [st for st in min_valueList if st != 999]
+    # min_tagsList = [st for st in min_tagsList if st != 999]
+    # relativeDiffList = [st for st in relativeDiffList if st != 999]
+    # diff11_zeros = [st for st in diff11_zeros if st != 999]
+    # min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999]

     return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros, ham1min, ham2min])

+
 def readFileReferenceFree(file):
     with open(file, 'r') as dest_f:
         data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string')
         integers = numpy.array(data_array[:, 0]).astype(int)
         return(integers, data_array)

+
 def hammingDistanceWithFS(fs, ham):
     fs = numpy.asarray(fs)
     maximum = max(ham)
@@ -565,7 +570,8 @@
     list1 = [data, data2, data3, data4, data5, data6]
     return(list1, maximum, minimum)

-def familySizeDistributionWithHD(fs, ham, diff=False, rel = True):
+
+def familySizeDistributionWithHD(fs, ham, diff=False, rel=True):
     hammingDistances = numpy.unique(ham)
     fs = numpy.asarray(fs)

@@ -616,49 +622,49 @@
     data6 = fs[hd6]

     if diff is True:
-        list1 = [data0,data, data2, data3, data4, data5, data6]
+        list1 = [data0, data, data2, data3, data4, data5, data6]
     else:
         list1 = [data, data2, data3, data4, data5, data6]

     return(list1, hammingDistances, maximum, minimum)

+
 def make_argparser():
     parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data')
     parser.add_argument('--inputFile',
                         help='Tabular File with three columns: ab or ba, tag and family size.')
     parser.add_argument('--inputName1')
-    parser.add_argument('--inputFile2',default=None,
+    parser.add_argument('--inputFile2', default=None,
                         help='Tabular File with three columns: ab or ba, tag and family size.')
     parser.add_argument('--inputName2')
-    parser.add_argument('--sample_size', default=1000,type=int,
+    parser.add_argument('--sample_size', default=1000, type=int,
                         help='Sample size of Hamming distance analysis.')
-    parser.add_argument('--sep', default=",",
-                        help='Separator in the csv file.')
-    parser.add_argument('--subset_tag', default=0,type=int,
+    parser.add_argument('--subset_tag', default=0, type=int,
                         help='The tag is shortened to the given number.')
-    parser.add_argument('--nproc', default=4,type=int,
+    parser.add_argument('--nproc', default=4, type=int,
                         help='The tool runs with the given number of processors.')
-    parser.add_argument('--only_DCS', action="store_false",  # default=False, type=bool,
+    parser.add_argument('--only_DCS', action="store_false",
                         help='Only tags of the DCSs are included in the HD analysis')

     parser.add_argument('--minFS', default=1, type=int,
                         help='Only tags, which have a family size greater or equal than specified, are included in the HD analysis')
     parser.add_argument('--maxFS', default=0, type=int,
                         help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis')
-    parser.add_argument('--nr_above_bars', action="store_true",  # default=False, type=bool,
+    parser.add_argument('--nr_above_bars', action="store_true",
                         help='If no, values above bars in the histrograms are removed')
-
-    parser.add_argument('--output_csv', default="data.csv", type=str,
-                        help='Name of the csv file.')
+
+    parser.add_argument('--output_tabular', default="data.tabular", type=str,
+                        help='Name of the tabular file.')
     parser.add_argument('--output_pdf', default="data.pdf", type=str,
                         help='Name of the pdf file.')
     parser.add_argument('--output_pdf2', default="data2.pdf", type=str,
                         help='Name of the pdf file.')
-    parser.add_argument('--output_csv2', default="data2.csv", type=str,
-                        help='Name of the csv file.')
+    parser.add_argument('--output_tabular2', default="data2.tabular", type=str,
+                        help='Name of the tabular file.')

     return parser

+
 def Hamming_Distance_Analysis(argv):
     parser = make_argparser()
     args = parser.parse_args(argv[1:])
@@ -673,20 +679,19 @@
     title_savedFile_pdf = args.output_pdf
     title_savedFile_pdf2 = args.output_pdf2

-    title_savedFile_csv = args.output_csv
-    title_savedFile_csv2 = args.output_csv2
+    title_savedFile_csv = args.output_tabular
+    title_savedFile_csv2 = args.output_tabular2

-    sep = args.sep
+    sep = "\t"
     onlyDuplicates = args.only_DCS
     minFS = args.minFS
     maxFS = args.maxFS
     nr_above_bars = args.nr_above_bars
-

     subset = args.subset_tag
     nproc = args.nproc

-    ### input checks
+    # input checks
     if index_size < 0:
         print("index_size is a negative integer.")
         exit(2)
@@ -695,15 +700,11 @@
         print("nproc is smaller or equal zero")
         exit(3)

-    if type(sep) is not str or len(sep)>1:
-        print("sep must be a single character.")
-        exit(4)
-
     if subset < 0:
         print("subset_tag is smaller or equal zero.")
         exit(5)

-    ### PLOT ###
+    # PLOT
     plt.rcParams['axes.facecolor'] = "E0E0E0"  # grey background color
     plt.rcParams['xtick.labelsize'] = 14
     plt.rcParams['ytick.labelsize'] = 14
@@ -763,7 +764,7 @@
                 integers = numpy.array(data_array[:, 0]).astype(int)
                 print("DCS in whole dataset", len(data_array))

-            ## HD analysis for a subset of the tag
+            # HD analysis for a subset of the tag
             if subset > 0:
                 tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]])
                 tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]])
@@ -789,11 +790,10 @@
             if index_size == 0:
                 result = numpy.arange(0, len(data_array), 1)
             else:
-                result = numpy.random.choice(len(integers), size=index_size,
-                                             replace=False)  # array of random sequences of size=index.size
+                result = numpy.random.choice(len(integers), size=index_size, replace=False)  # array of random sequences of size=index.size

-           # with open("index_result1_{}.pkl".format(app_f), "wb") as o:
-            #    pickle.dump(result, o, pickle.HIGHEST_PROTOCOL)
+            # with open("index_result1_{}.pkl".format(app_f), "wb") as o:
+            #     pickle.dump(result, o, pickle.HIGHEST_PROTOCOL)

             # comparison random tags to whole dataset
             result1 = data_array[result, 1]  # random tags
@@ -808,9 +808,9 @@
             proc_pool.close()
             proc_pool.join()
             ham = numpy.concatenate(ham).astype(int)
-          #  with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1:
-           #     for h, tag in zip(ham, result1):
-            #        output_file1.write("{}\t{}\n".format(tag, h))
+            # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1:
+            # for h, tag in zip(ham, result1):
+            #     output_file1.write("{}\t{}\n".format(tag, h))

             # HD analysis for chimeric reads
             proc_pool_b = Pool(nproc)
@@ -834,14 +834,9 @@
                                             numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int)
             minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]),
                                                   numpy.concatenate([item_b[7] for item_b in diff_list_b])))
-            HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]),
-                                             numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int)
+            HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]), numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int)
             HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]),
                                             numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int)
-         #   with open("HD_within tag_{}.txt".format(app_f), "w") as output_file2:
-          #      for d, s1, s2, hd, rel_d, tag in zip(diff, HDhalf1, HDhalf2, minHDs, rel_Diff, minHD_tags):
-           #         output_file2.write(
-            #            "{}\t{}\t{}\t{}\t{}\t{}\n".format(tag, hd, s1, s2, d, rel_d))

             lenTags = len(data_array)

@@ -856,10 +851,10 @@

             # prepare data for different kinds of plots
             # distribution of FSs separated after HD
-            familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham,rel=False)
+            familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False)
             list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham)  # histogram of HDs separated after FS
-
-            ## get FS for all tags with min HD of analysis of chimeric reads
+
+            # get FS for all tags with min HD of analysis of chimeric reads
             # there are more tags than sample size in the plot, because one tag can have multiple minimas
             seqDic = dict(zip(seq, quant))
             lst_minHD_tags = []
@@ -883,34 +878,23 @@
                 for i in minHD_tags_zeros:
                     lst_minHD_tags_zeros.append(seqDic.get(i))  # get family size for tags of chimeric reads

-                # histogram with HD of non-identical half
-                listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(
-                    lst_minHD_tags_zeros, diff_zeros)
-                # family size distribution of non-identical half
-                familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD(
-                    lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False)
-
-            #####################################################################################################################
-            ##################         plot Hamming Distance with Family size distribution         ##############################
-            #####################################################################################################################
-            plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf,
-                          subtitle="Hamming distance separated by family size", title_file1=name_file,
-                          lenTags=lenTags,xlabel="HD", nr_above_bars=nr_above_bars)
+            # histogram with HD of non-identical half
+            listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(lst_minHD_tags_zeros, diff_zeros)
+            # family size distribution of non-identical half
+            familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD(lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False)

-            ##########################       Plot FSD with separation after        ###############################################
-            ######################################################################################################################
+            # plot Hamming Distance with Family size distribution
+            plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, subtitle="Hamming distance separated by family size", title_file1=name_file, lenTags=lenTags, xlabel="HD", nr_above_bars=nr_above_bars)
+
+            # Plot FSD with separation after
             plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS,
                            originalCounts=quant, subtitle="Family size distribution separated by Hamming distance",
-                           pdf=pdf,relative=False, title_file1=name_file, diff=False)
+                           pdf=pdf, relative=False, title_file1=name_file, diff=False)

-            ##########################       Plot HD within tags          ########################################################
-            ######################################################################################################################
-           # plotHDwithinSeq_Sum2(HDhalf1, HDhalf2, minHDs, pdf=pdf, lenTags=lenTags, title_file1=name_file)
-            plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2min , HDhalf2, minHDs, pdf=pdf, lenTags=lenTags, title_file1=name_file)
-
-
-            ##########################       Plot difference between HD's separated after FSD ####################################
-            ######################################################################################################################
+            # Plot HD within tags
+            plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, title_file1=name_file)
+
+            # Plot difference between HD's separated after FSD
             plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf,
                           subtitle="Delta Hamming distance within tags",
                           title_file1=name_file, lenTags=lenTags,
@@ -921,44 +905,27 @@
                           title_file1=name_file, lenTags=lenTags,
                           xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars)

-            ####################       Plot FSD separated after difference between HD's        #####################################
-            ########################################################################################################################
-          #  plotFSDwithHD2(familySizeList1_diff, maximumXFS_diff, minimumXFS_diff,
-           #                subtitle="Family size distribution separated by delta Hamming distances within the tags",
-            #               pdf=pdf,relative=False, diff=True, title_file1=name_file, originalCounts=quant)
-
-         #   plotFSDwithHD2(familySizeList1_reldiff, maximumXFS_reldiff, minimumXFS_reldiff, originalCounts=quant, pdf=pdf,
-          #                 subtitle="Family size distribution separated by delta Hamming distances within the tags",
-#                           relative=True, diff=True, title_file1=name_file)
-
-
             # plots for chimeric reads
             if len(minHD_tags_zeros) != 0:
-                ## HD
+                # HD
                 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf,
                               subtitle="Hamming distance of the non-identical half of chimeras",
-                              title_file1=name_file, lenTags=lenTags,xlabel="HD", relative=False, nr_above_bars=nr_above_bars)
+                              title_file1=name_file, lenTags=lenTags, xlabel="HD", relative=False, nr_above_bars=nr_above_bars)

-                ## FSD
-           #     plotFSDwithHD2(familySizeList1_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros,
-            #                   originalCounts=quant, pdf=pdf,
-             #                  subtitle="Family size distribution separated by Hamming distance of the non-identical half of chimeras",
-              #                 relative=False, diff=False, title_file1=name_file)
-
-            ### print all data to a CSV file
-            #### HD ####
+            # print all data to a CSV file
+            # HD
             summary, sumCol = createTableHD(list1, "HD=")
             overallSum = sum(sumCol)  # sum of columns in table

-            #### FSD ####
+            # FSD
             summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False)
             overallSum5 = sum(sumCol5)

-            ### HD of both parts of the tag ####
-            summary9, sumCol9 = createTableHDwithTags([HDhalf1, HDhalf2,numpy.array(minHDs)])
+            # HD of both parts of the tag
+            summary9, sumCol9 = createTableHDwithTags([HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, numpy.array(minHDs)])
             overallSum9 = sum(sumCol9)

-            ## HD
+            # HD
             # absolute difference
             summary11, sumCol11 = createTableHD(listDifference1, "diff=")
             overallSum11 = sum(sumCol11)
@@ -966,44 +933,33 @@
             summary13, sumCol13 = createTableHD(listRelDifference1, "diff=")
             overallSum13 = sum(sumCol13)

-            ## FSD
-            # absolute difference
-        #    summary19, sumCol19 = createTableFSD2(familySizeList1_diff)
-        #    overallSum19 = sum(sumCol19)
-            # relative difference
-         #   summary21, sumCol21 = createTableFSD2(familySizeList1_reldiff)
-          #  overallSum21 = sum(sumCol21)
-
             # chimeric reads
             if len(minHD_tags_zeros) != 0:
                 # absolute difference and tags where at least one half has HD=0
                 summary15, sumCol15 = createTableHD(listDifference1_zeros, "HD=")
                 overallSum15 = sum(sumCol15)
-                # absolute difference and tags where at least one half has HD=0
-           #     summary23, sumCol23 = createTableFSD2(familySizeList1_diff_zeros, diff=False)
-            #    overallSum23 = sum(sumCol23)

             output_file.write("{}\n".format(name_file))
             output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len(
                 numpy.concatenate(list1)), lenTags, lenTags))

-            ### HD ###
+            # HD
             createFileHD(summary, sumCol, overallSum, output_file,
                          "Hamming distance separated by family size", sep)
-            ### FSD ###
+            # FSD
             createFileFSD2(summary5, sumCol5, overallSum5, output_file,
                            "Family size distribution separated by Hamming distance", sep,
                            diff=False)

             count = numpy.bincount(quant)
-            #output_file.write("{}{}\n".format(sep, name_file))
+            # output_file.write("{}{}\n".format(sep, name_file))
             output_file.write("\n")
             output_file.write("max. family size:{}{}\n".format(sep, max(quant)))
             output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1]))
             output_file.write(
                 "relative frequency:{}{}\n\n".format(sep, float(count[len(count) - 1]) / sum(count)))

-            ### HD within tags ###
+            # HD within tags
             output_file.write(
                 "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n"
                 "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n")
@@ -1019,24 +975,13 @@
             createFileHD(summary13, sumCol13, overallSum13, output_file,
                          "Chimera analysis: relative delta Hamming distances", sep)

-        #    createFileFSD2(summary19, sumCol19, overallSum19, output_file,
-         #                  "Family size distribution separated by absolute delta Hamming distance",
-          #                 sep)
-          #  createFileFSD2(summary21, sumCol21, overallSum21, output_file,
-           #                "Family size distribution separated by relative delta Hamming distance",
-            #               sep, rel=True)
-
             if len(minHD_tags_zeros) != 0:
                 output_file.write(
                     "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n")
                 createFileHD(summary15, sumCol15, overallSum15, output_file,
                              "Hamming distances of non-zero half", sep)
-         #       createFileFSD2(summary23, sumCol23, overallSum23, output_file,
-          #                     "Family size distribution separated by Hamming distance of non-zero half",
-           #                    sep, diff=False)
             output_file.write("\n")


-
 if __name__ == '__main__':
     sys.exit(Hamming_Distance_Analysis(sys.argv))
--- a/hd.xml	Wed May 23 14:47:43 2018 -0400
+++ b/hd.xml	Mon Oct 08 05:56:04 2018 -0400
@@ -1,33 +1,32 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<tool id="hd" name="Duplex Sequencing Analysis: hd" version="0.0.19">
+<tool id="hd" name="Duplex Sequencing Analysis: hd" version="1.0.0">
+    <description>Hamming distance (HD) analysis of tags</description>
     <requirements>
         <requirement type="package" version="2.7">python</requirement>
-        <requirement type="package" version="1.4">matplotlib</requirement>
+        <requirement type="package" version="1.4.0">matplotlib</requirement>
     </requirements>
-    <description>Hamming distance (HD) analysis of tags</description>
     <command>
-        python2 $__tool_directory__/hd.py --inputFile "$inputFile" --inputName1 "$inputFile.name" --inputFile2 "$inputFile2" --inputName2 "$inputFile2.name" --sample_size $sampleSize --sep $separator --subset_tag $subsetTag --nproc $nproc $onlyDCS --minFS $minFS --maxFS $maxFS
-		$nr_above_bars --output_pdf $output_pdf --output_csv $output_csv
+        python2 '$__tool_directory__/hd.py' --inputFile '$inputFile' --inputName1 '$inputFile.name' --inputFile2 '$inputFile2' --inputName2 '$inputFile2.name' --sample_size $sampleSize --subset_tag $subsetTag --nproc $nproc $onlyDCS --minFS $minFS --maxFS $maxFS
+		$nr_above_bars --output_pdf $output_pdf --output_tabular $output_tabular
         #if $inputFile2:
-        --output_pdf2 $output_pdf2 --output_csv2 $output_csv2
+        --output_pdf2 $output_pdf2 --output_tabular2 $output_tabular2
         #end if
     </command>
     <inputs>
         <param name="inputFile" type="data" format="tabular" label="Dataset 1: input tags" optional="false"/>
-        <param name="inputFile2" type="data" format="tabular" label="Dataset 2: input tags" optional="true" help="Input in tabular format with the family size, tags and the direction of the strand ('ab' or 'ba') for each family."/>
+        <param name="inputFile2" type="data" format="tabular" label="Dataset 2: input tags" optional="true" help="Input in tabular format with the family size, tag and the direction of the strand ('ab' or 'ba') for each family."/>
         <param name="sampleSize" type="integer" label="number of tags in the sample" value="1000" min="0" help="specifies the number of tags in one analysis. If sample size is 0, all tags of the dataset are compared against all tags."/>
-        <param name="minFS" type="integer" label="minimum family size of the tags" min="1" value="1" help="filters the tags after their family size: Families with smaller size are skipped. Default: min. family size = 1."/>
-        <param name="maxFS" type="integer" label="max family size of the tags" min="0" value="0" help="filters the tags after their family size: Families with larger size are skipped. If max. family size is 0, no upper bound is defined and the maximum family size in the analysis will be the maximum family size of the whole dataset. Default: max. family size = 0."/>
-        <param name="separator" type="text" label="Separator of the CSV file." help="can be a single character" value=","/>
-        <param name="onlyDCS" type="boolean" label="only DCS in the analysis?" truevalue="" falsevalue="--only_DCS" checked="False" help="Only tags, which have a partner tag in the dataset, are included in the analysis."/>
-        <param name="subsetTag" type="integer" label="shorten tag in the analysis?" value="0" help="An analysis with shorter tag length, which is specified by this parameter, is simulated. If this parameter is 0 (by default), the tag with its original length is used in the analysis."/>
+        <param name="minFS" type="integer" label="minimum family size of the tags" min="1" value="1" help="filters the tags after their family size: Families with a smaller size are skipped. Default: min. family size = 1."/>
+        <param name="maxFS" type="integer" label="max family size of the tags" min="0" value="0" help="filters the tags after their family size: Families with a larger size are skipped. If max. family size is 0, no upper bound is defined and the maximum family size in the analysis will be the maximum family size of the whole dataset. Default: max. family size = 0."/>
+        <param name="onlyDCS" type="boolean" label="only DCS in the analysis?" truevalue="" falsevalue="--only_DCS" checked="False" help="Only tags, which have a partner tag (ab and ba) in the dataset, are included in the analysis."/>
+        <param name="subsetTag" type="integer" label="shorten tag in the analysis?" value="0" help="By this parameter an analysis with shorter tag length is simulated. If this parameter is 0 (by default), the tags with its original length are used in the analysis."/>
         <param name="nproc" type="integer" label="number of processors" value="8" help="Number of processor used for computing."/>
-        <param name="nr_above_bars" type="boolean" label="include numbers above bars?" truevalue="--nr_above_bars" falsevalue="" checked="True" help="The absolute and relative values of the bar can be included or removed in the plot. "/>
+        <param name="nr_above_bars" type="boolean" label="include numbers above bars?" truevalue="--nr_above_bars" falsevalue="" checked="True" help="The absolute and relative values of the data can be included or removed from the plots. "/>

     </inputs>
     <outputs>
-        <data name="output_csv" format="csv"/>
-        <data name="output_csv2" format="csv">
+        <data name="output_tabular" format="tabular"/>
+        <data name="output_tabular2" format="tabular">
             <filter>inputFile2</filter>
         </data>
         <data name="output_pdf" format="pdf" />
@@ -35,19 +34,30 @@
             <filter>inputFile2</filter>
         </data>
     </outputs>
+    <tests>
+        <test>
+            <param name="inputFile" value="Test_data.tabular"/>
+            <param name="inputFile2" value="Test_data2.tabular"/>
+            <param name="sampleSize" value="0"/>
+            <output name="output_pdf" file="output_file.pdf" lines_diff="6"/>
+            <output name="output_tabular" file="output_file.tabular"/>
+            <output name="output_pdf2" file="output_file2.pdf" lines_diff="6"/>
+            <output name="output_tabular2" file="output_file2.tabular"/>
+        </test>
+    </tests>
     <help> <![CDATA[
 **What it does**

     This tool calculates the Hamming distance for the tags by comparing them to all tags in the dataset and finally searches for the minimum Hamming distance.
     The Hamming distance is shown in a histogram separated by the family sizes or in a family size distribution separated by the Hamming distances.
     This similarity measure was calculated for each tag to distinguish whether similar tags truly stem from different molecules or occured due to sequencing or PCR errros.
-    In addition the tags of chimeric reads can be identified by calculating the Hamming distance for each half of the tag.
+    In addition, the tags of chimeric reads can be identified by calculating the Hamming distance for each half of the tag.
     This analysis can be performed on only a sample (by default: sample size=1000) or on the whole dataset (sample size=0).
-    It is also possible to select on only those tags, which have a partner tag in the dataset (DCSs) or to filter the dataset after the tag's family size.
+    It is also possible to select on only those tags, which have a partner tag (ab and ba) in the dataset (DCSs) or to filter the dataset after the tag's family size.

 **Input**

-    This tools expects a tabular file with the tags of all families, their sizes and information about forward (ab) and reverse (ba) strands.
+    This tools expects a tabular file with the tags of all families, their sizes and information about forward (ab) and reverse (ba) strands. It is possible to upload two files which allows the performance of two analyses at the same time.

     +-----+----------------------------+----+
     | 1   | AAAAAAAAAAAATGTTGGAATCTT   | ba |
@@ -60,7 +70,7 @@

 **Output**

-    The output is one PDF file with the plots of the Hamming distance and a CSV with the data of the plot for each dataset.
+    The output is one PDF file with the plots of the Hamming distance and a tabular file with the data of the plot for each dataset.


 **About Author**
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Test_data.tabular	Mon Oct 08 05:56:04 2018 -0400
@@ -0,0 +1,20 @@
+1	AAAAAAAAAAAAAACCAAAACTTC	ba
+1	AAAAAAAAAAAAACCAGGCGTCGA	ba
+1	AAAAAAAAAAAAAGCTCCACGTTG	ba
+1	AAAAAAAAAAAAATCGTGGTTTGT	ba
+1	AAAAAAAAAAAAATTCACCCTTGT	ba
+7	AAAAAAAAAAAACACACTTAACTT	ba
+1	AAAAAAAAAAAACAGTGTTGAGAC	ba
+4	AAAAAAAAAAAACCGCTCCTCACA	ba
+1	AAAAAAAAAAAAGGCAACACAGAA	ab
+2	AAAAAAAAAAAATCTTTCTTTGAG	ab
+1	AAAAAAAAAAAATTGGGTTCCTTA	ab
+1	AAAAAAAAAAAGAGTCGCACCCAG	ba
+4	AAAAAAAAAAAGATCGTGGTTTGT	ba
+1	AAAAAAAAAAAGCGCAACACAGAA	ab
+3	AAAAAAAAAAAGGGCAACACAGAA	ab
+1	AAAAAAAAAAAGTAGCCCTAAACG	ab
+1	AAAAAAAAAAAGTCTTTCTTTGAG	ab
+1	AAAAAAAAAAATATCATAGACTCT	ab
+6	AAAAAAAAAAATATTCACCCTTGT	ba
+1	AAAAAAAAAAATATTCGAAAGTTA	ba
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Test_data2.tabular	Mon Oct 08 05:56:04 2018 -0400
@@ -0,0 +1,20 @@
+1	AAAAAAAACCGCCCAACTGCCGGT	ab
+5	AAAAAAAACCTCTCAACCCCAAAT	ba
+7	AAAAAAAACCTCTTGCGATGTTGT	ab
+1	AAAAAAAACCTCTTGCGCTGTTGT	ab
+1	AAAAAAAACCTCTTGTGATGTTGT	ab
+12	AAAAAAAACCTGAGCAATGGTTCC	ab
+3	AAAAAAAACCTTGACCCTCACATG	ba
+6	AAAAAAAACCTTGCACTCGTCCTA	ba
+9	AAAAAAAACGAAATAAAAAAACCT	ba
+1	AAAAAAAACGACCGGCCTTAGACA	ba
+4	AAAAAAAACGCCACCACCCCCTTT	ab
+12	AAAAAAAACGCCACGGGCACTATT	ba
+13	AAAAAAAACGTATCAGTAGATCCT	ab
+1	AAAAAAAACTAGTAGGATTTCATG	ba
+3	AAAAAAAACTATAGAAAATCCATT	ba
+1	AAAAAAAACTATTCTATTTCCGAT	ba
+13	AAAAAAAACTGATCTGCTTGGCGG	ba
+8	AAAAAAAACTTGCGAATAGCATCG	ba
+4	AAAAAAAACTTGTTATCAAAACGT	ab
+1	AAAAAAAAGAAAAGTTCAACACGC	ba
\ No newline at end of file
Binary file test-data/output_file.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_file.tabular	Mon Oct 08 05:56:04 2018 -0400
@@ -0,0 +1,85 @@
+Test_data
+number of tags per file	20 (from 20) against 20
+
+Hamming distance separated by family size
+	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
+HD=1	5	1	1	1	1	0	9
+HD=6	3	0	0	0	0	0	3
+HD=7	4	0	0	0	1	0	5
+HD=8	2	0	0	1	0	0	3
+sum	14	1	1	2	2	0	20
+
+Family size distribution separated by Hamming distance
+	HD=1	HD=2	HD=3	HD=4	HD=5-8	HD>8	sum
+FS=1	5	0	0	0	9	0	14
+FS=2	1	0	0	0	0	0	1
+FS=3	1	0	0	0	0	0	1
+FS=4	1	0	0	0	1	0	2
+FS=6	1	0	0	0	0	0	1
+FS=7	0	0	0	0	1	0	1
+sum	9	0	0	0	11	0	20
+
+
+max. family size:	7
+absolute frequency:	1
+relative frequency:	0.05
+
+The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.
+It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.
+actual number of tags with min HD = 171 (sample size by user = 20)
+length of one part of the tag = 12
+
+Hamming distance of each half in the tag
+	HD a	HD b'	HD b	HD a'	HD a+b	sum
+HD=0	146	0	8	4	0	158
+HD=1	0	2	2	21	11	36
+HD=2	0	0	0	0	1	1
+HD=5	0	0	4	0	0	4
+HD=6	0	2	2	0	6	10
+HD=7	0	16	9	0	21	46
+HD=8	0	20	0	0	26	46
+HD=9	0	50	0	0	50	100
+HD=10	0	30	0	0	30	60
+HD=11	0	18	0	0	18	36
+HD=12	0	8	0	0	8	16
+sum	146	146	25	25	171	513
+
+Absolute delta Hamming distances within the tag
+	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
+diff=0	1	0	0	0	0	0	1
+diff=1	6	1	2	1	1	0	11
+diff=4	4	0	0	0	0	0	4
+diff=5	2	0	0	0	0	0	2
+diff=6	6	0	0	1	1	0	8
+diff=7	15	0	1	0	3	0	19
+diff=8	15	2	0	1	2	0	20
+diff=9	37	4	1	4	4	0	50
+diff=10	22	2	1	4	1	0	30
+diff=11	8	1	1	5	3	0	18
+diff=12	6	1	0	1	0	0	8
+sum	122	11	6	17	15	0	171
+
+Chimera analysis: relative delta Hamming distances
+	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
+diff=0.0	1	0	0	0	0	0	1
+diff=0.7	6	0	0	0	0	0	6
+diff=0.8	4	0	0	1	1	0	6
+diff=1.0	111	11	6	16	14	0	158
+sum	122	11	6	17	15	0	171
+
+Chimeras:
+All tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.
+So the hamming distance of the non-identical half is compared.
+Hamming distances of non-zero half
+	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
+HD=1	6	1	2	1	1	0	11
+HD=6	2	0	0	0	0	0	2
+HD=7	15	0	1	0	3	0	19
+HD=8	15	2	0	1	2	0	20
+HD=9	37	4	1	4	4	0	50
+HD=10	22	2	1	4	1	0	30
+HD=11	8	1	1	5	3	0	18
+HD=12	6	1	0	1	0	0	8
+sum	111	11	6	16	14	0	158
+
+
Binary file test-data/output_file2.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_file2.tabular	Mon Oct 08 05:56:04 2018 -0400
@@ -0,0 +1,97 @@
+Test_data2
+number of tags per file	20 (from 20) against 20
+
+Hamming distance separated by family size
+	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
+HD=1	2	0	0	0	1	0	3
+HD=6	0	0	0	1	0	1	2
+HD=7	2	0	1	1	2	1	7
+HD=8	1	0	1	0	2	1	5
+HD=9	1	0	0	0	0	1	2
+HD=10	1	0	0	0	0	0	1
+sum	7	0	2	2	5	4	20
+
+Family size distribution separated by Hamming distance
+	HD=1	HD=2	HD=3	HD=4	HD=5-8	HD>8	sum
+FS=1	2	0	0	0	3	2	7
+FS=3	0	0	0	0	2	0	2
+FS=4	0	0	0	0	2	0	2
+FS=5	0	0	0	0	1	0	1
+FS=6	0	0	0	0	1	0	1
+FS=7	1	0	0	0	0	0	1
+FS=8	0	0	0	0	1	0	1
+FS=9	0	0	0	0	1	0	1
+FS=12	0	0	0	0	2	0	2
+FS=13	0	0	0	0	1	1	2
+sum	3	0	0	0	14	3	20
+
+
+max. family size:	13
+absolute frequency:	2
+relative frequency:	0.1
+
+The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.
+It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.
+actual number of tags with min HD = 79 (sample size by user = 20)
+length of one part of the tag = 12
+
+Hamming distance of each half in the tag
+	HD a	HD b'	HD b	HD a'	HD a+b	sum
+HD=0	20	0	0	5	0	25
+HD=1	22	4	4	3	8	41
+HD=2	9	2	0	9	2	22
+HD=3	0	0	0	10	0	10
+HD=4	0	0	2	1	0	3
+HD=5	0	0	5	0	0	5
+HD=6	0	5	7	0	3	15
+HD=7	0	7	10	0	10	27
+HD=8	0	6	0	0	10	16
+HD=9	0	7	0	0	17	24
+HD=10	0	11	0	0	13	24
+HD=11	0	8	0	0	7	15
+HD=12	0	1	0	0	5	6
+HD=13	0	0	0	0	4	4
+sum	51	51	28	28	79	237
+
+Absolute delta Hamming distances within the tag
+	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
+diff=1	5	0	0	1	5	0	11
+diff=2	4	0	0	0	0	0	4
+diff=3	1	0	2	1	1	0	5
+diff=4	1	0	1	0	2	1	5
+diff=5	2	0	0	0	4	6	12
+diff=6	1	0	0	1	1	7	10
+diff=7	2	0	1	0	0	0	3
+diff=8	0	0	1	0	1	3	5
+diff=9	6	0	0	1	3	4	14
+diff=10	4	0	0	0	3	2	9
+diff=11	0	0	0	0	0	1	1
+sum	26	0	5	4	20	24	79
+
+Chimera analysis: relative delta Hamming distances
+	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
+diff=0.1	1	0	0	1	1	0	3
+diff=0.3	3	0	2	0	0	0	5
+diff=0.4	1	0	0	1	3	0	5
+diff=0.5	0	0	1	0	0	1	2
+diff=0.6	1	0	0	0	3	7	11
+diff=0.7	1	0	0	0	1	5	7
+diff=0.8	10	0	0	0	2	9	21
+diff=1.0	9	0	2	2	10	2	25
+sum	26	0	5	4	20	24	79
+
+Chimeras:
+All tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.
+So the hamming distance of the non-identical half is compared.
+Hamming distances of non-zero half
+	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
+HD=1	4	0	0	0	4	0	8
+HD=2	2	0	0	0	0	0	2
+HD=6	0	0	0	1	0	2	3
+HD=7	1	0	1	0	0	0	2
+HD=8	0	0	1	0	1	0	2
+HD=9	1	0	0	1	2	0	4
+HD=10	1	0	0	0	3	0	4
+sum	9	0	2	2	10	2	25
+
+