changeset 20:b084b6a8e3ac draft

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/hd commit e76960d95c059a78d880ed5ecd6202f54b091025
author mheinzl
date Fri, 14 Dec 2018 04:31:21 -0500
parents 2e9f7ea7ae93
children 9919024d7778
files hd.py hd.xml test-data/Test_data2.tabular test-data/output_file2.pdf test-data/output_file2.tabular
diffstat 5 files changed, 301 insertions(+), 400 deletions(-) [+]
line wrap: on
line diff
--- a/hd.py	Mon Oct 08 05:56:04 2018 -0400
+++ b/hd.py	Fri Dec 14 04:31:21 2018 -0500
@@ -13,8 +13,8 @@
 # It is also possible to perform the HD analysis with shortened tags with given sizes as input.
 # The tool can run on a certain number of processors, which can be defined by the user.
 
-# USAGE: python hd.py --inputFile filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --sample_size int/0 --sep "characterWhichSeparatesCSVFile" /
-#        --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular --output_pdf outputfile_name_pdf
+# USAGE: python hd.py --inputFile filename --inputName1 filename --sample_size int/0 --sep "characterWhichSeparatesCSVFile" /
+#        --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular
 
 import argparse
 import itertools
@@ -174,7 +174,7 @@
     plt.xticks(numpy.arange(0, maximumX + 1, 1.0))
     # plt.ylim(0, maximumY * 1.2)
 
-    legend = "sample size= {:,} against {:,}".format(sum(ham_partial[4]), lenTags)
+    legend = "sample size= {:,} against {:,}".format(len(numpy.concatenate(ham_partial)), lenTags)
     plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure)
     pdf.savefig(fig, bbox_inches="tight")
     plt.close("all")
@@ -634,9 +634,9 @@
     parser.add_argument('--inputFile',
                         help='Tabular File with three columns: ab or ba, tag and family size.')
     parser.add_argument('--inputName1')
-    parser.add_argument('--inputFile2', default=None,
-                        help='Tabular File with three columns: ab or ba, tag and family size.')
-    parser.add_argument('--inputName2')
+    # parser.add_argument('--inputFile2', default=None,
+    #                     help='Tabular File with three columns: ab or ba, tag and family size.')
+    # parser.add_argument('--inputName2')
     parser.add_argument('--sample_size', default=1000, type=int,
                         help='Sample size of Hamming distance analysis.')
     parser.add_argument('--subset_tag', default=0, type=int,
@@ -657,10 +657,10 @@
                         help='Name of the tabular file.')
     parser.add_argument('--output_pdf', default="data.pdf", type=str,
                         help='Name of the pdf file.')
-    parser.add_argument('--output_pdf2', default="data2.pdf", type=str,
-                        help='Name of the pdf file.')
-    parser.add_argument('--output_tabular2', default="data2.tabular", type=str,
-                        help='Name of the tabular file.')
+    # parser.add_argument('--output_pdf2', default="data2.pdf", type=str,
+    #                     help='Name of the pdf file.')
+    # parser.add_argument('--output_tabular2', default="data2.tabular", type=str,
+    #                     help='Name of the tabular file.')
 
     return parser
 
@@ -672,15 +672,15 @@
     file1 = args.inputFile
     name1 = args.inputName1
 
-    file2 = args.inputFile2
-    name2 = args.inputName2
+    # file2 = args.inputFile2
+    # name2 = args.inputName2
 
     index_size = args.sample_size
     title_savedFile_pdf = args.output_pdf
-    title_savedFile_pdf2 = args.output_pdf2
+    # title_savedFile_pdf2 = args.output_pdf2
 
     title_savedFile_csv = args.output_tabular
-    title_savedFile_csv2 = args.output_tabular2
+    # title_savedFile_csv2 = args.output_tabular2
 
     sep = "\t"
     onlyDuplicates = args.only_DCS
@@ -711,276 +711,284 @@
     plt.rcParams['patch.edgecolor'] = "#000000"
     plt.rc('figure', figsize=(11.69, 8.27))  # A4 format
 
-    if file2 != str(None):
-        files = [file1, file2]
-        name1 = name1.split(".tabular")[0]
-        name2 = name2.split(".tabular")[0]
-        names = [name1, name2]
-        pdf_files = [title_savedFile_pdf, title_savedFile_pdf2]
-        csv_files = [title_savedFile_csv, title_savedFile_csv2]
-    else:
-        files = [file1]
-        name1 = name1.split(".tabular")[0]
-        names = [name1]
-        pdf_files = [title_savedFile_pdf]
-        csv_files = [title_savedFile_csv]
+    # if file2 != str(None):
+    #     files = [file1, file2]
+    #     name1 = name1.split(".tabular")[0]
+    #     name2 = name2.split(".tabular")[0]
+    #     names = [name1, name2]
+    #     pdf_files = [title_savedFile_pdf, title_savedFile_pdf2]
+    #     csv_files = [title_savedFile_csv, title_savedFile_csv2]
+    # else:
+    # f = file1
+    name1 = name1.split(".tabular")[0]
+    # name_file = name1
+    # pdf_f = title_savedFile_pdf
+    # csv_f = title_savedFile_csv
 
-    for f, name_file, pdf_f, csv_f in zip(files, names, pdf_files, csv_files):
-        with open(csv_f, "w") as output_file, PdfPages(pdf_f) as pdf:
-            print("dataset: ", name_file)
-            integers, data_array = readFileReferenceFree(f)
-            data_array = numpy.array(data_array)
-            int_f = numpy.array(data_array[:, 0]).astype(int)
-            data_array = data_array[numpy.where(int_f >= minFS)]
-            integers = integers[integers >= minFS]
+    #for f, name_file, pdf_f, csv_f in zip(files, names, pdf_files, csv_files):
+    with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf:
+        print("dataset: ", name1)
+        integers, data_array = readFileReferenceFree(file1)
+        data_array = numpy.array(data_array)
+        int_f = numpy.array(data_array[:, 0]).astype(int)
+        data_array = data_array[numpy.where(int_f >= minFS)]
+        integers = integers[integers >= minFS]
 
-            # select family size for tags
-            if maxFS > 0:
-                int_f2 = numpy.array(data_array[:, 0]).astype(int)
-                data_array = data_array[numpy.where(int_f2 <= maxFS)]
-                integers = integers[integers <= maxFS]
+        # select family size for tags
+        if maxFS > 0:
+            int_f2 = numpy.array(data_array[:, 0]).astype(int)
+            data_array = data_array[numpy.where(int_f2 <= maxFS)]
+            integers = integers[integers <= maxFS]
 
-            print("min FS", min(integers))
-            print("max FS", max(integers))
+        print("min FS", min(integers))
+        print("max FS", max(integers))
 
-            tags = data_array[:, 2]
-            seq = data_array[:, 1]
+        tags = data_array[:, 2]
+        seq = data_array[:, 1]
 
-            if onlyDuplicates is True:
-                # find all unique tags and get the indices for ALL tags, but only once
-                u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
-                d = u[c > 1]
+        if onlyDuplicates is True:
+            # find all unique tags and get the indices for ALL tags, but only once
+            u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
+            d = u[c > 1]
 
-                # get family sizes, tag for duplicates
-                duplTags_double = integers[numpy.in1d(seq, d)]
-                duplTags = duplTags_double[0::2]  # ab of DCS
-                duplTagsBA = duplTags_double[1::2]  # ba of DCS
+            # get family sizes, tag for duplicates
+            duplTags_double = integers[numpy.in1d(seq, d)]
+            duplTags = duplTags_double[0::2]  # ab of DCS
+            duplTagsBA = duplTags_double[1::2]  # ba of DCS
 
-                duplTags_tag = tags[numpy.in1d(seq, d)][0::2]  # ab
-                duplTags_seq = seq[numpy.in1d(seq, d)][0::2]  # ab - tags
+            duplTags_tag = tags[numpy.in1d(seq, d)][0::2]  # ab
+            duplTags_seq = seq[numpy.in1d(seq, d)][0::2]  # ab - tags
 
-                data_array = numpy.column_stack((duplTags, duplTags_seq))
-                data_array = numpy.column_stack((data_array, duplTags_tag))
-                integers = numpy.array(data_array[:, 0]).astype(int)
-                print("DCS in whole dataset", len(data_array))
+            data_array = numpy.column_stack((duplTags, duplTags_seq))
+            data_array = numpy.column_stack((data_array, duplTags_tag))
+            integers = numpy.array(data_array[:, 0]).astype(int)
+            print("DCS in whole dataset", len(data_array))
 
-            # HD analysis for a subset of the tag
-            if subset > 0:
-                tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]])
-                tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]])
+        # HD analysis for a subset of the tag
+        if subset > 0:
+            tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]])
+            tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]])
 
-                flanking_region_float = float((len(tag1[0]) - subset)) / 2
-                flanking_region = int(flanking_region_float)
-                if flanking_region_float % 2 == 0:
-                    tag1_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag1])
-                    tag2_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag2])
-                else:
-                    flanking_region_rounded = int(round(flanking_region, 1))
-                    flanking_region_rounded_end = len(tag1[0]) - subset - flanking_region_rounded
-                    tag1_shorten = numpy.array(
-                        [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag1])
-                    tag2_shorten = numpy.array(
-                        [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2])
+            flanking_region_float = float((len(tag1[0]) - subset)) / 2
+            flanking_region = int(flanking_region_float)
+            if flanking_region_float % 2 == 0:
+                tag1_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag1])
+                tag2_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag2])
+            else:
+                flanking_region_rounded = int(round(flanking_region, 1))
+                flanking_region_rounded_end = len(tag1[0]) - subset - flanking_region_rounded
+                tag1_shorten = numpy.array(
+                    [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag1])
+                tag2_shorten = numpy.array(
+                    [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2])
 
-                data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)])
-                data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2]))
+            data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)])
+            data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2]))
 
-            print("length of tag= ", len(data_array[0, 1]))
-            # select sample: if no size given --> all vs. all comparison
-            if index_size == 0:
-                result = numpy.arange(0, len(data_array), 1)
-            else:
-                result = numpy.random.choice(len(integers), size=index_size, replace=False)  # array of random sequences of size=index.size
+        print("length of tag= ", len(data_array[0, 1]))
+        # select sample: if no size given --> all vs. all comparison
+        if index_size == 0:
+            result = numpy.arange(0, len(data_array), 1)
+        else:
+            result = numpy.random.choice(len(integers), size=index_size,
+                                         replace=False)  # array of random sequences of size=index.size
 
-            # with open("index_result1_{}.pkl".format(app_f), "wb") as o:
-            #     pickle.dump(result, o, pickle.HIGHEST_PROTOCOL)
+        # with open("index_result1_{}.pkl".format(app_f), "wb") as o:
+        #     pickle.dump(result, o, pickle.HIGHEST_PROTOCOL)
 
-            # comparison random tags to whole dataset
-            result1 = data_array[result, 1]  # random tags
-            result2 = data_array[:, 1]  # all tags
-            print("size of the whole dataset= ", len(result2))
-            print("sample size= ", len(result1))
+        # comparison random tags to whole dataset
+        result1 = data_array[result, 1]  # random tags
+        result2 = data_array[:, 1]  # all tags
+        print("size of the whole dataset= ", len(result2))
+        print("sample size= ", len(result1))
 
-            # HD analysis of whole tag
-            proc_pool = Pool(nproc)
-            chunks_sample = numpy.array_split(result1, nproc)
-            ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample)
-            proc_pool.close()
-            proc_pool.join()
-            ham = numpy.concatenate(ham).astype(int)
-            # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1:
-            # for h, tag in zip(ham, result1):
-            #     output_file1.write("{}\t{}\n".format(tag, h))
+        # HD analysis of whole tag
+        proc_pool = Pool(nproc)
+        chunks_sample = numpy.array_split(result1, nproc)
+        ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample)
+        proc_pool.close()
+        proc_pool.join()
+        ham = numpy.concatenate(ham).astype(int)
+        # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1:
+        # for h, tag in zip(ham, result1):
+        #     output_file1.write("{}\t{}\n".format(tag, h))
 
-            # HD analysis for chimeric reads
-            proc_pool_b = Pool(nproc)
-            diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample)
-            diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample)
-            proc_pool_b.close()
-            proc_pool_b.join()
-            diff = numpy.concatenate((numpy.concatenate([item[0] for item in diff_list_a]),
-                                      numpy.concatenate([item_b[0] for item_b in diff_list_b]))).astype(int)
-            HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]),
-                                         numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int)
-            HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]),
-                                         numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int)
-            minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]),
-                                        numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int)
-            minHD_tags = numpy.concatenate((numpy.concatenate([item[4] for item in diff_list_a]),
-                                            numpy.concatenate([item_b[4] for item_b in diff_list_b])))
-            rel_Diff = numpy.concatenate((numpy.concatenate([item[5] for item in diff_list_a]),
-                                          numpy.concatenate([item_b[5] for item_b in diff_list_b])))
-            diff_zeros = numpy.concatenate((numpy.concatenate([item[6] for item in diff_list_a]),
-                                            numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int)
-            minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]),
-                                                  numpy.concatenate([item_b[7] for item_b in diff_list_b])))
-            HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]), numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int)
-            HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]),
-                                            numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int)
+        # HD analysis for chimeric reads
+        proc_pool_b = Pool(nproc)
+        diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample)
+        diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample)
+        proc_pool_b.close()
+        proc_pool_b.join()
+        diff = numpy.concatenate((numpy.concatenate([item[0] for item in diff_list_a]),
+                                  numpy.concatenate([item_b[0] for item_b in diff_list_b]))).astype(int)
+        HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]),
+                                     numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int)
+        HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]),
+                                     numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int)
+        minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]),
+                                    numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int)
+        minHD_tags = numpy.concatenate((numpy.concatenate([item[4] for item in diff_list_a]),
+                                        numpy.concatenate([item_b[4] for item_b in diff_list_b])))
+        rel_Diff = numpy.concatenate((numpy.concatenate([item[5] for item in diff_list_a]),
+                                      numpy.concatenate([item_b[5] for item_b in diff_list_b])))
+        diff_zeros = numpy.concatenate((numpy.concatenate([item[6] for item in diff_list_a]),
+                                        numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int)
+        minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]),
+                                              numpy.concatenate([item_b[7] for item_b in diff_list_b])))
+        HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]),
+                                        numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int)
+        HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]),
+                                        numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int)
 
-            lenTags = len(data_array)
+        lenTags = len(data_array)
 
-            quant = numpy.array(data_array[result, 0]).astype(int)  # family size for sample of tags
-            seq = numpy.array(data_array[result, 1])  # tags of sample
-            ham = numpy.asarray(ham)  # HD for sample of tags
+        quant = numpy.array(data_array[result, 0]).astype(int)  # family size for sample of tags
+        seq = numpy.array(data_array[result, 1])  # tags of sample
+        ham = numpy.asarray(ham)  # HD for sample of tags
 
-            if onlyDuplicates is True:  # ab and ba strands of DCSs
-                quant = numpy.concatenate((quant, duplTagsBA[result]))
-                seq = numpy.tile(seq, 2)
-                ham = numpy.tile(ham, 2)
+        if onlyDuplicates is True:  # ab and ba strands of DCSs
+            quant = numpy.concatenate((quant, duplTagsBA[result]))
+            seq = numpy.tile(seq, 2)
+            ham = numpy.tile(ham, 2)
 
-            # prepare data for different kinds of plots
-            # distribution of FSs separated after HD
-            familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False)
-            list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham)  # histogram of HDs separated after FS
-
-            # get FS for all tags with min HD of analysis of chimeric reads
-            # there are more tags than sample size in the plot, because one tag can have multiple minimas
-            seqDic = dict(zip(seq, quant))
-            lst_minHD_tags = []
-            for i in minHD_tags:
-                lst_minHD_tags.append(seqDic.get(i))
+        # prepare data for different kinds of plots
+        # distribution of FSs separated after HD
+        familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False)
+        list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham)  # histogram of HDs separated after FS
 
-            # histogram with absolute and relative difference between HDs of both parts of the tag
-            listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff)
-            listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags,
-                                                                                                     rel_Diff)
+        # get FS for all tags with min HD of analysis of chimeric reads
+        # there are more tags than sample size in the plot, because one tag can have multiple minimas
+        seqDic = dict(zip(seq, quant))
+        lst_minHD_tags = []
+        for i in minHD_tags:
+            lst_minHD_tags.append(seqDic.get(i))
 
-            # family size distribution separated after the difference between HDs of both parts of the tag
-            familySizeList1_diff, hammingDistances_diff, maximumXFS_diff, minimumXFS_diff = familySizeDistributionWithHD(
-                lst_minHD_tags, diff, diff=True, rel=False)
-            familySizeList1_reldiff, hammingDistances_reldiff, maximumXFS_reldiff, minimumXFS_reldiff = familySizeDistributionWithHD(
-                lst_minHD_tags, rel_Diff, diff=True, rel=True)
+        # histogram with absolute and relative difference between HDs of both parts of the tag
+        listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff)
+        listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags,
+                                                                                                 rel_Diff)
 
-            # chimeric read analysis: tags which have HD=0 in one of the halfs
-            if len(minHD_tags_zeros) != 0:
-                lst_minHD_tags_zeros = []
-                for i in minHD_tags_zeros:
-                    lst_minHD_tags_zeros.append(seqDic.get(i))  # get family size for tags of chimeric reads
+        # family size distribution separated after the difference between HDs of both parts of the tag
+        familySizeList1_diff, hammingDistances_diff, maximumXFS_diff, minimumXFS_diff = familySizeDistributionWithHD(
+            lst_minHD_tags, diff, diff=True, rel=False)
+        familySizeList1_reldiff, hammingDistances_reldiff, maximumXFS_reldiff, minimumXFS_reldiff = familySizeDistributionWithHD(
+            lst_minHD_tags, rel_Diff, diff=True, rel=True)
 
-            # histogram with HD of non-identical half
-            listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(lst_minHD_tags_zeros, diff_zeros)
-            # family size distribution of non-identical half
-            familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD(lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False)
-
-            # plot Hamming Distance with Family size distribution
-            plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, subtitle="Hamming distance separated by family size", title_file1=name_file, lenTags=lenTags, xlabel="HD", nr_above_bars=nr_above_bars)
+        # chimeric read analysis: tags which have HD=0 in one of the halfs
+        if len(minHD_tags_zeros) != 0:
+            lst_minHD_tags_zeros = []
+            for i in minHD_tags_zeros:
+                lst_minHD_tags_zeros.append(seqDic.get(i))  # get family size for tags of chimeric reads
 
-            # Plot FSD with separation after
-            plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS,
-                           originalCounts=quant, subtitle="Family size distribution separated by Hamming distance",
-                           pdf=pdf, relative=False, title_file1=name_file, diff=False)
+        # histogram with HD of non-identical half
+        listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(
+            lst_minHD_tags_zeros, diff_zeros)
+        # family size distribution of non-identical half
+        familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD(
+            lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False)
 
-            # Plot HD within tags
-            plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, title_file1=name_file)
+        # plot Hamming Distance with Family size distribution
+        plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf,
+                      subtitle="Hamming distance separated by family size", title_file1=name1, lenTags=lenTags,
+                      xlabel="HD", nr_above_bars=nr_above_bars)
 
-            # Plot difference between HD's separated after FSD
-            plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf,
-                          subtitle="Delta Hamming distance within tags",
-                          title_file1=name_file, lenTags=lenTags,
-                          xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars)
+        # Plot FSD with separation after
+        plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS,
+                       originalCounts=quant, subtitle="Family size distribution separated by Hamming distance",
+                       pdf=pdf, relative=False, title_file1=name1, diff=False)
+
+        # Plot HD within tags
+        plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags,
+                             title_file1=name1)
 
-            plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf,
-                          subtitle="Chimera Analysis: relative delta Hamming distances",
-                          title_file1=name_file, lenTags=lenTags,
-                          xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars)
+        # Plot difference between HD's separated after FSD
+        plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf,
+                      subtitle="Delta Hamming distance within tags",
+                      title_file1=name1, lenTags=lenTags,
+                      xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars)
 
-            # plots for chimeric reads
-            if len(minHD_tags_zeros) != 0:
-                # HD
-                plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf,
-                              subtitle="Hamming distance of the non-identical half of chimeras",
-                              title_file1=name_file, lenTags=lenTags, xlabel="HD", relative=False, nr_above_bars=nr_above_bars)
+        plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf,
+                      subtitle="Chimera Analysis: relative delta Hamming distances",
+                      title_file1=name1, lenTags=lenTags,
+                      xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars)
 
-            # print all data to a CSV file
-            # HD
-            summary, sumCol = createTableHD(list1, "HD=")
-            overallSum = sum(sumCol)  # sum of columns in table
-
-            # FSD
-            summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False)
-            overallSum5 = sum(sumCol5)
-
-            # HD of both parts of the tag
-            summary9, sumCol9 = createTableHDwithTags([HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, numpy.array(minHDs)])
-            overallSum9 = sum(sumCol9)
-
-            # HD
-            # absolute difference
-            summary11, sumCol11 = createTableHD(listDifference1, "diff=")
-            overallSum11 = sum(sumCol11)
-            # relative difference and all tags
-            summary13, sumCol13 = createTableHD(listRelDifference1, "diff=")
-            overallSum13 = sum(sumCol13)
-
-            # chimeric reads
-            if len(minHD_tags_zeros) != 0:
-                # absolute difference and tags where at least one half has HD=0
-                summary15, sumCol15 = createTableHD(listDifference1_zeros, "HD=")
-                overallSum15 = sum(sumCol15)
-
-            output_file.write("{}\n".format(name_file))
-            output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len(
-                numpy.concatenate(list1)), lenTags, lenTags))
-
+        # plots for chimeric reads
+        if len(minHD_tags_zeros) != 0:
             # HD
-            createFileHD(summary, sumCol, overallSum, output_file,
-                         "Hamming distance separated by family size", sep)
-            # FSD
-            createFileFSD2(summary5, sumCol5, overallSum5, output_file,
-                           "Family size distribution separated by Hamming distance", sep,
-                           diff=False)
+            plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf,
+                          subtitle="Hamming distance of the non-identical half of chimeras",
+                          title_file1=name1, lenTags=lenTags, xlabel="HD", relative=False,
+                          nr_above_bars=nr_above_bars)
+
+        # print all data to a CSV file
+        # HD
+        summary, sumCol = createTableHD(list1, "HD=")
+        overallSum = sum(sumCol)  # sum of columns in table
+
+        # FSD
+        summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False)
+        overallSum5 = sum(sumCol5)
+
+        # HD of both parts of the tag
+        summary9, sumCol9 = createTableHDwithTags([HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, numpy.array(minHDs)])
+        overallSum9 = sum(sumCol9)
+
+        # HD
+        # absolute difference
+        summary11, sumCol11 = createTableHD(listDifference1, "diff=")
+        overallSum11 = sum(sumCol11)
+        # relative difference and all tags
+        summary13, sumCol13 = createTableHD(listRelDifference1, "diff=")
+        overallSum13 = sum(sumCol13)
+
+        # chimeric reads
+        if len(minHD_tags_zeros) != 0:
+            # absolute difference and tags where at least one half has HD=0
+            summary15, sumCol15 = createTableHD(listDifference1_zeros, "HD=")
+            overallSum15 = sum(sumCol15)
 
-            count = numpy.bincount(quant)
-            # output_file.write("{}{}\n".format(sep, name_file))
-            output_file.write("\n")
-            output_file.write("max. family size:{}{}\n".format(sep, max(quant)))
-            output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1]))
-            output_file.write(
-                "relative frequency:{}{}\n\n".format(sep, float(count[len(count) - 1]) / sum(count)))
+        output_file.write("{}\n".format(name1))
+        output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len(
+            numpy.concatenate(list1)), lenTags, lenTags))
+
+        # HD
+        createFileHD(summary, sumCol, overallSum, output_file,
+                     "Hamming distance separated by family size", sep)
+        # FSD
+        createFileFSD2(summary5, sumCol5, overallSum5, output_file,
+                       "Family size distribution separated by Hamming distance", sep,
+                       diff=False)
 
-            # HD within tags
+        count = numpy.bincount(quant)
+        # output_file.write("{}{}\n".format(sep, name1))
+        output_file.write("\n")
+        output_file.write("max. family size:{}{}\n".format(sep, max(quant)))
+        output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1]))
+        output_file.write(
+            "relative frequency:{}{}\n\n".format(sep, float(count[len(count) - 1]) / sum(count)))
+
+        # HD within tags
+        output_file.write(
+            "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n"
+            "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n")
+        output_file.write(
+            "actual number of tags with min HD = {:,} (sample size by user = {:,})\n".format(
+                len(numpy.concatenate(listDifference1)), len(numpy.concatenate(list1))))
+        output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2))
+
+        createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file,
+                              "Hamming distance of each half in the tag", sep)
+        createFileHD(summary11, sumCol11, overallSum11, output_file,
+                     "Absolute delta Hamming distances within the tag", sep)
+        createFileHD(summary13, sumCol13, overallSum13, output_file,
+                     "Chimera analysis: relative delta Hamming distances", sep)
+
+        if len(minHD_tags_zeros) != 0:
             output_file.write(
-                "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n"
-                "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n")
-            output_file.write(
-                "actual number of tags with min HD = {:,} (sample size by user = {:,})\n".format(
-                    len(numpy.concatenate(listDifference1)), len(numpy.concatenate(list1))))
-            output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2))
-
-            createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file,
-                                  "Hamming distance of each half in the tag", sep)
-            createFileHD(summary11, sumCol11, overallSum11, output_file,
-                         "Absolute delta Hamming distances within the tag", sep)
-            createFileHD(summary13, sumCol13, overallSum13, output_file,
-                         "Chimera analysis: relative delta Hamming distances", sep)
-
-            if len(minHD_tags_zeros) != 0:
-                output_file.write(
-                    "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n")
-                createFileHD(summary15, sumCol15, overallSum15, output_file,
-                             "Hamming distances of non-zero half", sep)
-            output_file.write("\n")
+                "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n")
+            createFileHD(summary15, sumCol15, overallSum15, output_file,
+                         "Hamming distances of non-zero half", sep)
+        output_file.write("\n")
 
 
 if __name__ == '__main__':
--- a/hd.xml	Mon Oct 08 05:56:04 2018 -0400
+++ b/hd.xml	Fri Dec 14 04:31:21 2018 -0500
@@ -1,20 +1,16 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<tool id="hd" name="Duplex Sequencing Analysis: hd" version="1.0.0">
-    <description>Hamming distance (HD) analysis of tags</description>
+<tool id="hd" name="HD:" version="1.0.0">
+    <description>hamming distance analysis of duplex tags</description>
     <requirements>
         <requirement type="package" version="2.7">python</requirement>
         <requirement type="package" version="1.4.0">matplotlib</requirement>
     </requirements>
     <command>
-        python2 '$__tool_directory__/hd.py' --inputFile '$inputFile' --inputName1 '$inputFile.name' --inputFile2 '$inputFile2' --inputName2 '$inputFile2.name' --sample_size $sampleSize --subset_tag $subsetTag --nproc $nproc $onlyDCS --minFS $minFS --maxFS $maxFS
-		$nr_above_bars --output_pdf $output_pdf --output_tabular $output_tabular 
-        #if $inputFile2:
-        --output_pdf2 $output_pdf2 --output_tabular2 $output_tabular2
-        #end if
+        python2 '$__tool_directory__/hd.py' --inputFile '$inputFile' --inputName1 '$inputFile.name' --sample_size $sampleSize --subset_tag $subsetTag --nproc $nproc $onlyDCS --minFS $minFS --maxFS $maxFS
+		$nr_above_bars --output_pdf $output_pdf --output_tabular $output_tabular
     </command>
     <inputs>
-        <param name="inputFile" type="data" format="tabular" label="Dataset 1: input tags" optional="false"/>
-        <param name="inputFile2" type="data" format="tabular" label="Dataset 2: input tags" optional="true" help="Input in tabular format with the family size, tag and the direction of the strand ('ab' or 'ba') for each family."/>
+        <param name="inputFile" type="data" format="tabular" label="Dataset 1: input tags" optional="false" help="Input in tabular format with the family size, tag and the direction of the strand ('ab' or 'ba') for each family."/>
         <param name="sampleSize" type="integer" label="number of tags in the sample" value="1000" min="0" help="specifies the number of tags in one analysis. If sample size is 0, all tags of the dataset are compared against all tags."/>
         <param name="minFS" type="integer" label="minimum family size of the tags" min="1" value="1" help="filters the tags after their family size: Families with a smaller size are skipped. Default: min. family size = 1."/>
         <param name="maxFS" type="integer" label="max family size of the tags" min="0" value="0" help="filters the tags after their family size: Families with a larger size are skipped. If max. family size is 0, no upper bound is defined and the maximum family size in the analysis will be the maximum family size of the whole dataset. Default: max. family size = 0."/>
@@ -26,51 +22,65 @@
     </inputs>
     <outputs>
         <data name="output_tabular" format="tabular"/>
-        <data name="output_tabular2" format="tabular">
-            <filter>inputFile2</filter>
-        </data>
         <data name="output_pdf" format="pdf" />
-        <data name="output_pdf2" format="pdf" >
-            <filter>inputFile2</filter>
-        </data>
     </outputs>
     <tests>
         <test>
             <param name="inputFile" value="Test_data.tabular"/>
-            <param name="inputFile2" value="Test_data2.tabular"/>
             <param name="sampleSize" value="0"/>
             <output name="output_pdf" file="output_file.pdf" lines_diff="6"/>
             <output name="output_tabular" file="output_file.tabular"/>
-            <output name="output_pdf2" file="output_file2.pdf" lines_diff="6"/>
-            <output name="output_tabular2" file="output_file2.tabular"/>
         </test>
     </tests>
     <help> <![CDATA[
 **What it does**
     
-    This tool calculates the Hamming distance for the tags by comparing them to all tags in the dataset and finally searches for the minimum Hamming distance. 
-    The Hamming distance is shown in a histogram separated by the family sizes or in a family size distribution separated by the Hamming distances. 
-    This similarity measure was calculated for each tag to distinguish whether similar tags truly stem from different molecules or occured due to sequencing or PCR errros. 
-    In addition, the tags of chimeric reads can be identified by calculating the Hamming distance for each half of the tag. 
-    This analysis can be performed on only a sample (by default: sample size=1000) or on the whole dataset (sample size=0). 
-    It is also possible to select on only those tags, which have a partner tag (ab and ba) in the dataset (DCSs) or to filter the dataset after the tag's family size. 
+This tool calculates the Hamming distance for the tags by comparing them to all tags in the dataset and finally searches for the minimum Hamming distance. 
+The Hamming distance is shown in a histogram separated by the family sizes or in a family size distribution separated by the Hamming distances. 
+This similarity measure was calculated for each tag to distinguish whether similar tags truly stem from different molecules or occured due to sequencing or PCR errros. 
+In addition, the tags of chimeric reads can be identified by calculating the Hamming distance for each half of the tag. 
+This analysis can be performed on only a sample (by default: sample size=1000) or on the whole dataset (sample size=0). 
+It is also possible to select on only those tags, which have a partner tag (ab and ba) in the dataset (DCSs) or to filter the dataset after the tag's family size. 
     
 **Input**
     
-    This tools expects a tabular file with the tags of all families, their sizes and information about forward (ab) and reverse (ba) strands. It is possible to upload two files which allows the performance of two analyses at the same time.
+This tools expects a tabular file with the tags of all families, their sizes and information about forward (ab) and reverse (ba) strands::
     
-    +-----+----------------------------+----+
-    | 1   | AAAAAAAAAAAATGTTGGAATCTT   | ba |
-    +-----+----------------------------+----+
-    | 10  | AAAAAAAAAAAGGCGGTCCACCCC   | ab |
-    +-----+----------------------------+----+
-    | 28  | AAAAAAAAAAATGGTATGGACCGA   | ab |
-    +-----+----------------------------+----+
+    1  AAAAAAAAAAAATGTTGGAATCTT ba
+   10  AAAAAAAAAAAGGCGGTCCACCCC ab
+   28  AAAAAAAAAAATGGTATGGACCGA ab
+
+**How to generate the input**
+
+The first step of the `Du Novo Analysis Pipeline <https://doi.org/10.1186/s13059-016-1039-4>`_ is the **Make Families** tool that produces output in this form::
+
+    1                        2  3     4
+    ------------------------------------------------------
+    AAAAAAAAAAAAAAATAGCTCGAT ba read1 CGCTACGTGACTGGGTCATG
+    AAAAAAAAAAAAAAATAGCTCGAT ba read2 CGCTACGTGACTGGGTCATG
+    AAAAAAAAAAAAAAATAGCTCGAT ba read3 CGCTACGTGACTGGGTCATG
+
+   we only need columns 1 and 2. These two columns can be extracted from this dataset using **Cut** tool::
+
+    1                        2 
+    ---------------------------
+    AAAAAAAAAAAAAAATAGCTCGAT ba
+    AAAAAAAAAAAAAAATAGCTCGAT ba
+    AAAAAAAAAAAAAAATAGCTCGAT ba
+
+   now one needs to count the number of unique occurencies of each tag. This is done using **Unique lines** tool, which would add an additional column containg counts (column 1)::
+
+
+    1 2                        3 
+    -----------------------------
+    3 AAAAAAAAAAAAAAATAGCTCGAT ba
+ 
+   these data can now be used in this tool.
     
     
 **Output**
     
-    The output is one PDF file with the plots of the Hamming distance and a tabular file with the data of the plot for each dataset.
+The output is one PDF file with the plots of the Hamming distance and a tabular file with the data of the plot for each dataset.
     
     
 **About Author**
--- a/test-data/Test_data2.tabular	Mon Oct 08 05:56:04 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,20 +0,0 @@
-1	AAAAAAAACCGCCCAACTGCCGGT	ab
-5	AAAAAAAACCTCTCAACCCCAAAT	ba
-7	AAAAAAAACCTCTTGCGATGTTGT	ab
-1	AAAAAAAACCTCTTGCGCTGTTGT	ab
-1	AAAAAAAACCTCTTGTGATGTTGT	ab
-12	AAAAAAAACCTGAGCAATGGTTCC	ab
-3	AAAAAAAACCTTGACCCTCACATG	ba
-6	AAAAAAAACCTTGCACTCGTCCTA	ba
-9	AAAAAAAACGAAATAAAAAAACCT	ba
-1	AAAAAAAACGACCGGCCTTAGACA	ba
-4	AAAAAAAACGCCACCACCCCCTTT	ab
-12	AAAAAAAACGCCACGGGCACTATT	ba
-13	AAAAAAAACGTATCAGTAGATCCT	ab
-1	AAAAAAAACTAGTAGGATTTCATG	ba
-3	AAAAAAAACTATAGAAAATCCATT	ba
-1	AAAAAAAACTATTCTATTTCCGAT	ba
-13	AAAAAAAACTGATCTGCTTGGCGG	ba
-8	AAAAAAAACTTGCGAATAGCATCG	ba
-4	AAAAAAAACTTGTTATCAAAACGT	ab
-1	AAAAAAAAGAAAAGTTCAACACGC	ba
\ No newline at end of file
Binary file test-data/output_file2.pdf has changed
--- a/test-data/output_file2.tabular	Mon Oct 08 05:56:04 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,97 +0,0 @@
-Test_data2
-number of tags per file	20 (from 20) against 20
-
-Hamming distance separated by family size
-	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum	
-HD=1	2	0	0	0	1	0	3	
-HD=6	0	0	0	1	0	1	2	
-HD=7	2	0	1	1	2	1	7	
-HD=8	1	0	1	0	2	1	5	
-HD=9	1	0	0	0	0	1	2	
-HD=10	1	0	0	0	0	0	1	
-sum	7	0	2	2	5	4	20	
-
-Family size distribution separated by Hamming distance
-	HD=1	HD=2	HD=3	HD=4	HD=5-8	HD>8	sum	
-FS=1	2	0	0	0	3	2	7	
-FS=3	0	0	0	0	2	0	2	
-FS=4	0	0	0	0	2	0	2	
-FS=5	0	0	0	0	1	0	1	
-FS=6	0	0	0	0	1	0	1	
-FS=7	1	0	0	0	0	0	1	
-FS=8	0	0	0	0	1	0	1	
-FS=9	0	0	0	0	1	0	1	
-FS=12	0	0	0	0	2	0	2	
-FS=13	0	0	0	0	1	1	2	
-sum	3	0	0	0	14	3	20	
-
-
-max. family size:	13
-absolute frequency:	2
-relative frequency:	0.1
-
-The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.
-It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.
-actual number of tags with min HD = 79 (sample size by user = 20)
-length of one part of the tag = 12
-
-Hamming distance of each half in the tag
-	HD a	HD b'	HD b	HD a'	HD a+b	sum	
-HD=0	20	0	0	5	0	25	
-HD=1	22	4	4	3	8	41	
-HD=2	9	2	0	9	2	22	
-HD=3	0	0	0	10	0	10	
-HD=4	0	0	2	1	0	3	
-HD=5	0	0	5	0	0	5	
-HD=6	0	5	7	0	3	15	
-HD=7	0	7	10	0	10	27	
-HD=8	0	6	0	0	10	16	
-HD=9	0	7	0	0	17	24	
-HD=10	0	11	0	0	13	24	
-HD=11	0	8	0	0	7	15	
-HD=12	0	1	0	0	5	6	
-HD=13	0	0	0	0	4	4	
-sum	51	51	28	28	79	237	
-
-Absolute delta Hamming distances within the tag
-	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum	
-diff=1	5	0	0	1	5	0	11	
-diff=2	4	0	0	0	0	0	4	
-diff=3	1	0	2	1	1	0	5	
-diff=4	1	0	1	0	2	1	5	
-diff=5	2	0	0	0	4	6	12	
-diff=6	1	0	0	1	1	7	10	
-diff=7	2	0	1	0	0	0	3	
-diff=8	0	0	1	0	1	3	5	
-diff=9	6	0	0	1	3	4	14	
-diff=10	4	0	0	0	3	2	9	
-diff=11	0	0	0	0	0	1	1	
-sum	26	0	5	4	20	24	79	
-
-Chimera analysis: relative delta Hamming distances
-	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum	
-diff=0.1	1	0	0	1	1	0	3	
-diff=0.3	3	0	2	0	0	0	5	
-diff=0.4	1	0	0	1	3	0	5	
-diff=0.5	0	0	1	0	0	1	2	
-diff=0.6	1	0	0	0	3	7	11	
-diff=0.7	1	0	0	0	1	5	7	
-diff=0.8	10	0	0	0	2	9	21	
-diff=1.0	9	0	2	2	10	2	25	
-sum	26	0	5	4	20	24	79	
-
-Chimeras:
-All tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.
-So the hamming distance of the non-identical half is compared.
-Hamming distances of non-zero half
-	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum	
-HD=1	4	0	0	0	4	0	8	
-HD=2	2	0	0	0	0	0	2	
-HD=6	0	0	0	1	0	2	3	
-HD=7	1	0	1	0	0	0	2	
-HD=8	0	0	1	0	1	0	2	
-HD=9	1	0	0	1	2	0	4	
-HD=10	1	0	0	0	3	0	4	
-sum	9	0	2	2	10	2	25	
-
-