changeset 0:239c4448a163 draft

planemo upload for repository https://github.com/monikaheinzl/galaxyProject/tree/master/tools/hd commit 6055f8c5c052f528ff85fb5e0d43b4500830637a
author mheinzl
date Thu, 10 May 2018 07:30:27 -0400
parents
children 7414792e1cb8
files hd.py hd.xml
diffstat 2 files changed, 707 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hd.py	Thu May 10 07:30:27 2018 -0400
@@ -0,0 +1,624 @@
+#!/usr/bin/env python
+
+# Hamming distance analysis of SSCSs
+#
+# Author: Monika Heinzl, Johannes-Kepler University Linz (Austria)
+# Contact: monika.heinzl@edumail.at
+#
+# Takes at least one TABULAR file with tags before the alignment to the SSCS and optionally a second TABULAR file as input.
+# The program produces a plot which shows a histogram of Hamming distances separated after family sizes,
+# a family size distribution separated after Hamming distances for all (sample_size=0) or a given sample of SSCSs or SSCSs, which form a DCS.
+# In additon, the tool produces HD and FSD plots for the difference between the HDs of both parts of the tags and for the chimeric reads
+# and finally a CSV file with the data of the plots.
+# It is also possible to perform the HD analysis with shortened tags with given sizes as input.
+# The tool can run on a certain number of processors, which can be defined by the user.
+
+# USAGE: python HDnew6_1Plot_FINAL.py --inputFile filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --sample_size int/0 --sep "characterWhichSeparatesCSVFile" /
+#        --only_DCS True --FamilySize3 True --subset_tag True --nproc int --output_csv outptufile_name_csv --output_pdf outptufile_name_pdf
+
+import numpy
+import itertools
+import operator
+import matplotlib.pyplot as plt
+import os.path
+import cPickle as pickle
+from multiprocessing.pool import Pool
+from functools import partial
+from HDAnalysis_plots.plot_HDwithFSD import plotHDwithFSD
+from HDAnalysis_plots.plot_FSDwithHD2 import plotFSDwithHD2
+from HDAnalysis_plots.plot_HDwithinSeq_Sum2 import plotHDwithinSeq_Sum2
+from HDAnalysis_plots.table_HD import createTableHD, createFileHD, createTableHDwithTags, createFileHDwithinTag
+from HDAnalysis_plots.table_FSD import createTableFSD2, createFileFSD2
+import argparse
+import sys
+import os
+from matplotlib.backends.backend_pdf import PdfPages
+
+def hamming(array1, array2):
+    res = 99 * numpy.ones(len(array1))
+    i = 0
+    array2 = numpy.unique(array2)  # remove duplicate sequences to decrease running time
+    for a in array1:
+        dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2])  # fastest
+        res[i] = numpy.amin(dist[dist > 0])  # pick min distance greater than zero
+        #print(i)
+        i += 1
+    return res
+
+def hamming_difference(array1, array2, mate_b):
+    array2 = numpy.unique(array2)  # remove duplicate sequences to decrease running time
+    array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1
+    array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2
+
+    array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1
+    array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2])  # mate2 part2
+
+    diff11 = []
+    relativeDiffList = []
+    ham1 = []
+    ham2 = []
+    min_valueList = []
+    min_tagsList = []
+    diff11_zeros = []
+    min_tagsList_zeros = []
+    i = 0 # counter, only used to see how many HDs of tags were already calculated
+    if mate_b is False: # HD calculation for all a's
+        half1_mate1 = array1_half
+        half2_mate1 = array1_half2
+        half1_mate2 = array2_half
+        half2_mate2 = array2_half2
+    elif mate_b is True: # HD calculation for all b's
+        half1_mate1 = array1_half2
+        half2_mate1 = array1_half
+        half1_mate2 = array2_half2
+        half2_mate2 = array2_half
+
+    for a, b, tag in zip(half1_mate1, half2_mate1, array1):
+        ## exclude identical tag from array2, to prevent comparison to itself
+        sameTag = numpy.where(array2 == tag)
+        indexArray2 = numpy.arange(0, len(array2), 1)
+        index_withoutSame = numpy.delete(indexArray2, sameTag)  # delete identical tag from the data
+
+        # all tags without identical tag
+        array2_half_withoutSame = half1_mate2[index_withoutSame]
+        array2_half2_withoutSame = half2_mate2[index_withoutSame]
+        #array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs)
+
+        dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in
+                            array2_half_withoutSame])  # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
+        min_index = numpy.where(dist == dist.min()) # get index of min HD
+        min_value = dist[min_index]  # get minimum HDs
+        min_tag_half2 = array2_half2_withoutSame[min_index]  # get all "b's" of the tag or all "a's" of the tag with minimum HD
+        #min_tag = array2_withoutSame[min_index] # get whole tag with min HD
+
+        dist2 = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in
+                             min_tag_half2])  # calculate HD of "b" to all "b's" or "a" to all "a's"
+        for d_1, d_2 in zip(min_value, dist2):
+            if mate_b is True:  # half2, corrects the variable of the HD from both halfs if it is a or b
+                d = d_2
+                d2 = d_1
+            else:  # half1, corrects the variable of the HD from both halfs if it is a or b
+                d = d_1
+                d2 = d_2
+            min_valueList.append(d + d2)
+            min_tagsList.append(tag)
+            ham1.append(d)
+            ham2.append(d2)
+            difference1 = abs(d - d2)
+            diff11.append(difference1)
+            rel_difference = round(float(difference1) / (d + d2), 1)
+            relativeDiffList.append(rel_difference)
+
+            #### tags which have identical parts:
+            if d == 0 or d2 == 0:
+                min_tagsList_zeros.append(tag)
+                difference1_zeros = abs(d - d2)
+                diff11_zeros.append(difference1_zeros)
+        #print(i)
+        i += 1
+    return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros])
+
+def readFileReferenceFree(file):
+    with open(file, 'r') as dest_f:
+        data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string')
+        integers = numpy.array(data_array[:, 0]).astype(int)
+        return(integers, data_array)
+
+def hammingDistanceWithFS(quant, ham):
+    quant = numpy.asarray(quant)
+    maximum = max(ham)
+    minimum = min(ham)
+    ham = numpy.asarray(ham)
+
+    singletons = numpy.where(quant == 1)[0]
+    data = ham[singletons]
+
+    hd2 = numpy.where(quant == 2)[0]
+    data2 = ham[hd2]
+
+    hd3 = numpy.where(quant == 3)[0]
+    data3 = ham[hd3]
+
+    hd4 = numpy.where(quant == 4)[0]
+    data4 = ham[hd4]
+
+    hd5 = numpy.where((quant >= 5) & (quant <= 10))[0]
+    data5 = ham[hd5]
+
+    hd6 = numpy.where(quant > 10)[0]
+    data6 = ham[hd6]
+
+    list1 = [data, data2, data3, data4, data5, data6]
+    return(list1, maximum, minimum)
+
+def familySizeDistributionWithHD(fs, ham, diff=False, rel = True):
+    hammingDistances = numpy.unique(ham)
+    fs = numpy.asarray(fs)
+
+    ham = numpy.asarray(ham)
+    bigFamilies2 = numpy.where(fs > 19)[0]
+    if len(bigFamilies2) != 0:
+        fs[bigFamilies2] = 20
+    maximum = max(fs)
+    minimum = min(fs)
+    if diff is True:
+        hd0 = numpy.where(ham == 0)[0]
+        data0 = fs[hd0]
+
+    if rel is True:
+        hd1 = numpy.where(ham == 0.1)[0]
+    else:
+        hd1 = numpy.where(ham == 1)[0]
+    data = fs[hd1]
+
+    if rel is True:
+        hd2 = numpy.where(ham == 0.2)[0]
+    else:
+        hd2 = numpy.where(ham == 2)[0]
+    data2 = fs[hd2]
+
+    if rel is True:
+        hd3 = numpy.where(ham == 0.3)[0]
+    else:
+        hd3 = numpy.where(ham == 3)[0]
+    data3 = fs[hd3]
+
+    if rel is True:
+        hd4 = numpy.where(ham == 0.4)[0]
+    else:
+        hd4 = numpy.where(ham == 4)[0]
+    data4 = fs[hd4]
+
+    if rel is True:
+        hd5 = numpy.where((ham >= 0.5) & (ham <= 0.8))[0]
+    else:
+        hd5 = numpy.where((ham >= 5) & (ham <= 8))[0]
+    data5 = fs[hd5]
+
+    if rel is True:
+        hd6 = numpy.where(ham > 0.8)[0]
+    else:
+        hd6 = numpy.where(ham > 8)[0]
+    data6 = fs[hd6]
+
+    if diff is True:
+        list1 = [data0,data, data2, data3, data4, data5, data6]
+    else:
+        list1 = [data, data2, data3, data4, data5, data6]
+
+    return(list1, hammingDistances, maximum, minimum)
+
+def make_argparser():
+    parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data')
+    parser.add_argument('--inputFile',
+                        help='Tabular File with three columns: ab or ba, tag and family size.')
+    parser.add_argument('--inputName1')
+    parser.add_argument('--inputFile2',default=None,
+                        help='Tabular File with three columns: ab or ba, tag and family size.')
+    parser.add_argument('--inputName2')
+    parser.add_argument('--sample_size', default=1000,type=int,
+                        help='Sample size of Hamming distance analysis.')
+    parser.add_argument('--sep', default=",",
+                        help='Separator in the csv file.')
+    parser.add_argument('--subset_tag', default=0,type=int,
+                        help='The tag is shortened to the given number.')
+    parser.add_argument('--nproc', default=4,type=int,
+                        help='The tool runs with the given number of processors.')
+    parser.add_argument('--only_DCS', action="store_false",  # default=False, type=bool,
+                        help='Only tags of the DCSs are included in the HD analysis')
+
+    parser.add_argument('--minFS', default=1, type=int,
+                        help='Only tags, which have a family size greater or equal than specified, are included in the HD analysis')
+    parser.add_argument('--maxFS', default=0, type=int,
+                        help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis')
+
+    parser.add_argument('--output_csv', default="data.csv", type=str,
+                        help='Name of the csv file.')
+    parser.add_argument('--output_pdf', default="data.pdf", type=str,
+                        help='Name of the pdf file.')
+    parser.add_argument('--output_pdf2', default="data2.pdf", type=str,
+                        help='Name of the pdf file.')
+    parser.add_argument('--output_csv2', default="data2.csv", type=str,
+                        help='Name of the csv file.')
+
+    return parser
+
+def Hamming_Distance_Analysis(argv):
+    parser = make_argparser()
+    args = parser.parse_args(argv[1:])
+
+    file1 = args.inputFile
+    name1 = args.inputName1
+
+    file2 = args.inputFile2
+    name2 = args.inputName2
+
+    index_size = args.sample_size
+    title_savedFile_pdf = args.output_pdf
+    title_savedFile_pdf2 = args.output_pdf2
+
+    title_savedFile_csv = args.output_csv
+    title_savedFile_csv2 = args.output_csv2
+
+    sep = args.sep
+    onlyDuplicates = args.only_DCS
+    minFS = args.minFS
+    maxFS = args.maxFS
+
+    subset = args.subset_tag
+    nproc = args.nproc
+
+    ### input checks
+    if index_size < 0:
+        print("index_size is a negative integer.")
+        exit(2)
+
+    if nproc <= 0:
+        print("nproc is smaller or equal zero")
+        exit(3)
+
+    if type(sep) is not str or len(sep)>1:
+        print("sep must be a single character.")
+        exit(4)
+
+    if subset < 0:
+        print("subset_tag is smaller or equal zero.")
+        exit(5)
+
+    ### PLOT ###
+    plt.rcParams['axes.facecolor'] = "E0E0E0"  # grey background color
+    plt.rcParams['xtick.labelsize'] = 12
+    plt.rcParams['ytick.labelsize'] = 12
+    plt.rcParams['patch.edgecolor'] = "#000000"
+    plt.rc('figure', figsize=(11.69, 8.27))  # A4 format
+
+    if file2 != str(None):
+        files = [file1, file2]
+        name1 = name1.split(".tabular")[0]
+        name2 = name2.split(".tabular")[0]
+        names = [name1, name2]
+        pdf_files = [title_savedFile_pdf, title_savedFile_pdf2]
+        csv_files = [title_savedFile_csv, title_savedFile_csv2]
+    else:
+        files = [file1]
+        name1 = name1.split(".tabular")[0]
+        names = [name1]
+        pdf_files = [title_savedFile_pdf]
+        csv_files = [title_savedFile_csv]
+
+    print(type(onlyDuplicates))
+    print(onlyDuplicates)
+
+    for f, name_file, pdf_f, csv_f in zip(files, names, pdf_files, csv_files):
+        with open(csv_f, "w") as output_file, PdfPages(pdf_f) as pdf:
+            print("dataset: ", name_file)
+            integers, data_array = readFileReferenceFree(f)
+            data_array = numpy.array(data_array)
+            int_f = numpy.array(data_array[:, 0]).astype(int)
+            data_array = data_array[numpy.where(int_f >= minFS)]
+            integers = integers[integers >= minFS]
+
+            # select family size for tags
+            if maxFS > 0:
+                int_f2 = numpy.array(data_array[:, 0]).astype(int)
+                data_array = data_array[numpy.where(int_f2 <= maxFS)]
+                integers = integers[integers <= maxFS]
+
+            print("min FS", min(integers))
+            print("max FS", max(integers))
+
+            tags = data_array[:, 2]
+            seq = data_array[:, 1]
+
+            if onlyDuplicates is True:
+                # find all unique tags and get the indices for ALL tags, but only once
+                u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
+                d = u[c > 1]
+
+                # get family sizes, tag for duplicates
+                duplTags_double = integers[numpy.in1d(seq, d)]
+                duplTags = duplTags_double[0::2]  # ab of DCS
+                duplTagsBA = duplTags_double[1::2]  # ba of DCS
+
+                duplTags_tag = tags[numpy.in1d(seq, d)][0::2]  # ab
+                duplTags_seq = seq[numpy.in1d(seq, d)][0::2]  # ab - tags
+
+                data_array = numpy.column_stack((duplTags, duplTags_seq))
+                data_array = numpy.column_stack((data_array, duplTags_tag))
+                integers = numpy.array(data_array[:, 0]).astype(int)
+                print("DCS in whole dataset", len(data_array))
+
+            ## HD analysis for a subset of the tag
+            if subset > 0:
+                tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]])
+                tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]])
+
+                flanking_region_float = float((len(tag1[0]) - subset)) / 2
+                flanking_region = int(flanking_region_float)
+                if flanking_region_float % 2 == 0:
+                    tag1_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag1])
+                    tag2_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag2])
+                else:
+                    flanking_region_rounded = int(round(flanking_region, 1))
+                    flanking_region_rounded_end = len(tag1[0]) - subset - flanking_region_rounded
+                    tag1_shorten = numpy.array(
+                        [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag1])
+                    tag2_shorten = numpy.array(
+                        [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2])
+
+                data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)])
+                data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2]))
+
+            print("length of tag= ", len(data_array[0, 1]))
+            # select sample: if no size given --> all vs. all comparison
+            if index_size == 0:
+                result = numpy.arange(0, len(data_array), 1)
+            else:
+                result = numpy.random.choice(len(integers), size=index_size,
+                                             replace=False)  # array of random sequences of size=index.size
+
+           # with open("index_result1_{}.pkl".format(app_f), "wb") as o:
+            #    pickle.dump(result, o, pickle.HIGHEST_PROTOCOL)
+
+            # comparison random tags to whole dataset
+            result1 = data_array[result, 1]  # random tags
+            result2 = data_array[:, 1]  # all tags
+            print("size of the whole dataset= ", len(result2))
+            print("sample size= ", len(result1))
+
+            # HD analysis of whole tag
+            proc_pool = Pool(nproc)
+            chunks_sample = numpy.array_split(result1, nproc)
+            ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample)
+            proc_pool.close()
+            proc_pool.join()
+            ham = numpy.concatenate(ham).astype(int)
+          #  with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1:
+           #     for h, tag in zip(ham, result1):
+            #        output_file1.write("{}\t{}\n".format(tag, h))
+
+            # HD analysis for chimeric reads
+            proc_pool_b = Pool(nproc)
+            diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample)
+            diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample)
+            proc_pool_b.close()
+            proc_pool_b.join()
+            diff = numpy.concatenate((numpy.concatenate([item[0] for item in diff_list_a]),
+                                      numpy.concatenate([item_b[0] for item_b in diff_list_b]))).astype(int)
+            HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]),
+                                         numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int)
+            HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]),
+                                         numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int)
+            minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]),
+                                        numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int)
+            minHD_tags = numpy.concatenate((numpy.concatenate([item[4] for item in diff_list_a]),
+                                            numpy.concatenate([item_b[4] for item_b in diff_list_b])))
+            rel_Diff = numpy.concatenate((numpy.concatenate([item[5] for item in diff_list_a]),
+                                          numpy.concatenate([item_b[5] for item_b in diff_list_b])))
+            diff_zeros = numpy.concatenate((numpy.concatenate([item[6] for item in diff_list_a]),
+                                            numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int)
+            minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]),
+                                                  numpy.concatenate([item_b[7] for item_b in diff_list_b])))
+
+         #   with open("HD_within tag_{}.txt".format(app_f), "w") as output_file2:
+          #      for d, s1, s2, hd, rel_d, tag in zip(diff, HDhalf1, HDhalf2, minHDs, rel_Diff, minHD_tags):
+           #         output_file2.write(
+            #            "{}\t{}\t{}\t{}\t{}\t{}\n".format(tag, hd, s1, s2, d, rel_d))
+
+            lenTags = len(data_array)
+
+            quant = numpy.array(data_array[result, 0]).astype(int)  # family size for sample of tags
+            seq = numpy.array(data_array[result, 1])  # tags of sample
+            ham = numpy.asarray(ham)  # HD for sample of tags
+
+            if onlyDuplicates is True:  # ab and ba strands of DCSs
+                quant = numpy.concatenate((quant, duplTagsBA[result]))
+                seq = numpy.tile(seq, 2)
+                ham = numpy.tile(ham, 2)
+
+            # prepare data for different kinds of plots
+            list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham)  # histogram of HDs separated after FS
+            # distribution of FSs separated after HD
+            familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham,
+                                                                                                     rel=False)
+
+            ## get FS for all tags with min HD of analysis of chimeric reads
+            # there are more tags than sample size in the plot, because one tag can have multiple minimas
+            seqDic = dict(zip(seq, quant))
+            lst_minHD_tags = []
+            for i in minHD_tags:
+                lst_minHD_tags.append(seqDic.get(i))
+
+            # histogram with absolute and relative difference between HDs of both parts of the tag
+            listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff)
+            listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags,
+                                                                                                     rel_Diff)
+
+            # family size distribution separated after the difference between HDs of both parts of the tag
+            familySizeList1_diff, hammingDistances_diff, maximumXFS_diff, minimumXFS_diff = familySizeDistributionWithHD(
+                lst_minHD_tags, diff, diff=True, rel=False)
+            familySizeList1_reldiff, hammingDistances_reldiff, maximumXFS_reldiff, minimumXFS_reldiff = familySizeDistributionWithHD(
+                lst_minHD_tags, rel_Diff, diff=True, rel=True)
+
+            # chimeric read analysis: tags which have HD=0 in one of the halfs
+            if len(minHD_tags_zeros) != 0:
+                lst_minHD_tags_zeros = []
+                for i in minHD_tags_zeros:
+                    lst_minHD_tags_zeros.append(seqDic.get(i))  # get family size for tags of chimeric reads
+
+                # histogram with HD of non-identical half
+                listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(
+                    lst_minHD_tags_zeros, diff_zeros)
+                # family size distribution of non-identical half
+                familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD(
+                    lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False)
+            
+            ##########################       Plot HD within tags          ########################################################
+            ######################################################################################################################
+            plotHDwithinSeq_Sum2(HDhalf1, HDhalf2, minHDs, pdf=pdf, lenTags=lenTags, title_file1=name_file)
+
+            #####################################################################################################################
+            ##################         plot Hamming Distance with Family size distribution         ##############################
+            #####################################################################################################################
+            plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf,
+                          subtitle="Overall hamming distance with separation after family size", title_file1=name_file,
+                          lenTags=lenTags,xlabel="Hamming distance")
+
+            ##########################       Plot FSD with separation after HD       ###############################################
+            ########################################################################################################################
+            plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS,
+                           quant=quant, subtitle="Family size distribution with separation after hamming distance",
+                           pdf=pdf,relative=False, title_file1=name_file, diff=False)
+
+            ##########################       Plot difference between HD's separated after FSD       ##########################################
+            ########################################################################################################################
+            plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf,
+                          subtitle="Delta Hamming distances within tags with separation after family size",
+                          title_file1=name_file, lenTags=lenTags,
+                          xlabel="absolute delta Hamming distance", relative=False)
+
+            plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf,
+                          subtitle="Relative delta Hamming distances within tags with separation after family size",
+                          title_file1=name_file, lenTags=lenTags,
+                          xlabel="relative delta Hamming distance", relative=True)
+
+            ####################       Plot FSD separated after difference between HD's        #####################################
+            ########################################################################################################################
+            plotFSDwithHD2(familySizeList1_diff, maximumXFS_diff, minimumXFS_diff,
+                           subtitle="Family size distribution with separation after delta Hamming distances within the tags",
+                           pdf=pdf,relative=False, diff=True, title_file1=name_file, quant=quant)
+
+            plotFSDwithHD2(familySizeList1_reldiff, maximumXFS_reldiff, minimumXFS_reldiff, quant=quant, pdf=pdf,
+                           subtitle="Family size distribution with separation after delta Hamming distances within the tags",
+                           relative=True, diff=True, title_file1=name_file)
+
+           
+            # plots for chimeric reads
+            if len(minHD_tags_zeros) != 0:
+                ## HD
+                plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf,
+                              subtitle="Hamming Distance of the non-identical half with separation after family size"
+                                       "\n(at least one half is identical with the half of the min. tag)\n",
+                              title_file1=name_file, lenTags=lenTags,xlabel="Hamming distance", relative=False)
+
+                ## FSD
+                plotFSDwithHD2(familySizeList1_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros,
+                               quant=quant, pdf=pdf,
+                               subtitle="Family size distribution with separation after hamming distances from the non-identical half\n"
+                                        "(at least one half is identical with the half of the min. tag)\n",
+                               relative=False, diff=False, title_file1=name_file)
+
+            ### print all data to a CSV file
+            #### HD ####
+            summary, sumCol = createTableHD(list1, "HD=")
+            overallSum = sum(sumCol)  # sum of columns in table
+
+            #### FSD ####
+            summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False)
+            overallSum5 = sum(sumCol5)
+
+            ### HD of both parts of the tag ####
+            summary9, sumCol9 = createTableHDwithTags([numpy.array(minHDs), HDhalf1, HDhalf2])
+            overallSum9 = sum(sumCol9)
+
+            ## HD
+            # absolute difference
+            summary11, sumCol11 = createTableHD(listDifference1, "diff=")
+            overallSum11 = sum(sumCol11)
+            # relative difference and all tags
+            summary13, sumCol13 = createTableHD(listRelDifference1, "diff=")
+            overallSum13 = sum(sumCol13)
+
+            ## FSD
+            # absolute difference
+            summary19, sumCol19 = createTableFSD2(familySizeList1_diff)
+            overallSum19 = sum(sumCol19)
+            # relative difference
+            summary21, sumCol21 = createTableFSD2(familySizeList1_reldiff)
+            overallSum21 = sum(sumCol21)
+
+            # chimeric reads
+            if len(minHD_tags_zeros) != 0:
+                # absolute difference and tags where at least one half has HD=0
+                summary15, sumCol15 = createTableHD(listDifference1_zeros, "diff=")
+                overallSum15 = sum(sumCol15)
+                # absolute difference and tags where at least one half has HD=0
+                summary23, sumCol23 = createTableFSD2(familySizeList1_diff_zeros, diff=False)
+                overallSum23 = sum(sumCol23)
+
+            output_file.write("{}\n".format(f))
+            output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len(
+                numpy.concatenate(list1)), lenTags, lenTags))
+
+            ### HD ###
+            createFileHD(summary, sumCol, overallSum, output_file,
+                         "Hamming distance with separation after family size: file1", sep)
+            ### FSD ###
+            createFileFSD2(summary5, sumCol5, overallSum5, output_file,
+                           "Family size distribution with separation after hamming distances: file1", sep,
+                           diff=False)
+
+            count = numpy.bincount(quant)
+            output_file.write("{}{}\n".format(sep, f))
+            output_file.write("max. family size:{}{}\n".format(sep, max(quant)))
+            output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1]))
+            output_file.write(
+                "relative frequency:{}{}\n\n".format(sep, float(count[len(count) - 1]) / sum(count)))
+
+            ### HD within tags ###
+            output_file.write(
+                "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n"
+                "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n")
+            output_file.write(
+                "file 1: actual number of tags with min HD = {:,} (sample size by user = {:,})\n".format(
+                    len(numpy.concatenate(listDifference1)), len(numpy.concatenate(list1))))
+            output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2))
+
+            createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file,
+                                  "Hamming distance of each half in the tag: file1", sep)
+            createFileHD(summary11, sumCol11, overallSum11, output_file,
+                         "Absolute delta Hamming distances within the tag: file1", sep)
+            createFileHD(summary13, sumCol13, overallSum13, output_file,
+                         "Relative delta Hamming distances within the tag: file1", sep)
+
+            createFileFSD2(summary19, sumCol19, overallSum19, output_file,
+                           "Family size distribution with separation after absolute delta Hamming distances: file1",
+                           sep)
+            createFileFSD2(summary21, sumCol21, overallSum21, output_file,
+                           "Family size distribution with separation after relative delta Hamming distances: file1",
+                           sep, rel=True)
+
+            if len(minHD_tags_zeros) != 0:
+                output_file.write(
+                    "All tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n")
+                createFileHD(summary15, sumCol15, overallSum15, output_file,
+                             "Hamming distances of non-zero half: file1", sep)
+                createFileFSD2(summary23, sumCol23, overallSum23, output_file,
+                               "Family size distribution with separation after Hamming distances of non-zero half: file1",
+                               sep, diff=False)
+            output_file.write("\n")
+
+
+
+if __name__ == '__main__':
+    sys.exit(Hamming_Distance_Analysis(sys.argv))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hd.xml	Thu May 10 07:30:27 2018 -0400
@@ -0,0 +1,83 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tool id="hd" name="Duplex Sequencing Analysis:" version="0.0.1">
+    <requirements>
+        <requirement type="package" version="2.7">python</requirement>
+        <requirement type="package" version="1.4">matplotlib</requirement>
+    </requirements>
+    <description>Hamming distance (HD) analysis of tags</description>
+    <command>
+        python2 $__tool_directory__/hd.py --inputFile "$inputFile" --inputName1 "$inputFile.name" --inputFile2 "$inputFile2" --inputName2 "$inputFile2.name" --sample_size $sampleSize --sep $separator --subset_tag $subsetTag --nproc $nproc $onlyDCS --minFS $minFS --maxFS $maxFS --output_csv $output_csv --output_pdf $output_pdf 
+        #if $inputFile2:
+        --output_pdf2 $output_pdf2 --output_csv2 $output_csv2
+        #end if
+    </command>
+    <inputs>
+        <param name="inputFile" type="data" format="tabular" label="Dataset 1: input tags" optional="false"/>
+        <param name="inputFile2" type="data" format="tabular" label="Dataset 2: input tags" optional="true" help="Input in tabular format with the family size, tags and the direction of the strand ('ab' or 'ba') for each family."/>
+        <param name="sampleSize" type="integer" label="number of tags in the sample" value="1000" min="0" help="specifies the number of tags in one analysis. If sample size is 0, all tags of the dataset are compared against all tags."/>
+        <param name="minFS" type="integer" label="minimum family size of the tags" min="1" value="1" help="filters the tags after their family size: Families with smaller size are skipped. Default: min. family size = 1."/>
+        <param name="maxFS" type="integer" label="max family size of the tags" min="0" value="0" help="filters the tags after their family size: Families with larger size are skipped. If max. family size is 0, no upper bound is defined and the maximum family size in the analysis will be the maximum family size of the whole dataset. Default: max. family size = 0."/>
+        <param name="separator" type="text" label="Separator of the CSV file." help="can be a single character" value=","/>
+        <param name="onlyDCS" type="boolean" label="only DCS in the analysis?" truevalue="" falsevalue="--only_DCS" checked="False" help="Only tags, which have a partner tag in the dataset, are included in the analysis."/>
+        <param name="subsetTag" type="integer" label="shorten tag in the analysis?" value="0" help="An analysis with shorter tag length, which is specified by this parameter, is simulated. If this parameter is 0 (by default), the tag with its original length is used in the analysis."/>
+        <param name="nproc" type="integer" label="number of processors" value="8" help="Number of processor used for computing."/>
+    </inputs>
+    <outputs>
+        <data name="output_csv" format="csv"/>
+        <data name="output_csv2" format="csv">
+            <filter>inputFile2</filter>
+        </data>
+        <data name="output_pdf" format="pdf" />
+        <data name="output_pdf2" format="pdf" >
+            <filter>inputFile2</filter>
+        </data>
+    </outputs>
+    <help> <![CDATA[
+**What it does**
+    
+    This tool calculates the Hamming distance for the tags by comparing them to all tags in the dataset and finally searches for the minimum Hamming distance. 
+    The Hamming distance is shown in a histogram separated by the family sizes or in a family size distribution separated by the Hamming distances. 
+    This similarity measure was calculated for each tag to distinguish whether similar tags truly stem from different molecules or occured due to sequencing or PCR errros. 
+    In addition the tags of chimeric reads can be identified by calculating the Hamming distance for each half of the tag. 
+    This analysis can be performed on only a sample (by default: sample size=1000) or on the whole dataset (sample size=0). 
+    It is also possible to select on only those tags, which have a partner tag in the dataset (DCSs) or to filter the dataset after the tag's family size.  
+    
+**Input**
+    
+    This tools expects a tabular file with the tags of all families, their sizes and information about forward (ab) and reverse (ba) strands. 
+    
+    +-----+----------------------------+----+
+    | 1   | AAAAAAAAAAAATGTTGGAATCTT   | ba |
+    +-----+----------------------------+----+
+    | 10  | AAAAAAAAAAAGGCGGTCCACCCC   | ab |
+    +-----+----------------------------+----+
+    | 28  | AAAAAAAAAAATGGTATGGACCGA   | ab |
+    +-----+----------------------------+----+
+    
+    
+**Output**
+    
+    The output is one PDF file with the plots of the Hamming distance and a CSV with the data of the plot for each dataset.
+    
+    
+**About Author**
+    
+    Author: Monika Heinzl
+    
+    Department: Institute of Bioinformatics, Johannes Kepler University Linz, Austria
+    
+    Contact: monika.heinzl@edumail.at
+    
+   ]]> 
+    
+    </help>
+    <citations>
+        <citation type="bibtex">
+            @misc{duplex,
+            author = {Heinzl, Monika},
+            year = {2018},
+            title = {Development of algorithms for the analysis of duplex sequencing data}
+         }
+        </citation>
+    </citations>
+</tool>