Mercurial > repos > mheinzl > fsd

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fsd.py	Wed Apr 25 08:59:17 2018 -0400
@@ -0,0 +1,431 @@
+#!/usr/bin/env python
+
+# Family size distribution of SSCSs
+#
+# Author: Monika Heinzl, Johannes-Kepler University Linz (Austria)
+# Contact: monika.heinzl@edumail.at
+#
+# Takes at least one TABULAR file with tags before the alignment to the SSCS, but up to 4 files can be provided, as input.
+# The program produces a plot which shows the distribution of family sizes of the all SSCSs from the input files and
+# a CSV file with the data of the plot, as well as a TXT file with all tags of the DCS and their family sizes.
+# If only one file is provided, then a family size distribution, which is separated after SSCSs without a partner and DCSs, is produced.
+# Whereas a family size distribution with multiple data in one plot is produced, when more than one file (up to 4) is given.
+
+# USAGE: python FSD_Galaxy_1.4_commandLine_FINAL.py filename --inputFile2 filename2 --inputFile3 filename3 --inputFile4 filename4 /
+#        --title_file outputFileName --sep "characterWhichSeparatesCSVFile"
+
+import numpy
+import matplotlib.pyplot as plt
+from matplotlib.backends.backend_pdf import PdfPages
+import argparse
+import sys
+import os
+import re
+from Cheetah.Template import Template
+
+def readFileReferenceFree(file):
+    with open(file, 'r') as dest_f:
+        data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string')
+        return(data_array)
+
+def make_argparser():
+    parser = argparse.ArgumentParser(description='Family Size Distribution of duplex sequencing data')
+    parser.add_argument('inputFile',
+                        help='Tabular File with three columns: ab or ba, tag and family size.')
+    parser.add_argument('--inputName1')
+    parser.add_argument('--inputFile2',default=None,
+                        help='Tabular File with three columns: ab or ba, tag and family size.')
+    parser.add_argument('--inputName2')
+    parser.add_argument('--inputFile3',default=None,
+                        help='Tabular File with three columns: ab or ba, tag and family size.')
+    parser.add_argument('--inputName3')
+    parser.add_argument('--inputFile4',default=None,
+                        help='Tabular File with three columns: ab or ba, tag and family size.')
+    parser.add_argument('--inputName4')
+    parser.add_argument('--sep', default=",",
+                        help='Separator in the csv file.')
+    parser.add_argument('--output_csv', default="data.csv",type=str,
+                        help='Name of the pdf and csv file.')
+    parser.add_argument('--output_pdf', default="data.pdf",type=str,
+                        help='Name of the pdf and csv file.')
+    return parser
+
+def compare_read_families(argv):
+    parser = make_argparser()
+    args=parser.parse_args(argv[1:])
+
+    firstFile = args.inputFile
+    name1 = args.inputName1
+    secondFile = args.inputFile2
+    name2 = args.inputName2
+    thirdFile = args.inputFile3
+    name3 = args.inputName3
+    fourthFile = args.inputFile4
+    name4 = args.inputName4
+
+    title_file = args.output_csv
+    title_file2 = args.output_pdf
+    sep = args.sep
+
+    if type(sep) is not str or len(sep)>1:
+        print("Error: --sep must be a single character.")
+        exit(4)
+
+    plt.rc('figure', figsize=(11.69, 8.27))  # A4 format
+    plt.rcParams['patch.edgecolor'] = "black"
+    plt.rcParams['axes.facecolor'] = "E0E0E0"  # grey background color
+    plt.rcParams['xtick.labelsize'] = 12
+    plt.rcParams['ytick.labelsize'] = 12
+
+    list_to_plot = []
+    label = []
+    data_array_list = []
+
+    with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf:
+       fig = plt.figure()
+       plt.subplots_adjust(bottom=0.25)
+       if firstFile != str(None):
+           file1 = readFileReferenceFree(firstFile)
+           integers = numpy.array(file1[:, 0]).astype(int)  ## keep original family sizes
+
+           # for plot: replace all big family sizes by 22
+           data1 = numpy.array(file1[:, 0]).astype(int)
+           bigFamilies = numpy.where(data1 > 20)[0]
+           data1[bigFamilies] = 22
+
+           name1 = name1.split(".tabular")[0]
+           list_to_plot.append(data1)
+           label.append(name1)
+           data_array_list.append(file1)
+
+           legend = "\n\n\n{}".format(name1)
+           plt.text(0.1, 0.11, legend, size=12, transform=plt.gcf().transFigure)
+           legend1 = "singletons:\nabsolute nr.\n{:,}".format(numpy.bincount(data1)[1])
+           plt.text(0.4, 0.11, legend1, size=12, transform=plt.gcf().transFigure)
+
+           legend3 = "rel. freq\n{:.3f}".format(float(numpy.bincount(data1)[1]) / len(data1))
+           plt.text(0.5, 0.11, legend3, size=12, transform=plt.gcf().transFigure)
+
+           legend4 = "family size > 20:\nabsolute nr.\n{:,}".format(
+               numpy.bincount(data1)[len(numpy.bincount(data1)) - 1].astype(int))
+           plt.text(0.6, 0.11, legend4, size=12, transform=plt.gcf().transFigure)
+
+           legend5 = "rel. freq\n{:.3f}".format(float(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1]) / len(data1))
+           plt.text(0.7, 0.11, legend5, size=12, transform=plt.gcf().transFigure)
+
+           legend6 = "total length\n{:,}".format(len(data1))
+           plt.text(0.8, 0.11, legend6, size=12, transform=plt.gcf().transFigure)
+
+       if secondFile != str(None):
+           file2 = readFileReferenceFree(secondFile)
+           data2 = numpy.asarray(file2[:, 0]).astype(int)
+           bigFamilies2 = numpy.where(data2 > 20)[0]
+           data2[bigFamilies2] = 22
+
+           list_to_plot.append(data2)
+           name2 = name2.split(".tabular")[0]
+           label.append(name2)
+           data_array_list.append(file2)
+
+           plt.text(0.1, 0.09, name2, size=12, transform=plt.gcf().transFigure)
+
+           legend1 = "{:,}".format(numpy.bincount(data2)[1])
+           plt.text(0.4, 0.09, legend1, size=12, transform=plt.gcf().transFigure)
+
+           legend3 = "{:.3f}".format(float(numpy.bincount(data2)[1]) / len(data2))
+           plt.text(0.5, 0.09, legend3, size=12, transform=plt.gcf().transFigure)
+
+           legend4 = "{:,}".format(numpy.bincount(data2)[len(numpy.bincount(data2)) - 1].astype(int))
+           plt.text(0.6, 0.09, legend4, size=12, transform=plt.gcf().transFigure)
+
+           legend5 = "{:.3f}".format(float(numpy.bincount(data2)[len(numpy.bincount(data2)) - 1]) / len(data2))
+           plt.text(0.7, 0.09, legend5, size=12, transform=plt.gcf().transFigure)
+
+           legend6 = "{:,}".format(len(data2))
+           plt.text(0.8, 0.09, legend6, size=12, transform=plt.gcf().transFigure)
+
+       if thirdFile != str(None):
+           file3 = readFileReferenceFree(thirdFile)
+
+           data3 = numpy.asarray(file3[:, 0]).astype(int)
+           bigFamilies3 = numpy.where(data3 > 20)[0]
+           data3[bigFamilies3] = 22
+
+           list_to_plot.append(data3)
+           name3 = name3.split(".tabular")[0]
+           label.append(name3)
+           data_array_list.append(file3)
+
+           plt.text(0.1, 0.07, name3, size=12, transform=plt.gcf().transFigure)
+
+           legend1 = "{:,}".format(numpy.bincount(data3)[1])
+           plt.text(0.4, 0.07, legend1, size=12, transform=plt.gcf().transFigure)
+
+           legend3 = "{:.3f}".format(float(numpy.bincount(data3)[1]) / len(data3))
+           plt.text(0.5, 0.07, legend3, size=12, transform=plt.gcf().transFigure)
+
+           legend4 = "{:,}".format(numpy.bincount(data3)[len(numpy.bincount(data3)) - 1].astype(int))
+           plt.text(0.6, 0.07, legend4, size=12, transform=plt.gcf().transFigure)
+
+           legend5 = "{:.3f}".format(float(numpy.bincount(data3)[len(numpy.bincount(data3)) - 1]) / len(data3))
+           plt.text(0.7, 0.07, legend5, size=12, transform=plt.gcf().transFigure)
+
+           legend6 = "{:,}".format(len(data3))
+           plt.text(0.8, 0.07, legend6, size=12, transform=plt.gcf().transFigure)
+
+       if fourthFile != str(None):
+           file4 = readFileReferenceFree(fourthFile)
+
+           data4 = numpy.asarray(file4[:, 0]).astype(int)
+           bigFamilies4 = numpy.where(data4 > 20)[0]
+           data4[bigFamilies4] = 22
+
+           list_to_plot.append(data4)
+           name4 = name4.split(".tabular")[0]
+           label.append(name4)
+           data_array_list.append(file4)
+
+           plt.text(0.1, 0.05, name4, size=12, transform=plt.gcf().transFigure)
+
+           legend1 = "{:,}".format(numpy.bincount(data4)[1])
+           plt.text(0.4, 0.05, legend1, size=12, transform=plt.gcf().transFigure)
+
+           legend4 = "{:.3f}".format(float(numpy.bincount(data4)[1]) / len(data4))
+           plt.text(0.5, 0.05, legend4, size=12, transform=plt.gcf().transFigure)
+
+           legend4 = "{:,}".format(numpy.bincount(data4)[len(numpy.bincount(data4)) - 1].astype(int))
+           plt.text(0.6, 0.05, legend4, size=12, transform=plt.gcf().transFigure)
+
+           legend5 = "{:.3f}".format(float(numpy.bincount(data4)[len(numpy.bincount(data4)) - 1]) / len(data4))
+           plt.text(0.7, 0.05, legend5, size=12, transform=plt.gcf().transFigure)
+
+           legend6 = "{:,}".format(len(data4))
+           plt.text(0.8, 0.05, legend6, size=12, transform=plt.gcf().transFigure)
+
+       maximumX = numpy.amax(numpy.concatenate(list_to_plot))
+       minimumX = numpy.amin(numpy.concatenate(list_to_plot))
+
+       counts = plt.hist(list_to_plot, bins=range(minimumX, maximumX + 1), stacked=False, edgecolor="black",
+                         linewidth=1, label=label, align="left", alpha=0.7, rwidth=0.8)
+
+       ticks = numpy.arange(minimumX - 1, maximumX, 1)
+       ticks1 = map(str, ticks)
+       ticks1[len(ticks1) - 1] = ">20"
+       plt.xticks(numpy.array(ticks), ticks1)
+
+       plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
+       plt.title("Family Size Distribution", fontsize=14)
+       plt.xlabel("No. of Family Members", fontsize=14)
+       plt.ylabel("Absolute Frequency", fontsize=14)
+       plt.margins(0.01, None)
+       plt.grid(b=True, which="major", color="#424242", linestyle=":")
+       pdf.savefig(fig)
+       plt.close()
+
+       # write data to CSV file
+       output_file.write("Values from family size distribution with all datasets\n")
+       output_file.write("\nFamily size")
+       for i in label:
+           output_file.write("{}{}".format(sep, i))
+       output_file.write("{}sum".format(sep))
+       output_file.write("\n")
+       j = 0
+       for fs in counts[1][0:len(counts[1]) - 1]:
+           if fs == 21:
+               fs = ">20"
+           else:
+               fs = "={}".format(fs)
+           output_file.write("FS{}{}".format(fs, sep))
+           values_of_fs = []
+           if len(label) == 1:
+               output_file.write("{}{}".format(int(counts[0][j]), sep))
+               values_of_fs.append(int(counts[0][j]))
+           else:
+               for n in range(len(label)):
+                   output_file.write("{}{}".format(int(counts[0][n][j]), sep))
+                   values_of_fs.append(int(counts[0][n][j]))
+           output_file.write("{}\n".format(sum(values_of_fs)))
+           j += 1
+       output_file.write("sum{}".format(sep))
+       values_for_sum = []
+       if len(label) == 1:
+            output_file.write("{}{}".format(int(sum(counts[0])), sep))
+            values_for_sum.append(int(sum(counts[0])))
+       else:
+            for i in counts[0]:
+                output_file.write("{}{}".format(int(sum(i)), sep))
+                values_for_sum.append(int(sum(i)))
+
+       output_file.write("{}\n".format(sum(values_for_sum)))
+
+### Family size distribution after DCS and SSCS
+       for dataset, data, name_file in zip(list_to_plot, data_array_list, label):
+            maximumX = numpy.amax(dataset)
+            minimumX = numpy.amin(dataset)
+
+            tags = numpy.array(data[:, 2])
+            seq = numpy.array(data[:, 1])
+            data = numpy.array(dataset)
+
+            # find all unique tags and get the indices for ALL tags, but only once
+            u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
+            d = u[c > 1]
+
+            # get family sizes, tag for duplicates
+            duplTags_double = data[numpy.in1d(seq, d)]
+            duplTags = duplTags_double[0::2]  # ab of DCS
+            duplTagsBA = duplTags_double[1::2]  # ba of DCS
+
+            duplTags_double_tag = tags[numpy.in1d(seq, d)]
+            duplTags_double_seq = seq[numpy.in1d(seq, d)]
+
+            # get family sizes for SSCS with no partner
+            ab = numpy.where(tags == "ab")[0]
+            abSeq = seq[ab]
+            ab = data[ab]
+            ba = numpy.where(tags == "ba")[0]
+            baSeq = seq[ba]
+            ba = data[ba]
+
+            dataAB = ab[numpy.in1d(abSeq, d, invert=True)]
+            dataBA = ba[numpy.in1d(baSeq, d, invert=True)]
+
+            # write DCS tags to file
+            # with open("DCS information_{}.txt".format(firstFile), "w") as file:
+            #     for t, s, f in zip(duplTags_double_tag, duplTags_double_seq, duplTags_double):
+            #         file.write("{}\t{}\t{}\n".format(t, s, f))
+
+            list1 = [duplTags_double, dataAB, dataBA]  # list for plotting
+
+            ## information for family size >= 3
+            dataAB_FS3 = dataAB[dataAB >= 3]
+            dataBA_FS3 = dataBA[dataBA >= 3]
+            ab_FS3 = ab[ab >= 3]
+            ba_FS3 = ba[ba >= 3]
+
+            duplTags_FS3 = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)]  # ab+ba with FS>=3
+            duplTags_FS3_BA = duplTagsBA[(duplTags >= 3) & (duplTagsBA >= 3)]  # ba+ab with FS>=3
+            duplTags_double_FS3 = len(duplTags_FS3)+len(duplTags_FS3_BA) # both ab and ba strands with FS>=3
+
+            fig = plt.figure()
+
+            plt.subplots_adjust(bottom=0.3)
+            counts = plt.hist(list1, bins=range(minimumX, maximumX + 1), stacked=True,
+                              label=["duplex", "ab", "ba"], edgecolor="black", linewidth=1,
+                              align="left", color=["#FF0000", "#5FB404", "#FFBF00"])
+            # tick labels of x axis
+            ticks = numpy.arange(minimumX - 1, maximumX, 1)
+            ticks1 = map(str, ticks)
+            ticks1[len(ticks1) - 1] = ">20"
+            plt.xticks(numpy.array(ticks), ticks1)
+            singl = counts[0][2][0]  # singletons
+            last = counts[0][2][len(counts[0][0]) - 1]  # large families
+
+            plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
+            plt.title(name1, fontsize=14)
+            plt.xlabel("No. of Family Members", fontsize=14)
+            plt.ylabel("Absolute Frequency", fontsize=14)
+            plt.margins(0.01, None)
+            plt.grid(b=True, which="major", color="#424242", linestyle=":")
+
+            ## extra information beneath the plot
+            legend = "SSCS ab= \nSSCS ba= \nDCS (total)= \nlength of dataset="
+            plt.text(0.1, 0.09, legend, size=12, transform=plt.gcf().transFigure)
+
+            legend = "absolute numbers\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,}" \
+                .format(len(dataAB), len(dataBA), len(duplTags), len(duplTags_double),
+                        (len(dataAB) + len(dataBA) + len(duplTags)))
+            plt.text(0.35, 0.09, legend, size=12, transform=plt.gcf().transFigure)
+
+            legend = "relative frequencies\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}" \
+                .format(float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+                        float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+                        float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+                        (len(dataAB) + len(dataBA) + len(duplTags)))
+            plt.text(0.54, 0.09, legend, size=12, transform=plt.gcf().transFigure)
+
+            legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}" \
+                .format(float(len(dataAB)) / (len(ab) + len(ba)), float(len(dataBA)) / (len(ab) + len(ba)),
+                        float(len(duplTags)) / (len(ab) + len(ba)),
+                        float(len(duplTags_double)) / (len(ab) + len(ba)), (len(ab) + len(ba)))
+            plt.text(0.64, 0.09, legend, size=12, transform=plt.gcf().transFigure)
+
+            legend1 = "\nsingletons:\nfamily size > 20:"
+            plt.text(0.1, 0.03, legend1, size=12, transform=plt.gcf().transFigure)
+
+            legend4 = "{:,}\n{:,}".format(singl.astype(int), last.astype(int))
+            plt.text(0.35, 0.03, legend4, size=12, transform=plt.gcf().transFigure)
+
+            legend3 = "{:.3f}\n{:.3f}".format(singl / len(data),last / len(data))
+            plt.text(0.54, 0.03, legend3, size=12, transform=plt.gcf().transFigure)
+
+            pdf.savefig(fig)
+            plt.close()
+
+            #  write same information to a csv file
+            count = numpy.bincount(integers)  # original counts of family sizes
+            output_file.write("\nDataset:{}{}\n".format(sep, name_file))
+            output_file.write("max. family size:{}{}\n".format(sep, max(integers)))
+            output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1]))
+            output_file.write("relative frequency:{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count)))
+
+            output_file.write("{}singletons:{}{}family size > 20:\n".format(sep, sep, sep))
+            output_file.write(
+                "{}absolute nr.{}rel. freq{}absolute nr.{}rel. freq{}total length\n".format(sep, sep, sep, sep, sep))
+            output_file.write("{}{}{}{}{:.3f}{}{}{}{:.3f}{}{}\n\n".format(name_file, sep, singl.astype(int), sep,
+                                                                          singl / len(data), sep,last.astype(int), sep,
+                                                                          last / len(data), sep, len(data)))
+
+            ## information for FS >= 1
+            output_file.write(
+                "The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS)\n" \
+                "Whereas the total frequencies were calculated from the whole dataset (=including the DCS).\n\n")
+            output_file.write("FS >= 1{}{}unique:{}total:\n".format(sep, sep, sep))
+            output_file.write("nr./rel. freq of ab={}{}{}{:.3f}{}{:.3f}\n".format(sep, len(dataAB), sep,
+                                                                                  float(len(dataAB)) / (len(dataAB) + len(dataBA) + len( duplTags)), sep,
+                                                                                  float(len(dataAB)) / (len(ab) + len(ba))))
+            output_file.write("nr./rel. freq of ba={}{}{}{:.3f}{}{:.3f}\n".format(sep, len(dataBA), sep,
+                                                                                  float(len(dataBA)) / (len(dataBA) + len(dataBA) + len(duplTags)), sep,
+                                                                                  float(len(dataBA)) / (len(ba) + len(ba))))
+            output_file.write(
+                "nr./rel. freq of DCS (total)={}{} ({}){}{:.3f}{}{:.3f} ({:.3f})\n".format(sep, len(duplTags), len(duplTags_double), sep,
+                                                                                           float(len(duplTags)) / ( len(dataAB) + len( dataBA) + len(duplTags)),
+                                                                                           sep, float(len(duplTags)) / ( len(ab) + len(ba)),
+                                                                                           float( len(duplTags_double)) / (len(ab) + len(ba))))
+            output_file.write(
+                "length of dataset={}{}{}{}{}{}\n".format(sep, (len(dataAB) + len(dataBA) + len(duplTags)), sep,
+                                                          (len(dataAB) + len(dataBA) + len(duplTags)), sep,(len(ab) + len(ba))))
+            ## information for FS >= 3
+            output_file.write("FS >= 3{}{}unique:{}total:\n".format(sep, sep, sep))
+            output_file.write("nr./rel. freq of ab={}{}{}{:.3f}{}{:.3f}\n".format(sep, len(dataAB_FS3), sep,
+                                                                                  float(len(dataAB_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)),
+                                                                                  sep, float(len(dataAB_FS3)) / ( len(ab_FS3) + len(ba_FS3))))
+            output_file.write("nr./rel. freq of ba={}{}{}{:.3f}{}{:.3f}\n".format(sep, len(dataBA_FS3), sep,
+                                                                                  float(len(dataBA_FS3)) / ( len(dataBA_FS3) + len(dataBA_FS3) + len(duplTags_FS3)),
+                                                                                  sep,float(len(dataBA_FS3)) / (len(ba_FS3) + len(ba_FS3))))
+            output_file.write(
+                "nr./rel. freq of DCS (total)={}{} ({}){}{:.3f}{}{:.3f} ({:.3f})\n".format(sep, len(duplTags_FS3),duplTags_double_FS3,
+                                                                                                        sep, float(len( duplTags_FS3)) / (len(dataBA_FS3) + len(duplTags_FS3)),
+                                                                                                        sep, float(len(duplTags_FS3)) / (len(ab_FS3) + len(ba_FS3)),
+                                                                                                        float(duplTags_double_FS3) / (len(ab_FS3) + len(ba_FS3))))
+            output_file.write(
+                "length of dataset={}{}{}{}{}{}\n".format(sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep,
+                                                                        (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep,
+                                                                        (len(ab_FS3) + len(ba_FS3))))
+
+            output_file.write("\nValues from family size distribution\n")
+            output_file.write("{}duplex{}ab{}ba{}sum\n".format(sep, sep, sep, sep))
+            for dx, ab, ba, fs in zip(counts[0][0], counts[0][1], counts[0][2], counts[1]):
+                if fs == 21:
+                    fs = ">20"
+                else:
+                    fs = "={}".format(fs)
+                ab1 = ab - dx
+                ba1 = ba - ab
+                output_file.write(
+                    "FS{}{}{}{}{}{}{}{}{}\n".format(fs, sep, int(dx), sep, int(ab1), sep, int(ba1), sep, int(ba)))
+
+    print("Files successfully created!")
+
+if __name__ == '__main__':
+  sys.exit(compare_read_families(sys.argv))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fsd.xml	Wed Apr 25 08:59:17 2018 -0400
@@ -0,0 +1,83 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tool id="fsd" name="Duplex Sequencing Analysis:" version="0.0.1">
+    <description>Family size distribution of tags</description>
+    <requirements>
+        <requirement type="package" version="1.4">matplotlib</requirement>
+    </requirements>
+
+    <command>
+        python FSD.py $file1 --inputName1 $file1.name --inputFile2 $file2 --inputName2 $file2.name --inputFile3 $file3 --inputName3 $file3.name --inputFile4 $file4 --inputName4 $file4.name --sep $separator --output_csv $output_csv --output_pdf $output_pdf
+    </command>
+    <inputs>
+        <param name="file1" type="data" format="tabular" label="Dataset 1: input tags" optional="false"/>
+        <param name="file2" type="data" format="tabular" label="Dataset 2: input tags" optional="true"  />
+        <param name="file3" type="data" format="tabular" label="Dataset 3: input tags" optional="true" />
+        <param name="file4" type="data" format="tabular" label="Dataset 4: input tags" optional="true"  help="Input in tabular format with the family size, tags and the direction of the strand ('ab' or 'ba') for each family. Name of the files can have max. 34 charcters, blanks are not allowed!"/>
+        <param name="separator" type="text" label="Separator of the CSV file." help="can be a single character" value=","/>
+    </inputs>
+    <outputs>
+        <data name="output_pdf" format="pdf" />
+        <data name="output_csv" format="csv"/>
+    </outputs>
+    <!--  <tests>
+        <test>
+            <param name="file1" value="Test_data.tabular"/>
+            <param name="file2" value="None"/>
+            <param name="file3" value="None"/>
+            <param name="file4" value="None"/>
+            <output name="output_pdf" file="output_file.pdf"/>
+            <output name="output_csv" file="output_file.csv"/>
+        </test>
+    </tests>
+    -->
+    <help> <![CDATA[
+
+**What it does**
+
+    This tool will create a distribution of family sizes of each tag, which is separated after families tags that have only the forward (ab) strand, the reverse (ba) strand or both strands (ab+ba) of the DCS and a family size distribution without separation is created. If multiple files are provided as input, the family size distribution without separation contains all datasets in one plot and for each dataset a distribution with separation after single ab, ba strands and DCSs is produced.
+
+
+**Input**
+
+    This tools expects a tabular file with the tags of all families, their sizes and information about forward (ab) and reverse (ba) strands.
+
+    **!!! Name of the files can have max. 34 charcters, blanks are not allowed !!!**
+
+    +-----+----------------------------+----+
+    | 1   | AAAAAAAAAAAATGTTGGAATCTT   | ba |
+    +-----+----------------------------+----+
+    | 10  | AAAAAAAAAAAGGCGGTCCACCCC   | ab |
+    +-----+----------------------------+----+
+    | 28  | AAAAAAAAAAATGGTATGGACCGA   | ab |
+    +-----+----------------------------+----+
+
+
+
+
+
+**Output**
+
+    The output is a PDF file with the plot and a CSV with the data of the plot.
+
+
+**About Author**
+
+    Author: Monika Heinzl
+
+    Department: Institute of Bioinformatics, Johannes Kepler University Linz, Austria
+
+    Contact: monika.heinzl@edumail.at
+
+        ]]>
+
+    </help>
+    <citations>
+        <citation type="bibtex">
+            @misc{duplex,
+            author = {Heinzl, Monika},
+            year = {2018},
+            title = {Duplex analysis}
+         }
+        </citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Test_data.tabular	Wed Apr 25 08:59:17 2018 -0400
@@ -0,0 +1,112 @@
+1	AAAAAAAAAAAAAACCAAAACTTC	ba
+1	AAAAAAAAAAAAACCAGGCGTCGA	ba
+1	AAAAAAAAAAAAAGCTCCACGTTG	ba
+1	AAAAAAAAAAAAATCGTGGTTTGT	ba
+1	AAAAAAAAAAAAATTCACCCTTGT	ba
+7	AAAAAAAAAAAACACACTTAACTT	ba
+1	AAAAAAAAAAAACAGTGTTGAGAC	ba
+4	AAAAAAAAAAAACCGCTCCTCACA	ba
+1	AAAAAAAAAAAAGGCAACACAGAA	ab
+2	AAAAAAAAAAAATCTTTCTTTGAG	ab
+1	AAAAAAAAAAAATTGGGTTCCTTA	ab
+1	AAAAAAAAAAAGAGTCGCACCCAG	ba
+4	AAAAAAAAAAAGATCGTGGTTTGT	ba
+1	AAAAAAAAAAAGCGCAACACAGAA	ab
+3	AAAAAAAAAAAGGGCAACACAGAA	ab
+1	AAAAAAAAAAAGTAGCCCTAAACG	ab
+1	AAAAAAAAAAAGTCTTTCTTTGAG	ab
+1	AAAAAAAAAAATATCATAGACTCT	ab
+6	AAAAAAAAAAATATTCACCCTTGT	ba
+1	AAAAAAAAAAATATTCGAAAGTTA	ba
+3	AAAAAAAAAAATCACACTTAACTT	ba
+1	AAAAAAAAAAATCCGCTCCTCACA	ba
+1	AAAAAAAAAAATTAACTAAACTTA	ab
+1	AAAAAAAAAACAAATTCTATTATT	ab
+1	AAAAAAAAAACTCCCAGATTTTTT	ab
+1	AAAAAAAAAACTTCTGCTTGGCGG	ba
+11	AAAAAAAAAAGAATCGTGGTTTGT	ba
+5	AAAAAAAAAAGATAGCCCTAAACG	ab
+1	AAAAAAAAAAGCAATAATGCCAGT	ab
+2	AAAAAAAAAAGTACCGCACTCTCA	ba
+1	AAAAAAAAAAGTTCTTTCTTTGAG	ab
+1	AAAAAAAAAATAACTTCAATAATG	ba
+2	AAAAAAAAAATAATCATAGACTCT	ab
+1	AAAAAAAAAATAGTCTCACATTTA	ab
+1	AAAAAAAAAATATAACCTTTGGCG	ab
+3	AAAAAAAAACAAAATTCTATTATT	ab
+1	AAAAAAAAACAAGTACGCGGCATT	ab
+1	AAAAAAAAACAAGTACGCGGTATT	ab
+1	AAAAAAAAACAATATCGAATTAAC	ab
+3	AAAAAAAAACACGGTGAGACAAGG	ba
+1	AAAAAAAAACACGTTTCTCCCCTT	ba
+1	AAAAAAAAACATATCGTCCCGAGC	ba
+1	AAAAAAAAACCTACCTGAGGCCCC	ab
+3	AAAAAAAAACCTTATTACAGCGGA	ab
+1	AAAAAAAAACGATTCTCTGTATCT	ba
+1	AAAAAAAAACGTACCGCACTCTCA	ba
+4	AAAAAAAAACTACCCAGATTTTTT	ba
+1	AAAAAAAAACTAGATGAGACGACC	ba
+4	AAAAAAAAACTGTCTGCTTGGCGG	ba
+1	AAAAAAAAAGAAGTTTAATTTTAA	ab
+1	AAAAAAAAAGAATGCCTAAGACGA	ba
+6	AAAAAAAAAGACCGGCCTTAGACA	ba
+1	AAAAAAAAAGATATCGTGGTTTGT	ba
+1	AAAAAAAAAGCAATACTCAAGCTG	ba
+6	AAAAAAAAAGCAATGTCTAAGCCT	ba
+1	AAAAAAAAAGCACTGTCTAAGCCT	ab
+2	AAAAAAAAAGCTAATAATGCCAGT	ab
+1	AAAAAAAAAGTTTCGTGAAGGTCC	ba
+1	AAAAAAAAATAAAGGTCCGAATCT	ab
+1	AAAAAAAAATAAATGAGAGTGTAA	ba
+8	AAAAAAAAATAAGTCTCACATTTA	ab
+1	AAAAAAAAATAATAACCTCTGGCG	ab
+10	AAAAAAAAATAATAACCTTTGGCG	ab
+1	AAAAAAAAATAATCCCCTTTGTCG	ab
+6	AAAAAAAAATACGCAAACGCTGAG	ab
+4	AAAAAAAAATAGATCATAGACTCT	ab
+10	AAAAAAAAATAGATCATAGACTCT	ba
+10	AAAAAAAAATAGTAGGATTTCATG	ba
+7	AAAAAAAAATATGAATACCCTCGT	ba
+1	AAAAAAAAATATGCCACTTGATCC	ba
+1	AAAAAAAAATATTCTGCCACTTGA	ba
+3	AAAAAAAAATCAAACCAAGAGGAC	ba
+1	AAAAAAAAATCAGTACCCCTAAAC	ab
+12	AAAAAAAAATCCTAGTTAATGAAG	ba
+1	AAAAAAAAATCGATTCTTTATGCG	ab
+1	AAAAAAAAATGTCTGAAAATATCT	ab
+4	AAAAAAAAATGTCTGAAAATATCT	ba
+1	AAAAAAAAATTTCCGCAGACCGTT	ba
+8	AAAAAAAAATTTGGGCTACTACAA	ba
+1	AAAAAAAACAAAATTAGAACCCTT	ab
+1	AAAAAAAACAAACCGCTCCTCACA	ba
+5	AAAAAAAACAACGTACGCGGTATT	ab
+4	AAAAAAAACAATATCGTTGATATG	ba
+4	AAAAAAAACAATCACGTTAATAGG	ab
+1	AAAAAAAACAGAATCGTGGTTTGT	ba
+1	AAAAAAAACCAAATCGTTGATATG	ba
+9	AAAAAAAACCAAGTCCAGGCATCT	ba
+2	AAAAAAAACCACGGTGAGACAAGG	ba
+1	AAAAAAAACCGCCCAACTGCCGGT	ab
+5	AAAAAAAACCTCTCAACCCCAAAT	ba
+7	AAAAAAAACCTCTTGCGATGTTGT	ab
+1	AAAAAAAACCTCTTGCGCTGTTGT	ab
+1	AAAAAAAACCTCTTGTGATGTTGT	ab
+12	AAAAAAAACCTGAGCAATGGTTCC	ab
+3	AAAAAAAACCTTGACCCTCACATG	ba
+6	AAAAAAAACCTTGCACTCGTCCTA	ba
+9	AAAAAAAACGAAATAAAAAAACCT	ba
+1	AAAAAAAACGACCGGCCTTAGACA	ba
+4	AAAAAAAACGCCACCACCCCCTTT	ab
+12	AAAAAAAACGCCACGGGCACTATT	ba
+13	AAAAAAAACGTATCAGTAGATCCT	ab
+1	AAAAAAAACTAGTAGGATTTCATG	ba
+3	AAAAAAAACTATAGAAAATCCATT	ba
+1	AAAAAAAACTATTCTATTTCCGAT	ba
+13	AAAAAAAACTGATCTGCTTGGCGG	ba
+8	AAAAAAAACTTGCGAATAGCATCG	ba
+4	AAAAAAAACTTGTTATCAAAACGT	ab
+1	AAAAAAAAGAAAAGTTCAACACGC	ba
+1	AAAAAAAAGAAGTTCGCCCTCCGA	ab
+13	AAAAAAAAGAGAGTTTAGTCATGG	ab
+1	AAAAAAAAGAGAGTTTAGTCATGG	ba
+1	AAAAAAAAGAGAGTTTAGTCCTGG	ab
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_file.csv	Wed Apr 25 08:59:17 2018 -0400
@@ -0,0 +1,54 @@
+Values from family size distribution with all datasets
+
+Family size,Test_data,sum
+FS=1,63,63
+FS=2,5,5
+FS=3,8,8
+FS=4,10,10
+FS=5,3,3
+FS=6,5,5
+FS=7,3,3
+FS=8,3,3
+FS=9,2,2
+FS=10,3,3
+FS=11,1,1
+FS=12,6,6
+sum,112,112
+
+Dataset:,Test_data
+max. family size:,13
+absolute frequency:,3
+relative frequency:,0.027
+
+,singletons:,,family size > 20:
+,absolute nr.,rel. freq,absolute nr.,rel. freq,total length
+Test_data,63,0.562,6,0.054,112
+
+The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS)
+Whereas the total frequencies were calculated from the whole dataset (=including the DCS).
+
+FS >= 1,,unique:,total:
+nr./rel. freq of ab=,47,0.431,0.420
+nr./rel. freq of ba=,59,0.488,0.476
+nr./rel. freq of DCS (total)=,3 (6),0.028,0.027 (0.054)
+length of dataset=,109,109,112
+FS >= 3,,unique:,total:
+nr./rel. freq of ab=,14,0.341,0.318
+nr./rel. freq of ba=,26,0.491,0.464
+nr./rel. freq of DCS (total)=,1 (2),0.037,0.023 (0.045)
+length of dataset=,41,41,44
+
+Values from family size distribution
+,duplex,ab,ba,sum
+FS=1,2,30,31,63
+FS=2,0,3,2,5
+FS=3,0,3,5,8
+FS=4,2,3,5,10
+FS=5,0,2,1,3
+FS=6,0,1,4,5
+FS=7,0,1,2,3
+FS=8,0,1,2,3
+FS=9,0,0,2,2
+FS=10,1,1,1,3
+FS=11,0,0,1,1
+FS=12,1,2,3,6
Binary file test-data/output_file.pdf has changed