# HG changeset patch # User mheinzl # Date 1554196209 14400 # Node ID 2e517a54eedc7d9dcd7a1be5dbe76502a79e12f0 # Parent 6bd9ef49d013fc965ae0754f0b87d1a175b7a89e planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd commit b8a2f7b7615b2bcd3b602027af31f4e677da94f6-dirty diff -r 6bd9ef49d013 -r 2e517a54eedc fsd.py --- a/fsd.py Mon Oct 08 05:50:18 2018 -0400 +++ b/fsd.py Tue Apr 02 05:10:09 2019 -0400 @@ -45,19 +45,17 @@ def compare_read_families(argv): + parser = make_argparser() args = parser.parse_args(argv[1:]) - firstFile = args.inputFile1 name1 = args.inputName1 - secondFile = args.inputFile2 name2 = args.inputName2 thirdFile = args.inputFile3 name3 = args.inputName3 fourthFile = args.inputFile4 name4 = args.inputName4 - title_file = args.output_tabular title_file2 = args.output_pdf @@ -90,24 +88,32 @@ data_array_list.append(file1) legend = "\n\n\n{}".format(name1) - plt.text(0.1, 0.11, legend, size=12, transform=plt.gcf().transFigure) - legend1 = "singletons:\nabsolute nr.\n{:,}".format(numpy.bincount(data1)[1]) - plt.text(0.4, 0.11, legend1, size=12, transform=plt.gcf().transFigure) + plt.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) + legend1 = "singletons:\nnr. of tags\n{:,}".format(numpy.bincount(data1)[1]) + plt.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) - legend3 = "rel. freq\n{:.3f}".format(float(numpy.bincount(data1)[1]) / len(data1)) - plt.text(0.5, 0.11, legend3, size=12, transform=plt.gcf().transFigure) + legend3 = "freq. of tags\n{:.3f}".format(float(numpy.bincount(data1)[1]) / len(data1)) + plt.text(0.41, 0.11, legend3, size=10, transform=plt.gcf().transFigure) + + legend3b = "PE reads\n{:.3f}".format(float(numpy.bincount(data1)[1]) / sum(integers)) + plt.text(0.5, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) - legend4 = "family size > 20:\nabsolute nr.\n{:,}".format(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1].astype(int)) - plt.text(0.6, 0.11, legend4, size=12, transform=plt.gcf().transFigure) + legend4 = "family size > 20:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1].astype(int), float(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1]) / len(data1)) + plt.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) + + legend5 = "PE reads\n{:,} ({:.3f})".format(sum(integers[integers > 20]), float(sum(integers[integers > 20])) / sum(integers)) + plt.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) - legend5 = "rel. freq\n{:.3f}".format(float(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1]) / len(data1)) - plt.text(0.7, 0.11, legend5, size=12, transform=plt.gcf().transFigure) + legend6 = "total nr. of\ntags\n{:,}".format(len(data1)) + plt.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure) - legend6 = "total length\n{:,}".format(len(data1)) - plt.text(0.8, 0.11, legend6, size=12, transform=plt.gcf().transFigure) + legend6b = "PE reads\n{:,}".format(sum(integers)) + plt.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure) if secondFile != str(None): file2 = readFileReferenceFree(secondFile) + integers2 = numpy.array(file2[:, 0]).astype(int) # keep original family sizes + data2 = numpy.asarray(file2[:, 0]).astype(int) bigFamilies2 = numpy.where(data2 > 20)[0] data2[bigFamilies2] = 22 @@ -117,25 +123,34 @@ label.append(name2) data_array_list.append(file2) - plt.text(0.1, 0.09, name2, size=12, transform=plt.gcf().transFigure) + plt.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) legend1 = "{:,}".format(numpy.bincount(data2)[1]) - plt.text(0.4, 0.09, legend1, size=12, transform=plt.gcf().transFigure) + plt.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure) legend3 = "{:.3f}".format(float(numpy.bincount(data2)[1]) / len(data2)) - plt.text(0.5, 0.09, legend3, size=12, transform=plt.gcf().transFigure) + plt.text(0.41, 0.09, legend3, size=10, transform=plt.gcf().transFigure) + + legend3b = "{:.3f}".format(float(numpy.bincount(data2)[1]) / sum(integers2)) + plt.text(0.5, 0.09, legend3b, size=10, transform=plt.gcf().transFigure) - legend4 = "{:,}".format(numpy.bincount(data2)[len(numpy.bincount(data2)) - 1].astype(int)) - plt.text(0.6, 0.09, legend4, size=12, transform=plt.gcf().transFigure) + legend4 = "{:,} ({:.3f})".format( + numpy.bincount(data2)[len(numpy.bincount(data2)) - 1].astype(int), + float(numpy.bincount(data2)[len(numpy.bincount(data2)) - 1]) / len(data2)) + plt.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) - legend5 = "{:.3f}".format(float(numpy.bincount(data2)[len(numpy.bincount(data2)) - 1]) / len(data2)) - plt.text(0.7, 0.09, legend5, size=12, transform=plt.gcf().transFigure) + legend5 = "{:,} ({:.3f})".format(sum(integers2[integers2 > 20]), float(sum(integers2[integers2 > 20])) / sum(integers2)) + plt.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) legend6 = "{:,}".format(len(data2)) - plt.text(0.8, 0.09, legend6, size=12, transform=plt.gcf().transFigure) + plt.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure) + + legend6b = "{:,}".format(sum(integers2)) + plt.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure) if thirdFile != str(None): file3 = readFileReferenceFree(thirdFile) + integers3 = numpy.array(file3[:, 0]).astype(int) # keep original family sizes data3 = numpy.asarray(file3[:, 0]).astype(int) bigFamilies3 = numpy.where(data3 > 20)[0] @@ -146,25 +161,35 @@ label.append(name3) data_array_list.append(file3) - plt.text(0.1, 0.07, name3, size=12, transform=plt.gcf().transFigure) + plt.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) legend1 = "{:,}".format(numpy.bincount(data3)[1]) - plt.text(0.4, 0.07, legend1, size=12, transform=plt.gcf().transFigure) + plt.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) legend3 = "{:.3f}".format(float(numpy.bincount(data3)[1]) / len(data3)) - plt.text(0.5, 0.07, legend3, size=12, transform=plt.gcf().transFigure) + plt.text(0.41, 0.07, legend3, size=10, transform=plt.gcf().transFigure) + + legend3b = "{:.3f}".format(float(numpy.bincount(data3)[1]) / sum(integers3)) + plt.text(0.5, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) - legend4 = "{:,}".format(numpy.bincount(data3)[len(numpy.bincount(data3)) - 1].astype(int)) - plt.text(0.6, 0.07, legend4, size=12, transform=plt.gcf().transFigure) + legend4 = "{:,} ({:.3f})".format( + numpy.bincount(data3)[len(numpy.bincount(data3)) - 1].astype(int), + float(numpy.bincount(data3)[len(numpy.bincount(data3)) - 1]) / len(data3)) + plt.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) - legend5 = "{:.3f}".format(float(numpy.bincount(data3)[len(numpy.bincount(data3)) - 1]) / len(data3)) - plt.text(0.7, 0.07, legend5, size=12, transform=plt.gcf().transFigure) + legend5 = "{:,} ({:.3f})".format(sum(integers3[integers3 > 20]), + float(sum(integers3[integers3 > 20])) / sum(integers3)) + plt.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure) legend6 = "{:,}".format(len(data3)) - plt.text(0.8, 0.07, legend6, size=12, transform=plt.gcf().transFigure) + plt.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure) + + legend6b = "{:,}".format(sum(integers3)) + plt.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure) if fourthFile != str(None): file4 = readFileReferenceFree(fourthFile) + integers4 = numpy.array(file4[:, 0]).astype(int) # keep original family sizes data4 = numpy.asarray(file4[:, 0]).astype(int) @@ -176,28 +201,37 @@ label.append(name4) data_array_list.append(file4) - plt.text(0.1, 0.05, name4, size=12, transform=plt.gcf().transFigure) + plt.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) legend1 = "{:,}".format(numpy.bincount(data4)[1]) - plt.text(0.4, 0.05, legend1, size=12, transform=plt.gcf().transFigure) + plt.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) - legend4 = "{:.3f}".format(float(numpy.bincount(data4)[1]) / len(data4)) - plt.text(0.5, 0.05, legend4, size=12, transform=plt.gcf().transFigure) + legend3 = "{:.3f}".format(float(numpy.bincount(data4)[1]) / len(data4)) + plt.text(0.41, 0.05, legend3, size=10, transform=plt.gcf().transFigure) + + legend3b = "{:.3f}".format(float(numpy.bincount(data4)[1]) / sum(integers4)) + plt.text(0.5, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) - legend4 = "{:,}".format(numpy.bincount(data4)[len(numpy.bincount(data4)) - 1].astype(int)) - plt.text(0.6, 0.05, legend4, size=12, transform=plt.gcf().transFigure) + legend4 = "{:,} ({:.3f})".format( + numpy.bincount(data4)[len(numpy.bincount(data4)) - 1].astype(int), + float(numpy.bincount(data4)[len(numpy.bincount(data4)) - 1]) / len(data4)) + plt.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) - legend5 = "{:.3f}".format(float(numpy.bincount(data4)[len(numpy.bincount(data4)) - 1]) / len(data4)) - plt.text(0.7, 0.05, legend5, size=12, transform=plt.gcf().transFigure) + legend5 = "{:,} ({:.3f})".format(sum(integers4[integers4 > 20]), + float(sum(integers4[integers4 > 20])) / sum(integers4)) + plt.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure) legend6 = "{:,}".format(len(data4)) - plt.text(0.8, 0.05, legend6, size=12, transform=plt.gcf().transFigure) + plt.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure) + + legend6b = "{:,}".format(sum(integers4)) + plt.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure) maximumX = numpy.amax(numpy.concatenate(list_to_plot)) minimumX = numpy.amin(numpy.concatenate(list_to_plot)) counts = plt.hist(list_to_plot, bins=range(minimumX, maximumX + 1), stacked=False, edgecolor="black", - linewidth=1, label=label, align="left", alpha=0.7, rwidth=0.8) + linewidth=1, label=label, align="left", rwidth=0.8, alpha=0.7) ticks = numpy.arange(minimumX - 1, maximumX, 1) ticks1 = map(str, ticks) @@ -242,53 +276,71 @@ output_file.write("{}{}".format(int(sum(i)), sep)) # Family size distribution after DCS and SSCS - for dataset, data, name_file in zip(list_to_plot, data_array_list, label): + for dataset, data_o, name_file in zip(list_to_plot, data_array_list, label): maximumX = numpy.amax(dataset) minimumX = numpy.amin(dataset) - tags = numpy.array(data[:, 2]) - seq = numpy.array(data[:, 1]) + tags = numpy.array(data_o[:, 2]) + seq = numpy.array(data_o[:, 1]) data = numpy.array(dataset) - + data_o = numpy.array(data_o[:, 0]).astype(int) # find all unique tags and get the indices for ALL tags, but only once u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) d = u[c > 1] # get family sizes, tag for duplicates duplTags_double = data[numpy.in1d(seq, d)] - duplTags = duplTags_double[0::2] # ab of DCS - duplTagsBA = duplTags_double[1::2] # ba of DCS + duplTags_double_o = data_o[numpy.in1d(seq, d)] - # duplTags_double_tag = tags[numpy.in1d(seq, d)] - # duplTags_double_seq = seq[numpy.in1d(seq, d)] + duplTags = duplTags_double[0::2] # ab of DCS + duplTags_o = duplTags_double_o[0::2] # ab of DCS + + duplTagsBA = duplTags_double[1::2] # ba of DCS + duplTagsBA_o = duplTags_double_o[1::2] # ba of DCS # get family sizes for SSCS with no partner ab = numpy.where(tags == "ab")[0] abSeq = seq[ab] + ab_o = data_o[ab] ab = data[ab] + ba = numpy.where(tags == "ba")[0] baSeq = seq[ba] + ba_o = data_o[ba] ba = data[ba] dataAB = ab[numpy.in1d(abSeq, d, invert=True)] + dataAB_o = ab_o[numpy.in1d(abSeq, d, invert=True)] + dataBA = ba[numpy.in1d(baSeq, d, invert=True)] + dataBA_o = ba_o[numpy.in1d(baSeq, d, invert=True)] list1 = [duplTags_double, dataAB, dataBA] # list for plotting # information for family size >= 3 dataAB_FS3 = dataAB[dataAB >= 3] + dataAB_FS3_o = dataAB_o[dataAB_o >= 3] dataBA_FS3 = dataBA[dataBA >= 3] + dataBA_FS3_o = dataBA_o[dataBA_o >= 3] ab_FS3 = ab[ab >= 3] ba_FS3 = ba[ba >= 3] + ab_FS3_o = ab_o[ab_o >= 3] + ba_FS3_o = ba_o[ba_o >= 3] duplTags_FS3 = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)] # ab+ba with FS>=3 duplTags_FS3_BA = duplTagsBA[(duplTags >= 3) & (duplTagsBA >= 3)] # ba+ab with FS>=3 duplTags_double_FS3 = len(duplTags_FS3) + len(duplTags_FS3_BA) # both ab and ba strands with FS>=3 - fig = plt.figure() + # original FS + duplTags_FS3_o = duplTags_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)] # ab+ba with FS>=3 + duplTags_FS3_BA_o = duplTagsBA_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)] # ba+ab with FS>=3 + duplTags_double_FS3_o = sum(duplTags_FS3_o) + sum(duplTags_FS3_BA_o) # both ab and ba strands with FS>=3 + fig = plt.figure() plt.subplots_adjust(bottom=0.3) - counts = plt.hist(list1, bins=range(minimumX, maximumX + 1), stacked=True, label=["duplex", "ab", "ba"], edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"]) + counts = plt.hist(list1, bins=range(minimumX, maximumX + 1), stacked=True, label=["duplex", "ab", "ba"], + edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"], + rwidth=0.8) # tick labels of x axis ticks = numpy.arange(minimumX - 1, maximumX, 1) ticks1 = map(str, ticks) @@ -298,33 +350,56 @@ last = counts[0][2][len(counts[0][0]) - 1] # large families plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) - # plt.title(name1, fontsize=14) + plt.title(name_file, fontsize=14) plt.xlabel("Family size", fontsize=14) plt.ylabel("Absolute Frequency", fontsize=14) plt.margins(0.01, None) plt.grid(b=True, which="major", color="#424242", linestyle=":") # extra information beneath the plot - legend = "SSCS ab= \nSSCS ba= \nDCS (total)= \nlength of dataset=" - plt.text(0.1, 0.09, legend, size=12, transform=plt.gcf().transFigure) + legend = "SSCS ab= \nSSCS ba= \nDCS (total)= \ntotal nr. of tags=" + plt.text(0.1, 0.09, legend, size=10, transform=plt.gcf().transFigure) + + legend = "nr. of tags\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,}".format(len(dataAB), len(dataBA), len(duplTags), len(duplTags_double), (len(dataAB) + len(dataBA) + len(duplTags))) + plt.text(0.23, 0.09, legend, size=10, transform=plt.gcf().transFigure) - legend = "absolute numbers\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,}".format(len(dataAB), len(dataBA), len(duplTags), len(duplTags_double), (len(dataAB) + len(dataBA) + len(duplTags))) - plt.text(0.35, 0.09, legend, size=12, transform=plt.gcf().transFigure) + legend5 = "PE reads\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,}".format(sum(dataAB_o), sum(dataBA_o), sum(duplTags_o), sum(duplTags_double_o), (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o))) + plt.text(0.38, 0.09, legend5, size=10, transform=plt.gcf().transFigure) - legend = "relative frequencies\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)), float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)), float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), (len(dataAB) + len(dataBA) + len(duplTags))) - plt.text(0.54, 0.09, legend, size=12, transform=plt.gcf().transFigure) + legend = "rel. freq. of tags\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)), float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)), float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), (len(dataAB) + len(dataBA) + len(duplTags))) + plt.text(0.54, 0.09, legend, size=10, transform=plt.gcf().transFigure) legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format(float(len(dataAB)) / (len(ab) + len(ba)), float(len(dataBA)) / (len(ab) + len(ba)), float(len(duplTags)) / (len(ab) + len(ba)), float(len(duplTags_double)) / (len(ab) + len(ba)), (len(ab) + len(ba))) - plt.text(0.64, 0.09, legend, size=12, transform=plt.gcf().transFigure) + plt.text(0.64, 0.09, legend, size=10, transform=plt.gcf().transFigure) legend1 = "\nsingletons:\nfamily size > 20:" - plt.text(0.1, 0.03, legend1, size=12, transform=plt.gcf().transFigure) + plt.text(0.1, 0.03, legend1, size=10, transform=plt.gcf().transFigure) legend4 = "{:,}\n{:,}".format(singl.astype(int), last.astype(int)) - plt.text(0.35, 0.03, legend4, size=12, transform=plt.gcf().transFigure) + plt.text(0.23, 0.03, legend4, size=10, transform=plt.gcf().transFigure) legend3 = "{:.3f}\n{:.3f}".format(singl / len(data), last / len(data)) - plt.text(0.54, 0.03, legend3, size=12, transform=plt.gcf().transFigure) + plt.text(0.64, 0.03, legend3, size=10, transform=plt.gcf().transFigure) + + legend3 = "\n\n{:,}".format(sum(data_o[data_o > 20])) + plt.text(0.38, 0.03, legend3, size=10, transform=plt.gcf().transFigure) + + legend3 = "{:.3f}\n{:.3f}".format(float(singl)/sum(data_o), float(sum(data_o[data_o > 20])) / sum(data_o)) + plt.text(0.84, 0.03, legend3, size=10, transform=plt.gcf().transFigure) + + legend = "PE reads\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format( + float(sum(dataAB_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), + float(sum(dataBA_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), + float(sum(duplTags_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), + (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o))) + plt.text(0.74, 0.09, legend, size=10, transform=plt.gcf().transFigure) + + legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format( + float(sum(dataAB_o)) / (sum(ab_o) + sum(ba_o)), + float(sum(dataBA_o)) / (sum(ab_o) + sum(ba_o)), + float(sum(duplTags_o)) / (sum(ab_o) + sum(ba_o)), + float(sum(duplTags_double_o)) / (sum(ab_o) + sum(ba_o)), (sum(ab_o) + sum(ba_o))) + plt.text(0.84, 0.09, legend, size=10, transform=plt.gcf().transFigure) pdf.savefig(fig) plt.close() @@ -336,23 +411,62 @@ output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1])) output_file.write("relative frequency:{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count))) - output_file.write("{}singletons:{}{}family size > 20:\n".format(sep, sep, sep)) - output_file.write("{}absolute nr.{}rel. freq{}absolute nr.{}rel. freq{}total length\n".format(sep, sep, sep, sep, sep)) - output_file.write("{}{}{}{}{:.3f}{}{}{}{:.3f}{}{}\n\n".format(name_file, sep, singl.astype(int), sep, singl / len(data), sep, last.astype(int), sep, last / len(data), sep, len(data))) + output_file.write("{}singletons:{}{}{}family size > 20:\n".format(sep, sep, sep, sep)) + output_file.write("{}nr. of tags{}rel. freq of tags{}rel.freq of PE reads{}nr. of tags{}rel. freq of tags{}nr. of PE reads{}rel. freq of PE reads{}total nr. of tags{}total nr. of PE reads\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) + output_file.write("{}{}{}{}{:.3f}{}{:.3f}{}{}{}{:.3f}{}{}{}{:.3f}{}{}{}{}\n\n".format( + name_file, sep, singl.astype(int), sep, singl / len(data), sep, float(singl)/sum(data_o), sep, + last.astype(int), sep, last / len(data), sep, sum(data_o[data_o > 20]), sep, float(sum(data_o[data_o > 20])) / sum(data_o), sep, len(data), sep, sum(data_o))) # information for FS >= 1 - output_file.write("The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS)\nWhereas the total frequencies were calculated from the whole dataset (=including the DCS).\n\n") - output_file.write("FS >= 1{}{}unique:{}total:\n".format(sep, sep, sep)) - output_file.write("nr./rel. freq of ab={}{}{}{:.3f}{}{:.3f}\n".format(sep, len(dataAB), sep, float(len(dataAB)) / (len(dataAB) + len(dataBA) + len( duplTags)), sep, float(len(dataAB)) / (len(ab) + len(ba)))) - output_file.write("nr./rel. freq of ba={}{}{}{:.3f}{}{:.3f}\n".format(sep, len(dataBA), sep, float(len(dataBA)) / (len(dataBA) + len(dataBA) + len(duplTags)), sep, float(len(dataBA)) / (len(ba) + len(ba)))) - output_file.write("nr./rel. freq of DCS (total)={}{} ({}){}{:.3f}{}{:.3f} ({:.3f})\n".format(sep, len(duplTags), len(duplTags_double), sep, float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), sep, float(len(duplTags)) / ( len(ab) + len(ba)), float(len(duplTags_double)) / (len(ab) + len(ba)))) - output_file.write("length of dataset={}{}{}{}{}{}\n".format(sep, (len(dataAB) + len(dataBA) + len(duplTags)), sep, (len(dataAB) + len(dataBA) + len(duplTags)), sep, (len(ab) + len(ba)))) + output_file.write("The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS)\n" + "Whereas the total frequencies were calculated from the whole dataset (=including the DCS).\n\n") + output_file.write("FS >= 1{}nr. of tags{}nr. of PE reads{}rel. freq of tags{}{}rel. freq of PE reads:\n".format(sep, sep, sep, sep, sep)) + output_file.write("{}{}{}unique:{}total{}unique{}total:\n".format(sep, sep, sep, sep, sep, sep)) + output_file.write("SSCS ab{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format( + sep, len(dataAB), sep, sum(dataAB_o), sep, float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)), + sep, float(sum(dataAB_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep, + float(len(dataAB)) / (len(ab) + len(ba)), sep, float(sum(dataAB_o)) / (sum(ab_o) + sum(ba_o)))) + output_file.write("SSCS ba{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format( + sep, len(dataBA), sep, sum(dataBA_o), sep, float(len(dataBA)) / (len(dataBA) + len(dataBA) + len(duplTags)), + sep, float(sum(dataBA_o)) / (sum(dataBA_o) + sum(dataBA_o) + sum(duplTags_o)), sep, float(len(dataBA)) / (len(ba) + len(ba)), + sep, float(sum(dataBA_o)) / (sum(ba_o) + sum(ba_o)))) + output_file.write("DCS (total){}{} ({}){}{} ({}){}{:.3f}{}{:.3f} ({:.3f}){}{:.3f}{}{:.3f} ({:.3f})\n".format( + sep, len(duplTags), len(duplTags_double), sep, sum(duplTags_o), sum(duplTags_double_o), sep, + float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), sep, + float(len(duplTags)) / (len(ab) + len(ba)), float(len(duplTags_double)) / (len(ab) + len(ba)), sep, + float(sum(duplTags_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep, + float(sum(duplTags_o)) / (sum(ab_o) + sum(ba_o)), float(sum(duplTags_double_o)) / (sum(ab_o) + sum(ba_o)))) + output_file.write("total nr. of tags{}{}{}{}{}{}{}{}{}{}{}{}\n".format( + sep, (len(dataAB) + len(dataBA) + len(duplTags)), sep, (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep, + (len(dataAB) + len(dataBA) + len(duplTags)), sep, (len(ab) + len(ba)), sep, + (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep, (sum(ab_o) + sum(ba_o)))) # information for FS >= 3 - output_file.write("FS >= 3{}{}unique:{}total:\n".format(sep, sep, sep)) - output_file.write("nr./rel. freq of ab={}{}{}{:.3f}{}{:.3f}\n".format(sep, len(dataAB_FS3), sep, float(len(dataAB_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, float(len(dataAB_FS3)) / (len(ab_FS3) + len(ba_FS3)))) - output_file.write("nr./rel. freq of ba={}{}{}{:.3f}{}{:.3f}\n".format(sep, len(dataBA_FS3), sep, float(len(dataBA_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, float(len(dataBA_FS3)) / (len(ba_FS3) + len(ba_FS3)))) - output_file.write("nr./rel. freq of DCS (total)={}{} ({}){}{:.3f}{}{:.3f} ({:.3f})\n".format(sep, len(duplTags_FS3), duplTags_double_FS3, sep, float(len( duplTags_FS3)) / (len(dataBA_FS3) + len(duplTags_FS3)), sep, float(len(duplTags_FS3)) / (len(ab_FS3) + len(ba_FS3)), float(duplTags_double_FS3) / (len(ab_FS3) + len(ba_FS3)))) - output_file.write("length of dataset={}{}{}{}{}{}\n".format(sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, (len(ab_FS3) + len(ba_FS3)))) + output_file.write("\nFS >= 3{}nr. of tags{}nr. of PE reads{}rel. freq of tags{}{}rel. freq of PE reads:\n".format(sep, sep, sep, sep, sep)) + output_file.write("{}{}{}unique:{}total{}unique{}total:\n".format(sep, sep, sep, sep, sep, sep)) + output_file.write("SSCS ab{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format( + sep, len(dataAB_FS3), sep, sum(dataAB_FS3_o), sep, + float(len(dataAB_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, + float(len(dataAB_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + duplTags_double_FS3), + sep, float(sum(dataAB_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), + sep, float(sum(dataAB_FS3_o)) / (sum(dataBA_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o))) + output_file.write("SSCS ba{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format( + sep, len(dataBA_FS3), sep, sum(dataBA_FS3_o), sep, + float(len(dataBA_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), + sep, float(len(dataBA_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + duplTags_double_FS3), + sep, float(sum(dataBA_FS3_o)) / (sum(dataBA_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), + sep, float(sum(dataBA_FS3_o)) / (sum(dataBA_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o))) + output_file.write("DCS (total){}{} ({}){}{} ({}){}{:.3f}{}{:.3f} ({:.3f}){}{:.3f}{}{:.3f} ({:.3f})\n".format( + sep, len(duplTags_FS3), duplTags_double_FS3, sep, sum(duplTags_FS3_o), duplTags_double_FS3_o, sep, + float(len(duplTags_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, + float(len(duplTags_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3), + float(duplTags_double_FS3) / (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3), + sep, float(sum(duplTags_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), sep, + float(sum(duplTags_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o), + float(duplTags_double_FS3_o) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o))) + output_file.write("total nr. of tags{}{}{}{}{}{}{}{}{}{}{}{}\n".format( + sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), + sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3), + sep, (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), sep, (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o))) output_file.write("\nValues from family size distribution\n") output_file.write("{}duplex{}ab{}ba{}sum\n".format(sep, sep, sep, sep)) diff -r 6bd9ef49d013 -r 2e517a54eedc fsd.xml --- a/fsd.xml Mon Oct 08 05:50:18 2018 -0400 +++ b/fsd.xml Tue Apr 02 05:10:09 2019 -0400 @@ -24,19 +24,19 @@ - - - - - - + + + + + + - - - - - + + + + + 20 1 1 1 1 +sum 112 112 112 112 +Dataset: fsd_data1.tab +max. family size: 21 +absolute frequency: 1 +relative frequency: 0.009 + + singletons: family size > 20: + nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads +fsd_data1.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 + +The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS) +Whereas the total frequencies were calculated from the whole dataset (=including the DCS). + +FS >= 1 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 47 123 0.431 0.339 0.420 0.325 +SSCS ba 59 222 0.488 0.481 0.476 0.468 +DCS (total) 3 (6) 18 (33) 0.028 0.027 (0.054) 0.050 0.048 (0.087) +total nr. of tags 109 363 109 112 363 378 + +FS >= 3 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 14 87 0.341 0.259 0.313 0.224 +SSCS ba 26 187 0.491 0.481 0.495 0.482 +DCS (total) 1 (2) 4 (14) 0.024 0.024 (0.048) 0.014 0.014 (0.049) +total nr. of tags 41 278 41 42 278 288 + +Values from family size distribution + duplex ab ba sum +FS=1 2 30 31 63 +FS=2 0 3 2 5 +FS=3 0 3 5 8 +FS=4 2 3 4 9 +FS=5 0 2 1 3 +FS=6 0 1 4 5 +FS=7 0 1 2 3 +FS=8 0 1 2 3 +FS=9 0 0 2 2 +FS=10 1 1 1 3 +FS=11 0 0 1 1 +FS=12 0 1 2 3 +FS=13 1 1 1 3 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 0 0 1 1 + +Dataset: fsd_data2.tab +max. family size: 21 +absolute frequency: 1 +relative frequency: 0.009 + + singletons: family size > 20: + nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads +fsd_data2.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 + +The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS) +Whereas the total frequencies were calculated from the whole dataset (=including the DCS). + +FS >= 1 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 47 123 0.431 0.339 0.420 0.325 +SSCS ba 59 222 0.488 0.481 0.476 0.468 +DCS (total) 3 (6) 18 (33) 0.028 0.027 (0.054) 0.050 0.048 (0.087) +total nr. of tags 109 363 109 112 363 378 + +FS >= 3 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 14 87 0.341 0.259 0.313 0.224 +SSCS ba 26 187 0.491 0.481 0.495 0.482 +DCS (total) 1 (2) 4 (14) 0.024 0.024 (0.048) 0.014 0.014 (0.049) +total nr. of tags 41 278 41 42 278 288 + +Values from family size distribution + duplex ab ba sum +FS=1 2 30 31 63 +FS=2 0 3 2 5 +FS=3 0 3 5 8 +FS=4 2 3 4 9 +FS=5 0 2 1 3 +FS=6 0 1 4 5 +FS=7 0 1 2 3 +FS=8 0 1 2 3 +FS=9 0 0 2 2 +FS=10 1 1 1 3 +FS=11 0 0 1 1 +FS=12 0 1 2 3 +FS=13 1 1 1 3 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 0 0 1 1 + +Dataset: fsd_data3.tab +max. family size: 21 +absolute frequency: 1 +relative frequency: 0.009 + + singletons: family size > 20: + nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads +fsd_data3.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 + +The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS) +Whereas the total frequencies were calculated from the whole dataset (=including the DCS). + +FS >= 1 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 47 123 0.431 0.339 0.420 0.325 +SSCS ba 59 222 0.488 0.481 0.476 0.468 +DCS (total) 3 (6) 18 (33) 0.028 0.027 (0.054) 0.050 0.048 (0.087) +total nr. of tags 109 363 109 112 363 378 + +FS >= 3 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 14 87 0.341 0.259 0.313 0.224 +SSCS ba 26 187 0.491 0.481 0.495 0.482 +DCS (total) 1 (2) 4 (14) 0.024 0.024 (0.048) 0.014 0.014 (0.049) +total nr. of tags 41 278 41 42 278 288 + +Values from family size distribution + duplex ab ba sum +FS=1 2 30 31 63 +FS=2 0 3 2 5 +FS=3 0 3 5 8 +FS=4 2 3 4 9 +FS=5 0 2 1 3 +FS=6 0 1 4 5 +FS=7 0 1 2 3 +FS=8 0 1 2 3 +FS=9 0 0 2 2 +FS=10 1 1 1 3 +FS=11 0 0 1 1 +FS=12 0 1 2 3 +FS=13 1 1 1 3 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 0 0 1 1 + +Dataset: fsd_data4.tab +max. family size: 21 +absolute frequency: 1 +relative frequency: 0.009 + + singletons: family size > 20: + nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads +fsd_data4.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 + +The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS) +Whereas the total frequencies were calculated from the whole dataset (=including the DCS). + +FS >= 1 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 47 123 0.431 0.339 0.420 0.325 +SSCS ba 59 222 0.488 0.481 0.476 0.468 +DCS (total) 3 (6) 18 (33) 0.028 0.027 (0.054) 0.050 0.048 (0.087) +total nr. of tags 109 363 109 112 363 378 + +FS >= 3 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 14 87 0.341 0.259 0.313 0.224 +SSCS ba 26 187 0.491 0.481 0.495 0.482 +DCS (total) 1 (2) 4 (14) 0.024 0.024 (0.048) 0.014 0.014 (0.049) +total nr. of tags 41 278 41 42 278 288 + +Values from family size distribution + duplex ab ba sum +FS=1 2 30 31 63 +FS=2 0 3 2 5 +FS=3 0 3 5 8 +FS=4 2 3 4 9 +FS=5 0 2 1 3 +FS=6 0 1 4 5 +FS=7 0 1 2 3 +FS=8 0 1 2 3 +FS=9 0 0 2 2 +FS=10 1 1 1 3 +FS=11 0 0 1 1 +FS=12 0 1 2 3 +FS=13 1 1 1 3 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 0 0 1 1 diff -r 6bd9ef49d013 -r 2e517a54eedc test-data/fsd_output2.pdf Binary file test-data/fsd_output2.pdf has changed diff -r 6bd9ef49d013 -r 2e517a54eedc test-data/fsd_output2.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fsd_output2.tab Tue Apr 02 05:10:09 2019 -0400 @@ -0,0 +1,174 @@ +Values from family size distribution with all datasets + +Family size fsd_data1.tab fsd_data2.tab fsd_data3.tab +FS=1 63 63 63 +FS=2 5 5 5 +FS=3 8 8 8 +FS=4 9 9 9 +FS=5 3 3 3 +FS=6 5 5 5 +FS=7 3 3 3 +FS=8 3 3 3 +FS=9 2 2 2 +FS=10 3 3 3 +FS=11 1 1 1 +FS=12 3 3 3 +FS=13 3 3 3 +FS=14 0 0 0 +FS=15 0 0 0 +FS=16 0 0 0 +FS=17 0 0 0 +FS=18 0 0 0 +FS=19 0 0 0 +FS=20 0 0 0 +FS>20 1 1 1 +sum 112 112 112 +Dataset: fsd_data1.tab +max. family size: 21 +absolute frequency: 1 +relative frequency: 0.009 + + singletons: family size > 20: + nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads +fsd_data1.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 + +The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS) +Whereas the total frequencies were calculated from the whole dataset (=including the DCS). + +FS >= 1 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 47 123 0.431 0.339 0.420 0.325 +SSCS ba 59 222 0.488 0.481 0.476 0.468 +DCS (total) 3 (6) 18 (33) 0.028 0.027 (0.054) 0.050 0.048 (0.087) +total nr. of tags 109 363 109 112 363 378 + +FS >= 3 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 14 87 0.341 0.259 0.313 0.224 +SSCS ba 26 187 0.491 0.481 0.495 0.482 +DCS (total) 1 (2) 4 (14) 0.024 0.024 (0.048) 0.014 0.014 (0.049) +total nr. of tags 41 278 41 42 278 288 + +Values from family size distribution + duplex ab ba sum +FS=1 2 30 31 63 +FS=2 0 3 2 5 +FS=3 0 3 5 8 +FS=4 2 3 4 9 +FS=5 0 2 1 3 +FS=6 0 1 4 5 +FS=7 0 1 2 3 +FS=8 0 1 2 3 +FS=9 0 0 2 2 +FS=10 1 1 1 3 +FS=11 0 0 1 1 +FS=12 0 1 2 3 +FS=13 1 1 1 3 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 0 0 1 1 + +Dataset: fsd_data2.tab +max. family size: 21 +absolute frequency: 1 +relative frequency: 0.009 + + singletons: family size > 20: + nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads +fsd_data2.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 + +The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS) +Whereas the total frequencies were calculated from the whole dataset (=including the DCS). + +FS >= 1 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 47 123 0.431 0.339 0.420 0.325 +SSCS ba 59 222 0.488 0.481 0.476 0.468 +DCS (total) 3 (6) 18 (33) 0.028 0.027 (0.054) 0.050 0.048 (0.087) +total nr. of tags 109 363 109 112 363 378 + +FS >= 3 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 14 87 0.341 0.259 0.313 0.224 +SSCS ba 26 187 0.491 0.481 0.495 0.482 +DCS (total) 1 (2) 4 (14) 0.024 0.024 (0.048) 0.014 0.014 (0.049) +total nr. of tags 41 278 41 42 278 288 + +Values from family size distribution + duplex ab ba sum +FS=1 2 30 31 63 +FS=2 0 3 2 5 +FS=3 0 3 5 8 +FS=4 2 3 4 9 +FS=5 0 2 1 3 +FS=6 0 1 4 5 +FS=7 0 1 2 3 +FS=8 0 1 2 3 +FS=9 0 0 2 2 +FS=10 1 1 1 3 +FS=11 0 0 1 1 +FS=12 0 1 2 3 +FS=13 1 1 1 3 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 0 0 1 1 + +Dataset: fsd_data3.tab +max. family size: 21 +absolute frequency: 1 +relative frequency: 0.009 + + singletons: family size > 20: + nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads +fsd_data3.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 + +The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS) +Whereas the total frequencies were calculated from the whole dataset (=including the DCS). + +FS >= 1 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 47 123 0.431 0.339 0.420 0.325 +SSCS ba 59 222 0.488 0.481 0.476 0.468 +DCS (total) 3 (6) 18 (33) 0.028 0.027 (0.054) 0.050 0.048 (0.087) +total nr. of tags 109 363 109 112 363 378 + +FS >= 3 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 14 87 0.341 0.259 0.313 0.224 +SSCS ba 26 187 0.491 0.481 0.495 0.482 +DCS (total) 1 (2) 4 (14) 0.024 0.024 (0.048) 0.014 0.014 (0.049) +total nr. of tags 41 278 41 42 278 288 + +Values from family size distribution + duplex ab ba sum +FS=1 2 30 31 63 +FS=2 0 3 2 5 +FS=3 0 3 5 8 +FS=4 2 3 4 9 +FS=5 0 2 1 3 +FS=6 0 1 4 5 +FS=7 0 1 2 3 +FS=8 0 1 2 3 +FS=9 0 0 2 2 +FS=10 1 1 1 3 +FS=11 0 0 1 1 +FS=12 0 1 2 3 +FS=13 1 1 1 3 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 0 0 1 1 diff -r 6bd9ef49d013 -r 2e517a54eedc test-data/output_file.pdf Binary file test-data/output_file.pdf has changed diff -r 6bd9ef49d013 -r 2e517a54eedc test-data/output_file.tabular --- a/test-data/output_file.tabular Mon Oct 08 05:50:18 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,167 +0,0 @@ -Values from family size distribution with all datasets - -Family size Test_data Test_data2 Test_data3 Test_data4 -FS=1 63 63 63 63 -FS=2 5 5 5 5 -FS=3 8 8 8 8 -FS=4 10 10 10 10 -FS=5 3 3 3 3 -FS=6 5 5 5 5 -FS=7 3 3 3 3 -FS=8 3 3 3 3 -FS=9 2 2 2 2 -FS=10 3 3 3 3 -FS=11 1 1 1 1 -FS=12 6 6 6 6 -sum 112 112 112 112 -Dataset: Test_data -max. family size: 13 -absolute frequency: 3 -relative frequency: 0.027 - - singletons: family size > 20: - absolute nr. rel. freq absolute nr. rel. freq total length -Test_data 63 0.562 6 0.054 112 - -The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS) -Whereas the total frequencies were calculated from the whole dataset (=including the DCS). - -FS >= 1 unique: total: -nr./rel. freq of ab= 47 0.431 0.420 -nr./rel. freq of ba= 59 0.488 0.476 -nr./rel. freq of DCS (total)= 3 (6) 0.028 0.027 (0.054) -length of dataset= 109 109 112 -FS >= 3 unique: total: -nr./rel. freq of ab= 14 0.341 0.318 -nr./rel. freq of ba= 26 0.491 0.464 -nr./rel. freq of DCS (total)= 1 (2) 0.037 0.023 (0.045) -length of dataset= 41 41 44 - -Values from family size distribution - duplex ab ba sum -FS=1 2 30 31 63 -FS=2 0 3 2 5 -FS=3 0 3 5 8 -FS=4 2 3 5 10 -FS=5 0 2 1 3 -FS=6 0 1 4 5 -FS=7 0 1 2 3 -FS=8 0 1 2 3 -FS=9 0 0 2 2 -FS=10 1 1 1 3 -FS=11 0 0 1 1 -FS=12 1 2 3 6 - -Dataset: Test_data2 -max. family size: 13 -absolute frequency: 3 -relative frequency: 0.027 - - singletons: family size > 20: - absolute nr. rel. freq absolute nr. rel. freq total length -Test_data2 63 0.562 6 0.054 112 - -The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS) -Whereas the total frequencies were calculated from the whole dataset (=including the DCS). - -FS >= 1 unique: total: -nr./rel. freq of ab= 47 0.431 0.420 -nr./rel. freq of ba= 59 0.488 0.476 -nr./rel. freq of DCS (total)= 3 (6) 0.028 0.027 (0.054) -length of dataset= 109 109 112 -FS >= 3 unique: total: -nr./rel. freq of ab= 14 0.341 0.318 -nr./rel. freq of ba= 26 0.491 0.464 -nr./rel. freq of DCS (total)= 1 (2) 0.037 0.023 (0.045) -length of dataset= 41 41 44 - -Values from family size distribution - duplex ab ba sum -FS=1 2 30 31 63 -FS=2 0 3 2 5 -FS=3 0 3 5 8 -FS=4 2 3 5 10 -FS=5 0 2 1 3 -FS=6 0 1 4 5 -FS=7 0 1 2 3 -FS=8 0 1 2 3 -FS=9 0 0 2 2 -FS=10 1 1 1 3 -FS=11 0 0 1 1 -FS=12 1 2 3 6 - -Dataset: Test_data3 -max. family size: 13 -absolute frequency: 3 -relative frequency: 0.027 - - singletons: family size > 20: - absolute nr. rel. freq absolute nr. rel. freq total length -Test_data3 63 0.562 6 0.054 112 - -The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS) -Whereas the total frequencies were calculated from the whole dataset (=including the DCS). - -FS >= 1 unique: total: -nr./rel. freq of ab= 47 0.431 0.420 -nr./rel. freq of ba= 59 0.488 0.476 -nr./rel. freq of DCS (total)= 3 (6) 0.028 0.027 (0.054) -length of dataset= 109 109 112 -FS >= 3 unique: total: -nr./rel. freq of ab= 14 0.341 0.318 -nr./rel. freq of ba= 26 0.491 0.464 -nr./rel. freq of DCS (total)= 1 (2) 0.037 0.023 (0.045) -length of dataset= 41 41 44 - -Values from family size distribution - duplex ab ba sum -FS=1 2 30 31 63 -FS=2 0 3 2 5 -FS=3 0 3 5 8 -FS=4 2 3 5 10 -FS=5 0 2 1 3 -FS=6 0 1 4 5 -FS=7 0 1 2 3 -FS=8 0 1 2 3 -FS=9 0 0 2 2 -FS=10 1 1 1 3 -FS=11 0 0 1 1 -FS=12 1 2 3 6 - -Dataset: Test_data4 -max. family size: 13 -absolute frequency: 3 -relative frequency: 0.027 - - singletons: family size > 20: - absolute nr. rel. freq absolute nr. rel. freq total length -Test_data4 63 0.562 6 0.054 112 - -The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS) -Whereas the total frequencies were calculated from the whole dataset (=including the DCS). - -FS >= 1 unique: total: -nr./rel. freq of ab= 47 0.431 0.420 -nr./rel. freq of ba= 59 0.488 0.476 -nr./rel. freq of DCS (total)= 3 (6) 0.028 0.027 (0.054) -length of dataset= 109 109 112 -FS >= 3 unique: total: -nr./rel. freq of ab= 14 0.341 0.318 -nr./rel. freq of ba= 26 0.491 0.464 -nr./rel. freq of DCS (total)= 1 (2) 0.037 0.023 (0.045) -length of dataset= 41 41 44 - -Values from family size distribution - duplex ab ba sum -FS=1 2 30 31 63 -FS=2 0 3 2 5 -FS=3 0 3 5 8 -FS=4 2 3 5 10 -FS=5 0 2 1 3 -FS=6 0 1 4 5 -FS=7 0 1 2 3 -FS=8 0 1 2 3 -FS=9 0 0 2 2 -FS=10 1 1 1 3 -FS=11 0 0 1 1 -FS=12 1 2 3 6 diff -r 6bd9ef49d013 -r 2e517a54eedc test-data/output_file2.pdf Binary file test-data/output_file2.pdf has changed diff -r 6bd9ef49d013 -r 2e517a54eedc test-data/output_file2.tabular --- a/test-data/output_file2.tabular Mon Oct 08 05:50:18 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,129 +0,0 @@ -Values from family size distribution with all datasets - -Family size Test_data Test_data2 Test_data3 -FS=1 63 63 63 -FS=2 5 5 5 -FS=3 8 8 8 -FS=4 10 10 10 -FS=5 3 3 3 -FS=6 5 5 5 -FS=7 3 3 3 -FS=8 3 3 3 -FS=9 2 2 2 -FS=10 3 3 3 -FS=11 1 1 1 -FS=12 6 6 6 -sum 112 112 112 -Dataset: Test_data -max. family size: 13 -absolute frequency: 3 -relative frequency: 0.027 - - singletons: family size > 20: - absolute nr. rel. freq absolute nr. rel. freq total length -Test_data 63 0.562 6 0.054 112 - -The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS) -Whereas the total frequencies were calculated from the whole dataset (=including the DCS). - -FS >= 1 unique: total: -nr./rel. freq of ab= 47 0.431 0.420 -nr./rel. freq of ba= 59 0.488 0.476 -nr./rel. freq of DCS (total)= 3 (6) 0.028 0.027 (0.054) -length of dataset= 109 109 112 -FS >= 3 unique: total: -nr./rel. freq of ab= 14 0.341 0.318 -nr./rel. freq of ba= 26 0.491 0.464 -nr./rel. freq of DCS (total)= 1 (2) 0.037 0.023 (0.045) -length of dataset= 41 41 44 - -Values from family size distribution - duplex ab ba sum -FS=1 2 30 31 63 -FS=2 0 3 2 5 -FS=3 0 3 5 8 -FS=4 2 3 5 10 -FS=5 0 2 1 3 -FS=6 0 1 4 5 -FS=7 0 1 2 3 -FS=8 0 1 2 3 -FS=9 0 0 2 2 -FS=10 1 1 1 3 -FS=11 0 0 1 1 -FS=12 1 2 3 6 - -Dataset: Test_data2 -max. family size: 13 -absolute frequency: 3 -relative frequency: 0.027 - - singletons: family size > 20: - absolute nr. rel. freq absolute nr. rel. freq total length -Test_data2 63 0.562 6 0.054 112 - -The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS) -Whereas the total frequencies were calculated from the whole dataset (=including the DCS). - -FS >= 1 unique: total: -nr./rel. freq of ab= 47 0.431 0.420 -nr./rel. freq of ba= 59 0.488 0.476 -nr./rel. freq of DCS (total)= 3 (6) 0.028 0.027 (0.054) -length of dataset= 109 109 112 -FS >= 3 unique: total: -nr./rel. freq of ab= 14 0.341 0.318 -nr./rel. freq of ba= 26 0.491 0.464 -nr./rel. freq of DCS (total)= 1 (2) 0.037 0.023 (0.045) -length of dataset= 41 41 44 - -Values from family size distribution - duplex ab ba sum -FS=1 2 30 31 63 -FS=2 0 3 2 5 -FS=3 0 3 5 8 -FS=4 2 3 5 10 -FS=5 0 2 1 3 -FS=6 0 1 4 5 -FS=7 0 1 2 3 -FS=8 0 1 2 3 -FS=9 0 0 2 2 -FS=10 1 1 1 3 -FS=11 0 0 1 1 -FS=12 1 2 3 6 - -Dataset: Test_data3 -max. family size: 13 -absolute frequency: 3 -relative frequency: 0.027 - - singletons: family size > 20: - absolute nr. rel. freq absolute nr. rel. freq total length -Test_data3 63 0.562 6 0.054 112 - -The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS) -Whereas the total frequencies were calculated from the whole dataset (=including the DCS). - -FS >= 1 unique: total: -nr./rel. freq of ab= 47 0.431 0.420 -nr./rel. freq of ba= 59 0.488 0.476 -nr./rel. freq of DCS (total)= 3 (6) 0.028 0.027 (0.054) -length of dataset= 109 109 112 -FS >= 3 unique: total: -nr./rel. freq of ab= 14 0.341 0.318 -nr./rel. freq of ba= 26 0.491 0.464 -nr./rel. freq of DCS (total)= 1 (2) 0.037 0.023 (0.045) -length of dataset= 41 41 44 - -Values from family size distribution - duplex ab ba sum -FS=1 2 30 31 63 -FS=2 0 3 2 5 -FS=3 0 3 5 8 -FS=4 2 3 5 10 -FS=5 0 2 1 3 -FS=6 0 1 4 5 -FS=7 0 1 2 3 -FS=8 0 1 2 3 -FS=9 0 0 2 2 -FS=10 1 1 1 3 -FS=11 0 0 1 1 -FS=12 1 2 3 6