fsd: fsd.py comparison

comparison fsd.py @ 45:6651e76baca1 draft

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd commit 033dd7b750f68e8aa68f327d7d72bd311ddbee4e-dirty

author	mheinzl
date	Tue, 27 Aug 2019 07:36:53 -0400
parents	a76af7fd9fca
children	901827154779

comparison

equal deleted inserted replaced

-:a76af7fd9fca
+:6651e76baca1
 parser.add_argument('--inputFile3', default=None, help='Tabular File with three columns: ab or ba, tag and family size.')
 parser.add_argument('--inputName3')
 parser.add_argument('--inputFile4', default=None, help='Tabular File with three columns: ab or ba, tag and family size.')
 parser.add_argument('--inputName4')
 parser.add_argument('--log_axis', action="store_false", help='Transform y axis in log scale.')
+parser.add_argument('--rel_freq', action="store_false", help='If False, the relative frequencies are displayed.')
 parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf file.')
 parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the tabular file.')
 return parser
 thirdFile = args.inputFile3
 name3 = args.inputName3
 fourthFile = args.inputFile4
 name4 = args.inputName4
 log_axis = args.log_axis
+rel_freq = args.rel_freq
 title_file = args.output_tabular
 title_file2 = args.output_pdf
 sep = "\t"
 list_to_plot = []
 label = []
 data_array_list = []
 list_to_plot_original = []
 colors = []
 bins = numpy.arange(1, 22)
 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf:
 fig = plt.figure()
 fig.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0)
 fig2 = plt.figure()
 fig2.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0)
 # data1 = numpy.array(file1[:, 0]).astype(int)
 # bigFamilies = numpy.where(data1 > 20)[0]
 # data1[bigFamilies] = 22
 data1 = numpy.clip(integers, bins[0], bins[-1])
 name1 = name1.split(".tabular")[0]
+if len(name1) > 40:
+name1 = name1[:40]
 list_to_plot.append(data1)
 label.append(name1)
 data_array_list.append(file1)
 legend = "\n\n\n{}".format(name1)
 fig.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure)
 fig2.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure)
-legend1 = "singletons:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / len(data1))
+legend1 = "singletons:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[1],
+float(numpy.bincount(data1)[1]) / len(data1))
 fig.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure)
 fig2.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure)
-legend3b = "PE reads\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / sum(integers))
+legend3b = "PE reads\n{:,} ({:.3f})".format(numpy.bincount(data1)[1],
+float(numpy.bincount(data1)[1]) / sum(integers))
 fig.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure)
 fig2.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure)
 legend4 = "family size > 20:\nnr. of tags\n{:,} ({:.3f})".format(len(integers[integers > 20]),
-float(sum(integers[integers > 20]))
+float(len(integers[integers > 20]))
-/ sum(integers))
+/ len(integers))
 fig.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure)
 fig2.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure)
-legend5 = "PE reads\n{:,} ({:.3f})".format(sum(integers[integers > 20]), float(sum(integers[integers > 20])) / sum(integers))
+legend5 = "PE reads\n{:,} ({:.3f})".format(sum(integers[integers > 20]),
+float(sum(integers[integers > 20])) / sum(integers))
 fig.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure)
 fig2.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure)
 legend6 = "total nr. of\ntags\n{:,}".format(len(data1))
 fig.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure)
 # data2[bigFamilies2] = 22
 data2 = numpy.clip(integers2, bins[0], bins[-1])
 list_to_plot.append(data2)
 name2 = name2.split(".tabular")[0]
+if len(name2) > 40:
+name2 = name2[:40]
 label.append(name2)
 data_array_list.append(file2)
 fig.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure)
 fig2.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure)
 legend3 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / sum(integers2))
 fig.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure)
 fig2.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure)
-legend4 = "{:,} ({:.3f})".format(len(integers2[integers2 > 20]), float(sum(integers2[integers2 > 20])) / sum(integers2))
+legend4 = "{:,} ({:.3f})".format(len(integers2[integers2 > 20]),
+float(len(integers2[integers2 > 20])) / len(integers2))
 fig.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure)
 fig2.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure)
-legend5 = "{:,} ({:.3f})".format(sum(integers2[integers2 > 20]), float(sum(integers2[integers2 > 20])) / sum(integers2))
+legend5 = "{:,} ({:.3f})".format(sum(integers2[integers2 > 20]),
+float(sum(integers2[integers2 > 20])) / sum(integers2))
 fig.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure)
 fig2.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure)
 legend6 = "{:,}".format(len(data2))
 fig.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure)
 # data3[bigFamilies3] = 22
 data3 = numpy.clip(integers3, bins[0], bins[-1])
 list_to_plot.append(data3)
 name3 = name3.split(".tabular")[0]
+if len(name3) > 40:
+name3 = name3[:40]
 label.append(name3)
 data_array_list.append(file3)
 fig.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure)
 fig2.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure)
 legend1 = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / len(data3))
 fig.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure)
 fig2.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure)
-legend3b = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / sum(integers3))
+legend3b = "{:,} ({:.3f})".format(numpy.bincount(data3)[1],
+float(numpy.bincount(data3)[1]) / sum(integers3))
 fig.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure)
 fig2.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure)
-legend4 = "{:,} ({:.3f})".format(len(integers3[integers3 > 20]), float(sum(integers3[integers3 > 20])) / sum(integers3))
+legend4 = "{:,} ({:.3f})".format(len(integers3[integers3 > 20]),
+float(len(integers3[integers3 > 20])) / len(integers3))
 fig.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure)
 fig2.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure)
 legend5 = "{:,} ({:.3f})".format(sum(integers3[integers3 > 20]),
 float(sum(integers3[integers3 > 20])) / sum(integers3))
 # bigFamilies4 = numpy.where(data4 > 20)[0]
 # data4[bigFamilies4] = 22
 data4 = numpy.clip(integers4, bins[0], bins[-1])
 list_to_plot.append(data4)
 name4 = name4.split(".tabular")[0]
+if len(name4) > 40:
+name4 = name4[:40]
 label.append(name4)
 data_array_list.append(file4)
 fig.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure)
 fig2.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure)
 legend1 = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / len(data4))
 fig.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure)
 fig2.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure)
-legend3b = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / sum(integers4))
+legend3b = "{:,} ({:.3f})".format(numpy.bincount(data4)[1],
+float(numpy.bincount(data4)[1]) / sum(integers4))
 fig.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure)
 fig2.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure)
-legend4 = "{:,} ({:.3f})".format(len(integers4[integers4 > 20]), float(sum(integers4[integers4 > 20])) / sum(integers4))
+legend4 = "{:,} ({:.3f})".format(len(integers4[integers4 > 20]),
+float(len(integers4[integers4 > 20])) / len(integers4))
 fig.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure)
 fig2.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure)
 legend5 = "{:,} ({:.3f})".format(sum(integers4[integers4 > 20]),
 float(sum(integers4[integers4 > 20])) / sum(integers4))
 fig2.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure)
 maximumX = numpy.amax(numpy.concatenate(list_to_plot))
 minimumX = numpy.amin(numpy.concatenate(list_to_plot))
 list_to_plot2 = list_to_plot
-to_plot = ["Absolute frequencies", "Relative frequencies"]
-plt.xticks([], [])
+if rel_freq:
-plt.yticks([], [])
+ylab = "Relative Frequency"
-fig.suptitle('Family Size Distribution (tags)', fontsize=14)
+else:
+ylab = "Absolute Frequency"
-for l in range(len(to_plot)):
-ax = fig.add_subplot(2, 1, l+1)
+# PLOT FSD based on tags
-ticks = numpy.arange(1, 22, 1)
+fig.suptitle('Family Size Distribution (FSD) based on families', fontsize=14)
-ticks1 = map(str, ticks)
+ax = fig.add_subplot(1, 1, 1)
-ticks1[len(ticks1) - 1] = ">20"
+ticks = numpy.arange(1, 22, 1)
+ticks1 = map(str, ticks)
-if to_plot[l] == "Relative frequencies":
+ticks1[len(ticks1) - 1] = ">20"
-w = [numpy.zeros_like(data) + 1. / len(data) for data in list_to_plot2]
+ax.set_xticks([], [])
-counts_rel = ax.hist(list_to_plot2, weights=w,
+if rel_freq:
-bins=numpy.arange(1, 23), stacked=False, edgecolor="black",
+w = [numpy.zeros_like(data) + 1. / len(data) for data in list_to_plot2]
-linewidth=1, label=label, align="left", alpha=0.7, rwidth=0.8)
+counts = ax.hist(list_to_plot2, weights=w,
-ax.set_ylim(0, 1.07)
+bins=numpy.arange(1, 23), stacked=False, edgecolor="black", color=colors,
+linewidth=1, label=label, align="left", alpha=0.7, rwidth=0.8)
+ax.set_ylim(0, 1.07)
+else:
+counts = ax.hist(list_to_plot2, bins=numpy.arange(1, 23), stacked=False, edgecolor="black", linewidth=1,
+label=label, align="left", alpha=0.7, rwidth=0.8, color=colors)
+ax.set_xticks(numpy.array(ticks))
+ax.set_xticklabels(ticks1)
+ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
+ax.set_ylabel(ylab, fontsize=14)
+ax.set_xlabel("Family size", fontsize=14)
+if log_axis:
+ax.set_yscale('log')
+ax.grid(b=True, which="major", color="#424242", linestyle=":")
+ax.margins(0.01, None)
+pdf.savefig(fig)
+#plt.close()
+# PLOT FSD based on PE reads
+fig2.suptitle('Family Size Distribution (FSD) based on PE reads', fontsize=14)
+ax2 = fig2.add_subplot(1, 1, 1)
+ticks = numpy.arange(1, 22)
+ticks1 = map(str, ticks)
+ticks1[len(ticks1) - 1] = ">20"
+reads = []
+reads_rel = []
+barWidth = 0 - (len(list_to_plot) + 1) / 2 * 1. / (len(list_to_plot) + 1)
+ax2.set_xticks([], [])
+for i in range(len(list_to_plot2)):
+x = list(numpy.arange(1, 22).astype(float))
+unique, c = numpy.unique(list_to_plot2[i], return_counts=True)
+y = unique * c
+if sum(list_to_plot_original[i] > 20) > 0:
+y[len(y) - 1] = sum(list_to_plot_original[i][list_to_plot_original[i] > 20])
+y = [y[x[idx] == unique][0] if x[idx] in unique else 0 for idx in range(len(x))]
+reads.append(y)
+reads_rel.append(list(numpy.float_(y)) / sum(y))
+if len(list_to_plot2) == 1:
+x = [xi * 0.5 for xi in x]
+w = 0.4
 else:
-counts = ax.hist(list_to_plot2, bins=numpy.arange(1, 23), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=0.7, rwidth=0.8, color=colors)
+x = [xi + barWidth for xi in x]
-ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
+w = 1. / (len(list_to_plot) + 1)
+if rel_freq:
-ax.set_xticks(numpy.array(ticks))
+counts2_rel = ax2.bar(x, list(numpy.float_(y)) / numpy.sum(y), align="edge", width=w,
-ax.set_xticklabels(ticks1)
+edgecolor="black", label=label[i], linewidth=1, alpha=0.7, color=colors[i])
+ax2.set_ylim(0, 1.07)
-ax.set_ylabel(to_plot[l], fontsize=14)
-ax.set_xlabel("Family size", fontsize=14)
-if log_axis:
-ax.set_yscale('log')
-ax.grid(b=True, which="major", color="#424242", linestyle=":")
-ax.margins(0.01, None)
-pdf.savefig(fig)
-plt.close()
-fig2.suptitle('Family Size Distribution (PE reads)', fontsize=14)
-for l in range(len(to_plot)):
-ax = fig2.add_subplot(2, 1, l + 1)
-ticks = numpy.arange(minimumX, maximumX + 1)
-ticks = numpy.arange(1, 22)
-ticks1 = map(str, ticks)
-ticks1[len(ticks1) - 1] = ">20"
-reads = []
-reads_rel = []
-barWidth = 0 - (len(list_to_plot)+1)/2 * 1./(len(list_to_plot)+1)
-for i in range(len(list_to_plot2)):
-x = list(numpy.arange(1, 22).astype(float))
-unique, c = numpy.unique(list_to_plot2[i], return_counts=True)
-y = unique * c
-if sum(list_to_plot_original[i] > 20) > 0:
-y[len(y) - 1] = sum(list_to_plot_original[i][list_to_plot_original[i] > 20])
-y = [y[x[idx] == unique][0] if x[idx] in unique else 0 for idx in range(len(x))]
-reads.append(y)
-reads_rel.append(list(numpy.float_(y)) / sum(y))
-#x = [xi + barWidth for xi in x]
-if len(list_to_plot2) == 1:
-x = [xi * 0.5 for xi in x]
-w = 0.4
-else:
-x = [xi + barWidth for xi in x]
-w = 1./(len(list_to_plot) + 1)
-if to_plot[l] == "Relative frequencies":
-counts2_rel = ax.bar(x, list(numpy.float_(y)) / numpy.sum(y), align="edge", width=w,
-edgecolor="black", label=label[i],linewidth=1, alpha=0.7, color=colors[i])
-ax.set_ylim(0, 1.07)
-else:
-#y = list(y.reshape((len(y))))
-counts2 = ax.bar(x, y, align="edge", width=w, edgecolor="black", label=label[i], linewidth=1,
-alpha=0.7, color=colors[i])
-if i == len(list_to_plot2)-1:
-barWidth += 1. / (len(list_to_plot) + 1) + 1. / (len(list_to_plot) + 1)
-else:
-barWidth += 1. / (len(list_to_plot) + 1)
-if to_plot[l] == "Absolute frequencies":
-ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
 else:
-ax.set_xlabel("Family size", fontsize=14)
+counts2 = ax2.bar(x, y, align="edge", width=w, edgecolor="black", label=label[i], linewidth=1,
+alpha=0.7, color=colors[i])
-if len(list_to_plot2) == 1:
-ax.set_xticks(numpy.array([xi + 0.2 for xi in x]))
+if i == len(list_to_plot2) - 1:
+barWidth += 1. / (len(list_to_plot) + 1) + 1. / (len(list_to_plot) + 1)
 else:
-ax.set_xticks(numpy.array(ticks))
+barWidth += 1. / (len(list_to_plot) + 1)
-ax.set_xticklabels(ticks1)
-ax.set_ylabel(to_plot[l], fontsize=14)
+ax2.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
-if log_axis:
-ax.set_yscale('log')
+if len(list_to_plot2) == 1:
-ax.grid(b=True, which="major", color="#424242", linestyle=":")
+ax2.set_xticks(numpy.array([xi + 0.2 for xi in x]))
-ax.margins(0.01, None)
+else:
+ax2.set_xticks(numpy.array(ticks))
+ax2.set_xticklabels(ticks1)
+ax2.set_xlabel("Family size", fontsize=14)
+ax2.set_ylabel(ylab, fontsize=14)
+if log_axis:
+ax2.set_yscale('log')
+ax2.grid(b=True, which="major", color="#424242", linestyle=":")
+ax2.margins(0.01, None)
 pdf.savefig(fig2)
 plt.close()
 # write data to CSV file tags
-output_file.write("Values from family size distribution with all datasets (tags)\n")
+counts = [numpy.bincount(d, minlength=22)[1:] for d in list_to_plot2]  # original counts of family sizes
+output_file.write("Values from family size distribution with all datasets based on families\n")
 output_file.write("\nFamily size")
 for i in label:
 output_file.write("{}{}".format(sep, i))
-# output_file.write("{}sum".format(sep))
 output_file.write("\n")
 j = 0
-for fs in counts[1][0:len(counts[1]) - 1]:
+for fs in bins:
 if fs == 21:
 fs = ">20"
 else:
 fs = "={}".format(fs)
 output_file.write("FS{}{}".format(fs, sep))
-if len(label) == 1:
+for n in range(len(label)):
-output_file.write("{}{}".format(int(counts[0][j]), sep))
+output_file.write("{}{}".format(int(counts[n][j]), sep))
-else:
-for n in range(len(label)):
-output_file.write("{}{}".format(int(counts[0][n][j]), sep))
 output_file.write("\n")
 j += 1
 output_file.write("sum{}".format(sep))
-if len(label) == 1:
+for i in counts:
-output_file.write("{}{}".format(int(sum(counts[0])), sep))
+output_file.write("{}{}".format(int(sum(i)), sep))
-else:
-for i in counts[0]:
-output_file.write("{}{}".format(int(sum(i)), sep))
 # write data to CSV file PE reads
-output_file.write("\n\nValues from family size distribution with all datasets (PE reads)\n")
+output_file.write("\n\nValues from family size distribution with all datasets based on PE reads\n")
 output_file.write("\nFamily size")
 for i in label:
 output_file.write("{}{}".format(sep, i))
-# output_file.write("{}sum".format(sep))
 output_file.write("\n")
 j = 0
 for fs in bins:
 if fs == 21:
 fs = ">20"
 else:
 fs = "={}".format(fs)
 for n in range(len(label)):
 output_file.write("{}{}".format(int(reads[n][j]), sep))
 output_file.write("\n")
 j += 1
 output_file.write("sum{}".format(sep))
 if len(label) == 1:
 output_file.write("{}{}".format(int(sum(numpy.concatenate(reads))), sep))
 else:
 for i in reads:
 output_file.write("{}{}".format(int(sum(i)), sep))
 output_file.write("\n")
 dataBA = ba[numpy.in1d(baSeq, d, invert=True)]
 dataBA_o = ba_o[numpy.in1d(baSeq, d, invert=True)]
 list1 = [duplTags_double, dataAB, dataBA]  # list for plotting
+list1_o = [duplTags_double_o, dataAB_o, dataBA_o]  # list for plotting
 # information for family size >= 3
 dataAB_FS3 = dataAB[dataAB >= 3]
 dataAB_FS3_o = dataAB_o[dataAB_o >= 3]
 dataBA_FS3 = dataBA[dataBA >= 3]
 duplTags_FS3_BA_o = duplTagsBA_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)]  # ba+ab with FS>=3
 duplTags_double_FS3_o = sum(duplTags_FS3_o) + sum(duplTags_FS3_BA_o)  # both ab and ba strands with FS>=3
 fig = plt.figure()
 plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0)
-counts = plt.hist(list1, bins=numpy.arange(1, 23), stacked=True, label=["duplex", "ab", "ba"],
-edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"],
+if rel_freq:
-rwidth=0.8)
+w = [numpy.zeros_like(d) + 1. / len(numpy.concatenate(list1)) for d in list1]
+counts = plt.hist(list1, bins=numpy.arange(1, 23), stacked=True, label=["duplex", "ab", "ba"], weights=w,
+edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"],
+rwidth=0.8)
+plt.ylim(0, 1.07)
+else:
+counts = plt.hist(list1, bins=numpy.arange(1, 23), stacked=True, label=["duplex", "ab", "ba"],
+edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"],
+rwidth=0.8)
 # tick labels of x axis
 ticks = numpy.arange(1, 22, 1)
 ticks1 = map(str, ticks)
 ticks1[len(ticks1) - 1] = ">20"
 plt.xticks(numpy.array(ticks), ticks1)
-singl = counts[0][2][0]  # singletons
+# singl = counts[0][2][0]  # singletons
-last = counts[0][2][len(counts[0][0]) - 1]  # large families
+singl = len(data_o[data_o == 1])
+last = len(data_o[data_o > 20])  # large families
 if log_axis:
 plt.yscale('log')
 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
-plt.title(name_file, fontsize=14)
+plt.title("{}: FSD based on families".format(name_file), fontsize=14)
 plt.xlabel("Family size", fontsize=14)
 plt.ylabel("Absolute Frequency", fontsize=14)
 plt.margins(0.01, None)
 plt.grid(b=True, which="major", color="#424242", linestyle=":")
 # extra information beneath the plot
 legend = "SSCS ab= \nSSCS ba= \nDCS (total)= \ntotal nr. of tags="
 plt.text(0.1, 0.09, legend, size=10, transform=plt.gcf().transFigure)
-legend = "nr. of tags\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(len(dataAB), len(dataBA), len(duplTags), len(duplTags_double), (len(dataAB) + len(dataBA) + len(duplTags)), (len(ab) + len(ba)))
+legend = "nr. of tags\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(len(dataAB), len(dataBA),
+len(duplTags), len(duplTags_double), (
+len(dataAB) + len(
+dataBA) + len(duplTags)),
+(len(ab) + len(ba)))
 plt.text(0.23, 0.09, legend, size=10, transform=plt.gcf().transFigure)
-legend5 = "PE reads\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(sum(dataAB_o), sum(dataBA_o), sum(duplTags_o), sum(duplTags_double_o), (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), (sum(ab_o) + sum(ba_o)))
+legend5 = "PE reads\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(sum(dataAB_o), sum(dataBA_o),
+sum(duplTags_o), sum(duplTags_double_o),
+(sum(dataAB_o) + sum(dataBA_o) + sum(
+duplTags_o)),
+(sum(ab_o) + sum(ba_o)))
 plt.text(0.38, 0.09, legend5, size=10, transform=plt.gcf().transFigure)
-legend = "rel. freq. of tags\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)), float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)), float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), (len(dataAB) + len(dataBA) + len(duplTags)))
+legend = "rel. freq. of tags\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(
+float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+(len(dataAB) + len(dataBA) + len(duplTags)))
 plt.text(0.54, 0.09, legend, size=10, transform=plt.gcf().transFigure)
-legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format(float(len(dataAB)) / (len(ab) + len(ba)), float(len(dataBA)) / (len(ab) + len(ba)), float(len(duplTags)) / (len(ab) + len(ba)), float(len(duplTags_double)) / (len(ab) + len(ba)), (len(ab) + len(ba)))
+legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format(float(len(dataAB)) / (len(ab) + len(ba)),
+float(len(dataBA)) / (len(ab) + len(ba)),
+float(len(duplTags)) / (len(ab) + len(ba)),
+float(len(duplTags_double)) / (
+len(ab) + len(ba)),
+(len(ab) + len(ba)))
 plt.text(0.64, 0.09, legend, size=10, transform=plt.gcf().transFigure)
 legend1 = "\nsingletons:\nfamily size > 20:"
 plt.text(0.1, 0.03, legend1, size=10, transform=plt.gcf().transFigure)
-legend4 = "{:,}\n{:,}".format(singl.astype(int), last.astype(int))
+legend4 = "{:,}\n{:,}".format(singl, last)
 plt.text(0.23, 0.03, legend4, size=10, transform=plt.gcf().transFigure)
+legend3 = "{:.3f}\n{:.3f}".format(float(singl) / len(data), float(last) / len(data))
-legend3 = "{:.3f}\n{:.3f}".format(singl / len(data), last / len(data))
 plt.text(0.64, 0.03, legend3, size=10, transform=plt.gcf().transFigure)
 legend3 = "\n\n{:,}".format(sum(data_o[data_o > 20]))
 plt.text(0.38, 0.03, legend3, size=10, transform=plt.gcf().transFigure)
-legend3 = "{:.3f}\n{:.3f}".format(float(singl)/sum(data_o), float(sum(data_o[data_o > 20])) / sum(data_o))
+legend3 = "{:.3f}\n{:.3f}".format(float(singl) / sum(data_o), float(sum(data_o[data_o > 20])) / sum(data_o))
 plt.text(0.84, 0.03, legend3, size=10, transform=plt.gcf().transFigure)
 legend = "PE reads\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(
 float(sum(dataAB_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)),
 float(sum(dataBA_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)),
 plt.text(0.84, 0.09, legend, size=10, transform=plt.gcf().transFigure)
 pdf.savefig(fig)
 plt.close()
+# PLOT FSD based on PE reads
+fig3 = plt.figure()
+plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0)
+fig3.suptitle("{}: FSD based on PE reads".format(name_file), fontsize=14)
+ax2 = fig3.add_subplot(1, 1, 1)
+ticks = numpy.arange(1, 22)
+ticks1 = map(str, ticks)
+ticks1[len(ticks1) - 1] = ">20"
+reads = []
+reads_rel = []
+#barWidth = 0 - (len(list_to_plot) + 1) / 2 * 1. / (len(list_to_plot) + 1)
+ax2.set_xticks([], [])
+list_y = []
+label = ["duplex", "ab", "ba"]
+col = ["#FF0000", "#5FB404", "#FFBF00"]
+for i in range(len(list1)):
+x = list(numpy.arange(1, 22).astype(float))
+unique, c = numpy.unique(list1[i], return_counts=True)
+y = unique * c
+if sum(list1_o[i] > 20) > 0:
+y[len(y) - 1] = sum(list1_o[i][list1_o[i] > 20])
+y = [y[x[idx] == unique][0] if x[idx] in unique else 0 for idx in range(len(x))]
+reads.append(y)
+reads_rel.append(list(numpy.float_(y)) / sum(numpy.concatenate(list1_o)))
+if rel_freq:
+y = list(numpy.float_(y)) / sum(numpy.concatenate(list1_o))
+ax2.set_ylim(0, 1.07)
+else:
+y = y
+list_y.append(y)
+if i == 0:
+counts2 = ax2.bar(x, y, align="edge", width=0.8,
+edgecolor="black", label=label[0],
+linewidth=1, alpha=1, color=col[0])
+elif i == 1:
+counts2 = ax2.bar(x, y, bottom=list_y[i-1], align="edge", width=0.8,
+edgecolor="black", label=label[1],
+linewidth=1, alpha=1, color=col[1])
+elif i == 2:
+bars = numpy.add(list_y[0], list_y[1]).tolist()
+counts2 = ax2.bar(x, y, bottom=bars, align="edge", width=0.8,
+edgecolor="black", label=label[2],
+linewidth=1, alpha=1, color=col[2])
+ax2.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
+singl = len(data_o[data_o == 1])
+last = len(data_o[data_o > 20])  # large families
+ax2.set_xticks(numpy.array(ticks))
+ax2.set_xticklabels(ticks1)
+ax2.set_xlabel("Family size", fontsize=14)
+ax2.set_ylabel(ylab, fontsize=14)
+if log_axis:
+ax2.set_yscale('log')
+ax2.grid(b=True, which="major", color="#424242", linestyle=":")
+ax2.margins(0.01, None)
+# extra information beneath the plot
+legend = "SSCS ab= \nSSCS ba= \nDCS (total)= \ntotal nr. of tags="
+plt.text(0.1, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+legend = "nr. of tags\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(len(dataAB), len(dataBA),
+len(duplTags), len(duplTags_double), (
+len(dataAB) + len(
+dataBA) + len(duplTags)),
+(len(ab) + len(ba)))
+plt.text(0.23, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+legend5 = "PE reads\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(sum(dataAB_o), sum(dataBA_o),
+sum(duplTags_o), sum(duplTags_double_o),
+(sum(dataAB_o) + sum(dataBA_o) + sum(
+duplTags_o)),
+(sum(ab_o) + sum(ba_o)))
+plt.text(0.38, 0.09, legend5, size=10, transform=plt.gcf().transFigure)
+legend = "rel. freq. of tags\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(
+float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+(len(dataAB) + len(dataBA) + len(duplTags)))
+plt.text(0.54, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format(float(len(dataAB)) / (len(ab) + len(ba)),
+float(len(dataBA)) / (len(ab) + len(ba)),
+float(len(duplTags)) / (len(ab) + len(ba)),
+float(len(duplTags_double)) / (
+len(ab) + len(ba)),
+(len(ab) + len(ba)))
+plt.text(0.64, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+legend1 = "\nsingletons:\nfamily size > 20:"
+plt.text(0.1, 0.03, legend1, size=10, transform=plt.gcf().transFigure)
+legend4 = "{:,}\n{:,}".format(singl, last)
+plt.text(0.23, 0.03, legend4, size=10, transform=plt.gcf().transFigure)
+legend3 = "{:.3f}\n{:.3f}".format(float(singl) / len(data), float(last) / len(data))
+plt.text(0.64, 0.03, legend3, size=10, transform=plt.gcf().transFigure)
+legend3 = "\n\n{:,}".format(sum(data_o[data_o > 20]))
+plt.text(0.38, 0.03, legend3, size=10, transform=plt.gcf().transFigure)
+legend3 = "{:.3f}\n{:.3f}".format(float(singl) / sum(data_o), float(sum(data_o[data_o > 20])) / sum(data_o))
+plt.text(0.84, 0.03, legend3, size=10, transform=plt.gcf().transFigure)
+legend = "PE reads\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(
+float(sum(dataAB_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)),
+float(sum(dataBA_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)),
+float(sum(duplTags_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)),
+(sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)))
+plt.text(0.74, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format(
+float(sum(dataAB_o)) / (sum(ab_o) + sum(ba_o)),
+float(sum(dataBA_o)) / (sum(ab_o) + sum(ba_o)),
+float(sum(duplTags_o)) / (sum(ab_o) + sum(ba_o)),
+float(sum(duplTags_double_o)) / (sum(ab_o) + sum(ba_o)), (sum(ab_o) + sum(ba_o)))
+plt.text(0.84, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+pdf.savefig(fig3)
+plt.close()
 # write same information to a csv file
 count = numpy.bincount(data_o)  # original counts of family sizes
 output_file.write("\nDataset:{}{}\n".format(sep, name_file))
 output_file.write("max. family size:{}{}\n".format(sep, max(data_o)))
 output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1]))
 output_file.write("relative frequency:{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count)))
 output_file.write("median family size:{}{}\n".format(sep, numpy.median(numpy.array(data_o))))
 output_file.write("mean family size:{}{}\n\n".format(sep, numpy.mean(numpy.array(data_o))))
-output_file.write("{}singletons:{}{}{}family size > 20:{}{}{}{}length of dataset:\n".format(sep, sep, sep, sep, sep, sep, sep, sep))
+output_file.write(
-output_file.write("{}nr. of tags{}rel. freq of tags{}rel.freq of PE reads{}nr. of tags{}rel. freq of tags{}nr. of PE reads{}rel. freq of PE reads{}total nr. of tags{}total nr. of PE reads\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep))
+"{}singletons:{}{}{}family size > 20:{}{}{}{}length of dataset:\n".format(sep, sep, sep, sep, sep, sep,
+sep, sep))
+output_file.write(
+"{}nr. of tags{}rel. freq of tags{}rel.freq of PE reads{}nr. of tags{}rel. freq of tags{}nr. of PE reads{}rel. freq of PE reads{}total nr. of tags{}total nr. of PE reads\n".format(
+sep, sep, sep, sep, sep, sep, sep, sep, sep))
 output_file.write("{}{}{}{}{:.3f}{}{:.3f}{}{}{}{:.3f}{}{}{}{:.3f}{}{}{}{}\n\n".format(
-name_file, sep, singl.astype(int), sep, singl / len(data), sep, float(singl)/sum(data_o), sep,
+name_file, sep, singl, sep, float(singl) / len(data), sep, float(singl) / sum(data_o), sep,
-last.astype(int), sep, last / len(data), sep, sum(data_o[data_o > 20]), sep, float(sum(data_o[data_o > 20])) / sum(data_o), sep, len(data), sep, sum(data_o)))
+last, sep, float(last) / len(data), sep, sum(data_o[data_o > 20]), sep,
+float(sum(data_o[data_o > 20])) / sum(data_o), sep, len(data),
+sep, sum(data_o)))
 # information for FS >= 1
-output_file.write("The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS)\n"
+output_file.write(
-"Whereas the total frequencies were calculated from the whole dataset (=including the DCS).\n\n")
+"The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS)\n"
-output_file.write("FS >= 1{}nr. of tags{}nr. of PE reads{}rel. freq of tags{}{}rel. freq of PE reads:\n".format(sep, sep, sep, sep, sep))
+"Whereas the total frequencies were calculated from the whole dataset (=including the DCS).\n\n")
+output_file.write(
+"FS >= 1{}nr. of tags{}nr. of PE reads{}rel. freq of tags{}{}rel. freq of PE reads:\n".format(sep, sep,
+sep, sep,
+sep))
 output_file.write("{}{}{}unique:{}total{}unique{}total:\n".format(sep, sep, sep, sep, sep, sep))
 output_file.write("SSCS ab{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format(
-sep, len(dataAB), sep, sum(dataAB_o), sep, float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+sep, len(dataAB), sep, sum(dataAB_o), sep,
+float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)),
 sep, float(sum(dataAB_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep,
 float(len(dataAB)) / (len(ab) + len(ba)), sep, float(sum(dataAB_o)) / (sum(ab_o) + sum(ba_o))))
 output_file.write("SSCS ba{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format(
-sep, len(dataBA), sep, sum(dataBA_o), sep, float(len(dataBA)) / (len(dataBA) + len(dataBA) + len(duplTags)),
+sep, len(dataBA), sep, sum(dataBA_o), sep,
-sep, float(sum(dataBA_o)) / (sum(dataBA_o) + sum(dataBA_o) + sum(duplTags_o)), sep, float(len(dataBA)) / (len(ba) + len(ba)),
+float(len(dataBA)) / (len(dataBA) + len(dataBA) + len(duplTags)),
+sep, float(sum(dataBA_o)) / (sum(dataBA_o) + sum(dataBA_o) + sum(duplTags_o)), sep,
+float(len(dataBA)) / (len(ba) + len(ba)),
 sep, float(sum(dataBA_o)) / (sum(ba_o) + sum(ba_o))))
-output_file.write("DCS (total){}{} ({}){}{} ({}){}{:.3f}{}{:.3f} ({:.3f}){}{:.3f}{}{:.3f} ({:.3f})\n".format(
+output_file.write(
-sep, len(duplTags), len(duplTags_double), sep, sum(duplTags_o), sum(duplTags_double_o), sep,
+"DCS (total){}{} ({}){}{} ({}){}{:.3f}{}{:.3f} ({:.3f}){}{:.3f}{}{:.3f} ({:.3f})\n".format(
-float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), sep,
+sep, len(duplTags), len(duplTags_double), sep, sum(duplTags_o), sum(duplTags_double_o), sep,
-float(len(duplTags)) / (len(ab) + len(ba)), float(len(duplTags_double)) / (len(ab) + len(ba)), sep,
+float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), sep,
-float(sum(duplTags_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep,
+float(len(duplTags)) / (len(ab) + len(ba)), float(len(duplTags_double)) / (len(ab) + len(ba)), sep,
-float(sum(duplTags_o)) / (sum(ab_o) + sum(ba_o)), float(sum(duplTags_double_o)) / (sum(ab_o) + sum(ba_o))))
+float(sum(duplTags_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep,
+float(sum(duplTags_o)) / (sum(ab_o) + sum(ba_o)),
+float(sum(duplTags_double_o)) / (sum(ab_o) + sum(ba_o))))
 output_file.write("total nr. of tags{}{}{}{}{}{}{}{}{}{}{}{}\n".format(
-sep, (len(dataAB) + len(dataBA) + len(duplTags)), sep, (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep,
+sep, (len(dataAB) + len(dataBA) + len(duplTags)), sep,
+(sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep,
 (len(dataAB) + len(dataBA) + len(duplTags)), sep, (len(ab) + len(ba)), sep,
 (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep, (sum(ab_o) + sum(ba_o))))
 # information for FS >= 3
-output_file.write("\nFS >= 3{}nr. of tags{}nr. of PE reads{}rel. freq of tags{}{}rel. freq of PE reads:\n".format(sep, sep, sep, sep, sep))
+output_file.write(
+"\nFS >= 3{}nr. of tags{}nr. of PE reads{}rel. freq of tags{}{}rel. freq of PE reads:\n".format(sep,
+sep,
+sep,
+sep,
+sep))
 output_file.write("{}{}{}unique:{}total{}unique{}total:\n".format(sep, sep, sep, sep, sep, sep))
 output_file.write("SSCS ab{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format(
 sep, len(dataAB_FS3), sep, sum(dataAB_FS3_o), sep,
 float(len(dataAB_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep,
 float(len(dataAB_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
 sep, len(dataBA_FS3), sep, sum(dataBA_FS3_o), sep,
 float(len(dataBA_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + len(duplTags_FS3)),
 sep, float(len(dataBA_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
 sep, float(sum(dataBA_FS3_o)) / (sum(dataBA_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)),
 sep, float(sum(dataBA_FS3_o)) / (sum(dataBA_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o)))
-output_file.write("DCS (total){}{} ({}){}{} ({}){}{:.3f}{}{:.3f} ({:.3f}){}{:.3f}{}{:.3f} ({:.3f})\n".format(
+output_file.write(
-sep, len(duplTags_FS3), duplTags_double_FS3,  sep, sum(duplTags_FS3_o), duplTags_double_FS3_o, sep,
+"DCS (total){}{} ({}){}{} ({}){}{:.3f}{}{:.3f} ({:.3f}){}{:.3f}{}{:.3f} ({:.3f})\n".format(
-float(len(duplTags_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep,
+sep, len(duplTags_FS3), duplTags_double_FS3, sep, sum(duplTags_FS3_o), duplTags_double_FS3_o, sep,
-float(len(duplTags_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
+float(len(duplTags_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep,
-float(duplTags_double_FS3) / (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
+float(len(duplTags_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
-sep, float(sum(duplTags_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), sep,
+float(duplTags_double_FS3) / (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
-float(sum(duplTags_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o),
+sep, float(sum(duplTags_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)),
-float(duplTags_double_FS3_o) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o)))
+sep,
+float(sum(duplTags_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o),
+float(duplTags_double_FS3_o) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o)))
 output_file.write("total nr. of tags{}{}{}{}{}{}{}{}{}{}{}{}\n".format(
-sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)),
+sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep,
-sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
+(sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)),
-sep, (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), sep, (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o)))
+sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep,
+(len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
-output_file.write("\nValues from family size distribution\n")
+sep, (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), sep,
+(sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o)))
+counts = [numpy.bincount(d, minlength=22)[1:] for d in list1]  # original counts of family sizes
+output_file.write("\nValues from family size distribution based on families\n")
 output_file.write("{}duplex{}ab{}ba{}sum\n".format(sep, sep, sep, sep))
-for dx, ab, ba, fs in zip(counts[0][0], counts[0][1], counts[0][2], counts[1]):
+j = 0
+for fs in bins:
 if fs == 21:
 fs = ">20"
 else:
 fs = "={}".format(fs)
-ab1 = ab - dx
+output_file.write("FS{}{}".format(fs, sep))
-ba1 = ba - ab
+for n in range(3):
-output_file.write("FS{}{}{}{}{}{}{}{}{}\n".format(fs, sep, int(dx), sep, int(ab1), sep, int(ba1), sep, int(ba)))
+output_file.write("{}{}".format(int(counts[n][j]), sep))
+output_file.write("{}\n".format(counts[0][j] + counts[1][j] + counts[2][j]))
+j += 1
+output_file.write("sum{}".format(sep))
+for i in counts:
+output_file.write("{}{}".format(int(sum(i)), sep))
+output_file.write("{}\n".format(sum(counts[0] + counts[1] + counts[2])))
+output_file.write("\nValues from family size distribution based on PE reads\n")
+output_file.write("{}duplex{}ab{}ba{}sum\n".format(sep, sep, sep, sep))
+j = 0
+for fs in bins:
+if fs == 21:
+fs = ">20"
+else:
+fs = "={}".format(fs)
+output_file.write("FS{}{}".format(fs, sep))
+for n in range(3):
+output_file.write("{}{}".format(int(reads[n][j]), sep))
+output_file.write("{}\n".format(reads[0][j] + reads[1][j] + reads[2][j]))
+j += 1
+output_file.write("sum{}".format(sep))
+for i in reads:
+output_file.write("{}{}".format(int(sum(i)), sep))
+output_file.write("{}\n".format(sum(reads[0] + reads[1] + reads[2])))
 print("Files successfully created!")
 if __name__ == '__main__':

Mercurial > repos > mheinzl > fsd

comparison fsd.py @ 45:6651e76baca1 draft