hd: hd.py comparison

comparison hd.py @ 29:6b15b3b6405c draft

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/hd commit 5b3ab8c6467fe3a52e89f5a7d175bd8a0189018a-dirty

author	mheinzl
date	Wed, 24 Jul 2019 05:58:15 -0400
parents	1fa7342a140d
children	46bfbec0f9e6

comparison

equal deleted inserted replaced

-:1fa7342a140d
+:6b15b3b6405c
 # Hamming distance analysis of SSCSs
 #
 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria)
 # Contact: monika.heinzl@edumail.at
 #
-# Takes at least one TABULAR file with tags before the alignment to the SSCS and optionally a second TABULAR file as input.
+# Takes at least one TABULAR file with tags before the alignment to the SSCS and
-# The program produces a plot which shows a histogram of Hamming distances separated after family sizes,
+# optionally a second TABULAR file as input. The program produces a plot which shows a histogram of Hamming distances
-# a family size distribution separated after Hamming distances for all (sample_size=0) or a given sample of SSCSs or SSCSs, which form a DCS.
+# separated after family sizes, a family size distribution separated after Hamming distances for all (sample_size=0)
-# In additon, the tool produces HD and FSD plots for the difference between the HDs of both parts of the tags and for the chimeric reads
+# or a given sample of SSCSs or SSCSs, which form a DCS. In additon, the tool produces HD and FSD plots for the
-# and finally a CSV file with the data of the plots.
+# difference between the HDs of both parts of the tags and for the chimeric reads and finally a CSV file with the
-# It is also possible to perform the HD analysis with shortened tags with given sizes as input.
+# data of the plots. It is also possible to perform the HD analysis with shortened tags with given sizes as input.
 # The tool can run on a certain number of processors, which can be defined by the user.
 # USAGE: python hd.py --inputFile filename --inputName1 filename --sample_size int /
-#        --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular
+#        --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int
+#        --nr_above_bars True/False --output_tabular outptufile_name_tabular
 import argparse
 import itertools
 import operator
 import sys
 plt.switch_backend('agg')
 def plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, originalCounts,
-title_file1, subtitle, pdf, relative=False, diff=True):
+subtitle, pdf, relative=False, diff=True, rel_freq=False):
 if diff is False:
 colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"]
 labels = ["HD=1", "HD=2", "HD=3", "HD=4", "HD=5-8", "HD>8"]
 else:
 colors = ["#93A6AB", "#403C14", "#731E41", "#BAB591", "#085B6F", "#E8AA35", "#726C66"]
 labels = ["d=0", "d=1", "d=2", "d=3", "d=4", "d=5-8", "d>8"]
 fig = plt.figure(figsize=(6, 7))
 ax = fig.add_subplot(111)
 plt.subplots_adjust(bottom=0.1)
-p1 = numpy.bincount(numpy.concatenate((familySizeList1)))
+p1 = numpy.bincount(numpy.concatenate(familySizeList1))
 maximumY = numpy.amax(p1)
 if len(range(minimumXFS, maximumXFS)) == 0:
 range1 = range(minimumXFS - 1, minimumXFS + 2)
 else:
 range1 = range(0, maximumXFS + 2)
-counts = plt.hist(familySizeList1, label=labels,
-color=colors, stacked=True,
+if rel_freq:
-rwidth=0.8, alpha=1, align="left",
+w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(familySizeList1)) for data in familySizeList1]
-edgecolor="None", bins=range1)
+counts = plt.hist(familySizeList1, label=labels, weights=w, color=colors, stacked=True,
+rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1)
+plt.ylabel("Relative Frequency", fontsize=14)
+plt.ylim((0, (float(maximumY) / sum(p1)) * 1.1))
+else:
+counts = plt.hist(familySizeList1, label=labels, color=colors, stacked=True,
+rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1)
+if len(numpy.concatenate(familySizeList1)) != 0:
+plt.ylim((0, max(numpy.bincount(numpy.concatenate(familySizeList1))) * 1.1))
+plt.ylabel("Absolute Frequency", fontsize=14)
+plt.ylim((0, maximumY * 1.2))
 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
-# plt.title(title_file1, fontsize=12)
 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
 plt.xlabel("Family size", fontsize=14)
-plt.ylabel("Absolute Frequency", fontsize=14)
 ticks = numpy.arange(0, maximumXFS + 1, 1)
 ticks1 = map(str, ticks)
 if maximumXFS >= 20:
 ticks1[len(ticks1) - 1] = ">=20"
 plt.xticks(numpy.array(ticks), ticks1)
 [l.set_visible(False) for (i, l) in enumerate(ax.get_xticklabels()) if i % 5 != 0]
 plt.xlim((0, maximumXFS + 1))
-if len(numpy.concatenate(familySizeList1)) != 0:
-plt.ylim((0, max(numpy.bincount(numpy.concatenate(familySizeList1))) * 1.1))
-plt.ylim((0, maximumY * 1.2))
 legend = "\nfamily size: \nabsolute frequency: \nrelative frequency: "
 plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure)
 count = numpy.bincount(originalCounts)  # original counts
 if max(originalCounts) >= 20:
 max_count = ">= 20"
 else:
 max_count = max(originalCounts)
-legend1 = "{}\n{}\n{:.5f}".format(max_count, count[len(count) - 1], float(count[len(count) - 1]) / sum(count))
+legend1 = "{}\n{}\n{:.5f}".format(max_count, p1[len(p1) - 1], float(p1[len(p1) - 1]) / sum(p1))
 plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure)
-legend3 = "singletons\n{:,}\n{:.5f}".format(int(counts[0][len(counts[0]) - 1][1]), float(counts[0][len(counts[0]) - 1][1]) / sum(counts[0][len(counts[0]) - 1]))
+legend3 = "singletons\n{:,}\n{:.5f}".format(int(p1[1]), float(p1[1]) / sum(p1))
 plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12)
 plt.grid(b=True, which='major', color='#424242', linestyle=':')
 pdf.savefig(fig, bbox_inches="tight")
 plt.close("all")
-def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, title_file1, pdf, xlabel, relative=False, nr_above_bars=True, nr_unique_chimeras=0, len_sample=0):
+def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False,
+nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False):
 if relative is True:
 step = 0.1
 else:
 step = 1
 fig = plt.figure(figsize=(6, 8))
 plt.subplots_adjust(bottom=0.1)
-con_list1 = numpy.concatenate(list1)
+p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())])
-p1 = numpy.array([v for k, v in sorted(Counter(con_list1).iteritems())])
 maximumY = numpy.amax(p1)
-maximumX = int(maximumX)
-print("max X", maximumX )
 if relative is True:  # relative difference
 bin1 = numpy.arange(-1, maximumX + 0.2, 0.1)
 else:
 bin1 = maximumX + 1
-counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1,
+if rel_freq:
-label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10",
+w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1]
-"FS>10"], rwidth=0.8,
+counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w,
-color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"],
+label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8,
-stacked=True, alpha=1,
+color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"],
-align="left",
+stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
-range=(0, maximumX + 1))
+plt.ylim((0, (float(maximumY) / sum(p1)) * 1.2))
+plt.ylabel("Relative Frequency", fontsize=14)
+bins = counts[1]  # width of bins
+counts = numpy.array(map(float, counts[0][5]))
+else:
+counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1,
+label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8,
+color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"],
+stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
+maximumY = numpy.amax(p1)
+plt.ylim((0, maximumY * 1.2))
+plt.ylabel("Absolute Frequency", fontsize=14)
+bins = counts[1]  # width of bins
+counts = numpy.array(map(int, counts[0][5]))
 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
-bins = counts[1]  # width of bins
-counts = numpy.array(map(int, counts[0][5]))
 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
-# plt.title(title_file1, fontsize=12)
 plt.xlabel(xlabel, fontsize=14)
-plt.ylabel("Absolute Frequency", fontsize=14)
 plt.grid(b=True, which='major', color='#424242', linestyle=':')
-plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1))
+plt.xlim((minimumX - step, maximumX + step))
+# plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1))
 plt.xticks(numpy.arange(0, maximumX + step, step))
-plt.ylim((0, maximumY * 1.2))
+if nr_above_bars:
-if nr_above_bars is True:
 bin_centers = -0.4 * numpy.diff(bins) + bins[:-1]
 for x_label, label in zip(counts, bin_centers):  # labels for values
 if x_label == 0:
 continue
 else:
-plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts), 1),
+if rel_freq:
-xy=(label, x_label + len(con_list1) * 0.01),
+plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))),
-xycoords="data", color="#000066", fontsize=10)
+float(x_label)),
+xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001),
-legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(lenTags, len_sample, sum(counts))
+xycoords="data", color="#000066", fontsize=10)
-plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure)
+else:
+plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)),
-# if nr_unique_chimeras != 0 and len_sample != 0:
+xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01),
-#     if relative == True:
+xycoords="data", color="#000066", fontsize=10)
-#         legend = "nr. of unique chimeric tags= {:,} ({:.5f}) (rel.diff=1)".format(nr_unique_chimeras,
-#                                                                      int(nr_unique_chimeras) / float(len_sample))
+if nr_unique_chimeras != 0:
-#     else:
+if (relative and ((counts[len(counts)-1] / nr_unique_chimeras) == 2)) or \
-#         legend = "nr. of unique chimeric tags= {:,} ({:.5f})".format(nr_unique_chimeras, int(nr_unique_chimeras) / float(len_sample))
+(sum(counts) / nr_unique_chimeras) == 2:
-#     plt.text(0.14, -0.09, legend, size=12, transform=plt.gcf().transFigure)
+legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})"\
+.format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2)
+else:
+legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format(
+lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras)
+else:
+legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(
+lenTags, len_sample, len(numpy.concatenate(list1)))
+plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure)
 pdf.savefig(fig, bbox_inches="tight")
 plt.close("all")
 plt.clf()
-def plotHDwithinSeq_Sum2(sum1, sum1min, sum2, sum2min, min_value, lenTags, title_file1, pdf, len_sample):
+def plotHDwithDCS(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False,
+nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False):
+step = 1
 fig = plt.figure(figsize=(6, 8))
 plt.subplots_adjust(bottom=0.1)
+p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())])
+maximumY = numpy.amax(p1)
+bin1 = maximumX + 1
+if rel_freq:
+w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1]
+counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w,
+label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"],
+stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
+plt.ylim((0, (float(maximumY) / sum(p1)) * 1.2))
+plt.ylabel("Relative Frequency", fontsize=14)
+bins = counts[1]  # width of bins
+counts = numpy.array(map(float, counts[0][2]))
+else:
+counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1,
+label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"],
+stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
+plt.ylim((0, maximumY * 1.2))
+plt.ylabel("Absolute Frequency", fontsize=14)
+bins = counts[1]  # width of bins
+counts = numpy.array(map(int, counts[0][2]))
+plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
+plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
+plt.xlabel(xlabel, fontsize=14)
+plt.grid(b=True, which='major', color='#424242', linestyle=':')
+plt.xlim((minimumX - step, maximumX + step))
+# plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1))
+plt.xticks(numpy.arange(0, maximumX + step, step))
+if nr_above_bars:
+bin_centers = -0.4 * numpy.diff(bins) + bins[:-1]
+for x_label, label in zip(counts, bin_centers):  # labels for values
+if x_label == 0:
+continue
+else:
+if rel_freq:
+plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))),
+float(x_label)),
+xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001),
+xycoords="data", color="#000066", fontsize=10)
+else:
+plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)),
+xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01),
+xycoords="data", color="#000066", fontsize=10)
+if nr_unique_chimeras != 0:
+if (sum(counts) / nr_unique_chimeras) == 2:
+legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})".\
+format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2)
+else:
+legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format(
+lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras)
+else:
+legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(
+lenTags, len_sample, len(numpy.concatenate(list1)))
+plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure)
+legend2 = "SSCS ab = {:,}\nSSCS ba = {:,}\nDCS = {:,}".format(len(list1[1]), len(list1[2]), len(list1[0]))
+plt.text(0.6, -0.047, legend2, size=12, transform=plt.gcf().transFigure)
+pdf.savefig(fig, bbox_inches="tight")
+plt.close("all")
+plt.clf()
+def plotHDwithinSeq(sum1, sum1min, sum2, sum2min, min_value, lenTags, pdf, len_sample, rel_freq=False):
+fig = plt.figure(figsize=(6, 8))
+plt.subplots_adjust(bottom=0.1)
 ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)]  # new hd within tags
 maximumX = numpy.amax(numpy.concatenate(ham_partial))
 minimumX = numpy.amin(numpy.concatenate(ham_partial))
 maximumY = numpy.amax(numpy.array(numpy.concatenate(map(lambda x: numpy.bincount(x), ham_partial))))
 if len(range(minimumX, maximumX)) == 0:
 range1 = minimumX
 else:
 range1 = range(minimumX, maximumX + 2)
-plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, label=["HD a", "HD b'", "HD b", "HD a'", "HD a+b"], bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], edgecolor='black', linewidth=1)
+if rel_freq:
+w = [numpy.zeros_like(data) + 1. / len(data) for data in ham_partial]
+plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, weights=w,
+label=["HD a", "HD b'", "HD b", "HD a'", "HD a+b', a'+b"],
+bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"],
+edgecolor='black', linewidth=1)
+plt.ylabel("Relative Frequency", fontsize=14)
+# plt.ylim(-0.1, (float(maximumY) / len(numpy.concatenate(ham_partial))) * 1.2)
+else:
+plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False,
+label=["HD a", "HD b'", "HD b", "HD a'", "HD a+b', a'+b"],
+bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"],
+edgecolor='black', linewidth=1)
+plt.ylabel("Absolute Frequency", fontsize=14)
 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1))
 plt.suptitle('Hamming distances within tags', fontsize=14)
-# plt.title(title_file1, fontsize=12)
 plt.xlabel("HD", fontsize=14)
-plt.ylabel("Absolute Frequency", fontsize=14)
 plt.grid(b=True, which='major', color='#424242', linestyle=':')
+plt.xlim((minimumX - 1, maximumX + 1))
-plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2))
+# plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2))
 plt.xticks(numpy.arange(0, maximumX + 1, 1.0))
-# plt.ylim(0, maximumY * 1.2)
+legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(
-legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(lenTags, len_sample, len(numpy.concatenate(ham_partial)))
+lenTags, len_sample, len(numpy.concatenate(ham_partial)))
-# legend = "sample size= {:,} against {:,}".format(len(numpy.concatenate(ham_partial)), lenTags)
 plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure)
 pdf.savefig(fig, bbox_inches="tight")
 plt.close("all")
 plt.clf()
 nr = numpy.arange(0, len(uniqueFS), 1)
 if diff is False:
 count = numpy.zeros((len(uniqueFS), 6))
 else:
 count = numpy.zeros((len(uniqueFS), 7))
 state = 1
 for i in list1:
 counts = list(Counter(i).items())
 hd = [item[0] for item in counts]
 c = [item[1] for item in counts]
 if len(table) == 0:
 state = state + 1
 continue
 else:
 if state == 1:
-for i, l in zip(uniqueFS, nr):
+for k, l in zip(uniqueFS, nr):
 for j in table:
 if j[0] == uniqueFS[l]:
 count[l, 0] = j[1]
 if state == 2:
-for i, l in zip(uniqueFS, nr):
+for k, l in zip(uniqueFS, nr):
 for j in table:
 if j[0] == uniqueFS[l]:
 count[l, 1] = j[1]
 if state == 3:
-for i, l in zip(uniqueFS, nr):
+for k, l in zip(uniqueFS, nr):
 for j in table:
 if j[0] == uniqueFS[l]:
 count[l, 2] = j[1]
 if state == 4:
-for i, l in zip(uniqueFS, nr):
+for k, l in zip(uniqueFS, nr):
 for j in table:
 if j[0] == uniqueFS[l]:
 count[l, 3] = j[1]
 if state == 5:
-for i, l in zip(uniqueFS, nr):
+for k, l in zip(uniqueFS, nr):
 for j in table:
 if j[0] == uniqueFS[l]:
 count[l, 4] = j[1]
 if state == 6:
-for i, l in zip(uniqueFS, nr):
+for k, l in zip(uniqueFS, nr):
 for j in table:
 if j[0] == uniqueFS[l]:
 count[l, 5] = j[1]
 if state == 7:
-for i, l in zip(uniqueFS, nr):
+for k, l in zip(uniqueFS, nr):
 for j in table:
 if j[0] == uniqueFS[l]:
 count[l, 6] = j[1]
 state = state + 1
 sumRow = count.sum(axis=1)
 sumCol = count.sum(axis=0)
 uniqueFS = uniqueFS.astype(str)
 if uniqueFS[len(uniqueFS) - 1] == "20":
 uniqueFS[len(uniqueFS) - 1] = ">20"
 first = ["FS={}".format(i) for i in uniqueFS]
 final = numpy.column_stack((first, count, sumRow))
 return (final, sumCol)
 def createFileFSD2(summary, sumCol, overallSum, output_file, name, sep, rel=False, diff=True):
 output_file.write(name)
 output_file.write("\n")
 if diff is False:
-output_file.write("{}HD=1{}HD=2{}HD=3{}HD=4{}HD=5-8{}HD>8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep))
+output_file.write("{}HD=1{}HD=2{}HD=3{}HD=4{}HD=5-8{}HD>8{}sum{}\n".format(
+sep, sep, sep, sep, sep, sep, sep, sep))
 else:
 if rel is False:
-output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep))
+output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format(
+sep, sep, sep, sep, sep, sep, sep, sep, sep))
 else:
-output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep))
+output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n".
+format(sep, sep, sep, sep, sep, sep, sep, sep, sep))
 for item in summary:
 for nr in item:
 if "FS" not in nr and "diff" not in nr:
 nr = nr.astype(float)
 nr = nr.astype(int)
 if len(table) == 0:
 state = state + 1
 continue
 else:
 if state == 1:
-for i, l in zip(uniqueHD, nr):
+for k, l in zip(uniqueHD, nr):
 for j in table:
 if j[0] == uniqueHD[l]:
 count[l, 0] = j[1]
 if state == 2:
-for i, l in zip(uniqueHD, nr):
+for k, l in zip(uniqueHD, nr):
 for j in table:
 if j[0] == uniqueHD[l]:
 count[l, 1] = j[1]
 if state == 3:
-for i, l in zip(uniqueHD, nr):
+for k, l in zip(uniqueHD, nr):
 for j in table:
 if j[0] == uniqueHD[l]:
 count[l, 2] = j[1]
 if state == 4:
-for i, l in zip(uniqueHD, nr):
+for k, l in zip(uniqueHD, nr):
 for j in table:
 if j[0] == uniqueHD[l]:
 count[l, 3] = j[1]
 if state == 5:
-for i, l in zip(uniqueHD, nr):
+for k, l in zip(uniqueHD, nr):
 for j in table:
 if j[0] == uniqueHD[l]:
 count[l, 4] = j[1]
 if state == 6:
-for i, l in zip(uniqueHD, nr):
+for k, l in zip(uniqueHD, nr):
 for j in table:
 if j[0] == uniqueHD[l]:
 count[l, 5] = j[1]
 state = state + 1
 sumRow = count.sum(axis=1)
 sumCol = count.sum(axis=0)
 first = ["{}{}".format(row_label, i) for i in uniqueHD]
 final = numpy.column_stack((first, count, sumRow))
 return (final, sumCol)
 def createTableHDwithTags(list1):
 selfAB = numpy.concatenate(list1)
 uniqueHD = numpy.unique(selfAB)
 nr = numpy.arange(0, len(uniqueHD), 1)
 count = numpy.zeros((len(uniqueHD), 5))
 state = 1
 for i in list1:
 counts = list(Counter(i).items())
 hd = [item[0] for item in counts]
 c = [item[1] for item in counts]
 if len(table) == 0:
 state = state + 1
 continue
 else:
 if state == 1:
-for i, l in zip(uniqueHD, nr):
+for k, l in zip(uniqueHD, nr):
 for j in table:
 if j[0] == uniqueHD[l]:
 count[l, 0] = j[1]
 if state == 2:
-for i, l in zip(uniqueHD, nr):
+for k, l in zip(uniqueHD, nr):
 for j in table:
 if j[0] == uniqueHD[l]:
 count[l, 1] = j[1]
 if state == 3:
-for i, l in zip(uniqueHD, nr):
+for k, l in zip(uniqueHD, nr):
 for j in table:
 if j[0] == uniqueHD[l]:
 count[l, 2] = j[1]
 if state == 4:
-for i, l in zip(uniqueHD, nr):
+for k, l in zip(uniqueHD, nr):
 for j in table:
 if j[0] == uniqueHD[l]:
 count[l, 3] = j[1]
 if state == 5:
-for i, l in zip(uniqueHD, nr):
+for k, l in zip(uniqueHD, nr):
 for j in table:
 if j[0] == uniqueHD[l]:
 count[l, 4] = j[1]
 state = state + 1
 sumRow = count.sum(axis=1)
 sumCol = count.sum(axis=0)
 first = ["HD={}".format(i) for i in uniqueHD]
 final = numpy.column_stack((first, count, sumRow))
+return (final, sumCol)
+def createTableHDwithDCS(list1):
+selfAB = numpy.concatenate(list1)
+uniqueHD = numpy.unique(selfAB)
+nr = numpy.arange(0, len(uniqueHD), 1)
+count = numpy.zeros((len(uniqueHD), len(list1)))
+state = 1
+for i in list1:
+counts = list(Counter(i).items())
+hd = [item[0] for item in counts]
+c = [item[1] for item in counts]
+table = numpy.column_stack((hd, c))
+if len(table) == 0:
+state = state + 1
+continue
+else:
+if state == 1:
+for k, l in zip(uniqueHD, nr):
+for j in table:
+if j[0] == uniqueHD[l]:
+count[l, 0] = j[1]
+if state == 2:
+for k, l in zip(uniqueHD, nr):
+for j in table:
+if j[0] == uniqueHD[l]:
+count[l, 1] = j[1]
+if state == 3:
+for k, l in zip(uniqueHD, nr):
+for j in table:
+if j[0] == uniqueHD[l]:
+count[l, 2] = j[1]
+state = state + 1
+sumRow = count.sum(axis=1)
+sumCol = count.sum(axis=0)
+first = ["HD={}".format(i) for i in uniqueHD]
+final = numpy.column_stack((first, count, sumRow))
 return (final, sumCol)
 def createFileHD(summary, sumCol, overallSum, output_file, name, sep):
 output_file.write(name)
 output_file.write("\n")
-output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep))
+output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format(
+sep, sep, sep, sep, sep, sep, sep, sep))
 for item in summary:
 for nr in item:
 if "HD" not in nr and "diff" not in nr:
 nr = nr.astype(float)
 nr = nr.astype(int)
 output_file.write("{}{}".format(el, sep))
 output_file.write("{}{}".format(overallSum.astype(int), sep))
 output_file.write("\n\n")
-def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name, sep):
+def createFileHDwithDCS(summary, sumCol, overallSum, output_file, name, sep):
 output_file.write(name)
 output_file.write("\n")
-output_file.write("{}HD a{}HD b'{}HD b{}HD a'{}HD a+b{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep))
+output_file.write("{}DCS{}SSCS ab{}SSCS ba{}sum{}\n".format(sep, sep, sep, sep, sep))
 for item in summary:
 for nr in item:
 if "HD" not in nr:
 nr = nr.astype(float)
 nr = nr.astype(int)
 output_file.write("{}{}".format(el, sep))
 output_file.write("{}{}".format(overallSum.astype(int), sep))
 output_file.write("\n\n")
+def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name, sep):
+output_file.write(name)
+output_file.write("\n")
+output_file.write("{}HD DCS{}HD b'{}HD b{}HD a'{}HD a+b', a'+b{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep))
+for item in summary:
+for nr in item:
+if "HD" not in nr:
+nr = nr.astype(float)
+nr = nr.astype(int)
+output_file.write("{}{}".format(nr, sep))
+output_file.write("\n")
+output_file.write("sum{}".format(sep))
+sumCol = map(int, sumCol)
+for el in sumCol:
+output_file.write("{}{}".format(el, sep))
+output_file.write("{}{}".format(overallSum.astype(int), sep))
+output_file.write("\n\n")
 def hamming(array1, array2):
 res = 99 * numpy.ones(len(array1))
 i = 0
 array2 = numpy.unique(array2)  # remove duplicate sequences to decrease running time
 for a in array1:
 dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2])  # fastest
 res[i] = numpy.amin(dist[dist > 0])  # pick min distance greater than zero
-# print(i)
 i += 1
 return res
 def hamming_difference(array1, array2, mate_b):
 array2 = numpy.unique(array2)  # remove duplicate sequences to decrease running time
 array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1])  # mate1 part1
 array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1])  # mate1 part 2
 array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2])  # mate2 part1
 array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2])  # mate2 part2
 # diff11 = 999 * numpy.ones(len(array2))
 # relativeDiffList = 999 * numpy.ones(len(array2))
 elif mate_b is True:  # HD calculation for all b's
 half1_mate1 = array1_half2
 half2_mate1 = array1_half
 half1_mate2 = array2_half2
 half2_mate2 = array2_half
 # half1_mate1, index_halves = numpy.unique(half1_mate1, return_index=True)
 # print(len(half1_mate1))
 # half2_mate1 = half2_mate1[index_halves]
 # array1 = array1[index_halves]
 # all tags without identical tag
 array2_half_withoutSame = half1_mate2[index_withoutSame]
 array2_half2_withoutSame = half2_mate2[index_withoutSame]
 array2_withoutSame = array2[index_withoutSame]  # whole tag (=not splitted into 2 halfs)
+# calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
 dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in
-array2_half_withoutSame])  # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
+array2_half_withoutSame])
 min_index = numpy.where(dist == dist.min())[0]  # get index of min HD
 min_value = dist.min()
 # min_value = dist[min_index]  # get minimum HDs
-min_tag_half2 = array2_half2_withoutSame[min_index]  # get all "b's" of the tag or all "a's" of the tag with minimum HD
+# get all "b's" of the tag or all "a's" of the tag with minimum HD
+min_tag_half2 = array2_half2_withoutSame[min_index]
 min_tag_array2 = array2_withoutSame[min_index]  # get whole tag with min HD
-dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in min_tag_half2])  # calculate HD of "b" to all "b's" or "a" to all "a's"
+dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in
+min_tag_half2])  # calculate HD of "b" to all "b's" or "a" to all "a's"
 max_value = dist_second_half.max()
 max_index = numpy.where(dist_second_half == dist_second_half.max())[0]  # get index of max HD
 max_tag = min_tag_array2[max_index]
 # for d, d2 in zip(min_value, max_value):
 # tags which have identical parts:
 if min_value == 0 or max_value == 0:
 min_tagsList_zeros.append(numpy.array(tag))
 difference1_zeros = abs(min_value - max_value)  # hd of non-identical part
 diff11_zeros.append(difference1_zeros)
-max_tag_list.append(max_tag)
+max_tag_list.append(numpy.array(max_tag))
 else:
 min_tagsList_zeros.append(None)
 diff11_zeros.append(None)
-max_tag_list.append(numpy.array(["None"]))
+max_tag_list.append(None)
-# max_tag_list.append(numpy.array(max_tag))
 i += 1
 # print(i)
 # diff11 = [st for st in diff11 if st != 999]
 # ham1 = [st for st in ham1 if st != 999]
 # min_valueList = [st for st in min_valueList if st != 999]
 # min_tagsList = [st for st in min_tagsList if st != 999]
 # relativeDiffList = [st for st in relativeDiffList if st != 999]
 # diff11_zeros = [st for st in diff11_zeros if st != 999]
 # min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999]
-return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros, ham1min, ham2min, max_tag_list])
+return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros,
+min_tagsList_zeros, ham1min, ham2min, max_tag_list])
 def readFileReferenceFree(file):
 with open(file, 'r') as dest_f:
 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string')
 def familySizeDistributionWithHD(fs, ham, diff=False, rel=True):
 hammingDistances = numpy.unique(ham)
 fs = numpy.asarray(fs)
 ham = numpy.asarray(ham)
 bigFamilies2 = numpy.where(fs > 19)[0]
 if len(bigFamilies2) != 0:
 fs[bigFamilies2] = 20
 maximum = max(fs)
 list1 = [data0, data, data2, data3, data4, data5, data6]
 else:
 list1 = [data, data2, data3, data4, data5, data6]
 return(list1, hammingDistances, maximum, minimum)
+def hammingDistanceWithDCS(minHD_tags_zeros, diff_zeros, data_array):
+diff_zeros = numpy.array(diff_zeros)
+maximum = numpy.amax(diff_zeros)
+minimum = numpy.amin(diff_zeros)
+minHD_tags_zeros = numpy.array(minHD_tags_zeros)
+idx = numpy.concatenate([numpy.where(data_array[:, 1] == i)[0] for i in minHD_tags_zeros])
+subset_data = data_array[idx, :]
+seq = numpy.array(subset_data[:, 1])
+# find all unique tags and get the indices for ALL tags, but only once
+u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
+DCS_tags = u[c == 2]
+rest_tags = u[c == 1]
+dcs = numpy.repeat("DCS", len(DCS_tags))
+idx_sscs = numpy.concatenate([numpy.where(subset_data[:, 1] == i)[0] for i in rest_tags])
+sscs = subset_data[idx_sscs, 2]
+all_tags = numpy.column_stack((numpy.concatenate((DCS_tags, subset_data[idx_sscs, 1])),
+numpy.concatenate((dcs, sscs))))
+hd_DCS = []
+ab_SSCS = []
+ba_SSCS = []
+for i in range(len(all_tags)):
+tag = all_tags[i, :]
+hd = diff_zeros[numpy.where(minHD_tags_zeros == tag[0])[0]]
+if tag[1] == "DCS":
+hd_DCS.append(hd)
+elif tag[1] == "ab":
+ab_SSCS.append(hd)
+elif tag[1] == "ba":
+ba_SSCS.append(hd)
+if len(hd_DCS) != 0:
+hd_DCS = numpy.concatenate(hd_DCS)
+if len(ab_SSCS) != 0:
+ab_SSCS = numpy.concatenate(ab_SSCS)
+if len(ba_SSCS) != 0:
+ba_SSCS = numpy.concatenate(ba_SSCS)
+list1 = [hd_DCS, ab_SSCS, ba_SSCS]  # list for plotting
+return(list1, maximum, minimum)
 def make_argparser():
 parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data')
 parser.add_argument('--inputFile',
 help='The tool runs with the given number of processors.')
 parser.add_argument('--only_DCS', action="store_false",
 help='Only tags of the DCSs are included in the HD analysis')
 parser.add_argument('--minFS', default=1, type=int,
-help='Only tags, which have a family size greater or equal than specified, are included in the HD analysis')
+help='Only tags, which have a family size greater or equal than specified, '
+'are included in the HD analysis')
 parser.add_argument('--maxFS', default=0, type=int,
-help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis')
+help='Only tags, which have a family size smaller or equal than specified, '
+'are included in the HD analysis')
 parser.add_argument('--nr_above_bars', action="store_true",
-help='If no, values above bars in the histograms are removed')
+help='If False, values above bars in the histograms are removed')
+parser.add_argument('--rel_freq', action="store_false",
+help='If True, the relative frequencies are displayed.')
 parser.add_argument('--output_tabular', default="data.tabular", type=str,
 help='Name of the tabular file.')
 parser.add_argument('--output_pdf', default="data.pdf", type=str,
 help='Name of the pdf file.')
 return parser
 def Hamming_Distance_Analysis(argv):
+# def Hamming_Distance_Analysis(file1, name1, index_size, title_savedFile_pdf, title_savedFile_csv,
+#                               output_chimeras_tabular, onlyDuplicates, minFS=1, maxFS=0, nr_above_bars=True,
+#                               subset=False, nproc=12, rel_freq=False):
 parser = make_argparser()
 args = parser.parse_args(argv[1:])
 file1 = args.inputFile
 name1 = args.inputName1
 index_size = args.sample_size
 title_savedFile_pdf = args.output_pdf
 title_savedFile_csv = args.output_tabular
 output_chimeras_tabular = args.output_chimeras_tabular
-sep = "\t"
 onlyDuplicates = args.only_DCS
+rel_freq = args.rel_freq
 minFS = args.minFS
 maxFS = args.maxFS
 nr_above_bars = args.nr_above_bars
 subset = args.subset_tag
 nproc = args.nproc
+sep = "\t"
 # input checks
 if index_size < 0:
 print("index_size is a negative integer.")
 exit(2)
 if nproc <= 0:
 print("nproc is smaller or equal zero")
 exit(3)
 if subset < 0:
 print("subset_tag is smaller or equal zero.")
 exit(5)
 # PLOT
 plt.rcParams['axes.facecolor'] = "E0E0E0"  # grey background color
 plt.rcParams['xtick.labelsize'] = 14
 plt.rcParams['ytick.labelsize'] = 14
 plt.rcParams['patch.edgecolor'] = "#000000"
 plt.rc('figure', figsize=(11.69, 8.27))  # A4 format
 name1 = name1.split(".tabular")[0]
 with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf:
 print("dataset: ", name1)
 integers, data_array = readFileReferenceFree(file1)
 data_array = numpy.array(data_array)
-print("total nr of tags with Ns:", len(data_array))
+print("total nr of tags:", len(data_array))
-n = [i for i, x in enumerate(data_array[:, 1]) if "N" in x]
-if len(n) != 0:  # delete tags with N in the tag from data
+# filter tags out which contain any other character than ATCG
-print("nr of tags with N's within tag:", len(n), float(len(n)) / len(data_array))
+valid_bases = ["A", "T", "G", "C"]
+tagsToDelete = []
+for idx, t in enumerate(data_array[:, 1]):
+for char in t:
+if char not in valid_bases:
+tagsToDelete.append(idx)
+break
+if len(tagsToDelete) != 0:  # delete tags with N in the tag from data
+print("nr of tags with any other character than A, T, C, G:", len(tagsToDelete),
+float(len(tagsToDelete)) / len(data_array))
 index_whole_array = numpy.arange(0, len(data_array), 1)
-index_withoutN_inTag = numpy.delete(index_whole_array, n)
+index_withoutN_inTag = numpy.delete(index_whole_array, tagsToDelete)
 data_array = data_array[index_withoutN_inTag, :]
 integers = integers[index_withoutN_inTag]
-print("total nr of tags without Ns:", len(data_array))
+print("total nr of filtered tags:", len(data_array))
 int_f = numpy.array(data_array[:, 0]).astype(int)
 data_array = data_array[numpy.where(int_f >= minFS)]
 integers = integers[integers >= minFS]
 tags = data_array[:, 2]
 seq = data_array[:, 1]
 # find all unique tags and get the indices for ALL tags, but only once
 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
-d = u[c > 1]
+d = u[c == 2]
 # get family sizes, tag for duplicates
 duplTags_double = integers[numpy.in1d(seq, d)]
 duplTags = duplTags_double[0::2]  # ab of DCS
 duplTagsBA = duplTags_double[1::2]  # ba of DCS
 duplTags_tag = tags[numpy.in1d(seq, d)][0::2]  # ab
 duplTags_seq = seq[numpy.in1d(seq, d)][0::2]  # ab - tags
 if minFS > 1:
-duplTags_tag = duplTags_tag[(duplTags >= 3) & (duplTagsBA >= 3)]
+duplTags_tag = duplTags_tag[(duplTags >= minFS) & (duplTagsBA >= minFS)]
-duplTags_seq = duplTags_seq[(duplTags >= 3) & (duplTagsBA >= 3)]
+duplTags_seq = duplTags_seq[(duplTags >= minFS) & (duplTagsBA >= minFS)]
-duplTags = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)]  # ab+ba with FS>=3
+duplTags = duplTags[(duplTags >= minFS) & (duplTagsBA >= minFS)]  # ab+ba with FS>=3
 data_array = numpy.column_stack((duplTags, duplTags_seq))
 data_array = numpy.column_stack((data_array, duplTags_tag))
 integers = numpy.array(data_array[:, 0]).astype(int)
 print("DCS in whole dataset", len(data_array))
 if index_size == 0:
 result = numpy.arange(0, len(data_array), 1)
 else:
 numpy.random.shuffle(data_array)
 unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True)  # get only unique tags
-result = numpy.random.choice(unique_indices, size=index_size, replace=False)  # array of random sequences of size=index.size
+result = numpy.random.choice(unique_indices, size=index_size,
+replace=False)  # array of random sequences of size=index.size
 # result = numpy.random.choice(len(integers), size=index_size,
 #                             replace=False)  # array of random sequences of size=index.size
 # result = numpy.where(numpy.array(random_tags) == numpy.array(data_array[:,1]))[0]
-# with open("index_result1_{}.pkl".format(app_f), "wb") as o:
+# with open("index_result.pkl", "wb") as o:
 #     pickle.dump(result, o, pickle.HIGHEST_PROTOCOL)
+# save counts
+# with open(data_folder + "index_sampleTags1000_Barcode3_DCS.pkl", "wb") as f:
+#     pickle.dump(result, f, pickle.HIGHEST_PROTOCOL)
+# with open(data_folder + "dataArray_sampleTags1000_Barcode3_DCS.pkl", "wb") as f1:
+#     pickle.dump(data_array, f1, pickle.HIGHEST_PROTOCOL)
+#
+# with open(data_folder + "index_sampleTags100.pkl", "rb") as f:
+#     result = pickle.load(f)
+#
+# with open(data_folder + "dataArray_sampleTags100.pkl", "rb") as f1:
+#     data_array = pickle.load(f1)
+# with open(data_folder + "index_result.txt", "w") as t:
+#     for text in result:
+#         t.write("{}\n".format(text))
 # comparison random tags to whole dataset
 result1 = data_array[result, 1]  # random tags
 result2 = data_array[:, 1]  # all tags
 print("sample size= ", len(result1))
 diff_zeros1 = numpy.concatenate([item[6] for item in diff_list_a])
 diff_zeros2 = numpy.concatenate([item[6] for item in diff_list_b])
 minHD_tags = numpy.concatenate([item[4] for item in diff_list_a])
 minHD_tags_zeros1 = numpy.concatenate([item[7] for item in diff_list_a])
 minHD_tags_zeros2 = numpy.concatenate([item[7] for item in diff_list_b])
-chim_tags = [item[10] for item in diff_list_a]
-chim_tags2 = [item[10] for item in diff_list_b]
+chimera_tags1 = sum([item[10] for item in diff_list_a], [])
-chimera_tags1 = [ii if isinstance(i, list) else i for i in chim_tags for ii in i]
+chimera_tags2 = numpy.concatenate([item[10] for item in diff_list_b])
-chimera_tags2 = [ii if isinstance(i, list) else i for i in chim_tags2 for ii in i]
 rel_Diff = []
 diff_zeros = []
 minHD_tags_zeros = []
 diff = []
 chimera_tags = []
 for d1, d2, rel1, rel2, zeros1, zeros2, tag1, tag2, ctag1, ctag2 in \
-zip(diff1, diff2, rel_Diff1, rel_Diff2, diff_zeros1, diff_zeros2, minHD_tags_zeros1, minHD_tags_zeros2, chimera_tags1, chimera_tags2):
+zip(diff1, diff2, rel_Diff1, rel_Diff2, diff_zeros1, diff_zeros2, minHD_tags_zeros1, minHD_tags_zeros2,
+chimera_tags1, chimera_tags2):
 rel_Diff.append(max(rel1, rel2))
 diff.append(max(d1, d2))
 if all(i is not None for i in [zeros1, zeros2]):
 diff_zeros.append(max(zeros1, zeros2))
 diff_zeros.append(zeros2)
 minHD_tags_zeros.append(str(tag2))
 chimera_tags.append(ctag2)
 chimera_tags_new = chimera_tags
-#data_chimeraAnalysis = numpy.column_stack((minHD_tags_zeros, chimera_tags_new))
+data_chimeraAnalysis = numpy.column_stack((minHD_tags_zeros, chimera_tags_new))
 # chimeras_dic = defaultdict(list)
 #
 # for t1, t2 in zip(minHD_tags_zeros, chimera_tags_new):
 #     if len(t2) >1 and type(t2) is not numpy.ndarray:
 #         t2 = numpy.concatenate(t2)
 #     chimeras_dic[t1].append(t2)
+checked_tags = []
+stat_maxTags = []
 with open(output_chimeras_tabular, "w") as output_file1:
-output_file1.write("chimera tag\tsimilar tag with HD=0\n")
+output_file1.write("chimera tag\tfamily size, read direction\tsimilar tag with HD=0\n")
-for i in range(len(minHD_tags_zeros)):
+for i in range(len(data_chimeraAnalysis)):
-tag1 = minHD_tags_zeros[i]
+tag1 = data_chimeraAnalysis[i, 0]
+info_tag1 = data_array[data_array[:, 1] == tag1, :]
+fs_tag1 = ["{} {}".format(t[0], t[2]) for t in info_tag1]
+if tag1 in checked_tags:  # skip tag if already written to file
+continue
 sample_half_a = tag1[0:(len(tag1)) / 2]
 sample_half_b = tag1[len(tag1) / 2:len(tag1)]
-max_tags = chimera_tags_new[i]
+max_tags = data_chimeraAnalysis[i, 1]
-if isinstance(max_tags, list) and len(max_tags) > 1:
+if len(max_tags) > 1 and type(max_tags) is not numpy.ndarray:
 max_tags = numpy.concatenate(max_tags)
-#if isinstance(max_tags, list): #and type(max_tags) is not numpy.ndarray:
-#    print(max_tags)
-#    max_tags = numpy.concatenate(max_tags)
 max_tags = numpy.unique(max_tags)
+stat_maxTags.append(len(max_tags))
-chimera_half_a = numpy.array([i[0:(len(i)) / 2] for i in max_tags])  # mate1 part1
-chimera_half_b = numpy.array([i[len(i) / 2:len(i)] for i in max_tags])  # mate1 part 2
+info_maxTags = [data_array[data_array[:, 1] == t, :] for t in max_tags]
+chimera_half_a = numpy.array([t[0:(len(t)) / 2] for t in max_tags])  # mate1 part1
+chimera_half_b = numpy.array([t[len(t) / 2:len(t)] for t in max_tags])  # mate1 part 2
 new_format = []
 for j in range(len(max_tags)):
+fs_maxTags = ["{} {}".format(t[0], t[2]) for t in info_maxTags[j]]
 if sample_half_a == chimera_half_a[j]:
-max_tag = "*{}* {}".format(chimera_half_a[j], chimera_half_b[j])
+max_tag = "*{}* {} {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags))
 new_format.append(max_tag)
 elif sample_half_b == chimera_half_b[j]:
-max_tag = "{} *{}*".format(chimera_half_a[j], chimera_half_b[j])
+max_tag = "{} *{}* {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags))
 new_format.append(max_tag)
+checked_tags.append(max_tags[j])
-sample_tag = "{} {}".format(sample_half_a, sample_half_b)
+sample_tag = "{} {}\t{}".format(sample_half_a, sample_half_b, ", ".join(fs_tag1))
 output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format)))
+checked_tags.append(tag1)
 output_file1.write(
-"This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.\n "
+"This file contains all tags that were identified as chimeras as the first column and the "
+"corresponding tags which returned a Hamming distance of zero in either the first or the second "
+"half of the sample tag as the second column.\n"
 "The tags were separated by an empty space into their halves and the * marks the identical half.")
+output_file1.write("\n\nStatistics of nr. of tags that returned max. HD (2nd column)\n")
-# unique_chimeras = numpy.array(minHD_tags_zeros)
+output_file1.write("minimum\t{}\ttag(s)\n".format(numpy.amin(numpy.array(stat_maxTags))))
-#
+output_file1.write("mean\t{}\ttag(s)\n".format(numpy.mean(numpy.array(stat_maxTags))))
-# sample_half_a = numpy.array([i[0:(len(i)) / 2] for i in unique_chimeras])  # mate1 part1
+output_file1.write("median\t{}\ttag(s)\n".format(numpy.median(numpy.array(stat_maxTags))))
-# sample_half_b = numpy.array([i[len(i) / 2:len(i)] for i in unique_chimeras])  # mate1 part 2
+output_file1.write("maximum\t{}\ttag(s)\n".format(numpy.amax(numpy.array(stat_maxTags))))
-#
+output_file1.write("sum\t{}\ttag(s)\n".format(numpy.sum(numpy.array(stat_maxTags))))
-# output_file1.write("sample tag\tsimilar tag\n")
-# for tag1, a, b in zip(unique_chimeras, sample_half_a, sample_half_b):
-#     max_tags = numpy.concatenate(chimeras_dic.get(tag1))
-#     max_tags = numpy.unique(max_tags)
-#
-#     chimera_half_a = numpy.array([i[0:(len(i)) / 2] for i in max_tags])  # mate1 part1
-#     chimera_half_b = numpy.array([i[len(i) / 2:len(i)] for i in max_tags])  # mate1 part 2
-#
-#     new_format = []
-#     for i in range(len(max_tags)):
-#         if a == chimera_half_a[i]:
-#             max_tag = "*{}* {}".format(chimera_half_a[i], chimera_half_b[i])
-#             new_format.append(max_tag)
-#
-#         elif b == chimera_half_b[i]:
-#             max_tag = "{} *{}*".format(chimera_half_a[i], chimera_half_b[i])
-#             new_format.append(max_tag)
-#
-#     sample_tag = "{} {}".format(a, b)
-#     output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format)))
-# output_file1.write(
-#     "This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.\n "
-#     "The tags were separated by an empty space into their halves and the * marks the identical half.")
-nr_chimeric_tags = len(minHD_tags_zeros)
-print("nr of unique chimeras", nr_chimeric_tags)
 lenTags = len(data_array)
 len_sample = len(result1)
 quant = numpy.array(data_array[result, 0]).astype(int)  # family size for sample of tags
 seq = numpy.tile(seq, 2)
 ham = numpy.tile(ham, 2)
 diff = numpy.tile(diff, 2)
 rel_Diff = numpy.tile(rel_Diff, 2)
 diff_zeros = numpy.tile(diff_zeros, 2)
+nr_chimeric_tags = len(data_chimeraAnalysis)
+print("nr of chimeras", nr_chimeric_tags)
 # prepare data for different kinds of plots
 # distribution of FSs separated after HD
 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False)
 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham)  # histogram of HDs separated after FS
 lst_minHD_tags = []
 for i in minHD_tags:
 lst_minHD_tags.append(seqDic.get(i))
 if onlyDuplicates:
-lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags], [item_b[1] for item_b in lst_minHD_tags])).astype(int)
+lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags],
+[item_b[1] for item_b in lst_minHD_tags])).astype(int)
 # histogram with absolute and relative difference between HDs of both parts of the tag
 listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff)
 listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags,
 rel_Diff)
 # chimeric read analysis: tags which have HD=0 in one of the halfs
 if len(minHD_tags_zeros) != 0:
 lst_minHD_tags_zeros = []
 for i in minHD_tags_zeros:
 lst_minHD_tags_zeros.append(seqDic.get(i))  # get family size for tags of chimeric reads
 if onlyDuplicates:
-lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros], [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int)
+lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros],
+[item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int)
 # histogram with HD of non-identical half
-listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(lst_minHD_tags_zeros, diff_zeros)
+listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(
+lst_minHD_tags_zeros, diff_zeros)
+if onlyDuplicates is False:
+listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros = hammingDistanceWithDCS(minHD_tags_zeros,
+diff_zeros, data_array)
 # plot Hamming Distance with Family size distribution
-plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf,
+plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, rel_freq=rel_freq,
-subtitle="Hamming distance separated by family size", title_file1=name1, lenTags=lenTags,
+subtitle="Hamming distance separated by family size", lenTags=lenTags,
 xlabel="HD", nr_above_bars=nr_above_bars, len_sample=len_sample)
 # Plot FSD with separation after
-plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS,
+plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, rel_freq=rel_freq,
 originalCounts=quant, subtitle="Family size distribution separated by Hamming distance",
-pdf=pdf, relative=False, title_file1=name1, diff=False)
+pdf=pdf, relative=False, diff=False)
 # Plot HD within tags
-plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags,
+plotHDwithinSeq(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags,
-title_file1=name1, len_sample=len_sample)
+rel_freq=rel_freq, len_sample=len_sample)
 # Plot difference between HD's separated after FSD
 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf,
-subtitle="Delta Hamming distance within tags",
+subtitle="Delta Hamming distance within tags", lenTags=lenTags, rel_freq=rel_freq,
-title_file1=name1, lenTags=lenTags,
 xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars, len_sample=len_sample)
 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf,
-subtitle="Chimera Analysis: relative delta Hamming distances",
+subtitle="Chimera Analysis: relative delta Hamming distance", lenTags=lenTags, rel_freq=rel_freq,
-title_file1=name1, lenTags=lenTags,
+xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars,
-xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
+nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
 # plots for chimeric reads
 if len(minHD_tags_zeros) != 0:
 # HD
-plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, subtitle="Hamming distance of chimeras",
+plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf,
-title_file1=name1, lenTags=lenTags, xlabel="HD", relative=False,
+subtitle="Hamming distance of chimeric families (CF)", rel_freq=rel_freq,
+lenTags=lenTags, xlabel="HD", relative=False,
 nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
+if onlyDuplicates is False:
+plotHDwithDCS(listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros, pdf=pdf,
+subtitle="Hamming distance of chimeric families (CF)", rel_freq=rel_freq,
+lenTags=lenTags, xlabel="HD", relative=False,
+nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
 # print all data to a CSV file
 # HD
 summary, sumCol = createTableHD(list1, "HD=")
 overallSum = sum(sumCol)  # sum of columns in table
 if len(minHD_tags_zeros) != 0:
 # absolute difference and tags where at least one half has HD=0
 summary15, sumCol15 = createTableHD(listDifference1_zeros, "HD=")
 overallSum15 = sum(sumCol15)
+if onlyDuplicates is False:
+summary16, sumCol16 = createTableHDwithDCS(listDCS_zeros)
+overallSum16 = sum(sumCol16)
 output_file.write("{}\n".format(name1))
-output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len(
+output_file.write("nr of tags{}{:,}\nsample size{}{:,}\n\n".format(sep, lenTags, sep, len_sample))
-numpy.concatenate(list1)), lenTags, lenTags))
 # HD
 createFileHD(summary, sumCol, overallSum, output_file,
 "Hamming distance separated by family size", sep)
 # FSD
 output_file.write(
 "relative frequency:{}{}\n\n".format(sep, float(max_fs[len(max_fs) - 1]) / sum(max_fs)))
 # HD within tags
 output_file.write(
-"The Hamming distances were calculated by comparing the first halve against all halves and selected the minimum value (HD a).\n"
+"The Hamming distances were calculated by comparing the first halve against all halves and selected the "
-"For the second half of the tag, we compared them against all tags which resulted in the minimum HD of the previous step and selected the maximum value (HD b').\n"
+"minimum value (HD a).\nFor the second half of the tag, we compared them against all tags which resulted "
-"Finally, it was possible to calculate the absolute and relative differences between the HDs (absolute and relative delta HD).\n"
+"in the minimum HD of the previous step and selected the maximum value (HD b').\nFinally, it was possible "
-"These calculations were repeated, but starting with the second half in the first step to find all possible chimeras in the data (HD b and HD  For simplicity we used the maximum value between the delta values in the end.\n"
+"to calculate the absolute and relative differences between the HDs (absolute and relative delta HD).\n"
-"When only tags that can form DCS were allowed in the analysis, family sizes for the forward and reverse (ab and ba) will be included in the plots.\n")
+"These calculations were repeated, but starting with the second half in the first step to find all "
+"possible chimeras in the data (HD b and HD  For simplicity we used the maximum value between the delta "
-output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2))
+"values in the end.\nWhen only tags that can form DCS were allowed in the analysis, family sizes for the "
+"forward and reverse (ab and ba) will be included in the plots.\n")
+output_file.write("\nlength of one half of the tag{}{}\n\n".format(sep, len(data_array[0, 1]) / 2))
 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file,
 "Hamming distance of each half in the tag", sep)
 createFileHD(summary11, sumCol11, overallSum11, output_file,
-"Absolute delta Hamming distances within the tag", sep)
+"Absolute delta Hamming distance within the tag", sep)
 createFileHD(summary13, sumCol13, overallSum13, output_file,
-"Chimera analysis: relative delta Hamming distances", sep)
+"Chimera analysis: relative delta Hamming distance", sep)
 if len(minHD_tags_zeros) != 0:
 output_file.write(
-"Chimeras:\nAll tags were filtered: only those tags where at least one half was identical (HD=0) and therefore, had a relative delta of 1 were kept. These tags are considered as chimeric.\nSo the Hamming distances of the chimeric tags are shown.\n")
+"All tags were filtered: only those tags where at least one half was identical (HD=0) and therefore, "
+"had a relative delta of 1 were kept. These tags are considered as chimeric.\nSo the Hamming distances "
+"of the chimeric tags are shown.\n")
 createFileHD(summary15, sumCol15, overallSum15, output_file,
-"Hamming distances of chimeras", sep)
+"Hamming distance of chimeric families separated after FS", sep)
+if onlyDuplicates is False:
+createFileHDwithDCS(summary16, sumCol16, overallSum16, output_file,
+"Hamming distance of chimeric families separated after DCS and single SSCS", sep)
 output_file.write("\n")
 if __name__ == '__main__':

Mercurial > repos > mheinzl > hd

comparison hd.py @ 29:6b15b3b6405c draft