Mercurial > repos > mheinzl > td
changeset 0:3e56058d9552 draft default tip
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/hd commit 9bae9043a53f1e07b502acd1082450adcb6d9e31-dirty
author | mheinzl |
---|---|
date | Wed, 16 Oct 2019 04:17:59 -0400 |
parents | |
children | |
files | td.py td.xml test-data/td_chimeras_output.tab test-data/td_data.tab test-data/td_output.pdf test-data/td_output.tab |
diffstat | 6 files changed, 1646 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/td.py Wed Oct 16 04:17:59 2019 -0400 @@ -0,0 +1,1364 @@ +#!/usr/bin/env python + +# Tag distance analysis of SSCSs +# +# Author: Monika Heinzl, Johannes-Kepler University Linz (Austria) +# Contact: monika.heinzl@edumail.at +# +# Takes at least one TABULAR file with tags before the alignment to the SSCS and +# optionally a second TABULAR file as input. The program produces a plot which shows a histogram of Hamming distances +# separated after family sizes, a family size distribution separated after Hamming distances for all (sample_size=0) +# or a given sample of SSCSs or SSCSs, which form a DCS. In additon, the tool produces HD and FSD plots for the +# difference between the HDs of both parts of the tags and for the chimeric reads and finally a CSV file with the +# data of the plots. It is also possible to perform the HD analysis with shortened tags with given sizes as input. +# The tool can run on a certain number of processors, which can be defined by the user. + +# USAGE: python td.py --inputFile filename --inputName1 filename --sample_size int / +# --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int +# --nr_above_bars True/False --output_tabular outptufile_name_tabular + +import argparse +import itertools +import operator +import sys +from collections import Counter, defaultdict +from functools import partial +from multiprocessing.pool import Pool +import random +import os + +import matplotlib.pyplot as plt +import numpy +from matplotlib.backends.backend_pdf import PdfPages + +plt.switch_backend('agg') + + +def plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, originalCounts, + subtitle, pdf, relative=False, diff=True, rel_freq=False): + if diff is False: + colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"] + labels = ["TD=1", "TD=2", "TD=3", "TD=4", "TD=5-8", "TD>8"] + else: + colors = ["#93A6AB", "#403C14", "#731E41", "#BAB591", "#085B6F", "#E8AA35", "#726C66"] + if relative is True: + labels = ["d=0", "d=0.1", "d=0.2", "d=0.3", "d=0.4", "d=0.5-0.8", "d>0.8"] + else: + labels = ["d=0", "d=1", "d=2", "d=3", "d=4", "d=5-8", "d>8"] + + fig = plt.figure(figsize=(6, 7)) + ax = fig.add_subplot(111) + plt.subplots_adjust(bottom=0.1) + p1 = numpy.bincount(numpy.concatenate(familySizeList1)) + maximumY = numpy.amax(p1) + + if len(range(minimumXFS, maximumXFS)) == 0: + range1 = range(minimumXFS - 1, minimumXFS + 2) + else: + range1 = range(0, maximumXFS + 2) + + if rel_freq: + w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(familySizeList1)) for data in familySizeList1] + counts = plt.hist(familySizeList1, label=labels, weights=w, color=colors, stacked=True, + rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1) + plt.ylabel("Relative Frequency", fontsize=14) + plt.ylim((0, 1.07)) + else: + counts = plt.hist(familySizeList1, label=labels, color=colors, stacked=True, + rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1) + if len(numpy.concatenate(familySizeList1)) != 0: + plt.ylim((0, max(numpy.bincount(numpy.concatenate(familySizeList1))) * 1.1)) + plt.ylabel("Absolute Frequency", fontsize=14) + plt.ylim((0, maximumY * 1.2)) + plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) + plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) + plt.xlabel("Family size", fontsize=14) + ticks = numpy.arange(0, maximumXFS + 1, 1) + ticks1 = map(str, ticks) + if maximumXFS >= 20: + ticks1[len(ticks1) - 1] = ">=20" + plt.xticks(numpy.array(ticks), ticks1) + [l.set_visible(False) for (i, l) in enumerate(ax.get_xticklabels()) if i % 5 != 0] + plt.xlim((0, maximumXFS + 1)) + legend = "\nfamily size: \nabsolute frequency: \nrelative frequency: " + plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure) + + count = numpy.bincount(originalCounts) # original counts + if max(originalCounts) >= 20: + max_count = ">= 20" + else: + max_count = max(originalCounts) + legend1 = "{}\n{}\n{:.5f}".format(max_count, p1[len(p1) - 1], float(p1[len(p1) - 1]) / sum(p1)) + plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure) + legend3 = "singletons\n{:,}\n{:.5f}".format(int(p1[1]), float(p1[1]) / sum(p1)) + plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12) + plt.grid(b=True, which='major', color='#424242', linestyle=':') + pdf.savefig(fig, bbox_inches="tight") + plt.close("all") + + +def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False, + nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False): + if relative is True: + step = 0.1 + else: + step = 1 + + fig = plt.figure(figsize=(6, 8)) + plt.subplots_adjust(bottom=0.1) + p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())]) + maximumY = numpy.amax(p1) + if relative is True: # relative difference + bin1 = numpy.arange(-1, maximumX + 0.2, 0.1) + else: + bin1 = maximumX + 1 + + if rel_freq: + w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1] + counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w, + label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8, + color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"], + stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) + plt.ylim((0, 1.07)) + plt.ylabel("Relative Frequency", fontsize=14) + bins = counts[1] # width of bins + counts = numpy.array(map(float, counts[0][5])) + + else: + counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, + label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8, + color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"], + stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) + maximumY = numpy.amax(p1) + plt.ylim((0, maximumY * 1.2)) + plt.ylabel("Absolute Frequency", fontsize=14) + bins = counts[1] # width of bins + counts = numpy.array(map(int, counts[0][5])) + + plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) + plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) + plt.xlabel(xlabel, fontsize=14) + plt.grid(b=True, which='major', color='#424242', linestyle=':') + plt.xlim((minimumX - step, maximumX + step)) + # plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) + plt.xticks(numpy.arange(0, maximumX + step, step)) + + if nr_above_bars: + bin_centers = -0.4 * numpy.diff(bins) + bins[:-1] + for x_label, label in zip(counts, bin_centers): # labels for values + if x_label == 0: + continue + else: + if rel_freq: + plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))), + float(x_label)), + xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001), + xycoords="data", color="#000066", fontsize=10) + else: + plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)), + xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01), + xycoords="data", color="#000066", fontsize=10) + + if nr_unique_chimeras != 0: + if (relative and ((counts[len(counts)-1] / nr_unique_chimeras) == 2)) or \ + (sum(counts) / nr_unique_chimeras) == 2: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})"\ + .format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2) + else: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format( + lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras) + else: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format( + lenTags, len_sample, len(numpy.concatenate(list1))) + + plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure) + pdf.savefig(fig, bbox_inches="tight") + plt.close("all") + plt.clf() + + +def plotHDwithDCS(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False, + nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False): + step = 1 + fig = plt.figure(figsize=(6, 8)) + plt.subplots_adjust(bottom=0.1) + p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())]) + maximumY = numpy.amax(p1) + bin1 = maximumX + 1 + if rel_freq: + w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1] + counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w, + label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"], + stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) + plt.ylim((0, 1.07)) + plt.ylabel("Relative Frequency", fontsize=14) + bins = counts[1] # width of bins + counts = numpy.array(map(float, counts[0][2])) + + else: + counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, + label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"], + stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) + plt.ylim((0, maximumY * 1.2)) + plt.ylabel("Absolute Frequency", fontsize=14) + bins = counts[1] # width of bins + counts = numpy.array(map(int, counts[0][2])) + + plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) + plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) + plt.xlabel(xlabel, fontsize=14) + plt.grid(b=True, which='major', color='#424242', linestyle=':') + plt.xlim((minimumX - step, maximumX + step)) + # plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) + plt.xticks(numpy.arange(0, maximumX + step, step)) + + if nr_above_bars: + bin_centers = -0.4 * numpy.diff(bins) + bins[:-1] + for x_label, label in zip(counts, bin_centers): # labels for values + if x_label == 0: + continue + else: + if rel_freq: + plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))), + float(x_label)), + xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001), + xycoords="data", color="#000066", fontsize=10) + else: + plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)), + xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01), + xycoords="data", color="#000066", fontsize=10) + + if nr_unique_chimeras != 0: + if (sum(counts) / nr_unique_chimeras) == 2: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})".\ + format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2) + else: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format( + lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras) + else: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format( + lenTags, len_sample, len(numpy.concatenate(list1))) + plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure) + + legend2 = "SSCS ab = {:,} ({:.5f})\nSSCS ba = {:,} ({:.5f})\nDCS = {:,} ({:.5f})".format( + len(list1[1]), len(list1[1]) / float(nr_unique_chimeras), + len(list1[2]), len(list1[2]) / float(nr_unique_chimeras), + len(list1[0]), len(list1[0]) / float(nr_unique_chimeras)) + plt.text(0.6, -0.047, legend2, size=12, transform=plt.gcf().transFigure) + + pdf.savefig(fig, bbox_inches="tight") + plt.close("all") + plt.clf() + + +def plotHDwithinSeq(sum1, sum1min, sum2, sum2min, min_value, lenTags, pdf, len_sample, rel_freq=False): + fig = plt.figure(figsize=(6, 8)) + plt.subplots_adjust(bottom=0.1) + + ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)] # new hd within tags + maximumX = numpy.amax(numpy.concatenate(ham_partial)) + minimumX = numpy.amin(numpy.concatenate(ham_partial)) + maximumY = numpy.amax(numpy.array(numpy.concatenate(map(lambda x: numpy.bincount(x), ham_partial)))) + + if len(range(minimumX, maximumX)) == 0: + range1 = minimumX + else: + range1 = range(minimumX, maximumX + 2) + + if rel_freq: + w = [numpy.zeros_like(data) + 1. / len(data) for data in ham_partial] + plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, weights=w, + label=["TD a.min", "TD b.max", "TD b.min", "TD a.max", "TD a.min + b.max,\nTD a.max + b.min"], + bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], + edgecolor='black', linewidth=1) + plt.ylabel("Relative Frequency", fontsize=14) + plt.ylim(0, 1.07) + else: + plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, + label=["TD a.min", "TD b.max", "TD b.min", "TD a.max", "TD a.min + b.max,\nTD a.max + b.min"], + bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], + edgecolor='black', linewidth=1) + plt.ylabel("Absolute Frequency", fontsize=14) + + plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.6, 1)) + plt.suptitle('Tag distances within tags', fontsize=14) + plt.xlabel("TD", fontsize=14) + plt.grid(b=True, which='major', color='#424242', linestyle=':') + plt.xlim((minimumX - 1, maximumX + 1)) + # plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2)) + plt.xticks(numpy.arange(0, maximumX + 1, 1.0)) + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format( + lenTags, len_sample, len(numpy.concatenate(ham_partial))) + plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure) + pdf.savefig(fig, bbox_inches="tight") + plt.close("all") + plt.clf() + + +def createTableFSD2(list1, diff=True): + selfAB = numpy.concatenate(list1) + uniqueFS = numpy.unique(selfAB) + nr = numpy.arange(0, len(uniqueFS), 1) + if diff is False: + count = numpy.zeros((len(uniqueFS), 6)) + else: + count = numpy.zeros((len(uniqueFS), 7)) + state = 1 + for i in list1: + counts = list(Counter(i).items()) + hd = [item[0] for item in counts] + c = [item[1] for item in counts] + table = numpy.column_stack((hd, c)) + if len(table) == 0: + state = state + 1 + continue + else: + if state == 1: + for k, l in zip(uniqueFS, nr): + for j in table: + if j[0] == uniqueFS[l]: + count[l, 0] = j[1] + if state == 2: + for k, l in zip(uniqueFS, nr): + for j in table: + if j[0] == uniqueFS[l]: + count[l, 1] = j[1] + if state == 3: + for k, l in zip(uniqueFS, nr): + for j in table: + if j[0] == uniqueFS[l]: + count[l, 2] = j[1] + if state == 4: + for k, l in zip(uniqueFS, nr): + for j in table: + if j[0] == uniqueFS[l]: + count[l, 3] = j[1] + if state == 5: + for k, l in zip(uniqueFS, nr): + for j in table: + if j[0] == uniqueFS[l]: + count[l, 4] = j[1] + if state == 6: + for k, l in zip(uniqueFS, nr): + for j in table: + if j[0] == uniqueFS[l]: + count[l, 5] = j[1] + if state == 7: + for k, l in zip(uniqueFS, nr): + for j in table: + if j[0] == uniqueFS[l]: + count[l, 6] = j[1] + state = state + 1 + sumRow = count.sum(axis=1) + sumCol = count.sum(axis=0) + uniqueFS = uniqueFS.astype(str) + if uniqueFS[len(uniqueFS) - 1] == "20": + uniqueFS[len(uniqueFS) - 1] = ">20" + first = ["FS={}".format(i) for i in uniqueFS] + final = numpy.column_stack((first, count, sumRow)) + return (final, sumCol) + + +def createFileFSD2(summary, sumCol, overallSum, output_file, name, sep, rel=False, diff=True): + output_file.write(name) + output_file.write("\n") + if diff is False: + output_file.write("{}TD=1{}TD=2{}TD=3{}TD=4{}TD=5-8{}TD>8{}sum{}\n".format( + sep, sep, sep, sep, sep, sep, sep, sep)) + else: + if rel is False: + output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format( + sep, sep, sep, sep, sep, sep, sep, sep, sep)) + else: + output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n". + format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) + for item in summary: + for nr in item: + if "FS" not in nr and "diff" not in nr: + nr = nr.astype(float) + nr = nr.astype(int) + output_file.write("{}{}".format(nr, sep)) + output_file.write("\n") + output_file.write("sum{}".format(sep)) + sumCol = map(int, sumCol) + for el in sumCol: + output_file.write("{}{}".format(el, sep)) + output_file.write("{}{}".format(overallSum.astype(int), sep)) + output_file.write("\n\n") + + +def createTableHD(list1, row_label): + selfAB = numpy.concatenate(list1) + uniqueHD = numpy.unique(selfAB) + nr = numpy.arange(0, len(uniqueHD), 1) + count = numpy.zeros((len(uniqueHD), 6)) + state = 1 + for i in list1: + counts = list(Counter(i).items()) + hd = [item[0] for item in counts] + c = [item[1] for item in counts] + table = numpy.column_stack((hd, c)) + if len(table) == 0: + state = state + 1 + continue + else: + if state == 1: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 0] = j[1] + if state == 2: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 1] = j[1] + if state == 3: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 2] = j[1] + if state == 4: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 3] = j[1] + if state == 5: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 4] = j[1] + if state == 6: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 5] = j[1] + state = state + 1 + sumRow = count.sum(axis=1) + sumCol = count.sum(axis=0) + first = ["{}{}".format(row_label, i) for i in uniqueHD] + final = numpy.column_stack((first, count, sumRow)) + return (final, sumCol) + + +def createTableHDwithTags(list1): + selfAB = numpy.concatenate(list1) + uniqueHD = numpy.unique(selfAB) + nr = numpy.arange(0, len(uniqueHD), 1) + count = numpy.zeros((len(uniqueHD), 5)) + state = 1 + for i in list1: + counts = list(Counter(i).items()) + hd = [item[0] for item in counts] + c = [item[1] for item in counts] + table = numpy.column_stack((hd, c)) + if len(table) == 0: + state = state + 1 + continue + else: + if state == 1: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 0] = j[1] + if state == 2: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 1] = j[1] + if state == 3: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 2] = j[1] + if state == 4: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 3] = j[1] + if state == 5: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 4] = j[1] + state = state + 1 + sumRow = count.sum(axis=1) + sumCol = count.sum(axis=0) + first = ["TD={}".format(i) for i in uniqueHD] + final = numpy.column_stack((first, count, sumRow)) + return (final, sumCol) + + +def createTableHDwithDCS(list1): + selfAB = numpy.concatenate(list1) + uniqueHD = numpy.unique(selfAB) + nr = numpy.arange(0, len(uniqueHD), 1) + count = numpy.zeros((len(uniqueHD), len(list1))) + state = 1 + for i in list1: + counts = list(Counter(i).items()) + hd = [item[0] for item in counts] + c = [item[1] for item in counts] + table = numpy.column_stack((hd, c)) + if len(table) == 0: + state = state + 1 + continue + else: + if state == 1: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 0] = j[1] + if state == 2: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 1] = j[1] + if state == 3: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 2] = j[1] + state = state + 1 + sumRow = count.sum(axis=1) + sumCol = count.sum(axis=0) + first = ["TD={}".format(i) for i in uniqueHD] + final = numpy.column_stack((first, count, sumRow)) + return (final, sumCol) + + +def createFileHD(summary, sumCol, overallSum, output_file, name, sep): + output_file.write(name) + output_file.write("\n") + output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format( + sep, sep, sep, sep, sep, sep, sep, sep)) + for item in summary: + for nr in item: + if "TD" not in nr and "diff" not in nr: + nr = nr.astype(float) + nr = nr.astype(int) + output_file.write("{}{}".format(nr, sep)) + output_file.write("\n") + output_file.write("sum{}".format(sep)) + sumCol = map(int, sumCol) + for el in sumCol: + output_file.write("{}{}".format(el, sep)) + output_file.write("{}{}".format(overallSum.astype(int), sep)) + output_file.write("\n\n") + + +def createFileHDwithDCS(summary, sumCol, overallSum, output_file, name, sep): + output_file.write(name) + output_file.write("\n") + output_file.write("{}DCS{}SSCS ab{}SSCS ba{}sum{}\n".format(sep, sep, sep, sep, sep)) + for item in summary: + for nr in item: + if "TD" not in nr: + nr = nr.astype(float) + nr = nr.astype(int) + output_file.write("{}{}".format(nr, sep)) + output_file.write("\n") + output_file.write("sum{}".format(sep)) + sumCol = map(int, sumCol) + for el in sumCol: + output_file.write("{}{}".format(el, sep)) + output_file.write("{}{}".format(overallSum.astype(int), sep)) + output_file.write("\n\n") + + +def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name, sep): + output_file.write(name) + output_file.write("\n") + output_file.write("{}TD a.min{}TD b.max{}TD b.min{}TD a.max{}TD a.min + b.max, TD a.max + b.min{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep)) + for item in summary: + for nr in item: + if "TD" not in nr: + nr = nr.astype(float) + nr = nr.astype(int) + output_file.write("{}{}".format(nr, sep)) + output_file.write("\n") + output_file.write("sum{}".format(sep)) + sumCol = map(int, sumCol) + for el in sumCol: + output_file.write("{}{}".format(el, sep)) + output_file.write("{}{}".format(overallSum.astype(int), sep)) + output_file.write("\n\n") + + +def hamming(array1, array2): + res = 99 * numpy.ones(len(array1)) + i = 0 + array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time + for a in array1: + dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2]) # fastest + res[i] = numpy.amin(dist[dist > 0]) # pick min distance greater than zero + i += 1 + return res + + +def hamming_difference(array1, array2, mate_b): + array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time + array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1 + array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2 + array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1 + array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2 + + # diff11 = 999 * numpy.ones(len(array2)) + # relativeDiffList = 999 * numpy.ones(len(array2)) + # ham1 = 999 * numpy.ones(len(array2)) + # ham2 = 999 * numpy.ones(len(array2)) + # min_valueList = 999 * numpy.ones(len(array2)) + # min_tagsList = 999 * numpy.ones(len(array2)) + # diff11_zeros = 999 * numpy.ones(len(array2)) + # min_tagsList_zeros = 999 * numpy.ones(len(array2)) + + diff11 = [] + relativeDiffList = [] + ham1 = [] + ham2 = [] + ham1min = [] + ham2min = [] + min_valueList = [] + min_tagsList = [] + diff11_zeros = [] + min_tagsList_zeros = [] + max_tag_list = [] + i = 0 # counter, only used to see how many HDs of tags were already calculated + if mate_b is False: # HD calculation for all a's + half1_mate1 = array1_half + half2_mate1 = array1_half2 + half1_mate2 = array2_half + half2_mate2 = array2_half2 + elif mate_b is True: # HD calculation for all b's + half1_mate1 = array1_half2 + half2_mate1 = array1_half + half1_mate2 = array2_half2 + half2_mate2 = array2_half + + # half1_mate1, index_halves = numpy.unique(half1_mate1, return_index=True) + # print(len(half1_mate1)) + # half2_mate1 = half2_mate1[index_halves] + # array1 = array1[index_halves] + + for a, b, tag in zip(half1_mate1, half2_mate1, array1): + # exclude identical tag from array2, to prevent comparison to itself + sameTag = numpy.where(array2 == tag)[0] + indexArray2 = numpy.arange(0, len(array2), 1) + index_withoutSame = numpy.delete(indexArray2, sameTag) # delete identical tag from the data + + # all tags without identical tag + array2_half_withoutSame = half1_mate2[index_withoutSame] + array2_half2_withoutSame = half2_mate2[index_withoutSame] + array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs) + # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" + dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in + array2_half_withoutSame]) + min_index = numpy.where(dist == dist.min())[0] # get index of min HD + min_value = dist.min() + # min_value = dist[min_index] # get minimum HDs + # get all "b's" of the tag or all "a's" of the tag with minimum HD + min_tag_half2 = array2_half2_withoutSame[min_index] + min_tag_array2 = array2_withoutSame[min_index] # get whole tag with min HD + + dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in + min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" + max_value = dist_second_half.max() + max_index = numpy.where(dist_second_half == dist_second_half.max())[0] # get index of max HD + max_tag = min_tag_array2[max_index] + + # for d, d2 in zip(min_value, max_value): + if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b + ham2.append(min_value) + ham2min.append(max_value) + else: # half1, corrects the variable of the HD from both halfs if it is a or b + ham1.append(min_value) + ham1min.append(max_value) + + min_valueList.append(min_value + max_value) + min_tagsList.append(tag) + difference1 = abs(min_value - max_value) + diff11.append(difference1) + rel_difference = round(float(difference1) / (min_value + max_value), 1) + relativeDiffList.append(rel_difference) + + # tags which have identical parts: + if min_value == 0 or max_value == 0: + min_tagsList_zeros.append(numpy.array(tag)) + difference1_zeros = abs(min_value - max_value) # td of non-identical part + diff11_zeros.append(difference1_zeros) + max_tag_list.append(numpy.array(max_tag)) + else: + min_tagsList_zeros.append(None) + diff11_zeros.append(None) + max_tag_list.append(None) + i += 1 + + # print(i) + # diff11 = [st for st in diff11 if st != 999] + # ham1 = [st for st in ham1 if st != 999] + # ham2 = [st for st in ham2 if st != 999] + # min_valueList = [st for st in min_valueList if st != 999] + # min_tagsList = [st for st in min_tagsList if st != 999] + # relativeDiffList = [st for st in relativeDiffList if st != 999] + # diff11_zeros = [st for st in diff11_zeros if st != 999] + # min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999] + return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, + min_tagsList_zeros, ham1min, ham2min, max_tag_list]) + + +def readFileReferenceFree(file): + with open(file, 'r') as dest_f: + data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') + integers = numpy.array(data_array[:, 0]).astype(int) + return(integers, data_array) + + +def hammingDistanceWithFS(fs, ham): + fs = numpy.asarray(fs) + maximum = max(ham) + minimum = min(ham) + ham = numpy.asarray(ham) + + singletons = numpy.where(fs == 1)[0] + data = ham[singletons] + + hd2 = numpy.where(fs == 2)[0] + data2 = ham[hd2] + + hd3 = numpy.where(fs == 3)[0] + data3 = ham[hd3] + + hd4 = numpy.where(fs == 4)[0] + data4 = ham[hd4] + + hd5 = numpy.where((fs >= 5) & (fs <= 10))[0] + data5 = ham[hd5] + + hd6 = numpy.where(fs > 10)[0] + data6 = ham[hd6] + + list1 = [data, data2, data3, data4, data5, data6] + return(list1, maximum, minimum) + + +def familySizeDistributionWithHD(fs, ham, diff=False, rel=True): + hammingDistances = numpy.unique(ham) + fs = numpy.asarray(fs) + ham = numpy.asarray(ham) + bigFamilies2 = numpy.where(fs > 19)[0] + if len(bigFamilies2) != 0: + fs[bigFamilies2] = 20 + maximum = max(fs) + minimum = min(fs) + if diff is True: + hd0 = numpy.where(ham == 0)[0] + data0 = fs[hd0] + + if rel is True: + hd1 = numpy.where(ham == 0.1)[0] + else: + hd1 = numpy.where(ham == 1)[0] + data = fs[hd1] + + if rel is True: + hd2 = numpy.where(ham == 0.2)[0] + else: + hd2 = numpy.where(ham == 2)[0] + data2 = fs[hd2] + + if rel is True: + hd3 = numpy.where(ham == 0.3)[0] + else: + hd3 = numpy.where(ham == 3)[0] + data3 = fs[hd3] + + if rel is True: + hd4 = numpy.where(ham == 0.4)[0] + else: + hd4 = numpy.where(ham == 4)[0] + data4 = fs[hd4] + + if rel is True: + hd5 = numpy.where((ham >= 0.5) & (ham <= 0.8))[0] + else: + hd5 = numpy.where((ham >= 5) & (ham <= 8))[0] + data5 = fs[hd5] + + if rel is True: + hd6 = numpy.where(ham > 0.8)[0] + else: + hd6 = numpy.where(ham > 8)[0] + data6 = fs[hd6] + + if diff is True: + list1 = [data0, data, data2, data3, data4, data5, data6] + else: + list1 = [data, data2, data3, data4, data5, data6] + + return(list1, hammingDistances, maximum, minimum) + + +def hammingDistanceWithDCS(minHD_tags_zeros, diff_zeros, data_array): + diff_zeros = numpy.array(diff_zeros) + maximum = numpy.amax(diff_zeros) + minimum = numpy.amin(diff_zeros) + minHD_tags_zeros = numpy.array(minHD_tags_zeros) + + idx = numpy.concatenate([numpy.where(data_array[:, 1] == i)[0] for i in minHD_tags_zeros]) + subset_data = data_array[idx, :] + + seq = numpy.array(subset_data[:, 1]) + + # find all unique tags and get the indices for ALL tags, but only once + u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) + DCS_tags = u[c == 2] + rest_tags = u[c == 1] + + dcs = numpy.repeat("DCS", len(DCS_tags)) + idx_sscs = numpy.concatenate([numpy.where(subset_data[:, 1] == i)[0] for i in rest_tags]) + sscs = subset_data[idx_sscs, 2] + + all_tags = numpy.column_stack((numpy.concatenate((DCS_tags, subset_data[idx_sscs, 1])), + numpy.concatenate((dcs, sscs)))) + hd_DCS = [] + ab_SSCS = [] + ba_SSCS = [] + + for i in range(len(all_tags)): + tag = all_tags[i, :] + hd = diff_zeros[numpy.where(minHD_tags_zeros == tag[0])[0]] + + if tag[1] == "DCS": + hd_DCS.append(hd) + elif tag[1] == "ab": + ab_SSCS.append(hd) + elif tag[1] == "ba": + ba_SSCS.append(hd) + + if len(hd_DCS) != 0: + hd_DCS = numpy.concatenate(hd_DCS) + if len(ab_SSCS) != 0: + ab_SSCS = numpy.concatenate(ab_SSCS) + if len(ba_SSCS) != 0: + ba_SSCS = numpy.concatenate(ba_SSCS) + list1 = [hd_DCS, ab_SSCS, ba_SSCS] # list for plotting + return(list1, maximum, minimum) + + +def make_argparser(): + parser = argparse.ArgumentParser(description='Tag distance analysis of duplex sequencing data') + parser.add_argument('--inputFile', + help='Tabular File with three columns: ab or ba, tag and family size.') + parser.add_argument('--inputName1') + parser.add_argument('--sample_size', default=1000, type=int, + help='Sample size of Tag distance analysis.') + parser.add_argument('--subset_tag', default=0, type=int, + help='The tag is shortened to the given number.') + parser.add_argument('--nproc', default=4, type=int, + help='The tool runs with the given number of processors.') + parser.add_argument('--only_DCS', action="store_false", + help='Only tags of the DCSs are included in the HD analysis') + + parser.add_argument('--minFS', default=1, type=int, + help='Only tags, which have a family size greater or equal than specified, ' + 'are included in the HD analysis') + parser.add_argument('--maxFS', default=0, type=int, + help='Only tags, which have a family size smaller or equal than specified, ' + 'are included in the HD analysis') + parser.add_argument('--nr_above_bars', action="store_true", + help='If False, values above bars in the histograms are removed') + parser.add_argument('--rel_freq', action="store_false", + help='If True, the relative frequencies are displayed.') + + parser.add_argument('--output_tabular', default="data.tabular", type=str, + help='Name of the tabular file.') + parser.add_argument('--output_pdf', default="data.pdf", type=str, + help='Name of the pdf file.') + parser.add_argument('--output_chimeras_tabular', default="data.tabular", type=str, + help='Name of the tabular file with all chimeric tags.') + + return parser + + +def Hamming_Distance_Analysis(argv): + parser = make_argparser() + args = parser.parse_args(argv[1:]) + file1 = args.inputFile + name1 = args.inputName1 + index_size = args.sample_size + title_savedFile_pdf = args.output_pdf + title_savedFile_csv = args.output_tabular + output_chimeras_tabular = args.output_chimeras_tabular + onlyDuplicates = args.only_DCS + rel_freq = args.rel_freq + minFS = args.minFS + maxFS = args.maxFS + nr_above_bars = args.nr_above_bars + subset = args.subset_tag + nproc = args.nproc + sep = "\t" + + # input checks + if index_size < 0: + print("index_size is a negative integer.") + exit(2) + if nproc <= 0: + print("nproc is smaller or equal zero") + exit(3) + if subset < 0: + print("subset_tag is smaller or equal zero.") + exit(5) + + # PLOT + plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color + plt.rcParams['xtick.labelsize'] = 14 + plt.rcParams['ytick.labelsize'] = 14 + plt.rcParams['patch.edgecolor'] = "#000000" + plt.rc('figure', figsize=(11.69, 8.27)) # A4 format + name1 = name1.split(".tabular")[0] + + with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf: + print("dataset: ", name1) + integers, data_array = readFileReferenceFree(file1) + data_array = numpy.array(data_array) + print("total nr of tags:", len(data_array)) + + # filter tags out which contain any other character than ATCG + valid_bases = ["A", "T", "G", "C"] + tagsToDelete = [] + for idx, t in enumerate(data_array[:, 1]): + for char in t: + if char not in valid_bases: + tagsToDelete.append(idx) + break + + if len(tagsToDelete) != 0: # delete tags with N in the tag from data + print("nr of tags with any other character than A, T, C, G:", len(tagsToDelete), + float(len(tagsToDelete)) / len(data_array)) + index_whole_array = numpy.arange(0, len(data_array), 1) + index_withoutN_inTag = numpy.delete(index_whole_array, tagsToDelete) + data_array = data_array[index_withoutN_inTag, :] + integers = integers[index_withoutN_inTag] + print("total nr of filtered tags:", len(data_array)) + + int_f = numpy.array(data_array[:, 0]).astype(int) + data_array = data_array[numpy.where(int_f >= minFS)] + integers = integers[integers >= minFS] + + # select family size for tags + if maxFS > 0: + int_f2 = numpy.array(data_array[:, 0]).astype(int) + data_array = data_array[numpy.where(int_f2 <= maxFS)] + integers = integers[integers <= maxFS] + + if onlyDuplicates is True: + tags = data_array[:, 2] + seq = data_array[:, 1] + + # find all unique tags and get the indices for ALL tags, but only once + u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) + d = u[c == 2] + + # get family sizes, tag for duplicates + duplTags_double = integers[numpy.in1d(seq, d)] + duplTags = duplTags_double[0::2] # ab of DCS + duplTagsBA = duplTags_double[1::2] # ba of DCS + + duplTags_tag = tags[numpy.in1d(seq, d)][0::2] # ab + duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags + + if minFS > 1: + duplTags_tag = duplTags_tag[(duplTags >= minFS) & (duplTagsBA >= minFS)] + duplTags_seq = duplTags_seq[(duplTags >= minFS) & (duplTagsBA >= minFS)] + duplTags = duplTags[(duplTags >= minFS) & (duplTagsBA >= minFS)] # ab+ba with FS>=3 + + data_array = numpy.column_stack((duplTags, duplTags_seq)) + data_array = numpy.column_stack((data_array, duplTags_tag)) + integers = numpy.array(data_array[:, 0]).astype(int) + print("DCS in whole dataset", len(data_array)) + + print("min FS", min(integers)) + print("max FS", max(integers)) + + # HD analysis for a subset of the tag + if subset > 0: + tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]]) + tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]]) + + flanking_region_float = float((len(tag1[0]) - subset)) / 2 + flanking_region = int(flanking_region_float) + if flanking_region_float % 2 == 0: + tag1_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag1]) + tag2_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag2]) + else: + flanking_region_rounded = int(round(flanking_region, 1)) + flanking_region_rounded_end = len(tag1[0]) - subset - flanking_region_rounded + tag1_shorten = numpy.array( + [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag1]) + tag2_shorten = numpy.array( + [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2]) + + data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)]) + data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2])) + + print("length of tag= ", len(data_array[0, 1])) + # select sample: if no size given --> all vs. all comparison + if index_size == 0: + result = numpy.arange(0, len(data_array), 1) + else: + numpy.random.shuffle(data_array) + unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True) # get only unique tags + result = numpy.random.choice(unique_indices, size=index_size, + replace=False) # array of random sequences of size=index.size + + # result = numpy.random.choice(len(integers), size=index_size, + # replace=False) # array of random sequences of size=index.size + # result = numpy.where(numpy.array(random_tags) == numpy.array(data_array[:,1]))[0] + + # with open("index_result.pkl", "wb") as o: + # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL) + + # save counts + # with open(data_folder + "index_sampleTags1000_Barcode3_DCS.pkl", "wb") as f: + # pickle.dump(result, f, pickle.HIGHEST_PROTOCOL) + # with open(data_folder + "dataArray_sampleTags1000_Barcode3_DCS.pkl", "wb") as f1: + # pickle.dump(data_array, f1, pickle.HIGHEST_PROTOCOL) + # + # with open(data_folder + "index_sampleTags100.pkl", "rb") as f: + # result = pickle.load(f) + # + # with open(data_folder + "dataArray_sampleTags100.pkl", "rb") as f1: + # data_array = pickle.load(f1) + + # with open(data_folder + "index_result.txt", "w") as t: + # for text in result: + # t.write("{}\n".format(text)) + + # comparison random tags to whole dataset + result1 = data_array[result, 1] # random tags + result2 = data_array[:, 1] # all tags + print("sample size= ", len(result1)) + + # HD analysis of whole tag + proc_pool = Pool(nproc) + chunks_sample = numpy.array_split(result1, nproc) + ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample) + proc_pool.close() + proc_pool.join() + ham = numpy.concatenate(ham).astype(int) + # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1: + # for h, tag in zip(ham, result1): + # output_file1.write("{}\t{}\n".format(tag, h)) + + # # HD analysis for chimeric reads + # result2 = data_array_whole_dataset[:,1] + + proc_pool_b = Pool(nproc) + diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample) + diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample) + proc_pool_b.close() + proc_pool_b.join() + HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]), + numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int) + HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]), + numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int) + minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]), + numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int) + HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]), + numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int) + HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]), + numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int) + + rel_Diff1 = numpy.concatenate([item[5] for item in diff_list_a]) + rel_Diff2 = numpy.concatenate([item[5] for item in diff_list_b]) + diff1 = numpy.concatenate([item[0] for item in diff_list_a]) + diff2 = numpy.concatenate([item[0] for item in diff_list_b]) + + diff_zeros1 = numpy.concatenate([item[6] for item in diff_list_a]) + diff_zeros2 = numpy.concatenate([item[6] for item in diff_list_b]) + minHD_tags = numpy.concatenate([item[4] for item in diff_list_a]) + minHD_tags_zeros1 = numpy.concatenate([item[7] for item in diff_list_a]) + minHD_tags_zeros2 = numpy.concatenate([item[7] for item in diff_list_b]) + + chimera_tags1 = sum([item[10] for item in diff_list_a], []) + chimera_tags2 = sum([item[10] for item in diff_list_b], []) + + rel_Diff = [] + diff_zeros = [] + minHD_tags_zeros = [] + diff = [] + chimera_tags = [] + for d1, d2, rel1, rel2, zeros1, zeros2, tag1, tag2, ctag1, ctag2 in \ + zip(diff1, diff2, rel_Diff1, rel_Diff2, diff_zeros1, diff_zeros2, minHD_tags_zeros1, minHD_tags_zeros2, + chimera_tags1, chimera_tags2): + relatives = numpy.array([rel1, rel2]) + absolutes = numpy.array([d1, d2]) + max_idx = numpy.argmax(relatives) + rel_Diff.append(relatives[max_idx]) + diff.append(absolutes[max_idx]) + + if all(i is not None for i in [zeros1, zeros2]): + diff_zeros.append(max(zeros1, zeros2)) + minHD_tags_zeros.append(str(tag1)) + tags = [ctag1, ctag2] + chimera_tags.append(tags) + elif zeros1 is not None and zeros2 is None: + diff_zeros.append(zeros1) + minHD_tags_zeros.append(str(tag1)) + chimera_tags.append(ctag1) + elif zeros1 is None and zeros2 is not None: + diff_zeros.append(zeros2) + minHD_tags_zeros.append(str(tag2)) + chimera_tags.append(ctag2) + + chimera_tags_new = chimera_tags + data_chimeraAnalysis = numpy.column_stack((minHD_tags_zeros, chimera_tags_new)) + # chimeras_dic = defaultdict(list) + # + # for t1, t2 in zip(minHD_tags_zeros, chimera_tags_new): + # if len(t2) >1 and type(t2) is not numpy.ndarray: + # t2 = numpy.concatenate(t2) + # chimeras_dic[t1].append(t2) + + checked_tags = [] + stat_maxTags = [] + + with open(output_chimeras_tabular, "w") as output_file1: + output_file1.write("chimera tag\tfamily size, read direction\tsimilar tag with TD=0\n") + for i in range(len(data_chimeraAnalysis)): + tag1 = data_chimeraAnalysis[i, 0] + + info_tag1 = data_array[data_array[:, 1] == tag1, :] + fs_tag1 = ["{} {}".format(t[0], t[2]) for t in info_tag1] + + if tag1 in checked_tags: # skip tag if already written to file + continue + + sample_half_a = tag1[0:(len(tag1)) / 2] + sample_half_b = tag1[len(tag1) / 2:len(tag1)] + + max_tags = data_chimeraAnalysis[i, 1] + if len(max_tags) > 1 and len(max_tags) != len(data_array[0, 1]) and type(max_tags) is not numpy.ndarray: + max_tags = numpy.concatenate(max_tags) + max_tags = numpy.unique(max_tags) + stat_maxTags.append(len(max_tags)) + + info_maxTags = [data_array[data_array[:, 1] == t, :] for t in max_tags] + + chimera_half_a = numpy.array([t[0:(len(t)) / 2] for t in max_tags]) # mate1 part1 + chimera_half_b = numpy.array([t[len(t) / 2:len(t)] for t in max_tags]) # mate1 part 2 + + new_format = [] + for j in range(len(max_tags)): + fs_maxTags = ["{} {}".format(t[0], t[2]) for t in info_maxTags[j]] + + if sample_half_a == chimera_half_a[j]: + max_tag = "*{}* {} {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags)) + new_format.append(max_tag) + + elif sample_half_b == chimera_half_b[j]: + max_tag = "{} *{}* {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags)) + new_format.append(max_tag) + checked_tags.append(max_tags[j]) + + sample_tag = "{} {}\t{}".format(sample_half_a, sample_half_b, ", ".join(fs_tag1)) + output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format))) + + checked_tags.append(tag1) + + output_file1.write( + "This file contains all tags that were identified as chimeras as the first column and the " + "corresponding tags which returned a Hamming distance of zero in either the first or the second " + "half of the sample tag as the second column.\n" + "The tags were separated by an empty space into their halves and the * marks the identical half.") + output_file1.write("\n\nStatistics of nr. of tags that returned max. TD (2nd column)\n") + output_file1.write("minimum\t{}\ttag(s)\n".format(numpy.amin(numpy.array(stat_maxTags)))) + output_file1.write("mean\t{}\ttag(s)\n".format(numpy.mean(numpy.array(stat_maxTags)))) + output_file1.write("median\t{}\ttag(s)\n".format(numpy.median(numpy.array(stat_maxTags)))) + output_file1.write("maximum\t{}\ttag(s)\n".format(numpy.amax(numpy.array(stat_maxTags)))) + output_file1.write("sum\t{}\ttag(s)\n".format(numpy.sum(numpy.array(stat_maxTags)))) + + lenTags = len(data_array) + len_sample = len(result1) + + quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags + seq = numpy.array(data_array[result, 1]) # tags of sample + ham = numpy.asarray(ham) # HD for sample of tags + + if onlyDuplicates is True: # ab and ba strands of DCSs + quant = numpy.concatenate((quant, duplTagsBA[result])) + seq = numpy.tile(seq, 2) + ham = numpy.tile(ham, 2) + diff = numpy.tile(diff, 2) + rel_Diff = numpy.tile(rel_Diff, 2) + diff_zeros = numpy.tile(diff_zeros, 2) + + nr_chimeric_tags = len(data_chimeraAnalysis) + print("nr of chimeras", nr_chimeric_tags) + + # prepare data for different kinds of plots + # distribution of FSs separated after HD + familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False) + list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS + + # get FS for all tags with min HD of analysis of chimeric reads + # there are more tags than sample size in the plot, because one tag can have multiple minimas + if onlyDuplicates: + seqDic = defaultdict(list) + for s, q in zip(seq, quant): + seqDic[s].append(q) + else: + seqDic = dict(zip(seq, quant)) + + lst_minHD_tags = [] + for i in minHD_tags: + lst_minHD_tags.append(seqDic.get(i)) + + if onlyDuplicates: + lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags], + [item_b[1] for item_b in lst_minHD_tags])).astype(int) + # histogram with absolute and relative difference between HDs of both parts of the tag + listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff) + listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags, + rel_Diff) + # chimeric read analysis: tags which have TD=0 in one of the halfs + if len(minHD_tags_zeros) != 0: + lst_minHD_tags_zeros = [] + for i in minHD_tags_zeros: + lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads + if onlyDuplicates: + lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros], + [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int) + + # histogram with HD of non-identical half + listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS( + lst_minHD_tags_zeros, diff_zeros) + + if onlyDuplicates is False: + listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros = hammingDistanceWithDCS(minHD_tags_zeros, + diff_zeros, data_array) + + # plot Hamming Distance with Family size distribution + plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, rel_freq=rel_freq, + subtitle="Tag distance separated by family size", lenTags=lenTags, + xlabel="TD", nr_above_bars=nr_above_bars, len_sample=len_sample) + + # Plot FSD with separation after + plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, rel_freq=rel_freq, + originalCounts=quant, subtitle="Family size distribution separated by Tag distance", + pdf=pdf, relative=False, diff=False) + + # Plot HD within tags + plotHDwithinSeq(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, + rel_freq=rel_freq, len_sample=len_sample) + + # Plot difference between HD's separated after FSD + plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, + subtitle="Delta Tag distance within tags", lenTags=lenTags, rel_freq=rel_freq, + xlabel="absolute delta TD", relative=False, nr_above_bars=nr_above_bars, len_sample=len_sample) + + plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, + subtitle="Chimera Analysis: relative delta Tag distance", lenTags=lenTags, rel_freq=rel_freq, + xlabel="relative delta TD", relative=True, nr_above_bars=nr_above_bars, + nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) + + # plots for chimeric reads + if len(minHD_tags_zeros) != 0: + # HD + plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, + subtitle="Tag distance of chimeric families (CF)", rel_freq=rel_freq, + lenTags=lenTags, xlabel="TD", relative=False, + nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) + + if onlyDuplicates is False: + plotHDwithDCS(listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros, pdf=pdf, + subtitle="Tag distance of chimeric families (CF)", rel_freq=rel_freq, + lenTags=lenTags, xlabel="TD", relative=False, + nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) + + # print all data to a CSV file + # HD + summary, sumCol = createTableHD(list1, "TD=") + overallSum = sum(sumCol) # sum of columns in table + + # FSD + summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False) + overallSum5 = sum(sumCol5) + + # HD of both parts of the tag + summary9, sumCol9 = createTableHDwithTags([HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, numpy.array(minHDs)]) + overallSum9 = sum(sumCol9) + + # HD + # absolute difference + summary11, sumCol11 = createTableHD(listDifference1, "diff=") + overallSum11 = sum(sumCol11) + # relative difference and all tags + summary13, sumCol13 = createTableHD(listRelDifference1, "diff=") + overallSum13 = sum(sumCol13) + + # chimeric reads + if len(minHD_tags_zeros) != 0: + # absolute difference and tags where at least one half has HD=0 + summary15, sumCol15 = createTableHD(listDifference1_zeros, "TD=") + overallSum15 = sum(sumCol15) + + if onlyDuplicates is False: + summary16, sumCol16 = createTableHDwithDCS(listDCS_zeros) + overallSum16 = sum(sumCol16) + + output_file.write("{}\n".format(name1)) + output_file.write("nr of tags{}{:,}\nsample size{}{:,}\n\n".format(sep, lenTags, sep, len_sample)) + + # HD + createFileHD(summary, sumCol, overallSum, output_file, + "Tag distance separated by family size", sep) + # FSD + createFileFSD2(summary5, sumCol5, overallSum5, output_file, + "Family size distribution separated by Tag distance", sep, + diff=False) + + # output_file.write("{}{}\n".format(sep, name1)) + output_file.write("\n") + max_fs = numpy.bincount(integers[result]) + output_file.write("max. family size in sample:{}{}\n".format(sep, max(integers[result]))) + output_file.write("absolute frequency:{}{}\n".format(sep, max_fs[len(max_fs) - 1])) + output_file.write( + "relative frequency:{}{}\n\n".format(sep, float(max_fs[len(max_fs) - 1]) / sum(max_fs))) + + # HD within tags + output_file.write( + "Chimera Analysis:\nThe tags are splitted into two halves (part a and b) for which the Tag distances (TD) are calculated seperately.\n" + "The tag distance of the first half (part a) is calculated by comparing part a of the tag in the sample against all a parts in the dataset and by selecting the minimum value (TD a.min).\n" + "In the next step, we select those tags that showed the minimum TD and estimate the TD for the second half (part b) of the tag by comparing part b against the previously selected subset.\n" + "The maximum value represents then TD b.max. Finally, these process is repeated but starting with part b instead and TD b.min and TD a.max are calculated.\n" + "Next, the absolute differences between TD a.min & TD b.max and TD b.min & TD a.max are estimated (delta HD).\n" + "These are then divided by the sum of both parts (TD a.min + TD b.max or TD b.min + TD a.max, respectively) which give the relative differences between the partial HDs (rel. delta HD).\n" + "For simplicity, we used the maximum value of the relative differences and the respective delta HD.\n" + "Note that when only tags that can form a DCS are included in the analysis, the family sizes for both directions (ab and ba) of the strand will be included in the plots.\n") + + output_file.write("\nlength of one half of the tag{}{}\n\n".format(sep, len(data_array[0, 1]) / 2)) + + createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, + "Tag distance of each half in the tag", sep) + createFileHD(summary11, sumCol11, overallSum11, output_file, + "Absolute delta Tag distance within the tag", sep) + + createFileHD(summary13, sumCol13, overallSum13, output_file, + "Chimera analysis: relative delta Tag distance", sep) + + if len(minHD_tags_zeros) != 0: + output_file.write( + "All tags are filtered and only those tags where one half is identical (TD=0) and therefore, have a relative delta TD of 1, are kept.\n" + "These tags are considered as chimeras.\n") + createFileHD(summary15, sumCol15, overallSum15, output_file, + "Tag distance of chimeric families separated after FS", sep) + + if onlyDuplicates is False: + createFileHDwithDCS(summary16, sumCol16, overallSum16, output_file, + "Tag distance of chimeric families separated after DCS and single SSCS (ab, ba)", sep) + + output_file.write("\n") + + +if __name__ == '__main__': + sys.exit(Hamming_Distance_Analysis(sys.argv))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/td.xml Wed Oct 16 04:17:59 2019 -0400 @@ -0,0 +1,148 @@ +<?xml version="1.0" encoding="UTF-8"?> +<tool id="td" name="TD:" version="1.0.5"> + <description>Tag distance analysis of duplex tags</description> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.4.0">matplotlib</requirement> + </requirements> + <command> + python2 '$__tool_directory__/td.py' --inputFile '$inputFile' --inputName1 '$inputFile.name' --sample_size $sampleSize --subset_tag $subsetTag --nproc $nproc $onlyDCS $rel_freq --minFS $minFS --maxFS $maxFS + $nr_above_bars --output_pdf $output_pdf --output_tabular $output_tabular --output_chimeras_tabular $output_chimeras_tabular + </command> + <inputs> + <param name="inputFile" type="data" format="tabular" label="Dataset 1: input tags" optional="false" help="Input in tabular format with the family size, tag and the direction of the strand ('ab' or 'ba') for each family."/> + <param name="sampleSize" type="integer" label="number of tags in the sample" value="1000" min="0" help="specifies the number of tags in one analysis. If sample size is 0, all tags of the dataset are compared against all tags."/> + <param name="minFS" type="integer" label="minimum family size of the tags" min="1" value="1" help="filters the tags after their family size: Families with a smaller size are skipped. Default: min. family size = 1."/> + <param name="maxFS" type="integer" label="max family size of the tags" min="0" value="0" help="filters the tags after their family size: Families with a larger size are skipped. If max. family size is 0, no upper bound is defined and the maximum family size in the analysis will be the maximum family size of the whole dataset. Default: max. family size = 0."/> + <param name="onlyDCS" type="boolean" label="only DCS in the analysis?" truevalue="" falsevalue="--only_DCS" checked="False" help="Only tags, which have a partner tag (ab and ba) in the dataset, are included in the analysis."/> + <param name="rel_freq" type="boolean" label="relative frequency?" truevalue="" falsevalue="--rel_freq" checked="False" help="If True, the relative frequencies instead of the absolute values are displayed in the plots."/> + <param name="subsetTag" type="integer" label="shorten tag in the analysis?" value="0" help="By this parameter an analysis with shorter tag length is simulated. If this parameter is 0 (by default), the tags with its original length are used in the analysis."/> + <param name="nproc" type="integer" label="number of processors" value="8" help="Number of processor used for computing."/> + <param name="nr_above_bars" type="boolean" label="include numbers above bars?" truevalue="--nr_above_bars" falsevalue="" checked="True" help="The absolute and relative values of the data can be included or removed from the plots. "/> + + </inputs> + <outputs> + <data name="output_pdf" format="pdf" /> + <data name="output_tabular" format="tabular"/> + <data name="output_chimeras_tabular" format="tabular"/> + + </outputs> + <tests> + <test> + <param name="inputFile" value="td_data.tab"/> + <param name="sampleSize" value="0"/> + <output name="output_pdf" file="td_output.pdf" lines_diff="6"/> + <output name="output_tabular" file="td_output.tab"/> + <output name="output_chimeras_tabular" file="td_chimeras_output.tab"/> + </test> + </tests> + <help> <![CDATA[ +**What it does** + +Tags used in Duplex Sequencing (DS) are randomized barcodes, e.g 12 base pairs long. Since each DNA fragment is labeled by two tags at each end there are theoretically 4 to the power of (12+12) unique combinations. However, the input DNA in a typical DS experiment contains only ~1,000,000 molecules creating a large tag-to-input excess (4^24 +≫ 1,000,000). Because of such excess it is highly unlikely to tag distinct input DNA molecules with highly similar barcodes. + +This tool calculates the number of nucleotide differences among tags, also known as `Hamming distance <https://en.wikipedia.org/wiki/Hamming_distance>`_. In this context the Hamming distance is simply the number of differences between two tags. The tool compares in a randomly selected subset of tags (default n=1000), the difference between each tag of the subset with the tags of the complete dataset. Each tag will differ by a certain number of nucleotides with the other tags; yet the tool uses the smallest difference observed with any other tag. + +**Input** + +This tools expects a tabular file with the tags of all families, the family sizes and information about forward (ab) and reverse (ba) strands:: + + 1 2 3 + ----------------------------- + 1 AAAAAAAAAAAAAAAAATGGTATG ba + 3 AAAAAAAAAAAAAATGGTATGGAC ab + +.. class:: infomark + +**How to generate the input** + +The first step of the `Du Novo Analysis Pipeline <https://doi.org/10.1186/s13059-016-1039-4>`_ is the **Make Families** tool or the **Correct Barcodes** tool that produces output in this form:: + + 1 2 3 4 + ------------------------------------------------------ + AAAAAAAAAAAAAAATAGCTCGAT ab read1 CGCTACGTGACTGGGTCATG + AAAAAAAAAAAAAATAGCTCGAT ab read2 CGCTACGTGACTGGGTCATG + AAAAAAAAAAAAAATAGCTCGAT ab read3 CGCTACGTGACTGGGTCATG + AAAAAAAAAAAAAAAAATGGTATG ba read3 CGCTACGTGACTAAAACATG + +We only need columns 1 and 2. These two columns can be extracted from this dataset using the **Cut** tool:: + + 1 2 + --------------------------- + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAAAATGGTATG ba + +Next, the tags are sorted in ascending or descending order using the **Sort** tool:: + + 1 2 + --------------------------- + AAAAAAAAAAAAAAAAATGGTATG ba + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAATAGCTCGAT ab + +Finally, unique occurencies of each tag are counted. This is done using **Unique lines** tool that adds an additional column with the counts that also represent the family size (column 1):: + + 1 2 3 + ----------------------------- + 1 AAAAAAAAAAAAAAAAATGGTATG ba + 3 AAAAAAAAAAAAAATGGTATGGAC ab + +These data can now be used in this tool. + +**Output** + +The output is one PDF file with various plots of the Tag distance, a tabular file with the summarized data of the plots and a tabular file with the chimeras. The PDF file contains several pages: + + 1. This first page contains a graph representing the minimum tag distance (smallest number of differences) categorized after the family sizes. + + 2. The second page contains the same information as the first page, but plots the family size categorized by the minimum tag distance. + + 3. The third page contains the **first step** of the **chimera analysis**, which examines the differences between the tags at both ends of a read (a/b). Chimeras can be distinguished by carrying the same tag at one end combined with multiple different tags at the other end of a read. Here, we describe the calculation of the TDs for only one tag in detail, but the process is repeated for each tag in the sample (default n=1000). First, the tool splits the tag into its upstream and downstream part (named a and b) and compares it with all other a parts of the families in the dataset. Next, the tool estimates the sequence differences (TD) among the a parts and extracts those tags with the smallest difference (TD a.min) and calculates the TD of the b part. The tags with the largest differences are extracted to estimate the maximum TD (TD b.max). The process is repeated starting with the b part instead and estimates TD a.max and TD b.min. Next, we calculate the sum of TD a.min and TD b.max. + + 4. The fourth page contains the **second step** of the **chimera analysis**: the absolute difference (=delta TD) between the partial TDs (TD a.min & TD b.max and TD b.min & TD a.max). The partial TDs of chimeric tags are normally very different which means that multiple combinations of the same a part with different b parts is likely. But it is possible that small delta TDs occur due to a half of a tag that is identical to other halves in the data. For this purpose, the relative difference between the partial TDs is estimated in the next step. + + 5. The fifth page contains the **third step** of the **chimera analysis**: the relative differences of the partial TDs (=relative delta TD). These are calculated as the absolute difference between TD a.min and TD b.max equal to TD delta. Since it is not known whether the absolute difference originates due to a low and a very large TD within a tag or an identical half (TD=0), the tool estimates the relative TD delta as the ratio of the difference to the sum of the partial TDs. In a chimera, it is expected that only one end of the tag contributes the TD of the whole tag. In other words, if the same a part is observed in combination with several different b parts, then one end will have a TD = 0. Thus, the TD difference between the parts (TD a.min - TD b.max) is the same as the sum of the parts (TD a.min + TD b.max) or the ratio of the difference to the sum (relative delta TD = TD a.min - TD b.max / TD a.min + TD b.max) will equal 1 in chimeric families. The plot can be interpreted as the following: + + - A low relative difference indicates that the total TD is equally distributed in the two partial TDs. This case would be expected, if all tags originate from different molecules. + - A relative delta TD of 1 means that one part of the tags is identical. Since it is very unlikely that by chance two different tags have a TD of 0, the TDs in the other half are probably artificially introduced and represents chimeric families. + + 6. The sixth page is an analysis only of **chimeric tags** (relative delta TD =1) from step 5. + + 7. The last page is only generated when the parameter "only DCS in the analysis?" is set to **False (NO)**. The graph represents the **TD of the chimeric tags** that form a DCS (complementary ab and ba). + + .. class:: infomark + +**Note:** +Chimeras can be identical in the first or second part of the tag and can have an identical TD with mutliple tags. Therefore, the second column of the output file can have multiple tag entries. The file also contains the family sizes and the direction of the read (ab, ba). The asterisks mark the identical part of the tag.:: + + 1 2 + -------------------------------------------------------------------------------------------------- + GAAAGGGAGG GCGCTTCACG 1 ba GCAATCGACG *GCGCTTCACG* 1 ba + CCCTCCCTGA GGTTCGTTAT 1 ba CGTCCTTTTC *GGTTCGTTAT* 1 ba, GCACCTCCTT *GGTTCGTTAT* 1 ba + ATGCTGATCT CGAATGCATA 55 ba, 59 ab AGGTGCCGCC *CGAATGCATA* 27 ba, *ATGCTGATCT* GAATGTTTAC 1 ba + +**About Author** + +Author: Monika Heinzl + +Department: Institute of Biophysics, Johannes Kepler University Linz, Austria + +Contact: monika.heinzl@edumail.at + + ]]> + + </help> + <citations> + <citation type="bibtex"> + @misc{duplex, + author = {}, + year = {}, + title = {} + } + </citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/td_chimeras_output.tab Wed Oct 16 04:17:59 2019 -0400 @@ -0,0 +1,22 @@ +chimera tag family size, read direction similar tag with TD=0 +AAAAAAAAAAAA AACCAAAACTTC 1 ba *AAAAAAAAAAAA* TCTTTCTTTGAG 2 ab +AAAAAAAAAAAA ACCAGGCGTCGA 1 ba *AAAAAAAAAAAA* AACCAAAACTTC 1 ba, *AAAAAAAAAAAA* AGCTCCACGTTG 1 ba, *AAAAAAAAAAAA* CAGTGTTGAGAC 1 ba, *AAAAAAAAAAAA* TCTTTCTTTGAG 2 ab, *AAAAAAAAAAAA* TTGGGTTCCTTA 1 ab +AAAAAAAAAAAA ATCGTGGTTTGT 1 ba *AAAAAAAAAAAA* CAGTGTTGAGAC 1 ba, AAAAAAAAAAAG *ATCGTGGTTTGT* 4 ba +AAAAAAAAAAAA ATTCACCCTTGT 1 ba *AAAAAAAAAAAA* CAGTGTTGAGAC 1 ba, AAAAAAAAAAAT *ATTCACCCTTGT* 6 ba +AAAAAAAAAAAA CACACTTAACTT 7 ba *AAAAAAAAAAAA* ATTCACCCTTGT 1 ba, *AAAAAAAAAAAA* CCGCTCCTCACA 4 ba, *AAAAAAAAAAAA* TCTTTCTTTGAG 2 ab +AAAAAAAAAAAA GGCAACACAGAA 1 ab *AAAAAAAAAAAA* ATCGTGGTTTGT 1 ba, AAAAAAAAAAAG *GGCAACACAGAA* 3 ab +AAAAAAAAAAAG AGTCGCACCCAG 1 ba *AAAAAAAAAAAG* ATCGTGGTTTGT 4 ba +AAAAAAAAAAAG CGCAACACAGAA 1 ab *AAAAAAAAAAAG* ATCGTGGTTTGT 4 ba +AAAAAAAAAAAG TAGCCCTAAACG 1 ab *AAAAAAAAAAAG* ATCGTGGTTTGT 4 ba +AAAAAAAAAAAG TCTTTCTTTGAG 1 ab AAAAAAAAAAAA *TCTTTCTTTGAG* 2 ab, *AAAAAAAAAAAG* ATCGTGGTTTGT 4 ba, *AAAAAAAAAAAG* CGCAACACAGAA 1 ab, *AAAAAAAAAAAG* GGCAACACAGAA 3 ab +AAAAAAAAAAAT ATCATAGACTCT 1 ab *AAAAAAAAAAAT* ATTCACCCTTGT 6 ba +AAAAAAAAAAAT ATTCGAAAGTTA 1 ba *AAAAAAAAAAAT* ATCATAGACTCT 1 ab, *AAAAAAAAAAAT* ATTCACCCTTGT 6 ba +This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column. +The tags were separated by an empty space into their halves and the * marks the identical half. + +Statistics of nr. of tags that returned max. TD (2nd column) +minimum 1 tag(s) +mean 2.08333333333 tag(s) +median 2.0 tag(s) +maximum 5 tag(s) +sum 25 tag(s)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/td_data.tab Wed Oct 16 04:17:59 2019 -0400 @@ -0,0 +1,20 @@ +1 AAAAAAAAAAAAAACCAAAACTTC ba +1 AAAAAAAAAAAAACCAGGCGTCGA ba +1 AAAAAAAAAAAAAGCTCCACGTTG ba +1 AAAAAAAAAAAAATCGTGGTTTGT ba +1 AAAAAAAAAAAAATTCACCCTTGT ba +7 AAAAAAAAAAAACACACTTAACTT ba +1 AAAAAAAAAAAACAGTGTTGAGAC ba +4 AAAAAAAAAAAACCGCTCCTCACA ba +1 AAAAAAAAAAAAGGCAACACAGAA ab +2 AAAAAAAAAAAATCTTTCTTTGAG ab +1 AAAAAAAAAAAATTGGGTTCCTTA ab +1 AAAAAAAAAAAGAGTCGCACCCAG ba +4 AAAAAAAAAAAGATCGTGGTTTGT ba +1 AAAAAAAAAAAGCGCAACACAGAA ab +3 AAAAAAAAAAAGGGCAACACAGAA ab +1 AAAAAAAAAAAGTAGCCCTAAACG ab +1 AAAAAAAAAAAGTCTTTCTTTGAG ab +1 AAAAAAAAAAATATCATAGACTCT ab +6 AAAAAAAAAAATATTCACCCTTGT ba +1 AAAAAAAAAAATATTCGAAAGTTA ba \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/td_output.tab Wed Oct 16 04:17:59 2019 -0400 @@ -0,0 +1,92 @@ +td_data.tab +nr of tags 20 +sample size 20 + +Tag distance separated by family size + FS=1 FS=2 FS=3 FS=4 FS=5-10 FS>10 sum +TD=1 5 1 1 1 1 0 9 +TD=6 3 0 0 0 0 0 3 +TD=7 4 0 0 0 1 0 5 +TD=8 2 0 0 1 0 0 3 +sum 14 1 1 2 2 0 20 + +Family size distribution separated by Tag distance + TD=1 TD=2 TD=3 TD=4 TD=5-8 TD>8 sum +FS=1 5 0 0 0 9 0 14 +FS=2 1 0 0 0 0 0 1 +FS=3 1 0 0 0 0 0 1 +FS=4 1 0 0 0 1 0 2 +FS=6 1 0 0 0 0 0 1 +FS=7 0 0 0 0 1 0 1 +sum 9 0 0 0 11 0 20 + + +max. family size in sample: 7 +absolute frequency: 1 +relative frequency: 0.05 + +Chimera Analysis: +The tags are splitted into two halves (part a and b) for which the Tag distances (TD) are calculated seperately. +The tag distance of the first half (part a) is calculated by comparing part a of the tag in the sample against all a parts in the dataset and by selecting the minimum value (TD a.min). +In the next step, we select those tags that showed the minimum TD and estimate the TD for the second half (part b) of the tag by comparing part b against the previously selected subset. +The maximum value represents then TD b.max. Finally, these process is repeated but starting with part b instead and TD b.min and TD a.max are calculated. +Next, the absolute differences between TD a.min & TD b.max and TD b.min & TD a.max are estimated (delta HD). +These are then divided by the sum of both parts (TD a.min + TD b.max or TD b.min + TD a.max, respectively) which give the relative differences between the partial HDs (rel. delta HD). +For simplicity, we used the maximum value of the relative differences and the respective delta HD. +Note that when only tags that can form a DCS are included in the analysis, the family sizes for both directions (ab and ba) of the strand will be included in the plots. + +length of one half of the tag 12 + +Tag distance of each half in the tag + TD a.min TD b.max TD b.min TD a.max TD a.min + b.max, TD a.max + b.min sum +TD=0 20 0 8 1 0 29 +TD=1 0 0 1 19 8 28 +TD=2 0 0 0 0 1 1 +TD=5 0 0 3 0 0 3 +TD=6 0 0 2 0 3 5 +TD=7 0 1 6 0 4 11 +TD=8 0 2 0 0 7 9 +TD=9 0 1 0 0 1 2 +TD=10 0 2 0 0 2 4 +TD=11 0 7 0 0 7 14 +TD=12 0 7 0 0 7 14 +sum 20 20 20 20 40 120 + +Absolute delta Tag distance within the tag + FS=1 FS=2 FS=3 FS=4 FS=5-10 FS>10 sum +diff=7 1 0 0 0 0 0 1 +diff=8 1 0 0 0 1 0 2 +diff=9 1 0 0 0 0 0 1 +diff=10 2 0 0 0 0 0 2 +diff=11 4 0 1 1 1 0 7 +diff=12 5 1 0 1 0 0 7 +sum 14 1 1 2 2 0 20 + +Chimera analysis: relative delta Tag distance + FS=1 FS=2 FS=3 FS=4 FS=5-10 FS>10 sum +diff=1.0 14 1 1 2 2 0 20 +sum 14 1 1 2 2 0 20 + +All tags are filtered and only those tags where one half is identical (TD=0) and therefore, have a relative delta TD of 1, are kept. +These tags are considered as chimeras. +Tag distance of chimeric families separated after FS + FS=1 FS=2 FS=3 FS=4 FS=5-10 FS>10 sum +TD=7 1 0 0 0 0 0 1 +TD=8 1 0 0 0 1 0 2 +TD=9 1 0 0 0 0 0 1 +TD=10 2 0 0 0 0 0 2 +TD=11 4 0 1 1 1 0 7 +TD=12 5 1 0 1 0 0 7 +sum 14 1 1 2 2 0 20 + +Tag distance of chimeric families separated after DCS and single SSCS (ab, ba) + DCS SSCS ab SSCS ba sum +TD=7.0 0 0 1 1 +TD=8.0 0 1 1 2 +TD=9.0 0 1 0 1 +TD=10.0 0 1 1 2 +TD=11.0 0 3 4 7 +TD=12.0 0 2 5 7 +sum 0 8 12 20 + +