Mercurial > repos > mheinzl > td
diff td.py @ 0:3e56058d9552 draft default tip
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/hd commit 9bae9043a53f1e07b502acd1082450adcb6d9e31-dirty
author | mheinzl |
---|---|
date | Wed, 16 Oct 2019 04:17:59 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/td.py Wed Oct 16 04:17:59 2019 -0400 @@ -0,0 +1,1364 @@ +#!/usr/bin/env python + +# Tag distance analysis of SSCSs +# +# Author: Monika Heinzl, Johannes-Kepler University Linz (Austria) +# Contact: monika.heinzl@edumail.at +# +# Takes at least one TABULAR file with tags before the alignment to the SSCS and +# optionally a second TABULAR file as input. The program produces a plot which shows a histogram of Hamming distances +# separated after family sizes, a family size distribution separated after Hamming distances for all (sample_size=0) +# or a given sample of SSCSs or SSCSs, which form a DCS. In additon, the tool produces HD and FSD plots for the +# difference between the HDs of both parts of the tags and for the chimeric reads and finally a CSV file with the +# data of the plots. It is also possible to perform the HD analysis with shortened tags with given sizes as input. +# The tool can run on a certain number of processors, which can be defined by the user. + +# USAGE: python td.py --inputFile filename --inputName1 filename --sample_size int / +# --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int +# --nr_above_bars True/False --output_tabular outptufile_name_tabular + +import argparse +import itertools +import operator +import sys +from collections import Counter, defaultdict +from functools import partial +from multiprocessing.pool import Pool +import random +import os + +import matplotlib.pyplot as plt +import numpy +from matplotlib.backends.backend_pdf import PdfPages + +plt.switch_backend('agg') + + +def plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, originalCounts, + subtitle, pdf, relative=False, diff=True, rel_freq=False): + if diff is False: + colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"] + labels = ["TD=1", "TD=2", "TD=3", "TD=4", "TD=5-8", "TD>8"] + else: + colors = ["#93A6AB", "#403C14", "#731E41", "#BAB591", "#085B6F", "#E8AA35", "#726C66"] + if relative is True: + labels = ["d=0", "d=0.1", "d=0.2", "d=0.3", "d=0.4", "d=0.5-0.8", "d>0.8"] + else: + labels = ["d=0", "d=1", "d=2", "d=3", "d=4", "d=5-8", "d>8"] + + fig = plt.figure(figsize=(6, 7)) + ax = fig.add_subplot(111) + plt.subplots_adjust(bottom=0.1) + p1 = numpy.bincount(numpy.concatenate(familySizeList1)) + maximumY = numpy.amax(p1) + + if len(range(minimumXFS, maximumXFS)) == 0: + range1 = range(minimumXFS - 1, minimumXFS + 2) + else: + range1 = range(0, maximumXFS + 2) + + if rel_freq: + w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(familySizeList1)) for data in familySizeList1] + counts = plt.hist(familySizeList1, label=labels, weights=w, color=colors, stacked=True, + rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1) + plt.ylabel("Relative Frequency", fontsize=14) + plt.ylim((0, 1.07)) + else: + counts = plt.hist(familySizeList1, label=labels, color=colors, stacked=True, + rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1) + if len(numpy.concatenate(familySizeList1)) != 0: + plt.ylim((0, max(numpy.bincount(numpy.concatenate(familySizeList1))) * 1.1)) + plt.ylabel("Absolute Frequency", fontsize=14) + plt.ylim((0, maximumY * 1.2)) + plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) + plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) + plt.xlabel("Family size", fontsize=14) + ticks = numpy.arange(0, maximumXFS + 1, 1) + ticks1 = map(str, ticks) + if maximumXFS >= 20: + ticks1[len(ticks1) - 1] = ">=20" + plt.xticks(numpy.array(ticks), ticks1) + [l.set_visible(False) for (i, l) in enumerate(ax.get_xticklabels()) if i % 5 != 0] + plt.xlim((0, maximumXFS + 1)) + legend = "\nfamily size: \nabsolute frequency: \nrelative frequency: " + plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure) + + count = numpy.bincount(originalCounts) # original counts + if max(originalCounts) >= 20: + max_count = ">= 20" + else: + max_count = max(originalCounts) + legend1 = "{}\n{}\n{:.5f}".format(max_count, p1[len(p1) - 1], float(p1[len(p1) - 1]) / sum(p1)) + plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure) + legend3 = "singletons\n{:,}\n{:.5f}".format(int(p1[1]), float(p1[1]) / sum(p1)) + plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12) + plt.grid(b=True, which='major', color='#424242', linestyle=':') + pdf.savefig(fig, bbox_inches="tight") + plt.close("all") + + +def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False, + nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False): + if relative is True: + step = 0.1 + else: + step = 1 + + fig = plt.figure(figsize=(6, 8)) + plt.subplots_adjust(bottom=0.1) + p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())]) + maximumY = numpy.amax(p1) + if relative is True: # relative difference + bin1 = numpy.arange(-1, maximumX + 0.2, 0.1) + else: + bin1 = maximumX + 1 + + if rel_freq: + w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1] + counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w, + label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8, + color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"], + stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) + plt.ylim((0, 1.07)) + plt.ylabel("Relative Frequency", fontsize=14) + bins = counts[1] # width of bins + counts = numpy.array(map(float, counts[0][5])) + + else: + counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, + label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8, + color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"], + stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) + maximumY = numpy.amax(p1) + plt.ylim((0, maximumY * 1.2)) + plt.ylabel("Absolute Frequency", fontsize=14) + bins = counts[1] # width of bins + counts = numpy.array(map(int, counts[0][5])) + + plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) + plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) + plt.xlabel(xlabel, fontsize=14) + plt.grid(b=True, which='major', color='#424242', linestyle=':') + plt.xlim((minimumX - step, maximumX + step)) + # plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) + plt.xticks(numpy.arange(0, maximumX + step, step)) + + if nr_above_bars: + bin_centers = -0.4 * numpy.diff(bins) + bins[:-1] + for x_label, label in zip(counts, bin_centers): # labels for values + if x_label == 0: + continue + else: + if rel_freq: + plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))), + float(x_label)), + xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001), + xycoords="data", color="#000066", fontsize=10) + else: + plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)), + xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01), + xycoords="data", color="#000066", fontsize=10) + + if nr_unique_chimeras != 0: + if (relative and ((counts[len(counts)-1] / nr_unique_chimeras) == 2)) or \ + (sum(counts) / nr_unique_chimeras) == 2: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})"\ + .format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2) + else: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format( + lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras) + else: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format( + lenTags, len_sample, len(numpy.concatenate(list1))) + + plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure) + pdf.savefig(fig, bbox_inches="tight") + plt.close("all") + plt.clf() + + +def plotHDwithDCS(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False, + nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False): + step = 1 + fig = plt.figure(figsize=(6, 8)) + plt.subplots_adjust(bottom=0.1) + p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())]) + maximumY = numpy.amax(p1) + bin1 = maximumX + 1 + if rel_freq: + w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1] + counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w, + label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"], + stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) + plt.ylim((0, 1.07)) + plt.ylabel("Relative Frequency", fontsize=14) + bins = counts[1] # width of bins + counts = numpy.array(map(float, counts[0][2])) + + else: + counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, + label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"], + stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) + plt.ylim((0, maximumY * 1.2)) + plt.ylabel("Absolute Frequency", fontsize=14) + bins = counts[1] # width of bins + counts = numpy.array(map(int, counts[0][2])) + + plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) + plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) + plt.xlabel(xlabel, fontsize=14) + plt.grid(b=True, which='major', color='#424242', linestyle=':') + plt.xlim((minimumX - step, maximumX + step)) + # plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) + plt.xticks(numpy.arange(0, maximumX + step, step)) + + if nr_above_bars: + bin_centers = -0.4 * numpy.diff(bins) + bins[:-1] + for x_label, label in zip(counts, bin_centers): # labels for values + if x_label == 0: + continue + else: + if rel_freq: + plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))), + float(x_label)), + xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001), + xycoords="data", color="#000066", fontsize=10) + else: + plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)), + xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01), + xycoords="data", color="#000066", fontsize=10) + + if nr_unique_chimeras != 0: + if (sum(counts) / nr_unique_chimeras) == 2: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})".\ + format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2) + else: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format( + lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras) + else: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format( + lenTags, len_sample, len(numpy.concatenate(list1))) + plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure) + + legend2 = "SSCS ab = {:,} ({:.5f})\nSSCS ba = {:,} ({:.5f})\nDCS = {:,} ({:.5f})".format( + len(list1[1]), len(list1[1]) / float(nr_unique_chimeras), + len(list1[2]), len(list1[2]) / float(nr_unique_chimeras), + len(list1[0]), len(list1[0]) / float(nr_unique_chimeras)) + plt.text(0.6, -0.047, legend2, size=12, transform=plt.gcf().transFigure) + + pdf.savefig(fig, bbox_inches="tight") + plt.close("all") + plt.clf() + + +def plotHDwithinSeq(sum1, sum1min, sum2, sum2min, min_value, lenTags, pdf, len_sample, rel_freq=False): + fig = plt.figure(figsize=(6, 8)) + plt.subplots_adjust(bottom=0.1) + + ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)] # new hd within tags + maximumX = numpy.amax(numpy.concatenate(ham_partial)) + minimumX = numpy.amin(numpy.concatenate(ham_partial)) + maximumY = numpy.amax(numpy.array(numpy.concatenate(map(lambda x: numpy.bincount(x), ham_partial)))) + + if len(range(minimumX, maximumX)) == 0: + range1 = minimumX + else: + range1 = range(minimumX, maximumX + 2) + + if rel_freq: + w = [numpy.zeros_like(data) + 1. / len(data) for data in ham_partial] + plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, weights=w, + label=["TD a.min", "TD b.max", "TD b.min", "TD a.max", "TD a.min + b.max,\nTD a.max + b.min"], + bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], + edgecolor='black', linewidth=1) + plt.ylabel("Relative Frequency", fontsize=14) + plt.ylim(0, 1.07) + else: + plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, + label=["TD a.min", "TD b.max", "TD b.min", "TD a.max", "TD a.min + b.max,\nTD a.max + b.min"], + bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], + edgecolor='black', linewidth=1) + plt.ylabel("Absolute Frequency", fontsize=14) + + plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.6, 1)) + plt.suptitle('Tag distances within tags', fontsize=14) + plt.xlabel("TD", fontsize=14) + plt.grid(b=True, which='major', color='#424242', linestyle=':') + plt.xlim((minimumX - 1, maximumX + 1)) + # plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2)) + plt.xticks(numpy.arange(0, maximumX + 1, 1.0)) + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format( + lenTags, len_sample, len(numpy.concatenate(ham_partial))) + plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure) + pdf.savefig(fig, bbox_inches="tight") + plt.close("all") + plt.clf() + + +def createTableFSD2(list1, diff=True): + selfAB = numpy.concatenate(list1) + uniqueFS = numpy.unique(selfAB) + nr = numpy.arange(0, len(uniqueFS), 1) + if diff is False: + count = numpy.zeros((len(uniqueFS), 6)) + else: + count = numpy.zeros((len(uniqueFS), 7)) + state = 1 + for i in list1: + counts = list(Counter(i).items()) + hd = [item[0] for item in counts] + c = [item[1] for item in counts] + table = numpy.column_stack((hd, c)) + if len(table) == 0: + state = state + 1 + continue + else: + if state == 1: + for k, l in zip(uniqueFS, nr): + for j in table: + if j[0] == uniqueFS[l]: + count[l, 0] = j[1] + if state == 2: + for k, l in zip(uniqueFS, nr): + for j in table: + if j[0] == uniqueFS[l]: + count[l, 1] = j[1] + if state == 3: + for k, l in zip(uniqueFS, nr): + for j in table: + if j[0] == uniqueFS[l]: + count[l, 2] = j[1] + if state == 4: + for k, l in zip(uniqueFS, nr): + for j in table: + if j[0] == uniqueFS[l]: + count[l, 3] = j[1] + if state == 5: + for k, l in zip(uniqueFS, nr): + for j in table: + if j[0] == uniqueFS[l]: + count[l, 4] = j[1] + if state == 6: + for k, l in zip(uniqueFS, nr): + for j in table: + if j[0] == uniqueFS[l]: + count[l, 5] = j[1] + if state == 7: + for k, l in zip(uniqueFS, nr): + for j in table: + if j[0] == uniqueFS[l]: + count[l, 6] = j[1] + state = state + 1 + sumRow = count.sum(axis=1) + sumCol = count.sum(axis=0) + uniqueFS = uniqueFS.astype(str) + if uniqueFS[len(uniqueFS) - 1] == "20": + uniqueFS[len(uniqueFS) - 1] = ">20" + first = ["FS={}".format(i) for i in uniqueFS] + final = numpy.column_stack((first, count, sumRow)) + return (final, sumCol) + + +def createFileFSD2(summary, sumCol, overallSum, output_file, name, sep, rel=False, diff=True): + output_file.write(name) + output_file.write("\n") + if diff is False: + output_file.write("{}TD=1{}TD=2{}TD=3{}TD=4{}TD=5-8{}TD>8{}sum{}\n".format( + sep, sep, sep, sep, sep, sep, sep, sep)) + else: + if rel is False: + output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format( + sep, sep, sep, sep, sep, sep, sep, sep, sep)) + else: + output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n". + format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) + for item in summary: + for nr in item: + if "FS" not in nr and "diff" not in nr: + nr = nr.astype(float) + nr = nr.astype(int) + output_file.write("{}{}".format(nr, sep)) + output_file.write("\n") + output_file.write("sum{}".format(sep)) + sumCol = map(int, sumCol) + for el in sumCol: + output_file.write("{}{}".format(el, sep)) + output_file.write("{}{}".format(overallSum.astype(int), sep)) + output_file.write("\n\n") + + +def createTableHD(list1, row_label): + selfAB = numpy.concatenate(list1) + uniqueHD = numpy.unique(selfAB) + nr = numpy.arange(0, len(uniqueHD), 1) + count = numpy.zeros((len(uniqueHD), 6)) + state = 1 + for i in list1: + counts = list(Counter(i).items()) + hd = [item[0] for item in counts] + c = [item[1] for item in counts] + table = numpy.column_stack((hd, c)) + if len(table) == 0: + state = state + 1 + continue + else: + if state == 1: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 0] = j[1] + if state == 2: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 1] = j[1] + if state == 3: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 2] = j[1] + if state == 4: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 3] = j[1] + if state == 5: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 4] = j[1] + if state == 6: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 5] = j[1] + state = state + 1 + sumRow = count.sum(axis=1) + sumCol = count.sum(axis=0) + first = ["{}{}".format(row_label, i) for i in uniqueHD] + final = numpy.column_stack((first, count, sumRow)) + return (final, sumCol) + + +def createTableHDwithTags(list1): + selfAB = numpy.concatenate(list1) + uniqueHD = numpy.unique(selfAB) + nr = numpy.arange(0, len(uniqueHD), 1) + count = numpy.zeros((len(uniqueHD), 5)) + state = 1 + for i in list1: + counts = list(Counter(i).items()) + hd = [item[0] for item in counts] + c = [item[1] for item in counts] + table = numpy.column_stack((hd, c)) + if len(table) == 0: + state = state + 1 + continue + else: + if state == 1: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 0] = j[1] + if state == 2: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 1] = j[1] + if state == 3: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 2] = j[1] + if state == 4: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 3] = j[1] + if state == 5: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 4] = j[1] + state = state + 1 + sumRow = count.sum(axis=1) + sumCol = count.sum(axis=0) + first = ["TD={}".format(i) for i in uniqueHD] + final = numpy.column_stack((first, count, sumRow)) + return (final, sumCol) + + +def createTableHDwithDCS(list1): + selfAB = numpy.concatenate(list1) + uniqueHD = numpy.unique(selfAB) + nr = numpy.arange(0, len(uniqueHD), 1) + count = numpy.zeros((len(uniqueHD), len(list1))) + state = 1 + for i in list1: + counts = list(Counter(i).items()) + hd = [item[0] for item in counts] + c = [item[1] for item in counts] + table = numpy.column_stack((hd, c)) + if len(table) == 0: + state = state + 1 + continue + else: + if state == 1: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 0] = j[1] + if state == 2: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 1] = j[1] + if state == 3: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 2] = j[1] + state = state + 1 + sumRow = count.sum(axis=1) + sumCol = count.sum(axis=0) + first = ["TD={}".format(i) for i in uniqueHD] + final = numpy.column_stack((first, count, sumRow)) + return (final, sumCol) + + +def createFileHD(summary, sumCol, overallSum, output_file, name, sep): + output_file.write(name) + output_file.write("\n") + output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format( + sep, sep, sep, sep, sep, sep, sep, sep)) + for item in summary: + for nr in item: + if "TD" not in nr and "diff" not in nr: + nr = nr.astype(float) + nr = nr.astype(int) + output_file.write("{}{}".format(nr, sep)) + output_file.write("\n") + output_file.write("sum{}".format(sep)) + sumCol = map(int, sumCol) + for el in sumCol: + output_file.write("{}{}".format(el, sep)) + output_file.write("{}{}".format(overallSum.astype(int), sep)) + output_file.write("\n\n") + + +def createFileHDwithDCS(summary, sumCol, overallSum, output_file, name, sep): + output_file.write(name) + output_file.write("\n") + output_file.write("{}DCS{}SSCS ab{}SSCS ba{}sum{}\n".format(sep, sep, sep, sep, sep)) + for item in summary: + for nr in item: + if "TD" not in nr: + nr = nr.astype(float) + nr = nr.astype(int) + output_file.write("{}{}".format(nr, sep)) + output_file.write("\n") + output_file.write("sum{}".format(sep)) + sumCol = map(int, sumCol) + for el in sumCol: + output_file.write("{}{}".format(el, sep)) + output_file.write("{}{}".format(overallSum.astype(int), sep)) + output_file.write("\n\n") + + +def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name, sep): + output_file.write(name) + output_file.write("\n") + output_file.write("{}TD a.min{}TD b.max{}TD b.min{}TD a.max{}TD a.min + b.max, TD a.max + b.min{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep)) + for item in summary: + for nr in item: + if "TD" not in nr: + nr = nr.astype(float) + nr = nr.astype(int) + output_file.write("{}{}".format(nr, sep)) + output_file.write("\n") + output_file.write("sum{}".format(sep)) + sumCol = map(int, sumCol) + for el in sumCol: + output_file.write("{}{}".format(el, sep)) + output_file.write("{}{}".format(overallSum.astype(int), sep)) + output_file.write("\n\n") + + +def hamming(array1, array2): + res = 99 * numpy.ones(len(array1)) + i = 0 + array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time + for a in array1: + dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2]) # fastest + res[i] = numpy.amin(dist[dist > 0]) # pick min distance greater than zero + i += 1 + return res + + +def hamming_difference(array1, array2, mate_b): + array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time + array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1 + array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2 + array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1 + array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2 + + # diff11 = 999 * numpy.ones(len(array2)) + # relativeDiffList = 999 * numpy.ones(len(array2)) + # ham1 = 999 * numpy.ones(len(array2)) + # ham2 = 999 * numpy.ones(len(array2)) + # min_valueList = 999 * numpy.ones(len(array2)) + # min_tagsList = 999 * numpy.ones(len(array2)) + # diff11_zeros = 999 * numpy.ones(len(array2)) + # min_tagsList_zeros = 999 * numpy.ones(len(array2)) + + diff11 = [] + relativeDiffList = [] + ham1 = [] + ham2 = [] + ham1min = [] + ham2min = [] + min_valueList = [] + min_tagsList = [] + diff11_zeros = [] + min_tagsList_zeros = [] + max_tag_list = [] + i = 0 # counter, only used to see how many HDs of tags were already calculated + if mate_b is False: # HD calculation for all a's + half1_mate1 = array1_half + half2_mate1 = array1_half2 + half1_mate2 = array2_half + half2_mate2 = array2_half2 + elif mate_b is True: # HD calculation for all b's + half1_mate1 = array1_half2 + half2_mate1 = array1_half + half1_mate2 = array2_half2 + half2_mate2 = array2_half + + # half1_mate1, index_halves = numpy.unique(half1_mate1, return_index=True) + # print(len(half1_mate1)) + # half2_mate1 = half2_mate1[index_halves] + # array1 = array1[index_halves] + + for a, b, tag in zip(half1_mate1, half2_mate1, array1): + # exclude identical tag from array2, to prevent comparison to itself + sameTag = numpy.where(array2 == tag)[0] + indexArray2 = numpy.arange(0, len(array2), 1) + index_withoutSame = numpy.delete(indexArray2, sameTag) # delete identical tag from the data + + # all tags without identical tag + array2_half_withoutSame = half1_mate2[index_withoutSame] + array2_half2_withoutSame = half2_mate2[index_withoutSame] + array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs) + # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" + dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in + array2_half_withoutSame]) + min_index = numpy.where(dist == dist.min())[0] # get index of min HD + min_value = dist.min() + # min_value = dist[min_index] # get minimum HDs + # get all "b's" of the tag or all "a's" of the tag with minimum HD + min_tag_half2 = array2_half2_withoutSame[min_index] + min_tag_array2 = array2_withoutSame[min_index] # get whole tag with min HD + + dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in + min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" + max_value = dist_second_half.max() + max_index = numpy.where(dist_second_half == dist_second_half.max())[0] # get index of max HD + max_tag = min_tag_array2[max_index] + + # for d, d2 in zip(min_value, max_value): + if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b + ham2.append(min_value) + ham2min.append(max_value) + else: # half1, corrects the variable of the HD from both halfs if it is a or b + ham1.append(min_value) + ham1min.append(max_value) + + min_valueList.append(min_value + max_value) + min_tagsList.append(tag) + difference1 = abs(min_value - max_value) + diff11.append(difference1) + rel_difference = round(float(difference1) / (min_value + max_value), 1) + relativeDiffList.append(rel_difference) + + # tags which have identical parts: + if min_value == 0 or max_value == 0: + min_tagsList_zeros.append(numpy.array(tag)) + difference1_zeros = abs(min_value - max_value) # td of non-identical part + diff11_zeros.append(difference1_zeros) + max_tag_list.append(numpy.array(max_tag)) + else: + min_tagsList_zeros.append(None) + diff11_zeros.append(None) + max_tag_list.append(None) + i += 1 + + # print(i) + # diff11 = [st for st in diff11 if st != 999] + # ham1 = [st for st in ham1 if st != 999] + # ham2 = [st for st in ham2 if st != 999] + # min_valueList = [st for st in min_valueList if st != 999] + # min_tagsList = [st for st in min_tagsList if st != 999] + # relativeDiffList = [st for st in relativeDiffList if st != 999] + # diff11_zeros = [st for st in diff11_zeros if st != 999] + # min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999] + return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, + min_tagsList_zeros, ham1min, ham2min, max_tag_list]) + + +def readFileReferenceFree(file): + with open(file, 'r') as dest_f: + data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') + integers = numpy.array(data_array[:, 0]).astype(int) + return(integers, data_array) + + +def hammingDistanceWithFS(fs, ham): + fs = numpy.asarray(fs) + maximum = max(ham) + minimum = min(ham) + ham = numpy.asarray(ham) + + singletons = numpy.where(fs == 1)[0] + data = ham[singletons] + + hd2 = numpy.where(fs == 2)[0] + data2 = ham[hd2] + + hd3 = numpy.where(fs == 3)[0] + data3 = ham[hd3] + + hd4 = numpy.where(fs == 4)[0] + data4 = ham[hd4] + + hd5 = numpy.where((fs >= 5) & (fs <= 10))[0] + data5 = ham[hd5] + + hd6 = numpy.where(fs > 10)[0] + data6 = ham[hd6] + + list1 = [data, data2, data3, data4, data5, data6] + return(list1, maximum, minimum) + + +def familySizeDistributionWithHD(fs, ham, diff=False, rel=True): + hammingDistances = numpy.unique(ham) + fs = numpy.asarray(fs) + ham = numpy.asarray(ham) + bigFamilies2 = numpy.where(fs > 19)[0] + if len(bigFamilies2) != 0: + fs[bigFamilies2] = 20 + maximum = max(fs) + minimum = min(fs) + if diff is True: + hd0 = numpy.where(ham == 0)[0] + data0 = fs[hd0] + + if rel is True: + hd1 = numpy.where(ham == 0.1)[0] + else: + hd1 = numpy.where(ham == 1)[0] + data = fs[hd1] + + if rel is True: + hd2 = numpy.where(ham == 0.2)[0] + else: + hd2 = numpy.where(ham == 2)[0] + data2 = fs[hd2] + + if rel is True: + hd3 = numpy.where(ham == 0.3)[0] + else: + hd3 = numpy.where(ham == 3)[0] + data3 = fs[hd3] + + if rel is True: + hd4 = numpy.where(ham == 0.4)[0] + else: + hd4 = numpy.where(ham == 4)[0] + data4 = fs[hd4] + + if rel is True: + hd5 = numpy.where((ham >= 0.5) & (ham <= 0.8))[0] + else: + hd5 = numpy.where((ham >= 5) & (ham <= 8))[0] + data5 = fs[hd5] + + if rel is True: + hd6 = numpy.where(ham > 0.8)[0] + else: + hd6 = numpy.where(ham > 8)[0] + data6 = fs[hd6] + + if diff is True: + list1 = [data0, data, data2, data3, data4, data5, data6] + else: + list1 = [data, data2, data3, data4, data5, data6] + + return(list1, hammingDistances, maximum, minimum) + + +def hammingDistanceWithDCS(minHD_tags_zeros, diff_zeros, data_array): + diff_zeros = numpy.array(diff_zeros) + maximum = numpy.amax(diff_zeros) + minimum = numpy.amin(diff_zeros) + minHD_tags_zeros = numpy.array(minHD_tags_zeros) + + idx = numpy.concatenate([numpy.where(data_array[:, 1] == i)[0] for i in minHD_tags_zeros]) + subset_data = data_array[idx, :] + + seq = numpy.array(subset_data[:, 1]) + + # find all unique tags and get the indices for ALL tags, but only once + u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) + DCS_tags = u[c == 2] + rest_tags = u[c == 1] + + dcs = numpy.repeat("DCS", len(DCS_tags)) + idx_sscs = numpy.concatenate([numpy.where(subset_data[:, 1] == i)[0] for i in rest_tags]) + sscs = subset_data[idx_sscs, 2] + + all_tags = numpy.column_stack((numpy.concatenate((DCS_tags, subset_data[idx_sscs, 1])), + numpy.concatenate((dcs, sscs)))) + hd_DCS = [] + ab_SSCS = [] + ba_SSCS = [] + + for i in range(len(all_tags)): + tag = all_tags[i, :] + hd = diff_zeros[numpy.where(minHD_tags_zeros == tag[0])[0]] + + if tag[1] == "DCS": + hd_DCS.append(hd) + elif tag[1] == "ab": + ab_SSCS.append(hd) + elif tag[1] == "ba": + ba_SSCS.append(hd) + + if len(hd_DCS) != 0: + hd_DCS = numpy.concatenate(hd_DCS) + if len(ab_SSCS) != 0: + ab_SSCS = numpy.concatenate(ab_SSCS) + if len(ba_SSCS) != 0: + ba_SSCS = numpy.concatenate(ba_SSCS) + list1 = [hd_DCS, ab_SSCS, ba_SSCS] # list for plotting + return(list1, maximum, minimum) + + +def make_argparser(): + parser = argparse.ArgumentParser(description='Tag distance analysis of duplex sequencing data') + parser.add_argument('--inputFile', + help='Tabular File with three columns: ab or ba, tag and family size.') + parser.add_argument('--inputName1') + parser.add_argument('--sample_size', default=1000, type=int, + help='Sample size of Tag distance analysis.') + parser.add_argument('--subset_tag', default=0, type=int, + help='The tag is shortened to the given number.') + parser.add_argument('--nproc', default=4, type=int, + help='The tool runs with the given number of processors.') + parser.add_argument('--only_DCS', action="store_false", + help='Only tags of the DCSs are included in the HD analysis') + + parser.add_argument('--minFS', default=1, type=int, + help='Only tags, which have a family size greater or equal than specified, ' + 'are included in the HD analysis') + parser.add_argument('--maxFS', default=0, type=int, + help='Only tags, which have a family size smaller or equal than specified, ' + 'are included in the HD analysis') + parser.add_argument('--nr_above_bars', action="store_true", + help='If False, values above bars in the histograms are removed') + parser.add_argument('--rel_freq', action="store_false", + help='If True, the relative frequencies are displayed.') + + parser.add_argument('--output_tabular', default="data.tabular", type=str, + help='Name of the tabular file.') + parser.add_argument('--output_pdf', default="data.pdf", type=str, + help='Name of the pdf file.') + parser.add_argument('--output_chimeras_tabular', default="data.tabular", type=str, + help='Name of the tabular file with all chimeric tags.') + + return parser + + +def Hamming_Distance_Analysis(argv): + parser = make_argparser() + args = parser.parse_args(argv[1:]) + file1 = args.inputFile + name1 = args.inputName1 + index_size = args.sample_size + title_savedFile_pdf = args.output_pdf + title_savedFile_csv = args.output_tabular + output_chimeras_tabular = args.output_chimeras_tabular + onlyDuplicates = args.only_DCS + rel_freq = args.rel_freq + minFS = args.minFS + maxFS = args.maxFS + nr_above_bars = args.nr_above_bars + subset = args.subset_tag + nproc = args.nproc + sep = "\t" + + # input checks + if index_size < 0: + print("index_size is a negative integer.") + exit(2) + if nproc <= 0: + print("nproc is smaller or equal zero") + exit(3) + if subset < 0: + print("subset_tag is smaller or equal zero.") + exit(5) + + # PLOT + plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color + plt.rcParams['xtick.labelsize'] = 14 + plt.rcParams['ytick.labelsize'] = 14 + plt.rcParams['patch.edgecolor'] = "#000000" + plt.rc('figure', figsize=(11.69, 8.27)) # A4 format + name1 = name1.split(".tabular")[0] + + with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf: + print("dataset: ", name1) + integers, data_array = readFileReferenceFree(file1) + data_array = numpy.array(data_array) + print("total nr of tags:", len(data_array)) + + # filter tags out which contain any other character than ATCG + valid_bases = ["A", "T", "G", "C"] + tagsToDelete = [] + for idx, t in enumerate(data_array[:, 1]): + for char in t: + if char not in valid_bases: + tagsToDelete.append(idx) + break + + if len(tagsToDelete) != 0: # delete tags with N in the tag from data + print("nr of tags with any other character than A, T, C, G:", len(tagsToDelete), + float(len(tagsToDelete)) / len(data_array)) + index_whole_array = numpy.arange(0, len(data_array), 1) + index_withoutN_inTag = numpy.delete(index_whole_array, tagsToDelete) + data_array = data_array[index_withoutN_inTag, :] + integers = integers[index_withoutN_inTag] + print("total nr of filtered tags:", len(data_array)) + + int_f = numpy.array(data_array[:, 0]).astype(int) + data_array = data_array[numpy.where(int_f >= minFS)] + integers = integers[integers >= minFS] + + # select family size for tags + if maxFS > 0: + int_f2 = numpy.array(data_array[:, 0]).astype(int) + data_array = data_array[numpy.where(int_f2 <= maxFS)] + integers = integers[integers <= maxFS] + + if onlyDuplicates is True: + tags = data_array[:, 2] + seq = data_array[:, 1] + + # find all unique tags and get the indices for ALL tags, but only once + u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) + d = u[c == 2] + + # get family sizes, tag for duplicates + duplTags_double = integers[numpy.in1d(seq, d)] + duplTags = duplTags_double[0::2] # ab of DCS + duplTagsBA = duplTags_double[1::2] # ba of DCS + + duplTags_tag = tags[numpy.in1d(seq, d)][0::2] # ab + duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags + + if minFS > 1: + duplTags_tag = duplTags_tag[(duplTags >= minFS) & (duplTagsBA >= minFS)] + duplTags_seq = duplTags_seq[(duplTags >= minFS) & (duplTagsBA >= minFS)] + duplTags = duplTags[(duplTags >= minFS) & (duplTagsBA >= minFS)] # ab+ba with FS>=3 + + data_array = numpy.column_stack((duplTags, duplTags_seq)) + data_array = numpy.column_stack((data_array, duplTags_tag)) + integers = numpy.array(data_array[:, 0]).astype(int) + print("DCS in whole dataset", len(data_array)) + + print("min FS", min(integers)) + print("max FS", max(integers)) + + # HD analysis for a subset of the tag + if subset > 0: + tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]]) + tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]]) + + flanking_region_float = float((len(tag1[0]) - subset)) / 2 + flanking_region = int(flanking_region_float) + if flanking_region_float % 2 == 0: + tag1_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag1]) + tag2_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag2]) + else: + flanking_region_rounded = int(round(flanking_region, 1)) + flanking_region_rounded_end = len(tag1[0]) - subset - flanking_region_rounded + tag1_shorten = numpy.array( + [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag1]) + tag2_shorten = numpy.array( + [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2]) + + data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)]) + data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2])) + + print("length of tag= ", len(data_array[0, 1])) + # select sample: if no size given --> all vs. all comparison + if index_size == 0: + result = numpy.arange(0, len(data_array), 1) + else: + numpy.random.shuffle(data_array) + unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True) # get only unique tags + result = numpy.random.choice(unique_indices, size=index_size, + replace=False) # array of random sequences of size=index.size + + # result = numpy.random.choice(len(integers), size=index_size, + # replace=False) # array of random sequences of size=index.size + # result = numpy.where(numpy.array(random_tags) == numpy.array(data_array[:,1]))[0] + + # with open("index_result.pkl", "wb") as o: + # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL) + + # save counts + # with open(data_folder + "index_sampleTags1000_Barcode3_DCS.pkl", "wb") as f: + # pickle.dump(result, f, pickle.HIGHEST_PROTOCOL) + # with open(data_folder + "dataArray_sampleTags1000_Barcode3_DCS.pkl", "wb") as f1: + # pickle.dump(data_array, f1, pickle.HIGHEST_PROTOCOL) + # + # with open(data_folder + "index_sampleTags100.pkl", "rb") as f: + # result = pickle.load(f) + # + # with open(data_folder + "dataArray_sampleTags100.pkl", "rb") as f1: + # data_array = pickle.load(f1) + + # with open(data_folder + "index_result.txt", "w") as t: + # for text in result: + # t.write("{}\n".format(text)) + + # comparison random tags to whole dataset + result1 = data_array[result, 1] # random tags + result2 = data_array[:, 1] # all tags + print("sample size= ", len(result1)) + + # HD analysis of whole tag + proc_pool = Pool(nproc) + chunks_sample = numpy.array_split(result1, nproc) + ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample) + proc_pool.close() + proc_pool.join() + ham = numpy.concatenate(ham).astype(int) + # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1: + # for h, tag in zip(ham, result1): + # output_file1.write("{}\t{}\n".format(tag, h)) + + # # HD analysis for chimeric reads + # result2 = data_array_whole_dataset[:,1] + + proc_pool_b = Pool(nproc) + diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample) + diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample) + proc_pool_b.close() + proc_pool_b.join() + HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]), + numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int) + HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]), + numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int) + minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]), + numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int) + HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]), + numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int) + HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]), + numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int) + + rel_Diff1 = numpy.concatenate([item[5] for item in diff_list_a]) + rel_Diff2 = numpy.concatenate([item[5] for item in diff_list_b]) + diff1 = numpy.concatenate([item[0] for item in diff_list_a]) + diff2 = numpy.concatenate([item[0] for item in diff_list_b]) + + diff_zeros1 = numpy.concatenate([item[6] for item in diff_list_a]) + diff_zeros2 = numpy.concatenate([item[6] for item in diff_list_b]) + minHD_tags = numpy.concatenate([item[4] for item in diff_list_a]) + minHD_tags_zeros1 = numpy.concatenate([item[7] for item in diff_list_a]) + minHD_tags_zeros2 = numpy.concatenate([item[7] for item in diff_list_b]) + + chimera_tags1 = sum([item[10] for item in diff_list_a], []) + chimera_tags2 = sum([item[10] for item in diff_list_b], []) + + rel_Diff = [] + diff_zeros = [] + minHD_tags_zeros = [] + diff = [] + chimera_tags = [] + for d1, d2, rel1, rel2, zeros1, zeros2, tag1, tag2, ctag1, ctag2 in \ + zip(diff1, diff2, rel_Diff1, rel_Diff2, diff_zeros1, diff_zeros2, minHD_tags_zeros1, minHD_tags_zeros2, + chimera_tags1, chimera_tags2): + relatives = numpy.array([rel1, rel2]) + absolutes = numpy.array([d1, d2]) + max_idx = numpy.argmax(relatives) + rel_Diff.append(relatives[max_idx]) + diff.append(absolutes[max_idx]) + + if all(i is not None for i in [zeros1, zeros2]): + diff_zeros.append(max(zeros1, zeros2)) + minHD_tags_zeros.append(str(tag1)) + tags = [ctag1, ctag2] + chimera_tags.append(tags) + elif zeros1 is not None and zeros2 is None: + diff_zeros.append(zeros1) + minHD_tags_zeros.append(str(tag1)) + chimera_tags.append(ctag1) + elif zeros1 is None and zeros2 is not None: + diff_zeros.append(zeros2) + minHD_tags_zeros.append(str(tag2)) + chimera_tags.append(ctag2) + + chimera_tags_new = chimera_tags + data_chimeraAnalysis = numpy.column_stack((minHD_tags_zeros, chimera_tags_new)) + # chimeras_dic = defaultdict(list) + # + # for t1, t2 in zip(minHD_tags_zeros, chimera_tags_new): + # if len(t2) >1 and type(t2) is not numpy.ndarray: + # t2 = numpy.concatenate(t2) + # chimeras_dic[t1].append(t2) + + checked_tags = [] + stat_maxTags = [] + + with open(output_chimeras_tabular, "w") as output_file1: + output_file1.write("chimera tag\tfamily size, read direction\tsimilar tag with TD=0\n") + for i in range(len(data_chimeraAnalysis)): + tag1 = data_chimeraAnalysis[i, 0] + + info_tag1 = data_array[data_array[:, 1] == tag1, :] + fs_tag1 = ["{} {}".format(t[0], t[2]) for t in info_tag1] + + if tag1 in checked_tags: # skip tag if already written to file + continue + + sample_half_a = tag1[0:(len(tag1)) / 2] + sample_half_b = tag1[len(tag1) / 2:len(tag1)] + + max_tags = data_chimeraAnalysis[i, 1] + if len(max_tags) > 1 and len(max_tags) != len(data_array[0, 1]) and type(max_tags) is not numpy.ndarray: + max_tags = numpy.concatenate(max_tags) + max_tags = numpy.unique(max_tags) + stat_maxTags.append(len(max_tags)) + + info_maxTags = [data_array[data_array[:, 1] == t, :] for t in max_tags] + + chimera_half_a = numpy.array([t[0:(len(t)) / 2] for t in max_tags]) # mate1 part1 + chimera_half_b = numpy.array([t[len(t) / 2:len(t)] for t in max_tags]) # mate1 part 2 + + new_format = [] + for j in range(len(max_tags)): + fs_maxTags = ["{} {}".format(t[0], t[2]) for t in info_maxTags[j]] + + if sample_half_a == chimera_half_a[j]: + max_tag = "*{}* {} {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags)) + new_format.append(max_tag) + + elif sample_half_b == chimera_half_b[j]: + max_tag = "{} *{}* {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags)) + new_format.append(max_tag) + checked_tags.append(max_tags[j]) + + sample_tag = "{} {}\t{}".format(sample_half_a, sample_half_b, ", ".join(fs_tag1)) + output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format))) + + checked_tags.append(tag1) + + output_file1.write( + "This file contains all tags that were identified as chimeras as the first column and the " + "corresponding tags which returned a Hamming distance of zero in either the first or the second " + "half of the sample tag as the second column.\n" + "The tags were separated by an empty space into their halves and the * marks the identical half.") + output_file1.write("\n\nStatistics of nr. of tags that returned max. TD (2nd column)\n") + output_file1.write("minimum\t{}\ttag(s)\n".format(numpy.amin(numpy.array(stat_maxTags)))) + output_file1.write("mean\t{}\ttag(s)\n".format(numpy.mean(numpy.array(stat_maxTags)))) + output_file1.write("median\t{}\ttag(s)\n".format(numpy.median(numpy.array(stat_maxTags)))) + output_file1.write("maximum\t{}\ttag(s)\n".format(numpy.amax(numpy.array(stat_maxTags)))) + output_file1.write("sum\t{}\ttag(s)\n".format(numpy.sum(numpy.array(stat_maxTags)))) + + lenTags = len(data_array) + len_sample = len(result1) + + quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags + seq = numpy.array(data_array[result, 1]) # tags of sample + ham = numpy.asarray(ham) # HD for sample of tags + + if onlyDuplicates is True: # ab and ba strands of DCSs + quant = numpy.concatenate((quant, duplTagsBA[result])) + seq = numpy.tile(seq, 2) + ham = numpy.tile(ham, 2) + diff = numpy.tile(diff, 2) + rel_Diff = numpy.tile(rel_Diff, 2) + diff_zeros = numpy.tile(diff_zeros, 2) + + nr_chimeric_tags = len(data_chimeraAnalysis) + print("nr of chimeras", nr_chimeric_tags) + + # prepare data for different kinds of plots + # distribution of FSs separated after HD + familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False) + list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS + + # get FS for all tags with min HD of analysis of chimeric reads + # there are more tags than sample size in the plot, because one tag can have multiple minimas + if onlyDuplicates: + seqDic = defaultdict(list) + for s, q in zip(seq, quant): + seqDic[s].append(q) + else: + seqDic = dict(zip(seq, quant)) + + lst_minHD_tags = [] + for i in minHD_tags: + lst_minHD_tags.append(seqDic.get(i)) + + if onlyDuplicates: + lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags], + [item_b[1] for item_b in lst_minHD_tags])).astype(int) + # histogram with absolute and relative difference between HDs of both parts of the tag + listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff) + listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags, + rel_Diff) + # chimeric read analysis: tags which have TD=0 in one of the halfs + if len(minHD_tags_zeros) != 0: + lst_minHD_tags_zeros = [] + for i in minHD_tags_zeros: + lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads + if onlyDuplicates: + lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros], + [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int) + + # histogram with HD of non-identical half + listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS( + lst_minHD_tags_zeros, diff_zeros) + + if onlyDuplicates is False: + listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros = hammingDistanceWithDCS(minHD_tags_zeros, + diff_zeros, data_array) + + # plot Hamming Distance with Family size distribution + plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, rel_freq=rel_freq, + subtitle="Tag distance separated by family size", lenTags=lenTags, + xlabel="TD", nr_above_bars=nr_above_bars, len_sample=len_sample) + + # Plot FSD with separation after + plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, rel_freq=rel_freq, + originalCounts=quant, subtitle="Family size distribution separated by Tag distance", + pdf=pdf, relative=False, diff=False) + + # Plot HD within tags + plotHDwithinSeq(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, + rel_freq=rel_freq, len_sample=len_sample) + + # Plot difference between HD's separated after FSD + plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, + subtitle="Delta Tag distance within tags", lenTags=lenTags, rel_freq=rel_freq, + xlabel="absolute delta TD", relative=False, nr_above_bars=nr_above_bars, len_sample=len_sample) + + plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, + subtitle="Chimera Analysis: relative delta Tag distance", lenTags=lenTags, rel_freq=rel_freq, + xlabel="relative delta TD", relative=True, nr_above_bars=nr_above_bars, + nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) + + # plots for chimeric reads + if len(minHD_tags_zeros) != 0: + # HD + plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, + subtitle="Tag distance of chimeric families (CF)", rel_freq=rel_freq, + lenTags=lenTags, xlabel="TD", relative=False, + nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) + + if onlyDuplicates is False: + plotHDwithDCS(listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros, pdf=pdf, + subtitle="Tag distance of chimeric families (CF)", rel_freq=rel_freq, + lenTags=lenTags, xlabel="TD", relative=False, + nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) + + # print all data to a CSV file + # HD + summary, sumCol = createTableHD(list1, "TD=") + overallSum = sum(sumCol) # sum of columns in table + + # FSD + summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False) + overallSum5 = sum(sumCol5) + + # HD of both parts of the tag + summary9, sumCol9 = createTableHDwithTags([HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, numpy.array(minHDs)]) + overallSum9 = sum(sumCol9) + + # HD + # absolute difference + summary11, sumCol11 = createTableHD(listDifference1, "diff=") + overallSum11 = sum(sumCol11) + # relative difference and all tags + summary13, sumCol13 = createTableHD(listRelDifference1, "diff=") + overallSum13 = sum(sumCol13) + + # chimeric reads + if len(minHD_tags_zeros) != 0: + # absolute difference and tags where at least one half has HD=0 + summary15, sumCol15 = createTableHD(listDifference1_zeros, "TD=") + overallSum15 = sum(sumCol15) + + if onlyDuplicates is False: + summary16, sumCol16 = createTableHDwithDCS(listDCS_zeros) + overallSum16 = sum(sumCol16) + + output_file.write("{}\n".format(name1)) + output_file.write("nr of tags{}{:,}\nsample size{}{:,}\n\n".format(sep, lenTags, sep, len_sample)) + + # HD + createFileHD(summary, sumCol, overallSum, output_file, + "Tag distance separated by family size", sep) + # FSD + createFileFSD2(summary5, sumCol5, overallSum5, output_file, + "Family size distribution separated by Tag distance", sep, + diff=False) + + # output_file.write("{}{}\n".format(sep, name1)) + output_file.write("\n") + max_fs = numpy.bincount(integers[result]) + output_file.write("max. family size in sample:{}{}\n".format(sep, max(integers[result]))) + output_file.write("absolute frequency:{}{}\n".format(sep, max_fs[len(max_fs) - 1])) + output_file.write( + "relative frequency:{}{}\n\n".format(sep, float(max_fs[len(max_fs) - 1]) / sum(max_fs))) + + # HD within tags + output_file.write( + "Chimera Analysis:\nThe tags are splitted into two halves (part a and b) for which the Tag distances (TD) are calculated seperately.\n" + "The tag distance of the first half (part a) is calculated by comparing part a of the tag in the sample against all a parts in the dataset and by selecting the minimum value (TD a.min).\n" + "In the next step, we select those tags that showed the minimum TD and estimate the TD for the second half (part b) of the tag by comparing part b against the previously selected subset.\n" + "The maximum value represents then TD b.max. Finally, these process is repeated but starting with part b instead and TD b.min and TD a.max are calculated.\n" + "Next, the absolute differences between TD a.min & TD b.max and TD b.min & TD a.max are estimated (delta HD).\n" + "These are then divided by the sum of both parts (TD a.min + TD b.max or TD b.min + TD a.max, respectively) which give the relative differences between the partial HDs (rel. delta HD).\n" + "For simplicity, we used the maximum value of the relative differences and the respective delta HD.\n" + "Note that when only tags that can form a DCS are included in the analysis, the family sizes for both directions (ab and ba) of the strand will be included in the plots.\n") + + output_file.write("\nlength of one half of the tag{}{}\n\n".format(sep, len(data_array[0, 1]) / 2)) + + createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, + "Tag distance of each half in the tag", sep) + createFileHD(summary11, sumCol11, overallSum11, output_file, + "Absolute delta Tag distance within the tag", sep) + + createFileHD(summary13, sumCol13, overallSum13, output_file, + "Chimera analysis: relative delta Tag distance", sep) + + if len(minHD_tags_zeros) != 0: + output_file.write( + "All tags are filtered and only those tags where one half is identical (TD=0) and therefore, have a relative delta TD of 1, are kept.\n" + "These tags are considered as chimeras.\n") + createFileHD(summary15, sumCol15, overallSum15, output_file, + "Tag distance of chimeric families separated after FS", sep) + + if onlyDuplicates is False: + createFileHDwithDCS(summary16, sumCol16, overallSum16, output_file, + "Tag distance of chimeric families separated after DCS and single SSCS (ab, ba)", sep) + + output_file.write("\n") + + +if __name__ == '__main__': + sys.exit(Hamming_Distance_Analysis(sys.argv))