Mercurial > repos > mheinzl > td
comparison td.py @ 0:3e56058d9552 draft default tip
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/hd commit 9bae9043a53f1e07b502acd1082450adcb6d9e31-dirty
| author | mheinzl |
|---|---|
| date | Wed, 16 Oct 2019 04:17:59 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:3e56058d9552 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 # Tag distance analysis of SSCSs | |
| 4 # | |
| 5 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria) | |
| 6 # Contact: monika.heinzl@edumail.at | |
| 7 # | |
| 8 # Takes at least one TABULAR file with tags before the alignment to the SSCS and | |
| 9 # optionally a second TABULAR file as input. The program produces a plot which shows a histogram of Hamming distances | |
| 10 # separated after family sizes, a family size distribution separated after Hamming distances for all (sample_size=0) | |
| 11 # or a given sample of SSCSs or SSCSs, which form a DCS. In additon, the tool produces HD and FSD plots for the | |
| 12 # difference between the HDs of both parts of the tags and for the chimeric reads and finally a CSV file with the | |
| 13 # data of the plots. It is also possible to perform the HD analysis with shortened tags with given sizes as input. | |
| 14 # The tool can run on a certain number of processors, which can be defined by the user. | |
| 15 | |
| 16 # USAGE: python td.py --inputFile filename --inputName1 filename --sample_size int / | |
| 17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int | |
| 18 # --nr_above_bars True/False --output_tabular outptufile_name_tabular | |
| 19 | |
| 20 import argparse | |
| 21 import itertools | |
| 22 import operator | |
| 23 import sys | |
| 24 from collections import Counter, defaultdict | |
| 25 from functools import partial | |
| 26 from multiprocessing.pool import Pool | |
| 27 import random | |
| 28 import os | |
| 29 | |
| 30 import matplotlib.pyplot as plt | |
| 31 import numpy | |
| 32 from matplotlib.backends.backend_pdf import PdfPages | |
| 33 | |
| 34 plt.switch_backend('agg') | |
| 35 | |
| 36 | |
| 37 def plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, originalCounts, | |
| 38 subtitle, pdf, relative=False, diff=True, rel_freq=False): | |
| 39 if diff is False: | |
| 40 colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"] | |
| 41 labels = ["TD=1", "TD=2", "TD=3", "TD=4", "TD=5-8", "TD>8"] | |
| 42 else: | |
| 43 colors = ["#93A6AB", "#403C14", "#731E41", "#BAB591", "#085B6F", "#E8AA35", "#726C66"] | |
| 44 if relative is True: | |
| 45 labels = ["d=0", "d=0.1", "d=0.2", "d=0.3", "d=0.4", "d=0.5-0.8", "d>0.8"] | |
| 46 else: | |
| 47 labels = ["d=0", "d=1", "d=2", "d=3", "d=4", "d=5-8", "d>8"] | |
| 48 | |
| 49 fig = plt.figure(figsize=(6, 7)) | |
| 50 ax = fig.add_subplot(111) | |
| 51 plt.subplots_adjust(bottom=0.1) | |
| 52 p1 = numpy.bincount(numpy.concatenate(familySizeList1)) | |
| 53 maximumY = numpy.amax(p1) | |
| 54 | |
| 55 if len(range(minimumXFS, maximumXFS)) == 0: | |
| 56 range1 = range(minimumXFS - 1, minimumXFS + 2) | |
| 57 else: | |
| 58 range1 = range(0, maximumXFS + 2) | |
| 59 | |
| 60 if rel_freq: | |
| 61 w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(familySizeList1)) for data in familySizeList1] | |
| 62 counts = plt.hist(familySizeList1, label=labels, weights=w, color=colors, stacked=True, | |
| 63 rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1) | |
| 64 plt.ylabel("Relative Frequency", fontsize=14) | |
| 65 plt.ylim((0, 1.07)) | |
| 66 else: | |
| 67 counts = plt.hist(familySizeList1, label=labels, color=colors, stacked=True, | |
| 68 rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1) | |
| 69 if len(numpy.concatenate(familySizeList1)) != 0: | |
| 70 plt.ylim((0, max(numpy.bincount(numpy.concatenate(familySizeList1))) * 1.1)) | |
| 71 plt.ylabel("Absolute Frequency", fontsize=14) | |
| 72 plt.ylim((0, maximumY * 1.2)) | |
| 73 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) | |
| 74 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) | |
| 75 plt.xlabel("Family size", fontsize=14) | |
| 76 ticks = numpy.arange(0, maximumXFS + 1, 1) | |
| 77 ticks1 = map(str, ticks) | |
| 78 if maximumXFS >= 20: | |
| 79 ticks1[len(ticks1) - 1] = ">=20" | |
| 80 plt.xticks(numpy.array(ticks), ticks1) | |
| 81 [l.set_visible(False) for (i, l) in enumerate(ax.get_xticklabels()) if i % 5 != 0] | |
| 82 plt.xlim((0, maximumXFS + 1)) | |
| 83 legend = "\nfamily size: \nabsolute frequency: \nrelative frequency: " | |
| 84 plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure) | |
| 85 | |
| 86 count = numpy.bincount(originalCounts) # original counts | |
| 87 if max(originalCounts) >= 20: | |
| 88 max_count = ">= 20" | |
| 89 else: | |
| 90 max_count = max(originalCounts) | |
| 91 legend1 = "{}\n{}\n{:.5f}".format(max_count, p1[len(p1) - 1], float(p1[len(p1) - 1]) / sum(p1)) | |
| 92 plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure) | |
| 93 legend3 = "singletons\n{:,}\n{:.5f}".format(int(p1[1]), float(p1[1]) / sum(p1)) | |
| 94 plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12) | |
| 95 plt.grid(b=True, which='major', color='#424242', linestyle=':') | |
| 96 pdf.savefig(fig, bbox_inches="tight") | |
| 97 plt.close("all") | |
| 98 | |
| 99 | |
| 100 def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False, | |
| 101 nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False): | |
| 102 if relative is True: | |
| 103 step = 0.1 | |
| 104 else: | |
| 105 step = 1 | |
| 106 | |
| 107 fig = plt.figure(figsize=(6, 8)) | |
| 108 plt.subplots_adjust(bottom=0.1) | |
| 109 p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())]) | |
| 110 maximumY = numpy.amax(p1) | |
| 111 if relative is True: # relative difference | |
| 112 bin1 = numpy.arange(-1, maximumX + 0.2, 0.1) | |
| 113 else: | |
| 114 bin1 = maximumX + 1 | |
| 115 | |
| 116 if rel_freq: | |
| 117 w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1] | |
| 118 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w, | |
| 119 label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8, | |
| 120 color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"], | |
| 121 stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) | |
| 122 plt.ylim((0, 1.07)) | |
| 123 plt.ylabel("Relative Frequency", fontsize=14) | |
| 124 bins = counts[1] # width of bins | |
| 125 counts = numpy.array(map(float, counts[0][5])) | |
| 126 | |
| 127 else: | |
| 128 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, | |
| 129 label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8, | |
| 130 color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"], | |
| 131 stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) | |
| 132 maximumY = numpy.amax(p1) | |
| 133 plt.ylim((0, maximumY * 1.2)) | |
| 134 plt.ylabel("Absolute Frequency", fontsize=14) | |
| 135 bins = counts[1] # width of bins | |
| 136 counts = numpy.array(map(int, counts[0][5])) | |
| 137 | |
| 138 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) | |
| 139 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) | |
| 140 plt.xlabel(xlabel, fontsize=14) | |
| 141 plt.grid(b=True, which='major', color='#424242', linestyle=':') | |
| 142 plt.xlim((minimumX - step, maximumX + step)) | |
| 143 # plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) | |
| 144 plt.xticks(numpy.arange(0, maximumX + step, step)) | |
| 145 | |
| 146 if nr_above_bars: | |
| 147 bin_centers = -0.4 * numpy.diff(bins) + bins[:-1] | |
| 148 for x_label, label in zip(counts, bin_centers): # labels for values | |
| 149 if x_label == 0: | |
| 150 continue | |
| 151 else: | |
| 152 if rel_freq: | |
| 153 plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))), | |
| 154 float(x_label)), | |
| 155 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001), | |
| 156 xycoords="data", color="#000066", fontsize=10) | |
| 157 else: | |
| 158 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)), | |
| 159 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01), | |
| 160 xycoords="data", color="#000066", fontsize=10) | |
| 161 | |
| 162 if nr_unique_chimeras != 0: | |
| 163 if (relative and ((counts[len(counts)-1] / nr_unique_chimeras) == 2)) or \ | |
| 164 (sum(counts) / nr_unique_chimeras) == 2: | |
| 165 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})"\ | |
| 166 .format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2) | |
| 167 else: | |
| 168 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format( | |
| 169 lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras) | |
| 170 else: | |
| 171 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format( | |
| 172 lenTags, len_sample, len(numpy.concatenate(list1))) | |
| 173 | |
| 174 plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure) | |
| 175 pdf.savefig(fig, bbox_inches="tight") | |
| 176 plt.close("all") | |
| 177 plt.clf() | |
| 178 | |
| 179 | |
| 180 def plotHDwithDCS(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False, | |
| 181 nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False): | |
| 182 step = 1 | |
| 183 fig = plt.figure(figsize=(6, 8)) | |
| 184 plt.subplots_adjust(bottom=0.1) | |
| 185 p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())]) | |
| 186 maximumY = numpy.amax(p1) | |
| 187 bin1 = maximumX + 1 | |
| 188 if rel_freq: | |
| 189 w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1] | |
| 190 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w, | |
| 191 label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"], | |
| 192 stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) | |
| 193 plt.ylim((0, 1.07)) | |
| 194 plt.ylabel("Relative Frequency", fontsize=14) | |
| 195 bins = counts[1] # width of bins | |
| 196 counts = numpy.array(map(float, counts[0][2])) | |
| 197 | |
| 198 else: | |
| 199 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, | |
| 200 label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"], | |
| 201 stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) | |
| 202 plt.ylim((0, maximumY * 1.2)) | |
| 203 plt.ylabel("Absolute Frequency", fontsize=14) | |
| 204 bins = counts[1] # width of bins | |
| 205 counts = numpy.array(map(int, counts[0][2])) | |
| 206 | |
| 207 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) | |
| 208 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) | |
| 209 plt.xlabel(xlabel, fontsize=14) | |
| 210 plt.grid(b=True, which='major', color='#424242', linestyle=':') | |
| 211 plt.xlim((minimumX - step, maximumX + step)) | |
| 212 # plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) | |
| 213 plt.xticks(numpy.arange(0, maximumX + step, step)) | |
| 214 | |
| 215 if nr_above_bars: | |
| 216 bin_centers = -0.4 * numpy.diff(bins) + bins[:-1] | |
| 217 for x_label, label in zip(counts, bin_centers): # labels for values | |
| 218 if x_label == 0: | |
| 219 continue | |
| 220 else: | |
| 221 if rel_freq: | |
| 222 plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))), | |
| 223 float(x_label)), | |
| 224 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001), | |
| 225 xycoords="data", color="#000066", fontsize=10) | |
| 226 else: | |
| 227 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)), | |
| 228 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01), | |
| 229 xycoords="data", color="#000066", fontsize=10) | |
| 230 | |
| 231 if nr_unique_chimeras != 0: | |
| 232 if (sum(counts) / nr_unique_chimeras) == 2: | |
| 233 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})".\ | |
| 234 format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2) | |
| 235 else: | |
| 236 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format( | |
| 237 lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras) | |
| 238 else: | |
| 239 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format( | |
| 240 lenTags, len_sample, len(numpy.concatenate(list1))) | |
| 241 plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure) | |
| 242 | |
| 243 legend2 = "SSCS ab = {:,} ({:.5f})\nSSCS ba = {:,} ({:.5f})\nDCS = {:,} ({:.5f})".format( | |
| 244 len(list1[1]), len(list1[1]) / float(nr_unique_chimeras), | |
| 245 len(list1[2]), len(list1[2]) / float(nr_unique_chimeras), | |
| 246 len(list1[0]), len(list1[0]) / float(nr_unique_chimeras)) | |
| 247 plt.text(0.6, -0.047, legend2, size=12, transform=plt.gcf().transFigure) | |
| 248 | |
| 249 pdf.savefig(fig, bbox_inches="tight") | |
| 250 plt.close("all") | |
| 251 plt.clf() | |
| 252 | |
| 253 | |
| 254 def plotHDwithinSeq(sum1, sum1min, sum2, sum2min, min_value, lenTags, pdf, len_sample, rel_freq=False): | |
| 255 fig = plt.figure(figsize=(6, 8)) | |
| 256 plt.subplots_adjust(bottom=0.1) | |
| 257 | |
| 258 ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)] # new hd within tags | |
| 259 maximumX = numpy.amax(numpy.concatenate(ham_partial)) | |
| 260 minimumX = numpy.amin(numpy.concatenate(ham_partial)) | |
| 261 maximumY = numpy.amax(numpy.array(numpy.concatenate(map(lambda x: numpy.bincount(x), ham_partial)))) | |
| 262 | |
| 263 if len(range(minimumX, maximumX)) == 0: | |
| 264 range1 = minimumX | |
| 265 else: | |
| 266 range1 = range(minimumX, maximumX + 2) | |
| 267 | |
| 268 if rel_freq: | |
| 269 w = [numpy.zeros_like(data) + 1. / len(data) for data in ham_partial] | |
| 270 plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, weights=w, | |
| 271 label=["TD a.min", "TD b.max", "TD b.min", "TD a.max", "TD a.min + b.max,\nTD a.max + b.min"], | |
| 272 bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], | |
| 273 edgecolor='black', linewidth=1) | |
| 274 plt.ylabel("Relative Frequency", fontsize=14) | |
| 275 plt.ylim(0, 1.07) | |
| 276 else: | |
| 277 plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, | |
| 278 label=["TD a.min", "TD b.max", "TD b.min", "TD a.max", "TD a.min + b.max,\nTD a.max + b.min"], | |
| 279 bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], | |
| 280 edgecolor='black', linewidth=1) | |
| 281 plt.ylabel("Absolute Frequency", fontsize=14) | |
| 282 | |
| 283 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.6, 1)) | |
| 284 plt.suptitle('Tag distances within tags', fontsize=14) | |
| 285 plt.xlabel("TD", fontsize=14) | |
| 286 plt.grid(b=True, which='major', color='#424242', linestyle=':') | |
| 287 plt.xlim((minimumX - 1, maximumX + 1)) | |
| 288 # plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2)) | |
| 289 plt.xticks(numpy.arange(0, maximumX + 1, 1.0)) | |
| 290 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format( | |
| 291 lenTags, len_sample, len(numpy.concatenate(ham_partial))) | |
| 292 plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure) | |
| 293 pdf.savefig(fig, bbox_inches="tight") | |
| 294 plt.close("all") | |
| 295 plt.clf() | |
| 296 | |
| 297 | |
| 298 def createTableFSD2(list1, diff=True): | |
| 299 selfAB = numpy.concatenate(list1) | |
| 300 uniqueFS = numpy.unique(selfAB) | |
| 301 nr = numpy.arange(0, len(uniqueFS), 1) | |
| 302 if diff is False: | |
| 303 count = numpy.zeros((len(uniqueFS), 6)) | |
| 304 else: | |
| 305 count = numpy.zeros((len(uniqueFS), 7)) | |
| 306 state = 1 | |
| 307 for i in list1: | |
| 308 counts = list(Counter(i).items()) | |
| 309 hd = [item[0] for item in counts] | |
| 310 c = [item[1] for item in counts] | |
| 311 table = numpy.column_stack((hd, c)) | |
| 312 if len(table) == 0: | |
| 313 state = state + 1 | |
| 314 continue | |
| 315 else: | |
| 316 if state == 1: | |
| 317 for k, l in zip(uniqueFS, nr): | |
| 318 for j in table: | |
| 319 if j[0] == uniqueFS[l]: | |
| 320 count[l, 0] = j[1] | |
| 321 if state == 2: | |
| 322 for k, l in zip(uniqueFS, nr): | |
| 323 for j in table: | |
| 324 if j[0] == uniqueFS[l]: | |
| 325 count[l, 1] = j[1] | |
| 326 if state == 3: | |
| 327 for k, l in zip(uniqueFS, nr): | |
| 328 for j in table: | |
| 329 if j[0] == uniqueFS[l]: | |
| 330 count[l, 2] = j[1] | |
| 331 if state == 4: | |
| 332 for k, l in zip(uniqueFS, nr): | |
| 333 for j in table: | |
| 334 if j[0] == uniqueFS[l]: | |
| 335 count[l, 3] = j[1] | |
| 336 if state == 5: | |
| 337 for k, l in zip(uniqueFS, nr): | |
| 338 for j in table: | |
| 339 if j[0] == uniqueFS[l]: | |
| 340 count[l, 4] = j[1] | |
| 341 if state == 6: | |
| 342 for k, l in zip(uniqueFS, nr): | |
| 343 for j in table: | |
| 344 if j[0] == uniqueFS[l]: | |
| 345 count[l, 5] = j[1] | |
| 346 if state == 7: | |
| 347 for k, l in zip(uniqueFS, nr): | |
| 348 for j in table: | |
| 349 if j[0] == uniqueFS[l]: | |
| 350 count[l, 6] = j[1] | |
| 351 state = state + 1 | |
| 352 sumRow = count.sum(axis=1) | |
| 353 sumCol = count.sum(axis=0) | |
| 354 uniqueFS = uniqueFS.astype(str) | |
| 355 if uniqueFS[len(uniqueFS) - 1] == "20": | |
| 356 uniqueFS[len(uniqueFS) - 1] = ">20" | |
| 357 first = ["FS={}".format(i) for i in uniqueFS] | |
| 358 final = numpy.column_stack((first, count, sumRow)) | |
| 359 return (final, sumCol) | |
| 360 | |
| 361 | |
| 362 def createFileFSD2(summary, sumCol, overallSum, output_file, name, sep, rel=False, diff=True): | |
| 363 output_file.write(name) | |
| 364 output_file.write("\n") | |
| 365 if diff is False: | |
| 366 output_file.write("{}TD=1{}TD=2{}TD=3{}TD=4{}TD=5-8{}TD>8{}sum{}\n".format( | |
| 367 sep, sep, sep, sep, sep, sep, sep, sep)) | |
| 368 else: | |
| 369 if rel is False: | |
| 370 output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format( | |
| 371 sep, sep, sep, sep, sep, sep, sep, sep, sep)) | |
| 372 else: | |
| 373 output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n". | |
| 374 format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) | |
| 375 for item in summary: | |
| 376 for nr in item: | |
| 377 if "FS" not in nr and "diff" not in nr: | |
| 378 nr = nr.astype(float) | |
| 379 nr = nr.astype(int) | |
| 380 output_file.write("{}{}".format(nr, sep)) | |
| 381 output_file.write("\n") | |
| 382 output_file.write("sum{}".format(sep)) | |
| 383 sumCol = map(int, sumCol) | |
| 384 for el in sumCol: | |
| 385 output_file.write("{}{}".format(el, sep)) | |
| 386 output_file.write("{}{}".format(overallSum.astype(int), sep)) | |
| 387 output_file.write("\n\n") | |
| 388 | |
| 389 | |
| 390 def createTableHD(list1, row_label): | |
| 391 selfAB = numpy.concatenate(list1) | |
| 392 uniqueHD = numpy.unique(selfAB) | |
| 393 nr = numpy.arange(0, len(uniqueHD), 1) | |
| 394 count = numpy.zeros((len(uniqueHD), 6)) | |
| 395 state = 1 | |
| 396 for i in list1: | |
| 397 counts = list(Counter(i).items()) | |
| 398 hd = [item[0] for item in counts] | |
| 399 c = [item[1] for item in counts] | |
| 400 table = numpy.column_stack((hd, c)) | |
| 401 if len(table) == 0: | |
| 402 state = state + 1 | |
| 403 continue | |
| 404 else: | |
| 405 if state == 1: | |
| 406 for k, l in zip(uniqueHD, nr): | |
| 407 for j in table: | |
| 408 if j[0] == uniqueHD[l]: | |
| 409 count[l, 0] = j[1] | |
| 410 if state == 2: | |
| 411 for k, l in zip(uniqueHD, nr): | |
| 412 for j in table: | |
| 413 if j[0] == uniqueHD[l]: | |
| 414 count[l, 1] = j[1] | |
| 415 if state == 3: | |
| 416 for k, l in zip(uniqueHD, nr): | |
| 417 for j in table: | |
| 418 if j[0] == uniqueHD[l]: | |
| 419 count[l, 2] = j[1] | |
| 420 if state == 4: | |
| 421 for k, l in zip(uniqueHD, nr): | |
| 422 for j in table: | |
| 423 if j[0] == uniqueHD[l]: | |
| 424 count[l, 3] = j[1] | |
| 425 if state == 5: | |
| 426 for k, l in zip(uniqueHD, nr): | |
| 427 for j in table: | |
| 428 if j[0] == uniqueHD[l]: | |
| 429 count[l, 4] = j[1] | |
| 430 if state == 6: | |
| 431 for k, l in zip(uniqueHD, nr): | |
| 432 for j in table: | |
| 433 if j[0] == uniqueHD[l]: | |
| 434 count[l, 5] = j[1] | |
| 435 state = state + 1 | |
| 436 sumRow = count.sum(axis=1) | |
| 437 sumCol = count.sum(axis=0) | |
| 438 first = ["{}{}".format(row_label, i) for i in uniqueHD] | |
| 439 final = numpy.column_stack((first, count, sumRow)) | |
| 440 return (final, sumCol) | |
| 441 | |
| 442 | |
| 443 def createTableHDwithTags(list1): | |
| 444 selfAB = numpy.concatenate(list1) | |
| 445 uniqueHD = numpy.unique(selfAB) | |
| 446 nr = numpy.arange(0, len(uniqueHD), 1) | |
| 447 count = numpy.zeros((len(uniqueHD), 5)) | |
| 448 state = 1 | |
| 449 for i in list1: | |
| 450 counts = list(Counter(i).items()) | |
| 451 hd = [item[0] for item in counts] | |
| 452 c = [item[1] for item in counts] | |
| 453 table = numpy.column_stack((hd, c)) | |
| 454 if len(table) == 0: | |
| 455 state = state + 1 | |
| 456 continue | |
| 457 else: | |
| 458 if state == 1: | |
| 459 for k, l in zip(uniqueHD, nr): | |
| 460 for j in table: | |
| 461 if j[0] == uniqueHD[l]: | |
| 462 count[l, 0] = j[1] | |
| 463 if state == 2: | |
| 464 for k, l in zip(uniqueHD, nr): | |
| 465 for j in table: | |
| 466 if j[0] == uniqueHD[l]: | |
| 467 count[l, 1] = j[1] | |
| 468 if state == 3: | |
| 469 for k, l in zip(uniqueHD, nr): | |
| 470 for j in table: | |
| 471 if j[0] == uniqueHD[l]: | |
| 472 count[l, 2] = j[1] | |
| 473 if state == 4: | |
| 474 for k, l in zip(uniqueHD, nr): | |
| 475 for j in table: | |
| 476 if j[0] == uniqueHD[l]: | |
| 477 count[l, 3] = j[1] | |
| 478 if state == 5: | |
| 479 for k, l in zip(uniqueHD, nr): | |
| 480 for j in table: | |
| 481 if j[0] == uniqueHD[l]: | |
| 482 count[l, 4] = j[1] | |
| 483 state = state + 1 | |
| 484 sumRow = count.sum(axis=1) | |
| 485 sumCol = count.sum(axis=0) | |
| 486 first = ["TD={}".format(i) for i in uniqueHD] | |
| 487 final = numpy.column_stack((first, count, sumRow)) | |
| 488 return (final, sumCol) | |
| 489 | |
| 490 | |
| 491 def createTableHDwithDCS(list1): | |
| 492 selfAB = numpy.concatenate(list1) | |
| 493 uniqueHD = numpy.unique(selfAB) | |
| 494 nr = numpy.arange(0, len(uniqueHD), 1) | |
| 495 count = numpy.zeros((len(uniqueHD), len(list1))) | |
| 496 state = 1 | |
| 497 for i in list1: | |
| 498 counts = list(Counter(i).items()) | |
| 499 hd = [item[0] for item in counts] | |
| 500 c = [item[1] for item in counts] | |
| 501 table = numpy.column_stack((hd, c)) | |
| 502 if len(table) == 0: | |
| 503 state = state + 1 | |
| 504 continue | |
| 505 else: | |
| 506 if state == 1: | |
| 507 for k, l in zip(uniqueHD, nr): | |
| 508 for j in table: | |
| 509 if j[0] == uniqueHD[l]: | |
| 510 count[l, 0] = j[1] | |
| 511 if state == 2: | |
| 512 for k, l in zip(uniqueHD, nr): | |
| 513 for j in table: | |
| 514 if j[0] == uniqueHD[l]: | |
| 515 count[l, 1] = j[1] | |
| 516 if state == 3: | |
| 517 for k, l in zip(uniqueHD, nr): | |
| 518 for j in table: | |
| 519 if j[0] == uniqueHD[l]: | |
| 520 count[l, 2] = j[1] | |
| 521 state = state + 1 | |
| 522 sumRow = count.sum(axis=1) | |
| 523 sumCol = count.sum(axis=0) | |
| 524 first = ["TD={}".format(i) for i in uniqueHD] | |
| 525 final = numpy.column_stack((first, count, sumRow)) | |
| 526 return (final, sumCol) | |
| 527 | |
| 528 | |
| 529 def createFileHD(summary, sumCol, overallSum, output_file, name, sep): | |
| 530 output_file.write(name) | |
| 531 output_file.write("\n") | |
| 532 output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format( | |
| 533 sep, sep, sep, sep, sep, sep, sep, sep)) | |
| 534 for item in summary: | |
| 535 for nr in item: | |
| 536 if "TD" not in nr and "diff" not in nr: | |
| 537 nr = nr.astype(float) | |
| 538 nr = nr.astype(int) | |
| 539 output_file.write("{}{}".format(nr, sep)) | |
| 540 output_file.write("\n") | |
| 541 output_file.write("sum{}".format(sep)) | |
| 542 sumCol = map(int, sumCol) | |
| 543 for el in sumCol: | |
| 544 output_file.write("{}{}".format(el, sep)) | |
| 545 output_file.write("{}{}".format(overallSum.astype(int), sep)) | |
| 546 output_file.write("\n\n") | |
| 547 | |
| 548 | |
| 549 def createFileHDwithDCS(summary, sumCol, overallSum, output_file, name, sep): | |
| 550 output_file.write(name) | |
| 551 output_file.write("\n") | |
| 552 output_file.write("{}DCS{}SSCS ab{}SSCS ba{}sum{}\n".format(sep, sep, sep, sep, sep)) | |
| 553 for item in summary: | |
| 554 for nr in item: | |
| 555 if "TD" not in nr: | |
| 556 nr = nr.astype(float) | |
| 557 nr = nr.astype(int) | |
| 558 output_file.write("{}{}".format(nr, sep)) | |
| 559 output_file.write("\n") | |
| 560 output_file.write("sum{}".format(sep)) | |
| 561 sumCol = map(int, sumCol) | |
| 562 for el in sumCol: | |
| 563 output_file.write("{}{}".format(el, sep)) | |
| 564 output_file.write("{}{}".format(overallSum.astype(int), sep)) | |
| 565 output_file.write("\n\n") | |
| 566 | |
| 567 | |
| 568 def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name, sep): | |
| 569 output_file.write(name) | |
| 570 output_file.write("\n") | |
| 571 output_file.write("{}TD a.min{}TD b.max{}TD b.min{}TD a.max{}TD a.min + b.max, TD a.max + b.min{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep)) | |
| 572 for item in summary: | |
| 573 for nr in item: | |
| 574 if "TD" not in nr: | |
| 575 nr = nr.astype(float) | |
| 576 nr = nr.astype(int) | |
| 577 output_file.write("{}{}".format(nr, sep)) | |
| 578 output_file.write("\n") | |
| 579 output_file.write("sum{}".format(sep)) | |
| 580 sumCol = map(int, sumCol) | |
| 581 for el in sumCol: | |
| 582 output_file.write("{}{}".format(el, sep)) | |
| 583 output_file.write("{}{}".format(overallSum.astype(int), sep)) | |
| 584 output_file.write("\n\n") | |
| 585 | |
| 586 | |
| 587 def hamming(array1, array2): | |
| 588 res = 99 * numpy.ones(len(array1)) | |
| 589 i = 0 | |
| 590 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time | |
| 591 for a in array1: | |
| 592 dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2]) # fastest | |
| 593 res[i] = numpy.amin(dist[dist > 0]) # pick min distance greater than zero | |
| 594 i += 1 | |
| 595 return res | |
| 596 | |
| 597 | |
| 598 def hamming_difference(array1, array2, mate_b): | |
| 599 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time | |
| 600 array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1 | |
| 601 array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2 | |
| 602 array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1 | |
| 603 array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2 | |
| 604 | |
| 605 # diff11 = 999 * numpy.ones(len(array2)) | |
| 606 # relativeDiffList = 999 * numpy.ones(len(array2)) | |
| 607 # ham1 = 999 * numpy.ones(len(array2)) | |
| 608 # ham2 = 999 * numpy.ones(len(array2)) | |
| 609 # min_valueList = 999 * numpy.ones(len(array2)) | |
| 610 # min_tagsList = 999 * numpy.ones(len(array2)) | |
| 611 # diff11_zeros = 999 * numpy.ones(len(array2)) | |
| 612 # min_tagsList_zeros = 999 * numpy.ones(len(array2)) | |
| 613 | |
| 614 diff11 = [] | |
| 615 relativeDiffList = [] | |
| 616 ham1 = [] | |
| 617 ham2 = [] | |
| 618 ham1min = [] | |
| 619 ham2min = [] | |
| 620 min_valueList = [] | |
| 621 min_tagsList = [] | |
| 622 diff11_zeros = [] | |
| 623 min_tagsList_zeros = [] | |
| 624 max_tag_list = [] | |
| 625 i = 0 # counter, only used to see how many HDs of tags were already calculated | |
| 626 if mate_b is False: # HD calculation for all a's | |
| 627 half1_mate1 = array1_half | |
| 628 half2_mate1 = array1_half2 | |
| 629 half1_mate2 = array2_half | |
| 630 half2_mate2 = array2_half2 | |
| 631 elif mate_b is True: # HD calculation for all b's | |
| 632 half1_mate1 = array1_half2 | |
| 633 half2_mate1 = array1_half | |
| 634 half1_mate2 = array2_half2 | |
| 635 half2_mate2 = array2_half | |
| 636 | |
| 637 # half1_mate1, index_halves = numpy.unique(half1_mate1, return_index=True) | |
| 638 # print(len(half1_mate1)) | |
| 639 # half2_mate1 = half2_mate1[index_halves] | |
| 640 # array1 = array1[index_halves] | |
| 641 | |
| 642 for a, b, tag in zip(half1_mate1, half2_mate1, array1): | |
| 643 # exclude identical tag from array2, to prevent comparison to itself | |
| 644 sameTag = numpy.where(array2 == tag)[0] | |
| 645 indexArray2 = numpy.arange(0, len(array2), 1) | |
| 646 index_withoutSame = numpy.delete(indexArray2, sameTag) # delete identical tag from the data | |
| 647 | |
| 648 # all tags without identical tag | |
| 649 array2_half_withoutSame = half1_mate2[index_withoutSame] | |
| 650 array2_half2_withoutSame = half2_mate2[index_withoutSame] | |
| 651 array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs) | |
| 652 # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" | |
| 653 dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in | |
| 654 array2_half_withoutSame]) | |
| 655 min_index = numpy.where(dist == dist.min())[0] # get index of min HD | |
| 656 min_value = dist.min() | |
| 657 # min_value = dist[min_index] # get minimum HDs | |
| 658 # get all "b's" of the tag or all "a's" of the tag with minimum HD | |
| 659 min_tag_half2 = array2_half2_withoutSame[min_index] | |
| 660 min_tag_array2 = array2_withoutSame[min_index] # get whole tag with min HD | |
| 661 | |
| 662 dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in | |
| 663 min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" | |
| 664 max_value = dist_second_half.max() | |
| 665 max_index = numpy.where(dist_second_half == dist_second_half.max())[0] # get index of max HD | |
| 666 max_tag = min_tag_array2[max_index] | |
| 667 | |
| 668 # for d, d2 in zip(min_value, max_value): | |
| 669 if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b | |
| 670 ham2.append(min_value) | |
| 671 ham2min.append(max_value) | |
| 672 else: # half1, corrects the variable of the HD from both halfs if it is a or b | |
| 673 ham1.append(min_value) | |
| 674 ham1min.append(max_value) | |
| 675 | |
| 676 min_valueList.append(min_value + max_value) | |
| 677 min_tagsList.append(tag) | |
| 678 difference1 = abs(min_value - max_value) | |
| 679 diff11.append(difference1) | |
| 680 rel_difference = round(float(difference1) / (min_value + max_value), 1) | |
| 681 relativeDiffList.append(rel_difference) | |
| 682 | |
| 683 # tags which have identical parts: | |
| 684 if min_value == 0 or max_value == 0: | |
| 685 min_tagsList_zeros.append(numpy.array(tag)) | |
| 686 difference1_zeros = abs(min_value - max_value) # td of non-identical part | |
| 687 diff11_zeros.append(difference1_zeros) | |
| 688 max_tag_list.append(numpy.array(max_tag)) | |
| 689 else: | |
| 690 min_tagsList_zeros.append(None) | |
| 691 diff11_zeros.append(None) | |
| 692 max_tag_list.append(None) | |
| 693 i += 1 | |
| 694 | |
| 695 # print(i) | |
| 696 # diff11 = [st for st in diff11 if st != 999] | |
| 697 # ham1 = [st for st in ham1 if st != 999] | |
| 698 # ham2 = [st for st in ham2 if st != 999] | |
| 699 # min_valueList = [st for st in min_valueList if st != 999] | |
| 700 # min_tagsList = [st for st in min_tagsList if st != 999] | |
| 701 # relativeDiffList = [st for st in relativeDiffList if st != 999] | |
| 702 # diff11_zeros = [st for st in diff11_zeros if st != 999] | |
| 703 # min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999] | |
| 704 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, | |
| 705 min_tagsList_zeros, ham1min, ham2min, max_tag_list]) | |
| 706 | |
| 707 | |
| 708 def readFileReferenceFree(file): | |
| 709 with open(file, 'r') as dest_f: | |
| 710 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') | |
| 711 integers = numpy.array(data_array[:, 0]).astype(int) | |
| 712 return(integers, data_array) | |
| 713 | |
| 714 | |
| 715 def hammingDistanceWithFS(fs, ham): | |
| 716 fs = numpy.asarray(fs) | |
| 717 maximum = max(ham) | |
| 718 minimum = min(ham) | |
| 719 ham = numpy.asarray(ham) | |
| 720 | |
| 721 singletons = numpy.where(fs == 1)[0] | |
| 722 data = ham[singletons] | |
| 723 | |
| 724 hd2 = numpy.where(fs == 2)[0] | |
| 725 data2 = ham[hd2] | |
| 726 | |
| 727 hd3 = numpy.where(fs == 3)[0] | |
| 728 data3 = ham[hd3] | |
| 729 | |
| 730 hd4 = numpy.where(fs == 4)[0] | |
| 731 data4 = ham[hd4] | |
| 732 | |
| 733 hd5 = numpy.where((fs >= 5) & (fs <= 10))[0] | |
| 734 data5 = ham[hd5] | |
| 735 | |
| 736 hd6 = numpy.where(fs > 10)[0] | |
| 737 data6 = ham[hd6] | |
| 738 | |
| 739 list1 = [data, data2, data3, data4, data5, data6] | |
| 740 return(list1, maximum, minimum) | |
| 741 | |
| 742 | |
| 743 def familySizeDistributionWithHD(fs, ham, diff=False, rel=True): | |
| 744 hammingDistances = numpy.unique(ham) | |
| 745 fs = numpy.asarray(fs) | |
| 746 ham = numpy.asarray(ham) | |
| 747 bigFamilies2 = numpy.where(fs > 19)[0] | |
| 748 if len(bigFamilies2) != 0: | |
| 749 fs[bigFamilies2] = 20 | |
| 750 maximum = max(fs) | |
| 751 minimum = min(fs) | |
| 752 if diff is True: | |
| 753 hd0 = numpy.where(ham == 0)[0] | |
| 754 data0 = fs[hd0] | |
| 755 | |
| 756 if rel is True: | |
| 757 hd1 = numpy.where(ham == 0.1)[0] | |
| 758 else: | |
| 759 hd1 = numpy.where(ham == 1)[0] | |
| 760 data = fs[hd1] | |
| 761 | |
| 762 if rel is True: | |
| 763 hd2 = numpy.where(ham == 0.2)[0] | |
| 764 else: | |
| 765 hd2 = numpy.where(ham == 2)[0] | |
| 766 data2 = fs[hd2] | |
| 767 | |
| 768 if rel is True: | |
| 769 hd3 = numpy.where(ham == 0.3)[0] | |
| 770 else: | |
| 771 hd3 = numpy.where(ham == 3)[0] | |
| 772 data3 = fs[hd3] | |
| 773 | |
| 774 if rel is True: | |
| 775 hd4 = numpy.where(ham == 0.4)[0] | |
| 776 else: | |
| 777 hd4 = numpy.where(ham == 4)[0] | |
| 778 data4 = fs[hd4] | |
| 779 | |
| 780 if rel is True: | |
| 781 hd5 = numpy.where((ham >= 0.5) & (ham <= 0.8))[0] | |
| 782 else: | |
| 783 hd5 = numpy.where((ham >= 5) & (ham <= 8))[0] | |
| 784 data5 = fs[hd5] | |
| 785 | |
| 786 if rel is True: | |
| 787 hd6 = numpy.where(ham > 0.8)[0] | |
| 788 else: | |
| 789 hd6 = numpy.where(ham > 8)[0] | |
| 790 data6 = fs[hd6] | |
| 791 | |
| 792 if diff is True: | |
| 793 list1 = [data0, data, data2, data3, data4, data5, data6] | |
| 794 else: | |
| 795 list1 = [data, data2, data3, data4, data5, data6] | |
| 796 | |
| 797 return(list1, hammingDistances, maximum, minimum) | |
| 798 | |
| 799 | |
| 800 def hammingDistanceWithDCS(minHD_tags_zeros, diff_zeros, data_array): | |
| 801 diff_zeros = numpy.array(diff_zeros) | |
| 802 maximum = numpy.amax(diff_zeros) | |
| 803 minimum = numpy.amin(diff_zeros) | |
| 804 minHD_tags_zeros = numpy.array(minHD_tags_zeros) | |
| 805 | |
| 806 idx = numpy.concatenate([numpy.where(data_array[:, 1] == i)[0] for i in minHD_tags_zeros]) | |
| 807 subset_data = data_array[idx, :] | |
| 808 | |
| 809 seq = numpy.array(subset_data[:, 1]) | |
| 810 | |
| 811 # find all unique tags and get the indices for ALL tags, but only once | |
| 812 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) | |
| 813 DCS_tags = u[c == 2] | |
| 814 rest_tags = u[c == 1] | |
| 815 | |
| 816 dcs = numpy.repeat("DCS", len(DCS_tags)) | |
| 817 idx_sscs = numpy.concatenate([numpy.where(subset_data[:, 1] == i)[0] for i in rest_tags]) | |
| 818 sscs = subset_data[idx_sscs, 2] | |
| 819 | |
| 820 all_tags = numpy.column_stack((numpy.concatenate((DCS_tags, subset_data[idx_sscs, 1])), | |
| 821 numpy.concatenate((dcs, sscs)))) | |
| 822 hd_DCS = [] | |
| 823 ab_SSCS = [] | |
| 824 ba_SSCS = [] | |
| 825 | |
| 826 for i in range(len(all_tags)): | |
| 827 tag = all_tags[i, :] | |
| 828 hd = diff_zeros[numpy.where(minHD_tags_zeros == tag[0])[0]] | |
| 829 | |
| 830 if tag[1] == "DCS": | |
| 831 hd_DCS.append(hd) | |
| 832 elif tag[1] == "ab": | |
| 833 ab_SSCS.append(hd) | |
| 834 elif tag[1] == "ba": | |
| 835 ba_SSCS.append(hd) | |
| 836 | |
| 837 if len(hd_DCS) != 0: | |
| 838 hd_DCS = numpy.concatenate(hd_DCS) | |
| 839 if len(ab_SSCS) != 0: | |
| 840 ab_SSCS = numpy.concatenate(ab_SSCS) | |
| 841 if len(ba_SSCS) != 0: | |
| 842 ba_SSCS = numpy.concatenate(ba_SSCS) | |
| 843 list1 = [hd_DCS, ab_SSCS, ba_SSCS] # list for plotting | |
| 844 return(list1, maximum, minimum) | |
| 845 | |
| 846 | |
| 847 def make_argparser(): | |
| 848 parser = argparse.ArgumentParser(description='Tag distance analysis of duplex sequencing data') | |
| 849 parser.add_argument('--inputFile', | |
| 850 help='Tabular File with three columns: ab or ba, tag and family size.') | |
| 851 parser.add_argument('--inputName1') | |
| 852 parser.add_argument('--sample_size', default=1000, type=int, | |
| 853 help='Sample size of Tag distance analysis.') | |
| 854 parser.add_argument('--subset_tag', default=0, type=int, | |
| 855 help='The tag is shortened to the given number.') | |
| 856 parser.add_argument('--nproc', default=4, type=int, | |
| 857 help='The tool runs with the given number of processors.') | |
| 858 parser.add_argument('--only_DCS', action="store_false", | |
| 859 help='Only tags of the DCSs are included in the HD analysis') | |
| 860 | |
| 861 parser.add_argument('--minFS', default=1, type=int, | |
| 862 help='Only tags, which have a family size greater or equal than specified, ' | |
| 863 'are included in the HD analysis') | |
| 864 parser.add_argument('--maxFS', default=0, type=int, | |
| 865 help='Only tags, which have a family size smaller or equal than specified, ' | |
| 866 'are included in the HD analysis') | |
| 867 parser.add_argument('--nr_above_bars', action="store_true", | |
| 868 help='If False, values above bars in the histograms are removed') | |
| 869 parser.add_argument('--rel_freq', action="store_false", | |
| 870 help='If True, the relative frequencies are displayed.') | |
| 871 | |
| 872 parser.add_argument('--output_tabular', default="data.tabular", type=str, | |
| 873 help='Name of the tabular file.') | |
| 874 parser.add_argument('--output_pdf', default="data.pdf", type=str, | |
| 875 help='Name of the pdf file.') | |
| 876 parser.add_argument('--output_chimeras_tabular', default="data.tabular", type=str, | |
| 877 help='Name of the tabular file with all chimeric tags.') | |
| 878 | |
| 879 return parser | |
| 880 | |
| 881 | |
| 882 def Hamming_Distance_Analysis(argv): | |
| 883 parser = make_argparser() | |
| 884 args = parser.parse_args(argv[1:]) | |
| 885 file1 = args.inputFile | |
| 886 name1 = args.inputName1 | |
| 887 index_size = args.sample_size | |
| 888 title_savedFile_pdf = args.output_pdf | |
| 889 title_savedFile_csv = args.output_tabular | |
| 890 output_chimeras_tabular = args.output_chimeras_tabular | |
| 891 onlyDuplicates = args.only_DCS | |
| 892 rel_freq = args.rel_freq | |
| 893 minFS = args.minFS | |
| 894 maxFS = args.maxFS | |
| 895 nr_above_bars = args.nr_above_bars | |
| 896 subset = args.subset_tag | |
| 897 nproc = args.nproc | |
| 898 sep = "\t" | |
| 899 | |
| 900 # input checks | |
| 901 if index_size < 0: | |
| 902 print("index_size is a negative integer.") | |
| 903 exit(2) | |
| 904 if nproc <= 0: | |
| 905 print("nproc is smaller or equal zero") | |
| 906 exit(3) | |
| 907 if subset < 0: | |
| 908 print("subset_tag is smaller or equal zero.") | |
| 909 exit(5) | |
| 910 | |
| 911 # PLOT | |
| 912 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color | |
| 913 plt.rcParams['xtick.labelsize'] = 14 | |
| 914 plt.rcParams['ytick.labelsize'] = 14 | |
| 915 plt.rcParams['patch.edgecolor'] = "#000000" | |
| 916 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format | |
| 917 name1 = name1.split(".tabular")[0] | |
| 918 | |
| 919 with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf: | |
| 920 print("dataset: ", name1) | |
| 921 integers, data_array = readFileReferenceFree(file1) | |
| 922 data_array = numpy.array(data_array) | |
| 923 print("total nr of tags:", len(data_array)) | |
| 924 | |
| 925 # filter tags out which contain any other character than ATCG | |
| 926 valid_bases = ["A", "T", "G", "C"] | |
| 927 tagsToDelete = [] | |
| 928 for idx, t in enumerate(data_array[:, 1]): | |
| 929 for char in t: | |
| 930 if char not in valid_bases: | |
| 931 tagsToDelete.append(idx) | |
| 932 break | |
| 933 | |
| 934 if len(tagsToDelete) != 0: # delete tags with N in the tag from data | |
| 935 print("nr of tags with any other character than A, T, C, G:", len(tagsToDelete), | |
| 936 float(len(tagsToDelete)) / len(data_array)) | |
| 937 index_whole_array = numpy.arange(0, len(data_array), 1) | |
| 938 index_withoutN_inTag = numpy.delete(index_whole_array, tagsToDelete) | |
| 939 data_array = data_array[index_withoutN_inTag, :] | |
| 940 integers = integers[index_withoutN_inTag] | |
| 941 print("total nr of filtered tags:", len(data_array)) | |
| 942 | |
| 943 int_f = numpy.array(data_array[:, 0]).astype(int) | |
| 944 data_array = data_array[numpy.where(int_f >= minFS)] | |
| 945 integers = integers[integers >= minFS] | |
| 946 | |
| 947 # select family size for tags | |
| 948 if maxFS > 0: | |
| 949 int_f2 = numpy.array(data_array[:, 0]).astype(int) | |
| 950 data_array = data_array[numpy.where(int_f2 <= maxFS)] | |
| 951 integers = integers[integers <= maxFS] | |
| 952 | |
| 953 if onlyDuplicates is True: | |
| 954 tags = data_array[:, 2] | |
| 955 seq = data_array[:, 1] | |
| 956 | |
| 957 # find all unique tags and get the indices for ALL tags, but only once | |
| 958 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) | |
| 959 d = u[c == 2] | |
| 960 | |
| 961 # get family sizes, tag for duplicates | |
| 962 duplTags_double = integers[numpy.in1d(seq, d)] | |
| 963 duplTags = duplTags_double[0::2] # ab of DCS | |
| 964 duplTagsBA = duplTags_double[1::2] # ba of DCS | |
| 965 | |
| 966 duplTags_tag = tags[numpy.in1d(seq, d)][0::2] # ab | |
| 967 duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags | |
| 968 | |
| 969 if minFS > 1: | |
| 970 duplTags_tag = duplTags_tag[(duplTags >= minFS) & (duplTagsBA >= minFS)] | |
| 971 duplTags_seq = duplTags_seq[(duplTags >= minFS) & (duplTagsBA >= minFS)] | |
| 972 duplTags = duplTags[(duplTags >= minFS) & (duplTagsBA >= minFS)] # ab+ba with FS>=3 | |
| 973 | |
| 974 data_array = numpy.column_stack((duplTags, duplTags_seq)) | |
| 975 data_array = numpy.column_stack((data_array, duplTags_tag)) | |
| 976 integers = numpy.array(data_array[:, 0]).astype(int) | |
| 977 print("DCS in whole dataset", len(data_array)) | |
| 978 | |
| 979 print("min FS", min(integers)) | |
| 980 print("max FS", max(integers)) | |
| 981 | |
| 982 # HD analysis for a subset of the tag | |
| 983 if subset > 0: | |
| 984 tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]]) | |
| 985 tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]]) | |
| 986 | |
| 987 flanking_region_float = float((len(tag1[0]) - subset)) / 2 | |
| 988 flanking_region = int(flanking_region_float) | |
| 989 if flanking_region_float % 2 == 0: | |
| 990 tag1_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag1]) | |
| 991 tag2_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag2]) | |
| 992 else: | |
| 993 flanking_region_rounded = int(round(flanking_region, 1)) | |
| 994 flanking_region_rounded_end = len(tag1[0]) - subset - flanking_region_rounded | |
| 995 tag1_shorten = numpy.array( | |
| 996 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag1]) | |
| 997 tag2_shorten = numpy.array( | |
| 998 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2]) | |
| 999 | |
| 1000 data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)]) | |
| 1001 data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2])) | |
| 1002 | |
| 1003 print("length of tag= ", len(data_array[0, 1])) | |
| 1004 # select sample: if no size given --> all vs. all comparison | |
| 1005 if index_size == 0: | |
| 1006 result = numpy.arange(0, len(data_array), 1) | |
| 1007 else: | |
| 1008 numpy.random.shuffle(data_array) | |
| 1009 unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True) # get only unique tags | |
| 1010 result = numpy.random.choice(unique_indices, size=index_size, | |
| 1011 replace=False) # array of random sequences of size=index.size | |
| 1012 | |
| 1013 # result = numpy.random.choice(len(integers), size=index_size, | |
| 1014 # replace=False) # array of random sequences of size=index.size | |
| 1015 # result = numpy.where(numpy.array(random_tags) == numpy.array(data_array[:,1]))[0] | |
| 1016 | |
| 1017 # with open("index_result.pkl", "wb") as o: | |
| 1018 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL) | |
| 1019 | |
| 1020 # save counts | |
| 1021 # with open(data_folder + "index_sampleTags1000_Barcode3_DCS.pkl", "wb") as f: | |
| 1022 # pickle.dump(result, f, pickle.HIGHEST_PROTOCOL) | |
| 1023 # with open(data_folder + "dataArray_sampleTags1000_Barcode3_DCS.pkl", "wb") as f1: | |
| 1024 # pickle.dump(data_array, f1, pickle.HIGHEST_PROTOCOL) | |
| 1025 # | |
| 1026 # with open(data_folder + "index_sampleTags100.pkl", "rb") as f: | |
| 1027 # result = pickle.load(f) | |
| 1028 # | |
| 1029 # with open(data_folder + "dataArray_sampleTags100.pkl", "rb") as f1: | |
| 1030 # data_array = pickle.load(f1) | |
| 1031 | |
| 1032 # with open(data_folder + "index_result.txt", "w") as t: | |
| 1033 # for text in result: | |
| 1034 # t.write("{}\n".format(text)) | |
| 1035 | |
| 1036 # comparison random tags to whole dataset | |
| 1037 result1 = data_array[result, 1] # random tags | |
| 1038 result2 = data_array[:, 1] # all tags | |
| 1039 print("sample size= ", len(result1)) | |
| 1040 | |
| 1041 # HD analysis of whole tag | |
| 1042 proc_pool = Pool(nproc) | |
| 1043 chunks_sample = numpy.array_split(result1, nproc) | |
| 1044 ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample) | |
| 1045 proc_pool.close() | |
| 1046 proc_pool.join() | |
| 1047 ham = numpy.concatenate(ham).astype(int) | |
| 1048 # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1: | |
| 1049 # for h, tag in zip(ham, result1): | |
| 1050 # output_file1.write("{}\t{}\n".format(tag, h)) | |
| 1051 | |
| 1052 # # HD analysis for chimeric reads | |
| 1053 # result2 = data_array_whole_dataset[:,1] | |
| 1054 | |
| 1055 proc_pool_b = Pool(nproc) | |
| 1056 diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample) | |
| 1057 diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample) | |
| 1058 proc_pool_b.close() | |
| 1059 proc_pool_b.join() | |
| 1060 HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]), | |
| 1061 numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int) | |
| 1062 HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]), | |
| 1063 numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int) | |
| 1064 minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]), | |
| 1065 numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int) | |
| 1066 HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]), | |
| 1067 numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int) | |
| 1068 HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]), | |
| 1069 numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int) | |
| 1070 | |
| 1071 rel_Diff1 = numpy.concatenate([item[5] for item in diff_list_a]) | |
| 1072 rel_Diff2 = numpy.concatenate([item[5] for item in diff_list_b]) | |
| 1073 diff1 = numpy.concatenate([item[0] for item in diff_list_a]) | |
| 1074 diff2 = numpy.concatenate([item[0] for item in diff_list_b]) | |
| 1075 | |
| 1076 diff_zeros1 = numpy.concatenate([item[6] for item in diff_list_a]) | |
| 1077 diff_zeros2 = numpy.concatenate([item[6] for item in diff_list_b]) | |
| 1078 minHD_tags = numpy.concatenate([item[4] for item in diff_list_a]) | |
| 1079 minHD_tags_zeros1 = numpy.concatenate([item[7] for item in diff_list_a]) | |
| 1080 minHD_tags_zeros2 = numpy.concatenate([item[7] for item in diff_list_b]) | |
| 1081 | |
| 1082 chimera_tags1 = sum([item[10] for item in diff_list_a], []) | |
| 1083 chimera_tags2 = sum([item[10] for item in diff_list_b], []) | |
| 1084 | |
| 1085 rel_Diff = [] | |
| 1086 diff_zeros = [] | |
| 1087 minHD_tags_zeros = [] | |
| 1088 diff = [] | |
| 1089 chimera_tags = [] | |
| 1090 for d1, d2, rel1, rel2, zeros1, zeros2, tag1, tag2, ctag1, ctag2 in \ | |
| 1091 zip(diff1, diff2, rel_Diff1, rel_Diff2, diff_zeros1, diff_zeros2, minHD_tags_zeros1, minHD_tags_zeros2, | |
| 1092 chimera_tags1, chimera_tags2): | |
| 1093 relatives = numpy.array([rel1, rel2]) | |
| 1094 absolutes = numpy.array([d1, d2]) | |
| 1095 max_idx = numpy.argmax(relatives) | |
| 1096 rel_Diff.append(relatives[max_idx]) | |
| 1097 diff.append(absolutes[max_idx]) | |
| 1098 | |
| 1099 if all(i is not None for i in [zeros1, zeros2]): | |
| 1100 diff_zeros.append(max(zeros1, zeros2)) | |
| 1101 minHD_tags_zeros.append(str(tag1)) | |
| 1102 tags = [ctag1, ctag2] | |
| 1103 chimera_tags.append(tags) | |
| 1104 elif zeros1 is not None and zeros2 is None: | |
| 1105 diff_zeros.append(zeros1) | |
| 1106 minHD_tags_zeros.append(str(tag1)) | |
| 1107 chimera_tags.append(ctag1) | |
| 1108 elif zeros1 is None and zeros2 is not None: | |
| 1109 diff_zeros.append(zeros2) | |
| 1110 minHD_tags_zeros.append(str(tag2)) | |
| 1111 chimera_tags.append(ctag2) | |
| 1112 | |
| 1113 chimera_tags_new = chimera_tags | |
| 1114 data_chimeraAnalysis = numpy.column_stack((minHD_tags_zeros, chimera_tags_new)) | |
| 1115 # chimeras_dic = defaultdict(list) | |
| 1116 # | |
| 1117 # for t1, t2 in zip(minHD_tags_zeros, chimera_tags_new): | |
| 1118 # if len(t2) >1 and type(t2) is not numpy.ndarray: | |
| 1119 # t2 = numpy.concatenate(t2) | |
| 1120 # chimeras_dic[t1].append(t2) | |
| 1121 | |
| 1122 checked_tags = [] | |
| 1123 stat_maxTags = [] | |
| 1124 | |
| 1125 with open(output_chimeras_tabular, "w") as output_file1: | |
| 1126 output_file1.write("chimera tag\tfamily size, read direction\tsimilar tag with TD=0\n") | |
| 1127 for i in range(len(data_chimeraAnalysis)): | |
| 1128 tag1 = data_chimeraAnalysis[i, 0] | |
| 1129 | |
| 1130 info_tag1 = data_array[data_array[:, 1] == tag1, :] | |
| 1131 fs_tag1 = ["{} {}".format(t[0], t[2]) for t in info_tag1] | |
| 1132 | |
| 1133 if tag1 in checked_tags: # skip tag if already written to file | |
| 1134 continue | |
| 1135 | |
| 1136 sample_half_a = tag1[0:(len(tag1)) / 2] | |
| 1137 sample_half_b = tag1[len(tag1) / 2:len(tag1)] | |
| 1138 | |
| 1139 max_tags = data_chimeraAnalysis[i, 1] | |
| 1140 if len(max_tags) > 1 and len(max_tags) != len(data_array[0, 1]) and type(max_tags) is not numpy.ndarray: | |
| 1141 max_tags = numpy.concatenate(max_tags) | |
| 1142 max_tags = numpy.unique(max_tags) | |
| 1143 stat_maxTags.append(len(max_tags)) | |
| 1144 | |
| 1145 info_maxTags = [data_array[data_array[:, 1] == t, :] for t in max_tags] | |
| 1146 | |
| 1147 chimera_half_a = numpy.array([t[0:(len(t)) / 2] for t in max_tags]) # mate1 part1 | |
| 1148 chimera_half_b = numpy.array([t[len(t) / 2:len(t)] for t in max_tags]) # mate1 part 2 | |
| 1149 | |
| 1150 new_format = [] | |
| 1151 for j in range(len(max_tags)): | |
| 1152 fs_maxTags = ["{} {}".format(t[0], t[2]) for t in info_maxTags[j]] | |
| 1153 | |
| 1154 if sample_half_a == chimera_half_a[j]: | |
| 1155 max_tag = "*{}* {} {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags)) | |
| 1156 new_format.append(max_tag) | |
| 1157 | |
| 1158 elif sample_half_b == chimera_half_b[j]: | |
| 1159 max_tag = "{} *{}* {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags)) | |
| 1160 new_format.append(max_tag) | |
| 1161 checked_tags.append(max_tags[j]) | |
| 1162 | |
| 1163 sample_tag = "{} {}\t{}".format(sample_half_a, sample_half_b, ", ".join(fs_tag1)) | |
| 1164 output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format))) | |
| 1165 | |
| 1166 checked_tags.append(tag1) | |
| 1167 | |
| 1168 output_file1.write( | |
| 1169 "This file contains all tags that were identified as chimeras as the first column and the " | |
| 1170 "corresponding tags which returned a Hamming distance of zero in either the first or the second " | |
| 1171 "half of the sample tag as the second column.\n" | |
| 1172 "The tags were separated by an empty space into their halves and the * marks the identical half.") | |
| 1173 output_file1.write("\n\nStatistics of nr. of tags that returned max. TD (2nd column)\n") | |
| 1174 output_file1.write("minimum\t{}\ttag(s)\n".format(numpy.amin(numpy.array(stat_maxTags)))) | |
| 1175 output_file1.write("mean\t{}\ttag(s)\n".format(numpy.mean(numpy.array(stat_maxTags)))) | |
| 1176 output_file1.write("median\t{}\ttag(s)\n".format(numpy.median(numpy.array(stat_maxTags)))) | |
| 1177 output_file1.write("maximum\t{}\ttag(s)\n".format(numpy.amax(numpy.array(stat_maxTags)))) | |
| 1178 output_file1.write("sum\t{}\ttag(s)\n".format(numpy.sum(numpy.array(stat_maxTags)))) | |
| 1179 | |
| 1180 lenTags = len(data_array) | |
| 1181 len_sample = len(result1) | |
| 1182 | |
| 1183 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags | |
| 1184 seq = numpy.array(data_array[result, 1]) # tags of sample | |
| 1185 ham = numpy.asarray(ham) # HD for sample of tags | |
| 1186 | |
| 1187 if onlyDuplicates is True: # ab and ba strands of DCSs | |
| 1188 quant = numpy.concatenate((quant, duplTagsBA[result])) | |
| 1189 seq = numpy.tile(seq, 2) | |
| 1190 ham = numpy.tile(ham, 2) | |
| 1191 diff = numpy.tile(diff, 2) | |
| 1192 rel_Diff = numpy.tile(rel_Diff, 2) | |
| 1193 diff_zeros = numpy.tile(diff_zeros, 2) | |
| 1194 | |
| 1195 nr_chimeric_tags = len(data_chimeraAnalysis) | |
| 1196 print("nr of chimeras", nr_chimeric_tags) | |
| 1197 | |
| 1198 # prepare data for different kinds of plots | |
| 1199 # distribution of FSs separated after HD | |
| 1200 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False) | |
| 1201 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS | |
| 1202 | |
| 1203 # get FS for all tags with min HD of analysis of chimeric reads | |
| 1204 # there are more tags than sample size in the plot, because one tag can have multiple minimas | |
| 1205 if onlyDuplicates: | |
| 1206 seqDic = defaultdict(list) | |
| 1207 for s, q in zip(seq, quant): | |
| 1208 seqDic[s].append(q) | |
| 1209 else: | |
| 1210 seqDic = dict(zip(seq, quant)) | |
| 1211 | |
| 1212 lst_minHD_tags = [] | |
| 1213 for i in minHD_tags: | |
| 1214 lst_minHD_tags.append(seqDic.get(i)) | |
| 1215 | |
| 1216 if onlyDuplicates: | |
| 1217 lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags], | |
| 1218 [item_b[1] for item_b in lst_minHD_tags])).astype(int) | |
| 1219 # histogram with absolute and relative difference between HDs of both parts of the tag | |
| 1220 listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff) | |
| 1221 listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags, | |
| 1222 rel_Diff) | |
| 1223 # chimeric read analysis: tags which have TD=0 in one of the halfs | |
| 1224 if len(minHD_tags_zeros) != 0: | |
| 1225 lst_minHD_tags_zeros = [] | |
| 1226 for i in minHD_tags_zeros: | |
| 1227 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads | |
| 1228 if onlyDuplicates: | |
| 1229 lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros], | |
| 1230 [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int) | |
| 1231 | |
| 1232 # histogram with HD of non-identical half | |
| 1233 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS( | |
| 1234 lst_minHD_tags_zeros, diff_zeros) | |
| 1235 | |
| 1236 if onlyDuplicates is False: | |
| 1237 listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros = hammingDistanceWithDCS(minHD_tags_zeros, | |
| 1238 diff_zeros, data_array) | |
| 1239 | |
| 1240 # plot Hamming Distance with Family size distribution | |
| 1241 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, rel_freq=rel_freq, | |
| 1242 subtitle="Tag distance separated by family size", lenTags=lenTags, | |
| 1243 xlabel="TD", nr_above_bars=nr_above_bars, len_sample=len_sample) | |
| 1244 | |
| 1245 # Plot FSD with separation after | |
| 1246 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, rel_freq=rel_freq, | |
| 1247 originalCounts=quant, subtitle="Family size distribution separated by Tag distance", | |
| 1248 pdf=pdf, relative=False, diff=False) | |
| 1249 | |
| 1250 # Plot HD within tags | |
| 1251 plotHDwithinSeq(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, | |
| 1252 rel_freq=rel_freq, len_sample=len_sample) | |
| 1253 | |
| 1254 # Plot difference between HD's separated after FSD | |
| 1255 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, | |
| 1256 subtitle="Delta Tag distance within tags", lenTags=lenTags, rel_freq=rel_freq, | |
| 1257 xlabel="absolute delta TD", relative=False, nr_above_bars=nr_above_bars, len_sample=len_sample) | |
| 1258 | |
| 1259 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, | |
| 1260 subtitle="Chimera Analysis: relative delta Tag distance", lenTags=lenTags, rel_freq=rel_freq, | |
| 1261 xlabel="relative delta TD", relative=True, nr_above_bars=nr_above_bars, | |
| 1262 nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) | |
| 1263 | |
| 1264 # plots for chimeric reads | |
| 1265 if len(minHD_tags_zeros) != 0: | |
| 1266 # HD | |
| 1267 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, | |
| 1268 subtitle="Tag distance of chimeric families (CF)", rel_freq=rel_freq, | |
| 1269 lenTags=lenTags, xlabel="TD", relative=False, | |
| 1270 nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) | |
| 1271 | |
| 1272 if onlyDuplicates is False: | |
| 1273 plotHDwithDCS(listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros, pdf=pdf, | |
| 1274 subtitle="Tag distance of chimeric families (CF)", rel_freq=rel_freq, | |
| 1275 lenTags=lenTags, xlabel="TD", relative=False, | |
| 1276 nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) | |
| 1277 | |
| 1278 # print all data to a CSV file | |
| 1279 # HD | |
| 1280 summary, sumCol = createTableHD(list1, "TD=") | |
| 1281 overallSum = sum(sumCol) # sum of columns in table | |
| 1282 | |
| 1283 # FSD | |
| 1284 summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False) | |
| 1285 overallSum5 = sum(sumCol5) | |
| 1286 | |
| 1287 # HD of both parts of the tag | |
| 1288 summary9, sumCol9 = createTableHDwithTags([HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, numpy.array(minHDs)]) | |
| 1289 overallSum9 = sum(sumCol9) | |
| 1290 | |
| 1291 # HD | |
| 1292 # absolute difference | |
| 1293 summary11, sumCol11 = createTableHD(listDifference1, "diff=") | |
| 1294 overallSum11 = sum(sumCol11) | |
| 1295 # relative difference and all tags | |
| 1296 summary13, sumCol13 = createTableHD(listRelDifference1, "diff=") | |
| 1297 overallSum13 = sum(sumCol13) | |
| 1298 | |
| 1299 # chimeric reads | |
| 1300 if len(minHD_tags_zeros) != 0: | |
| 1301 # absolute difference and tags where at least one half has HD=0 | |
| 1302 summary15, sumCol15 = createTableHD(listDifference1_zeros, "TD=") | |
| 1303 overallSum15 = sum(sumCol15) | |
| 1304 | |
| 1305 if onlyDuplicates is False: | |
| 1306 summary16, sumCol16 = createTableHDwithDCS(listDCS_zeros) | |
| 1307 overallSum16 = sum(sumCol16) | |
| 1308 | |
| 1309 output_file.write("{}\n".format(name1)) | |
| 1310 output_file.write("nr of tags{}{:,}\nsample size{}{:,}\n\n".format(sep, lenTags, sep, len_sample)) | |
| 1311 | |
| 1312 # HD | |
| 1313 createFileHD(summary, sumCol, overallSum, output_file, | |
| 1314 "Tag distance separated by family size", sep) | |
| 1315 # FSD | |
| 1316 createFileFSD2(summary5, sumCol5, overallSum5, output_file, | |
| 1317 "Family size distribution separated by Tag distance", sep, | |
| 1318 diff=False) | |
| 1319 | |
| 1320 # output_file.write("{}{}\n".format(sep, name1)) | |
| 1321 output_file.write("\n") | |
| 1322 max_fs = numpy.bincount(integers[result]) | |
| 1323 output_file.write("max. family size in sample:{}{}\n".format(sep, max(integers[result]))) | |
| 1324 output_file.write("absolute frequency:{}{}\n".format(sep, max_fs[len(max_fs) - 1])) | |
| 1325 output_file.write( | |
| 1326 "relative frequency:{}{}\n\n".format(sep, float(max_fs[len(max_fs) - 1]) / sum(max_fs))) | |
| 1327 | |
| 1328 # HD within tags | |
| 1329 output_file.write( | |
| 1330 "Chimera Analysis:\nThe tags are splitted into two halves (part a and b) for which the Tag distances (TD) are calculated seperately.\n" | |
| 1331 "The tag distance of the first half (part a) is calculated by comparing part a of the tag in the sample against all a parts in the dataset and by selecting the minimum value (TD a.min).\n" | |
| 1332 "In the next step, we select those tags that showed the minimum TD and estimate the TD for the second half (part b) of the tag by comparing part b against the previously selected subset.\n" | |
| 1333 "The maximum value represents then TD b.max. Finally, these process is repeated but starting with part b instead and TD b.min and TD a.max are calculated.\n" | |
| 1334 "Next, the absolute differences between TD a.min & TD b.max and TD b.min & TD a.max are estimated (delta HD).\n" | |
| 1335 "These are then divided by the sum of both parts (TD a.min + TD b.max or TD b.min + TD a.max, respectively) which give the relative differences between the partial HDs (rel. delta HD).\n" | |
| 1336 "For simplicity, we used the maximum value of the relative differences and the respective delta HD.\n" | |
| 1337 "Note that when only tags that can form a DCS are included in the analysis, the family sizes for both directions (ab and ba) of the strand will be included in the plots.\n") | |
| 1338 | |
| 1339 output_file.write("\nlength of one half of the tag{}{}\n\n".format(sep, len(data_array[0, 1]) / 2)) | |
| 1340 | |
| 1341 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, | |
| 1342 "Tag distance of each half in the tag", sep) | |
| 1343 createFileHD(summary11, sumCol11, overallSum11, output_file, | |
| 1344 "Absolute delta Tag distance within the tag", sep) | |
| 1345 | |
| 1346 createFileHD(summary13, sumCol13, overallSum13, output_file, | |
| 1347 "Chimera analysis: relative delta Tag distance", sep) | |
| 1348 | |
| 1349 if len(minHD_tags_zeros) != 0: | |
| 1350 output_file.write( | |
| 1351 "All tags are filtered and only those tags where one half is identical (TD=0) and therefore, have a relative delta TD of 1, are kept.\n" | |
| 1352 "These tags are considered as chimeras.\n") | |
| 1353 createFileHD(summary15, sumCol15, overallSum15, output_file, | |
| 1354 "Tag distance of chimeric families separated after FS", sep) | |
| 1355 | |
| 1356 if onlyDuplicates is False: | |
| 1357 createFileHDwithDCS(summary16, sumCol16, overallSum16, output_file, | |
| 1358 "Tag distance of chimeric families separated after DCS and single SSCS (ab, ba)", sep) | |
| 1359 | |
| 1360 output_file.write("\n") | |
| 1361 | |
| 1362 | |
| 1363 if __name__ == '__main__': | |
| 1364 sys.exit(Hamming_Distance_Analysis(sys.argv)) |
