comparison td.py @ 0:3e56058d9552 draft default tip

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/hd commit 9bae9043a53f1e07b502acd1082450adcb6d9e31-dirty
author mheinzl
date Wed, 16 Oct 2019 04:17:59 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:3e56058d9552
1 #!/usr/bin/env python
2
3 # Tag distance analysis of SSCSs
4 #
5 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria)
6 # Contact: monika.heinzl@edumail.at
7 #
8 # Takes at least one TABULAR file with tags before the alignment to the SSCS and
9 # optionally a second TABULAR file as input. The program produces a plot which shows a histogram of Hamming distances
10 # separated after family sizes, a family size distribution separated after Hamming distances for all (sample_size=0)
11 # or a given sample of SSCSs or SSCSs, which form a DCS. In additon, the tool produces HD and FSD plots for the
12 # difference between the HDs of both parts of the tags and for the chimeric reads and finally a CSV file with the
13 # data of the plots. It is also possible to perform the HD analysis with shortened tags with given sizes as input.
14 # The tool can run on a certain number of processors, which can be defined by the user.
15
16 # USAGE: python td.py --inputFile filename --inputName1 filename --sample_size int /
17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int
18 # --nr_above_bars True/False --output_tabular outptufile_name_tabular
19
20 import argparse
21 import itertools
22 import operator
23 import sys
24 from collections import Counter, defaultdict
25 from functools import partial
26 from multiprocessing.pool import Pool
27 import random
28 import os
29
30 import matplotlib.pyplot as plt
31 import numpy
32 from matplotlib.backends.backend_pdf import PdfPages
33
34 plt.switch_backend('agg')
35
36
37 def plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, originalCounts,
38 subtitle, pdf, relative=False, diff=True, rel_freq=False):
39 if diff is False:
40 colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"]
41 labels = ["TD=1", "TD=2", "TD=3", "TD=4", "TD=5-8", "TD>8"]
42 else:
43 colors = ["#93A6AB", "#403C14", "#731E41", "#BAB591", "#085B6F", "#E8AA35", "#726C66"]
44 if relative is True:
45 labels = ["d=0", "d=0.1", "d=0.2", "d=0.3", "d=0.4", "d=0.5-0.8", "d>0.8"]
46 else:
47 labels = ["d=0", "d=1", "d=2", "d=3", "d=4", "d=5-8", "d>8"]
48
49 fig = plt.figure(figsize=(6, 7))
50 ax = fig.add_subplot(111)
51 plt.subplots_adjust(bottom=0.1)
52 p1 = numpy.bincount(numpy.concatenate(familySizeList1))
53 maximumY = numpy.amax(p1)
54
55 if len(range(minimumXFS, maximumXFS)) == 0:
56 range1 = range(minimumXFS - 1, minimumXFS + 2)
57 else:
58 range1 = range(0, maximumXFS + 2)
59
60 if rel_freq:
61 w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(familySizeList1)) for data in familySizeList1]
62 counts = plt.hist(familySizeList1, label=labels, weights=w, color=colors, stacked=True,
63 rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1)
64 plt.ylabel("Relative Frequency", fontsize=14)
65 plt.ylim((0, 1.07))
66 else:
67 counts = plt.hist(familySizeList1, label=labels, color=colors, stacked=True,
68 rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1)
69 if len(numpy.concatenate(familySizeList1)) != 0:
70 plt.ylim((0, max(numpy.bincount(numpy.concatenate(familySizeList1))) * 1.1))
71 plt.ylabel("Absolute Frequency", fontsize=14)
72 plt.ylim((0, maximumY * 1.2))
73 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
74 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
75 plt.xlabel("Family size", fontsize=14)
76 ticks = numpy.arange(0, maximumXFS + 1, 1)
77 ticks1 = map(str, ticks)
78 if maximumXFS >= 20:
79 ticks1[len(ticks1) - 1] = ">=20"
80 plt.xticks(numpy.array(ticks), ticks1)
81 [l.set_visible(False) for (i, l) in enumerate(ax.get_xticklabels()) if i % 5 != 0]
82 plt.xlim((0, maximumXFS + 1))
83 legend = "\nfamily size: \nabsolute frequency: \nrelative frequency: "
84 plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure)
85
86 count = numpy.bincount(originalCounts) # original counts
87 if max(originalCounts) >= 20:
88 max_count = ">= 20"
89 else:
90 max_count = max(originalCounts)
91 legend1 = "{}\n{}\n{:.5f}".format(max_count, p1[len(p1) - 1], float(p1[len(p1) - 1]) / sum(p1))
92 plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure)
93 legend3 = "singletons\n{:,}\n{:.5f}".format(int(p1[1]), float(p1[1]) / sum(p1))
94 plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12)
95 plt.grid(b=True, which='major', color='#424242', linestyle=':')
96 pdf.savefig(fig, bbox_inches="tight")
97 plt.close("all")
98
99
100 def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False,
101 nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False):
102 if relative is True:
103 step = 0.1
104 else:
105 step = 1
106
107 fig = plt.figure(figsize=(6, 8))
108 plt.subplots_adjust(bottom=0.1)
109 p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())])
110 maximumY = numpy.amax(p1)
111 if relative is True: # relative difference
112 bin1 = numpy.arange(-1, maximumX + 0.2, 0.1)
113 else:
114 bin1 = maximumX + 1
115
116 if rel_freq:
117 w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1]
118 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w,
119 label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8,
120 color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"],
121 stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
122 plt.ylim((0, 1.07))
123 plt.ylabel("Relative Frequency", fontsize=14)
124 bins = counts[1] # width of bins
125 counts = numpy.array(map(float, counts[0][5]))
126
127 else:
128 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1,
129 label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8,
130 color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"],
131 stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
132 maximumY = numpy.amax(p1)
133 plt.ylim((0, maximumY * 1.2))
134 plt.ylabel("Absolute Frequency", fontsize=14)
135 bins = counts[1] # width of bins
136 counts = numpy.array(map(int, counts[0][5]))
137
138 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
139 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
140 plt.xlabel(xlabel, fontsize=14)
141 plt.grid(b=True, which='major', color='#424242', linestyle=':')
142 plt.xlim((minimumX - step, maximumX + step))
143 # plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1))
144 plt.xticks(numpy.arange(0, maximumX + step, step))
145
146 if nr_above_bars:
147 bin_centers = -0.4 * numpy.diff(bins) + bins[:-1]
148 for x_label, label in zip(counts, bin_centers): # labels for values
149 if x_label == 0:
150 continue
151 else:
152 if rel_freq:
153 plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))),
154 float(x_label)),
155 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001),
156 xycoords="data", color="#000066", fontsize=10)
157 else:
158 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)),
159 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01),
160 xycoords="data", color="#000066", fontsize=10)
161
162 if nr_unique_chimeras != 0:
163 if (relative and ((counts[len(counts)-1] / nr_unique_chimeras) == 2)) or \
164 (sum(counts) / nr_unique_chimeras) == 2:
165 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})"\
166 .format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2)
167 else:
168 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format(
169 lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras)
170 else:
171 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(
172 lenTags, len_sample, len(numpy.concatenate(list1)))
173
174 plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure)
175 pdf.savefig(fig, bbox_inches="tight")
176 plt.close("all")
177 plt.clf()
178
179
180 def plotHDwithDCS(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False,
181 nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False):
182 step = 1
183 fig = plt.figure(figsize=(6, 8))
184 plt.subplots_adjust(bottom=0.1)
185 p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())])
186 maximumY = numpy.amax(p1)
187 bin1 = maximumX + 1
188 if rel_freq:
189 w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1]
190 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w,
191 label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"],
192 stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
193 plt.ylim((0, 1.07))
194 plt.ylabel("Relative Frequency", fontsize=14)
195 bins = counts[1] # width of bins
196 counts = numpy.array(map(float, counts[0][2]))
197
198 else:
199 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1,
200 label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"],
201 stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
202 plt.ylim((0, maximumY * 1.2))
203 plt.ylabel("Absolute Frequency", fontsize=14)
204 bins = counts[1] # width of bins
205 counts = numpy.array(map(int, counts[0][2]))
206
207 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
208 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
209 plt.xlabel(xlabel, fontsize=14)
210 plt.grid(b=True, which='major', color='#424242', linestyle=':')
211 plt.xlim((minimumX - step, maximumX + step))
212 # plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1))
213 plt.xticks(numpy.arange(0, maximumX + step, step))
214
215 if nr_above_bars:
216 bin_centers = -0.4 * numpy.diff(bins) + bins[:-1]
217 for x_label, label in zip(counts, bin_centers): # labels for values
218 if x_label == 0:
219 continue
220 else:
221 if rel_freq:
222 plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))),
223 float(x_label)),
224 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001),
225 xycoords="data", color="#000066", fontsize=10)
226 else:
227 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)),
228 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01),
229 xycoords="data", color="#000066", fontsize=10)
230
231 if nr_unique_chimeras != 0:
232 if (sum(counts) / nr_unique_chimeras) == 2:
233 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})".\
234 format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2)
235 else:
236 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format(
237 lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras)
238 else:
239 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(
240 lenTags, len_sample, len(numpy.concatenate(list1)))
241 plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure)
242
243 legend2 = "SSCS ab = {:,} ({:.5f})\nSSCS ba = {:,} ({:.5f})\nDCS = {:,} ({:.5f})".format(
244 len(list1[1]), len(list1[1]) / float(nr_unique_chimeras),
245 len(list1[2]), len(list1[2]) / float(nr_unique_chimeras),
246 len(list1[0]), len(list1[0]) / float(nr_unique_chimeras))
247 plt.text(0.6, -0.047, legend2, size=12, transform=plt.gcf().transFigure)
248
249 pdf.savefig(fig, bbox_inches="tight")
250 plt.close("all")
251 plt.clf()
252
253
254 def plotHDwithinSeq(sum1, sum1min, sum2, sum2min, min_value, lenTags, pdf, len_sample, rel_freq=False):
255 fig = plt.figure(figsize=(6, 8))
256 plt.subplots_adjust(bottom=0.1)
257
258 ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)] # new hd within tags
259 maximumX = numpy.amax(numpy.concatenate(ham_partial))
260 minimumX = numpy.amin(numpy.concatenate(ham_partial))
261 maximumY = numpy.amax(numpy.array(numpy.concatenate(map(lambda x: numpy.bincount(x), ham_partial))))
262
263 if len(range(minimumX, maximumX)) == 0:
264 range1 = minimumX
265 else:
266 range1 = range(minimumX, maximumX + 2)
267
268 if rel_freq:
269 w = [numpy.zeros_like(data) + 1. / len(data) for data in ham_partial]
270 plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, weights=w,
271 label=["TD a.min", "TD b.max", "TD b.min", "TD a.max", "TD a.min + b.max,\nTD a.max + b.min"],
272 bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"],
273 edgecolor='black', linewidth=1)
274 plt.ylabel("Relative Frequency", fontsize=14)
275 plt.ylim(0, 1.07)
276 else:
277 plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False,
278 label=["TD a.min", "TD b.max", "TD b.min", "TD a.max", "TD a.min + b.max,\nTD a.max + b.min"],
279 bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"],
280 edgecolor='black', linewidth=1)
281 plt.ylabel("Absolute Frequency", fontsize=14)
282
283 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.6, 1))
284 plt.suptitle('Tag distances within tags', fontsize=14)
285 plt.xlabel("TD", fontsize=14)
286 plt.grid(b=True, which='major', color='#424242', linestyle=':')
287 plt.xlim((minimumX - 1, maximumX + 1))
288 # plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2))
289 plt.xticks(numpy.arange(0, maximumX + 1, 1.0))
290 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(
291 lenTags, len_sample, len(numpy.concatenate(ham_partial)))
292 plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure)
293 pdf.savefig(fig, bbox_inches="tight")
294 plt.close("all")
295 plt.clf()
296
297
298 def createTableFSD2(list1, diff=True):
299 selfAB = numpy.concatenate(list1)
300 uniqueFS = numpy.unique(selfAB)
301 nr = numpy.arange(0, len(uniqueFS), 1)
302 if diff is False:
303 count = numpy.zeros((len(uniqueFS), 6))
304 else:
305 count = numpy.zeros((len(uniqueFS), 7))
306 state = 1
307 for i in list1:
308 counts = list(Counter(i).items())
309 hd = [item[0] for item in counts]
310 c = [item[1] for item in counts]
311 table = numpy.column_stack((hd, c))
312 if len(table) == 0:
313 state = state + 1
314 continue
315 else:
316 if state == 1:
317 for k, l in zip(uniqueFS, nr):
318 for j in table:
319 if j[0] == uniqueFS[l]:
320 count[l, 0] = j[1]
321 if state == 2:
322 for k, l in zip(uniqueFS, nr):
323 for j in table:
324 if j[0] == uniqueFS[l]:
325 count[l, 1] = j[1]
326 if state == 3:
327 for k, l in zip(uniqueFS, nr):
328 for j in table:
329 if j[0] == uniqueFS[l]:
330 count[l, 2] = j[1]
331 if state == 4:
332 for k, l in zip(uniqueFS, nr):
333 for j in table:
334 if j[0] == uniqueFS[l]:
335 count[l, 3] = j[1]
336 if state == 5:
337 for k, l in zip(uniqueFS, nr):
338 for j in table:
339 if j[0] == uniqueFS[l]:
340 count[l, 4] = j[1]
341 if state == 6:
342 for k, l in zip(uniqueFS, nr):
343 for j in table:
344 if j[0] == uniqueFS[l]:
345 count[l, 5] = j[1]
346 if state == 7:
347 for k, l in zip(uniqueFS, nr):
348 for j in table:
349 if j[0] == uniqueFS[l]:
350 count[l, 6] = j[1]
351 state = state + 1
352 sumRow = count.sum(axis=1)
353 sumCol = count.sum(axis=0)
354 uniqueFS = uniqueFS.astype(str)
355 if uniqueFS[len(uniqueFS) - 1] == "20":
356 uniqueFS[len(uniqueFS) - 1] = ">20"
357 first = ["FS={}".format(i) for i in uniqueFS]
358 final = numpy.column_stack((first, count, sumRow))
359 return (final, sumCol)
360
361
362 def createFileFSD2(summary, sumCol, overallSum, output_file, name, sep, rel=False, diff=True):
363 output_file.write(name)
364 output_file.write("\n")
365 if diff is False:
366 output_file.write("{}TD=1{}TD=2{}TD=3{}TD=4{}TD=5-8{}TD>8{}sum{}\n".format(
367 sep, sep, sep, sep, sep, sep, sep, sep))
368 else:
369 if rel is False:
370 output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format(
371 sep, sep, sep, sep, sep, sep, sep, sep, sep))
372 else:
373 output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n".
374 format(sep, sep, sep, sep, sep, sep, sep, sep, sep))
375 for item in summary:
376 for nr in item:
377 if "FS" not in nr and "diff" not in nr:
378 nr = nr.astype(float)
379 nr = nr.astype(int)
380 output_file.write("{}{}".format(nr, sep))
381 output_file.write("\n")
382 output_file.write("sum{}".format(sep))
383 sumCol = map(int, sumCol)
384 for el in sumCol:
385 output_file.write("{}{}".format(el, sep))
386 output_file.write("{}{}".format(overallSum.astype(int), sep))
387 output_file.write("\n\n")
388
389
390 def createTableHD(list1, row_label):
391 selfAB = numpy.concatenate(list1)
392 uniqueHD = numpy.unique(selfAB)
393 nr = numpy.arange(0, len(uniqueHD), 1)
394 count = numpy.zeros((len(uniqueHD), 6))
395 state = 1
396 for i in list1:
397 counts = list(Counter(i).items())
398 hd = [item[0] for item in counts]
399 c = [item[1] for item in counts]
400 table = numpy.column_stack((hd, c))
401 if len(table) == 0:
402 state = state + 1
403 continue
404 else:
405 if state == 1:
406 for k, l in zip(uniqueHD, nr):
407 for j in table:
408 if j[0] == uniqueHD[l]:
409 count[l, 0] = j[1]
410 if state == 2:
411 for k, l in zip(uniqueHD, nr):
412 for j in table:
413 if j[0] == uniqueHD[l]:
414 count[l, 1] = j[1]
415 if state == 3:
416 for k, l in zip(uniqueHD, nr):
417 for j in table:
418 if j[0] == uniqueHD[l]:
419 count[l, 2] = j[1]
420 if state == 4:
421 for k, l in zip(uniqueHD, nr):
422 for j in table:
423 if j[0] == uniqueHD[l]:
424 count[l, 3] = j[1]
425 if state == 5:
426 for k, l in zip(uniqueHD, nr):
427 for j in table:
428 if j[0] == uniqueHD[l]:
429 count[l, 4] = j[1]
430 if state == 6:
431 for k, l in zip(uniqueHD, nr):
432 for j in table:
433 if j[0] == uniqueHD[l]:
434 count[l, 5] = j[1]
435 state = state + 1
436 sumRow = count.sum(axis=1)
437 sumCol = count.sum(axis=0)
438 first = ["{}{}".format(row_label, i) for i in uniqueHD]
439 final = numpy.column_stack((first, count, sumRow))
440 return (final, sumCol)
441
442
443 def createTableHDwithTags(list1):
444 selfAB = numpy.concatenate(list1)
445 uniqueHD = numpy.unique(selfAB)
446 nr = numpy.arange(0, len(uniqueHD), 1)
447 count = numpy.zeros((len(uniqueHD), 5))
448 state = 1
449 for i in list1:
450 counts = list(Counter(i).items())
451 hd = [item[0] for item in counts]
452 c = [item[1] for item in counts]
453 table = numpy.column_stack((hd, c))
454 if len(table) == 0:
455 state = state + 1
456 continue
457 else:
458 if state == 1:
459 for k, l in zip(uniqueHD, nr):
460 for j in table:
461 if j[0] == uniqueHD[l]:
462 count[l, 0] = j[1]
463 if state == 2:
464 for k, l in zip(uniqueHD, nr):
465 for j in table:
466 if j[0] == uniqueHD[l]:
467 count[l, 1] = j[1]
468 if state == 3:
469 for k, l in zip(uniqueHD, nr):
470 for j in table:
471 if j[0] == uniqueHD[l]:
472 count[l, 2] = j[1]
473 if state == 4:
474 for k, l in zip(uniqueHD, nr):
475 for j in table:
476 if j[0] == uniqueHD[l]:
477 count[l, 3] = j[1]
478 if state == 5:
479 for k, l in zip(uniqueHD, nr):
480 for j in table:
481 if j[0] == uniqueHD[l]:
482 count[l, 4] = j[1]
483 state = state + 1
484 sumRow = count.sum(axis=1)
485 sumCol = count.sum(axis=0)
486 first = ["TD={}".format(i) for i in uniqueHD]
487 final = numpy.column_stack((first, count, sumRow))
488 return (final, sumCol)
489
490
491 def createTableHDwithDCS(list1):
492 selfAB = numpy.concatenate(list1)
493 uniqueHD = numpy.unique(selfAB)
494 nr = numpy.arange(0, len(uniqueHD), 1)
495 count = numpy.zeros((len(uniqueHD), len(list1)))
496 state = 1
497 for i in list1:
498 counts = list(Counter(i).items())
499 hd = [item[0] for item in counts]
500 c = [item[1] for item in counts]
501 table = numpy.column_stack((hd, c))
502 if len(table) == 0:
503 state = state + 1
504 continue
505 else:
506 if state == 1:
507 for k, l in zip(uniqueHD, nr):
508 for j in table:
509 if j[0] == uniqueHD[l]:
510 count[l, 0] = j[1]
511 if state == 2:
512 for k, l in zip(uniqueHD, nr):
513 for j in table:
514 if j[0] == uniqueHD[l]:
515 count[l, 1] = j[1]
516 if state == 3:
517 for k, l in zip(uniqueHD, nr):
518 for j in table:
519 if j[0] == uniqueHD[l]:
520 count[l, 2] = j[1]
521 state = state + 1
522 sumRow = count.sum(axis=1)
523 sumCol = count.sum(axis=0)
524 first = ["TD={}".format(i) for i in uniqueHD]
525 final = numpy.column_stack((first, count, sumRow))
526 return (final, sumCol)
527
528
529 def createFileHD(summary, sumCol, overallSum, output_file, name, sep):
530 output_file.write(name)
531 output_file.write("\n")
532 output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format(
533 sep, sep, sep, sep, sep, sep, sep, sep))
534 for item in summary:
535 for nr in item:
536 if "TD" not in nr and "diff" not in nr:
537 nr = nr.astype(float)
538 nr = nr.astype(int)
539 output_file.write("{}{}".format(nr, sep))
540 output_file.write("\n")
541 output_file.write("sum{}".format(sep))
542 sumCol = map(int, sumCol)
543 for el in sumCol:
544 output_file.write("{}{}".format(el, sep))
545 output_file.write("{}{}".format(overallSum.astype(int), sep))
546 output_file.write("\n\n")
547
548
549 def createFileHDwithDCS(summary, sumCol, overallSum, output_file, name, sep):
550 output_file.write(name)
551 output_file.write("\n")
552 output_file.write("{}DCS{}SSCS ab{}SSCS ba{}sum{}\n".format(sep, sep, sep, sep, sep))
553 for item in summary:
554 for nr in item:
555 if "TD" not in nr:
556 nr = nr.astype(float)
557 nr = nr.astype(int)
558 output_file.write("{}{}".format(nr, sep))
559 output_file.write("\n")
560 output_file.write("sum{}".format(sep))
561 sumCol = map(int, sumCol)
562 for el in sumCol:
563 output_file.write("{}{}".format(el, sep))
564 output_file.write("{}{}".format(overallSum.astype(int), sep))
565 output_file.write("\n\n")
566
567
568 def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name, sep):
569 output_file.write(name)
570 output_file.write("\n")
571 output_file.write("{}TD a.min{}TD b.max{}TD b.min{}TD a.max{}TD a.min + b.max, TD a.max + b.min{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep))
572 for item in summary:
573 for nr in item:
574 if "TD" not in nr:
575 nr = nr.astype(float)
576 nr = nr.astype(int)
577 output_file.write("{}{}".format(nr, sep))
578 output_file.write("\n")
579 output_file.write("sum{}".format(sep))
580 sumCol = map(int, sumCol)
581 for el in sumCol:
582 output_file.write("{}{}".format(el, sep))
583 output_file.write("{}{}".format(overallSum.astype(int), sep))
584 output_file.write("\n\n")
585
586
587 def hamming(array1, array2):
588 res = 99 * numpy.ones(len(array1))
589 i = 0
590 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time
591 for a in array1:
592 dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2]) # fastest
593 res[i] = numpy.amin(dist[dist > 0]) # pick min distance greater than zero
594 i += 1
595 return res
596
597
598 def hamming_difference(array1, array2, mate_b):
599 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time
600 array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1
601 array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2
602 array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1
603 array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2
604
605 # diff11 = 999 * numpy.ones(len(array2))
606 # relativeDiffList = 999 * numpy.ones(len(array2))
607 # ham1 = 999 * numpy.ones(len(array2))
608 # ham2 = 999 * numpy.ones(len(array2))
609 # min_valueList = 999 * numpy.ones(len(array2))
610 # min_tagsList = 999 * numpy.ones(len(array2))
611 # diff11_zeros = 999 * numpy.ones(len(array2))
612 # min_tagsList_zeros = 999 * numpy.ones(len(array2))
613
614 diff11 = []
615 relativeDiffList = []
616 ham1 = []
617 ham2 = []
618 ham1min = []
619 ham2min = []
620 min_valueList = []
621 min_tagsList = []
622 diff11_zeros = []
623 min_tagsList_zeros = []
624 max_tag_list = []
625 i = 0 # counter, only used to see how many HDs of tags were already calculated
626 if mate_b is False: # HD calculation for all a's
627 half1_mate1 = array1_half
628 half2_mate1 = array1_half2
629 half1_mate2 = array2_half
630 half2_mate2 = array2_half2
631 elif mate_b is True: # HD calculation for all b's
632 half1_mate1 = array1_half2
633 half2_mate1 = array1_half
634 half1_mate2 = array2_half2
635 half2_mate2 = array2_half
636
637 # half1_mate1, index_halves = numpy.unique(half1_mate1, return_index=True)
638 # print(len(half1_mate1))
639 # half2_mate1 = half2_mate1[index_halves]
640 # array1 = array1[index_halves]
641
642 for a, b, tag in zip(half1_mate1, half2_mate1, array1):
643 # exclude identical tag from array2, to prevent comparison to itself
644 sameTag = numpy.where(array2 == tag)[0]
645 indexArray2 = numpy.arange(0, len(array2), 1)
646 index_withoutSame = numpy.delete(indexArray2, sameTag) # delete identical tag from the data
647
648 # all tags without identical tag
649 array2_half_withoutSame = half1_mate2[index_withoutSame]
650 array2_half2_withoutSame = half2_mate2[index_withoutSame]
651 array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs)
652 # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
653 dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in
654 array2_half_withoutSame])
655 min_index = numpy.where(dist == dist.min())[0] # get index of min HD
656 min_value = dist.min()
657 # min_value = dist[min_index] # get minimum HDs
658 # get all "b's" of the tag or all "a's" of the tag with minimum HD
659 min_tag_half2 = array2_half2_withoutSame[min_index]
660 min_tag_array2 = array2_withoutSame[min_index] # get whole tag with min HD
661
662 dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in
663 min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's"
664 max_value = dist_second_half.max()
665 max_index = numpy.where(dist_second_half == dist_second_half.max())[0] # get index of max HD
666 max_tag = min_tag_array2[max_index]
667
668 # for d, d2 in zip(min_value, max_value):
669 if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b
670 ham2.append(min_value)
671 ham2min.append(max_value)
672 else: # half1, corrects the variable of the HD from both halfs if it is a or b
673 ham1.append(min_value)
674 ham1min.append(max_value)
675
676 min_valueList.append(min_value + max_value)
677 min_tagsList.append(tag)
678 difference1 = abs(min_value - max_value)
679 diff11.append(difference1)
680 rel_difference = round(float(difference1) / (min_value + max_value), 1)
681 relativeDiffList.append(rel_difference)
682
683 # tags which have identical parts:
684 if min_value == 0 or max_value == 0:
685 min_tagsList_zeros.append(numpy.array(tag))
686 difference1_zeros = abs(min_value - max_value) # td of non-identical part
687 diff11_zeros.append(difference1_zeros)
688 max_tag_list.append(numpy.array(max_tag))
689 else:
690 min_tagsList_zeros.append(None)
691 diff11_zeros.append(None)
692 max_tag_list.append(None)
693 i += 1
694
695 # print(i)
696 # diff11 = [st for st in diff11 if st != 999]
697 # ham1 = [st for st in ham1 if st != 999]
698 # ham2 = [st for st in ham2 if st != 999]
699 # min_valueList = [st for st in min_valueList if st != 999]
700 # min_tagsList = [st for st in min_tagsList if st != 999]
701 # relativeDiffList = [st for st in relativeDiffList if st != 999]
702 # diff11_zeros = [st for st in diff11_zeros if st != 999]
703 # min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999]
704 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros,
705 min_tagsList_zeros, ham1min, ham2min, max_tag_list])
706
707
708 def readFileReferenceFree(file):
709 with open(file, 'r') as dest_f:
710 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string')
711 integers = numpy.array(data_array[:, 0]).astype(int)
712 return(integers, data_array)
713
714
715 def hammingDistanceWithFS(fs, ham):
716 fs = numpy.asarray(fs)
717 maximum = max(ham)
718 minimum = min(ham)
719 ham = numpy.asarray(ham)
720
721 singletons = numpy.where(fs == 1)[0]
722 data = ham[singletons]
723
724 hd2 = numpy.where(fs == 2)[0]
725 data2 = ham[hd2]
726
727 hd3 = numpy.where(fs == 3)[0]
728 data3 = ham[hd3]
729
730 hd4 = numpy.where(fs == 4)[0]
731 data4 = ham[hd4]
732
733 hd5 = numpy.where((fs >= 5) & (fs <= 10))[0]
734 data5 = ham[hd5]
735
736 hd6 = numpy.where(fs > 10)[0]
737 data6 = ham[hd6]
738
739 list1 = [data, data2, data3, data4, data5, data6]
740 return(list1, maximum, minimum)
741
742
743 def familySizeDistributionWithHD(fs, ham, diff=False, rel=True):
744 hammingDistances = numpy.unique(ham)
745 fs = numpy.asarray(fs)
746 ham = numpy.asarray(ham)
747 bigFamilies2 = numpy.where(fs > 19)[0]
748 if len(bigFamilies2) != 0:
749 fs[bigFamilies2] = 20
750 maximum = max(fs)
751 minimum = min(fs)
752 if diff is True:
753 hd0 = numpy.where(ham == 0)[0]
754 data0 = fs[hd0]
755
756 if rel is True:
757 hd1 = numpy.where(ham == 0.1)[0]
758 else:
759 hd1 = numpy.where(ham == 1)[0]
760 data = fs[hd1]
761
762 if rel is True:
763 hd2 = numpy.where(ham == 0.2)[0]
764 else:
765 hd2 = numpy.where(ham == 2)[0]
766 data2 = fs[hd2]
767
768 if rel is True:
769 hd3 = numpy.where(ham == 0.3)[0]
770 else:
771 hd3 = numpy.where(ham == 3)[0]
772 data3 = fs[hd3]
773
774 if rel is True:
775 hd4 = numpy.where(ham == 0.4)[0]
776 else:
777 hd4 = numpy.where(ham == 4)[0]
778 data4 = fs[hd4]
779
780 if rel is True:
781 hd5 = numpy.where((ham >= 0.5) & (ham <= 0.8))[0]
782 else:
783 hd5 = numpy.where((ham >= 5) & (ham <= 8))[0]
784 data5 = fs[hd5]
785
786 if rel is True:
787 hd6 = numpy.where(ham > 0.8)[0]
788 else:
789 hd6 = numpy.where(ham > 8)[0]
790 data6 = fs[hd6]
791
792 if diff is True:
793 list1 = [data0, data, data2, data3, data4, data5, data6]
794 else:
795 list1 = [data, data2, data3, data4, data5, data6]
796
797 return(list1, hammingDistances, maximum, minimum)
798
799
800 def hammingDistanceWithDCS(minHD_tags_zeros, diff_zeros, data_array):
801 diff_zeros = numpy.array(diff_zeros)
802 maximum = numpy.amax(diff_zeros)
803 minimum = numpy.amin(diff_zeros)
804 minHD_tags_zeros = numpy.array(minHD_tags_zeros)
805
806 idx = numpy.concatenate([numpy.where(data_array[:, 1] == i)[0] for i in minHD_tags_zeros])
807 subset_data = data_array[idx, :]
808
809 seq = numpy.array(subset_data[:, 1])
810
811 # find all unique tags and get the indices for ALL tags, but only once
812 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
813 DCS_tags = u[c == 2]
814 rest_tags = u[c == 1]
815
816 dcs = numpy.repeat("DCS", len(DCS_tags))
817 idx_sscs = numpy.concatenate([numpy.where(subset_data[:, 1] == i)[0] for i in rest_tags])
818 sscs = subset_data[idx_sscs, 2]
819
820 all_tags = numpy.column_stack((numpy.concatenate((DCS_tags, subset_data[idx_sscs, 1])),
821 numpy.concatenate((dcs, sscs))))
822 hd_DCS = []
823 ab_SSCS = []
824 ba_SSCS = []
825
826 for i in range(len(all_tags)):
827 tag = all_tags[i, :]
828 hd = diff_zeros[numpy.where(minHD_tags_zeros == tag[0])[0]]
829
830 if tag[1] == "DCS":
831 hd_DCS.append(hd)
832 elif tag[1] == "ab":
833 ab_SSCS.append(hd)
834 elif tag[1] == "ba":
835 ba_SSCS.append(hd)
836
837 if len(hd_DCS) != 0:
838 hd_DCS = numpy.concatenate(hd_DCS)
839 if len(ab_SSCS) != 0:
840 ab_SSCS = numpy.concatenate(ab_SSCS)
841 if len(ba_SSCS) != 0:
842 ba_SSCS = numpy.concatenate(ba_SSCS)
843 list1 = [hd_DCS, ab_SSCS, ba_SSCS] # list for plotting
844 return(list1, maximum, minimum)
845
846
847 def make_argparser():
848 parser = argparse.ArgumentParser(description='Tag distance analysis of duplex sequencing data')
849 parser.add_argument('--inputFile',
850 help='Tabular File with three columns: ab or ba, tag and family size.')
851 parser.add_argument('--inputName1')
852 parser.add_argument('--sample_size', default=1000, type=int,
853 help='Sample size of Tag distance analysis.')
854 parser.add_argument('--subset_tag', default=0, type=int,
855 help='The tag is shortened to the given number.')
856 parser.add_argument('--nproc', default=4, type=int,
857 help='The tool runs with the given number of processors.')
858 parser.add_argument('--only_DCS', action="store_false",
859 help='Only tags of the DCSs are included in the HD analysis')
860
861 parser.add_argument('--minFS', default=1, type=int,
862 help='Only tags, which have a family size greater or equal than specified, '
863 'are included in the HD analysis')
864 parser.add_argument('--maxFS', default=0, type=int,
865 help='Only tags, which have a family size smaller or equal than specified, '
866 'are included in the HD analysis')
867 parser.add_argument('--nr_above_bars', action="store_true",
868 help='If False, values above bars in the histograms are removed')
869 parser.add_argument('--rel_freq', action="store_false",
870 help='If True, the relative frequencies are displayed.')
871
872 parser.add_argument('--output_tabular', default="data.tabular", type=str,
873 help='Name of the tabular file.')
874 parser.add_argument('--output_pdf', default="data.pdf", type=str,
875 help='Name of the pdf file.')
876 parser.add_argument('--output_chimeras_tabular', default="data.tabular", type=str,
877 help='Name of the tabular file with all chimeric tags.')
878
879 return parser
880
881
882 def Hamming_Distance_Analysis(argv):
883 parser = make_argparser()
884 args = parser.parse_args(argv[1:])
885 file1 = args.inputFile
886 name1 = args.inputName1
887 index_size = args.sample_size
888 title_savedFile_pdf = args.output_pdf
889 title_savedFile_csv = args.output_tabular
890 output_chimeras_tabular = args.output_chimeras_tabular
891 onlyDuplicates = args.only_DCS
892 rel_freq = args.rel_freq
893 minFS = args.minFS
894 maxFS = args.maxFS
895 nr_above_bars = args.nr_above_bars
896 subset = args.subset_tag
897 nproc = args.nproc
898 sep = "\t"
899
900 # input checks
901 if index_size < 0:
902 print("index_size is a negative integer.")
903 exit(2)
904 if nproc <= 0:
905 print("nproc is smaller or equal zero")
906 exit(3)
907 if subset < 0:
908 print("subset_tag is smaller or equal zero.")
909 exit(5)
910
911 # PLOT
912 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color
913 plt.rcParams['xtick.labelsize'] = 14
914 plt.rcParams['ytick.labelsize'] = 14
915 plt.rcParams['patch.edgecolor'] = "#000000"
916 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format
917 name1 = name1.split(".tabular")[0]
918
919 with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf:
920 print("dataset: ", name1)
921 integers, data_array = readFileReferenceFree(file1)
922 data_array = numpy.array(data_array)
923 print("total nr of tags:", len(data_array))
924
925 # filter tags out which contain any other character than ATCG
926 valid_bases = ["A", "T", "G", "C"]
927 tagsToDelete = []
928 for idx, t in enumerate(data_array[:, 1]):
929 for char in t:
930 if char not in valid_bases:
931 tagsToDelete.append(idx)
932 break
933
934 if len(tagsToDelete) != 0: # delete tags with N in the tag from data
935 print("nr of tags with any other character than A, T, C, G:", len(tagsToDelete),
936 float(len(tagsToDelete)) / len(data_array))
937 index_whole_array = numpy.arange(0, len(data_array), 1)
938 index_withoutN_inTag = numpy.delete(index_whole_array, tagsToDelete)
939 data_array = data_array[index_withoutN_inTag, :]
940 integers = integers[index_withoutN_inTag]
941 print("total nr of filtered tags:", len(data_array))
942
943 int_f = numpy.array(data_array[:, 0]).astype(int)
944 data_array = data_array[numpy.where(int_f >= minFS)]
945 integers = integers[integers >= minFS]
946
947 # select family size for tags
948 if maxFS > 0:
949 int_f2 = numpy.array(data_array[:, 0]).astype(int)
950 data_array = data_array[numpy.where(int_f2 <= maxFS)]
951 integers = integers[integers <= maxFS]
952
953 if onlyDuplicates is True:
954 tags = data_array[:, 2]
955 seq = data_array[:, 1]
956
957 # find all unique tags and get the indices for ALL tags, but only once
958 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
959 d = u[c == 2]
960
961 # get family sizes, tag for duplicates
962 duplTags_double = integers[numpy.in1d(seq, d)]
963 duplTags = duplTags_double[0::2] # ab of DCS
964 duplTagsBA = duplTags_double[1::2] # ba of DCS
965
966 duplTags_tag = tags[numpy.in1d(seq, d)][0::2] # ab
967 duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags
968
969 if minFS > 1:
970 duplTags_tag = duplTags_tag[(duplTags >= minFS) & (duplTagsBA >= minFS)]
971 duplTags_seq = duplTags_seq[(duplTags >= minFS) & (duplTagsBA >= minFS)]
972 duplTags = duplTags[(duplTags >= minFS) & (duplTagsBA >= minFS)] # ab+ba with FS>=3
973
974 data_array = numpy.column_stack((duplTags, duplTags_seq))
975 data_array = numpy.column_stack((data_array, duplTags_tag))
976 integers = numpy.array(data_array[:, 0]).astype(int)
977 print("DCS in whole dataset", len(data_array))
978
979 print("min FS", min(integers))
980 print("max FS", max(integers))
981
982 # HD analysis for a subset of the tag
983 if subset > 0:
984 tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]])
985 tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]])
986
987 flanking_region_float = float((len(tag1[0]) - subset)) / 2
988 flanking_region = int(flanking_region_float)
989 if flanking_region_float % 2 == 0:
990 tag1_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag1])
991 tag2_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag2])
992 else:
993 flanking_region_rounded = int(round(flanking_region, 1))
994 flanking_region_rounded_end = len(tag1[0]) - subset - flanking_region_rounded
995 tag1_shorten = numpy.array(
996 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag1])
997 tag2_shorten = numpy.array(
998 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2])
999
1000 data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)])
1001 data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2]))
1002
1003 print("length of tag= ", len(data_array[0, 1]))
1004 # select sample: if no size given --> all vs. all comparison
1005 if index_size == 0:
1006 result = numpy.arange(0, len(data_array), 1)
1007 else:
1008 numpy.random.shuffle(data_array)
1009 unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True) # get only unique tags
1010 result = numpy.random.choice(unique_indices, size=index_size,
1011 replace=False) # array of random sequences of size=index.size
1012
1013 # result = numpy.random.choice(len(integers), size=index_size,
1014 # replace=False) # array of random sequences of size=index.size
1015 # result = numpy.where(numpy.array(random_tags) == numpy.array(data_array[:,1]))[0]
1016
1017 # with open("index_result.pkl", "wb") as o:
1018 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL)
1019
1020 # save counts
1021 # with open(data_folder + "index_sampleTags1000_Barcode3_DCS.pkl", "wb") as f:
1022 # pickle.dump(result, f, pickle.HIGHEST_PROTOCOL)
1023 # with open(data_folder + "dataArray_sampleTags1000_Barcode3_DCS.pkl", "wb") as f1:
1024 # pickle.dump(data_array, f1, pickle.HIGHEST_PROTOCOL)
1025 #
1026 # with open(data_folder + "index_sampleTags100.pkl", "rb") as f:
1027 # result = pickle.load(f)
1028 #
1029 # with open(data_folder + "dataArray_sampleTags100.pkl", "rb") as f1:
1030 # data_array = pickle.load(f1)
1031
1032 # with open(data_folder + "index_result.txt", "w") as t:
1033 # for text in result:
1034 # t.write("{}\n".format(text))
1035
1036 # comparison random tags to whole dataset
1037 result1 = data_array[result, 1] # random tags
1038 result2 = data_array[:, 1] # all tags
1039 print("sample size= ", len(result1))
1040
1041 # HD analysis of whole tag
1042 proc_pool = Pool(nproc)
1043 chunks_sample = numpy.array_split(result1, nproc)
1044 ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample)
1045 proc_pool.close()
1046 proc_pool.join()
1047 ham = numpy.concatenate(ham).astype(int)
1048 # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1:
1049 # for h, tag in zip(ham, result1):
1050 # output_file1.write("{}\t{}\n".format(tag, h))
1051
1052 # # HD analysis for chimeric reads
1053 # result2 = data_array_whole_dataset[:,1]
1054
1055 proc_pool_b = Pool(nproc)
1056 diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample)
1057 diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample)
1058 proc_pool_b.close()
1059 proc_pool_b.join()
1060 HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]),
1061 numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int)
1062 HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]),
1063 numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int)
1064 minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]),
1065 numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int)
1066 HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]),
1067 numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int)
1068 HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]),
1069 numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int)
1070
1071 rel_Diff1 = numpy.concatenate([item[5] for item in diff_list_a])
1072 rel_Diff2 = numpy.concatenate([item[5] for item in diff_list_b])
1073 diff1 = numpy.concatenate([item[0] for item in diff_list_a])
1074 diff2 = numpy.concatenate([item[0] for item in diff_list_b])
1075
1076 diff_zeros1 = numpy.concatenate([item[6] for item in diff_list_a])
1077 diff_zeros2 = numpy.concatenate([item[6] for item in diff_list_b])
1078 minHD_tags = numpy.concatenate([item[4] for item in diff_list_a])
1079 minHD_tags_zeros1 = numpy.concatenate([item[7] for item in diff_list_a])
1080 minHD_tags_zeros2 = numpy.concatenate([item[7] for item in diff_list_b])
1081
1082 chimera_tags1 = sum([item[10] for item in diff_list_a], [])
1083 chimera_tags2 = sum([item[10] for item in diff_list_b], [])
1084
1085 rel_Diff = []
1086 diff_zeros = []
1087 minHD_tags_zeros = []
1088 diff = []
1089 chimera_tags = []
1090 for d1, d2, rel1, rel2, zeros1, zeros2, tag1, tag2, ctag1, ctag2 in \
1091 zip(diff1, diff2, rel_Diff1, rel_Diff2, diff_zeros1, diff_zeros2, minHD_tags_zeros1, minHD_tags_zeros2,
1092 chimera_tags1, chimera_tags2):
1093 relatives = numpy.array([rel1, rel2])
1094 absolutes = numpy.array([d1, d2])
1095 max_idx = numpy.argmax(relatives)
1096 rel_Diff.append(relatives[max_idx])
1097 diff.append(absolutes[max_idx])
1098
1099 if all(i is not None for i in [zeros1, zeros2]):
1100 diff_zeros.append(max(zeros1, zeros2))
1101 minHD_tags_zeros.append(str(tag1))
1102 tags = [ctag1, ctag2]
1103 chimera_tags.append(tags)
1104 elif zeros1 is not None and zeros2 is None:
1105 diff_zeros.append(zeros1)
1106 minHD_tags_zeros.append(str(tag1))
1107 chimera_tags.append(ctag1)
1108 elif zeros1 is None and zeros2 is not None:
1109 diff_zeros.append(zeros2)
1110 minHD_tags_zeros.append(str(tag2))
1111 chimera_tags.append(ctag2)
1112
1113 chimera_tags_new = chimera_tags
1114 data_chimeraAnalysis = numpy.column_stack((minHD_tags_zeros, chimera_tags_new))
1115 # chimeras_dic = defaultdict(list)
1116 #
1117 # for t1, t2 in zip(minHD_tags_zeros, chimera_tags_new):
1118 # if len(t2) >1 and type(t2) is not numpy.ndarray:
1119 # t2 = numpy.concatenate(t2)
1120 # chimeras_dic[t1].append(t2)
1121
1122 checked_tags = []
1123 stat_maxTags = []
1124
1125 with open(output_chimeras_tabular, "w") as output_file1:
1126 output_file1.write("chimera tag\tfamily size, read direction\tsimilar tag with TD=0\n")
1127 for i in range(len(data_chimeraAnalysis)):
1128 tag1 = data_chimeraAnalysis[i, 0]
1129
1130 info_tag1 = data_array[data_array[:, 1] == tag1, :]
1131 fs_tag1 = ["{} {}".format(t[0], t[2]) for t in info_tag1]
1132
1133 if tag1 in checked_tags: # skip tag if already written to file
1134 continue
1135
1136 sample_half_a = tag1[0:(len(tag1)) / 2]
1137 sample_half_b = tag1[len(tag1) / 2:len(tag1)]
1138
1139 max_tags = data_chimeraAnalysis[i, 1]
1140 if len(max_tags) > 1 and len(max_tags) != len(data_array[0, 1]) and type(max_tags) is not numpy.ndarray:
1141 max_tags = numpy.concatenate(max_tags)
1142 max_tags = numpy.unique(max_tags)
1143 stat_maxTags.append(len(max_tags))
1144
1145 info_maxTags = [data_array[data_array[:, 1] == t, :] for t in max_tags]
1146
1147 chimera_half_a = numpy.array([t[0:(len(t)) / 2] for t in max_tags]) # mate1 part1
1148 chimera_half_b = numpy.array([t[len(t) / 2:len(t)] for t in max_tags]) # mate1 part 2
1149
1150 new_format = []
1151 for j in range(len(max_tags)):
1152 fs_maxTags = ["{} {}".format(t[0], t[2]) for t in info_maxTags[j]]
1153
1154 if sample_half_a == chimera_half_a[j]:
1155 max_tag = "*{}* {} {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags))
1156 new_format.append(max_tag)
1157
1158 elif sample_half_b == chimera_half_b[j]:
1159 max_tag = "{} *{}* {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags))
1160 new_format.append(max_tag)
1161 checked_tags.append(max_tags[j])
1162
1163 sample_tag = "{} {}\t{}".format(sample_half_a, sample_half_b, ", ".join(fs_tag1))
1164 output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format)))
1165
1166 checked_tags.append(tag1)
1167
1168 output_file1.write(
1169 "This file contains all tags that were identified as chimeras as the first column and the "
1170 "corresponding tags which returned a Hamming distance of zero in either the first or the second "
1171 "half of the sample tag as the second column.\n"
1172 "The tags were separated by an empty space into their halves and the * marks the identical half.")
1173 output_file1.write("\n\nStatistics of nr. of tags that returned max. TD (2nd column)\n")
1174 output_file1.write("minimum\t{}\ttag(s)\n".format(numpy.amin(numpy.array(stat_maxTags))))
1175 output_file1.write("mean\t{}\ttag(s)\n".format(numpy.mean(numpy.array(stat_maxTags))))
1176 output_file1.write("median\t{}\ttag(s)\n".format(numpy.median(numpy.array(stat_maxTags))))
1177 output_file1.write("maximum\t{}\ttag(s)\n".format(numpy.amax(numpy.array(stat_maxTags))))
1178 output_file1.write("sum\t{}\ttag(s)\n".format(numpy.sum(numpy.array(stat_maxTags))))
1179
1180 lenTags = len(data_array)
1181 len_sample = len(result1)
1182
1183 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags
1184 seq = numpy.array(data_array[result, 1]) # tags of sample
1185 ham = numpy.asarray(ham) # HD for sample of tags
1186
1187 if onlyDuplicates is True: # ab and ba strands of DCSs
1188 quant = numpy.concatenate((quant, duplTagsBA[result]))
1189 seq = numpy.tile(seq, 2)
1190 ham = numpy.tile(ham, 2)
1191 diff = numpy.tile(diff, 2)
1192 rel_Diff = numpy.tile(rel_Diff, 2)
1193 diff_zeros = numpy.tile(diff_zeros, 2)
1194
1195 nr_chimeric_tags = len(data_chimeraAnalysis)
1196 print("nr of chimeras", nr_chimeric_tags)
1197
1198 # prepare data for different kinds of plots
1199 # distribution of FSs separated after HD
1200 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False)
1201 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS
1202
1203 # get FS for all tags with min HD of analysis of chimeric reads
1204 # there are more tags than sample size in the plot, because one tag can have multiple minimas
1205 if onlyDuplicates:
1206 seqDic = defaultdict(list)
1207 for s, q in zip(seq, quant):
1208 seqDic[s].append(q)
1209 else:
1210 seqDic = dict(zip(seq, quant))
1211
1212 lst_minHD_tags = []
1213 for i in minHD_tags:
1214 lst_minHD_tags.append(seqDic.get(i))
1215
1216 if onlyDuplicates:
1217 lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags],
1218 [item_b[1] for item_b in lst_minHD_tags])).astype(int)
1219 # histogram with absolute and relative difference between HDs of both parts of the tag
1220 listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff)
1221 listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags,
1222 rel_Diff)
1223 # chimeric read analysis: tags which have TD=0 in one of the halfs
1224 if len(minHD_tags_zeros) != 0:
1225 lst_minHD_tags_zeros = []
1226 for i in minHD_tags_zeros:
1227 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads
1228 if onlyDuplicates:
1229 lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros],
1230 [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int)
1231
1232 # histogram with HD of non-identical half
1233 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(
1234 lst_minHD_tags_zeros, diff_zeros)
1235
1236 if onlyDuplicates is False:
1237 listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros = hammingDistanceWithDCS(minHD_tags_zeros,
1238 diff_zeros, data_array)
1239
1240 # plot Hamming Distance with Family size distribution
1241 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, rel_freq=rel_freq,
1242 subtitle="Tag distance separated by family size", lenTags=lenTags,
1243 xlabel="TD", nr_above_bars=nr_above_bars, len_sample=len_sample)
1244
1245 # Plot FSD with separation after
1246 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, rel_freq=rel_freq,
1247 originalCounts=quant, subtitle="Family size distribution separated by Tag distance",
1248 pdf=pdf, relative=False, diff=False)
1249
1250 # Plot HD within tags
1251 plotHDwithinSeq(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags,
1252 rel_freq=rel_freq, len_sample=len_sample)
1253
1254 # Plot difference between HD's separated after FSD
1255 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf,
1256 subtitle="Delta Tag distance within tags", lenTags=lenTags, rel_freq=rel_freq,
1257 xlabel="absolute delta TD", relative=False, nr_above_bars=nr_above_bars, len_sample=len_sample)
1258
1259 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf,
1260 subtitle="Chimera Analysis: relative delta Tag distance", lenTags=lenTags, rel_freq=rel_freq,
1261 xlabel="relative delta TD", relative=True, nr_above_bars=nr_above_bars,
1262 nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
1263
1264 # plots for chimeric reads
1265 if len(minHD_tags_zeros) != 0:
1266 # HD
1267 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf,
1268 subtitle="Tag distance of chimeric families (CF)", rel_freq=rel_freq,
1269 lenTags=lenTags, xlabel="TD", relative=False,
1270 nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
1271
1272 if onlyDuplicates is False:
1273 plotHDwithDCS(listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros, pdf=pdf,
1274 subtitle="Tag distance of chimeric families (CF)", rel_freq=rel_freq,
1275 lenTags=lenTags, xlabel="TD", relative=False,
1276 nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
1277
1278 # print all data to a CSV file
1279 # HD
1280 summary, sumCol = createTableHD(list1, "TD=")
1281 overallSum = sum(sumCol) # sum of columns in table
1282
1283 # FSD
1284 summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False)
1285 overallSum5 = sum(sumCol5)
1286
1287 # HD of both parts of the tag
1288 summary9, sumCol9 = createTableHDwithTags([HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, numpy.array(minHDs)])
1289 overallSum9 = sum(sumCol9)
1290
1291 # HD
1292 # absolute difference
1293 summary11, sumCol11 = createTableHD(listDifference1, "diff=")
1294 overallSum11 = sum(sumCol11)
1295 # relative difference and all tags
1296 summary13, sumCol13 = createTableHD(listRelDifference1, "diff=")
1297 overallSum13 = sum(sumCol13)
1298
1299 # chimeric reads
1300 if len(minHD_tags_zeros) != 0:
1301 # absolute difference and tags where at least one half has HD=0
1302 summary15, sumCol15 = createTableHD(listDifference1_zeros, "TD=")
1303 overallSum15 = sum(sumCol15)
1304
1305 if onlyDuplicates is False:
1306 summary16, sumCol16 = createTableHDwithDCS(listDCS_zeros)
1307 overallSum16 = sum(sumCol16)
1308
1309 output_file.write("{}\n".format(name1))
1310 output_file.write("nr of tags{}{:,}\nsample size{}{:,}\n\n".format(sep, lenTags, sep, len_sample))
1311
1312 # HD
1313 createFileHD(summary, sumCol, overallSum, output_file,
1314 "Tag distance separated by family size", sep)
1315 # FSD
1316 createFileFSD2(summary5, sumCol5, overallSum5, output_file,
1317 "Family size distribution separated by Tag distance", sep,
1318 diff=False)
1319
1320 # output_file.write("{}{}\n".format(sep, name1))
1321 output_file.write("\n")
1322 max_fs = numpy.bincount(integers[result])
1323 output_file.write("max. family size in sample:{}{}\n".format(sep, max(integers[result])))
1324 output_file.write("absolute frequency:{}{}\n".format(sep, max_fs[len(max_fs) - 1]))
1325 output_file.write(
1326 "relative frequency:{}{}\n\n".format(sep, float(max_fs[len(max_fs) - 1]) / sum(max_fs)))
1327
1328 # HD within tags
1329 output_file.write(
1330 "Chimera Analysis:\nThe tags are splitted into two halves (part a and b) for which the Tag distances (TD) are calculated seperately.\n"
1331 "The tag distance of the first half (part a) is calculated by comparing part a of the tag in the sample against all a parts in the dataset and by selecting the minimum value (TD a.min).\n"
1332 "In the next step, we select those tags that showed the minimum TD and estimate the TD for the second half (part b) of the tag by comparing part b against the previously selected subset.\n"
1333 "The maximum value represents then TD b.max. Finally, these process is repeated but starting with part b instead and TD b.min and TD a.max are calculated.\n"
1334 "Next, the absolute differences between TD a.min & TD b.max and TD b.min & TD a.max are estimated (delta HD).\n"
1335 "These are then divided by the sum of both parts (TD a.min + TD b.max or TD b.min + TD a.max, respectively) which give the relative differences between the partial HDs (rel. delta HD).\n"
1336 "For simplicity, we used the maximum value of the relative differences and the respective delta HD.\n"
1337 "Note that when only tags that can form a DCS are included in the analysis, the family sizes for both directions (ab and ba) of the strand will be included in the plots.\n")
1338
1339 output_file.write("\nlength of one half of the tag{}{}\n\n".format(sep, len(data_array[0, 1]) / 2))
1340
1341 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file,
1342 "Tag distance of each half in the tag", sep)
1343 createFileHD(summary11, sumCol11, overallSum11, output_file,
1344 "Absolute delta Tag distance within the tag", sep)
1345
1346 createFileHD(summary13, sumCol13, overallSum13, output_file,
1347 "Chimera analysis: relative delta Tag distance", sep)
1348
1349 if len(minHD_tags_zeros) != 0:
1350 output_file.write(
1351 "All tags are filtered and only those tags where one half is identical (TD=0) and therefore, have a relative delta TD of 1, are kept.\n"
1352 "These tags are considered as chimeras.\n")
1353 createFileHD(summary15, sumCol15, overallSum15, output_file,
1354 "Tag distance of chimeric families separated after FS", sep)
1355
1356 if onlyDuplicates is False:
1357 createFileHDwithDCS(summary16, sumCol16, overallSum16, output_file,
1358 "Tag distance of chimeric families separated after DCS and single SSCS (ab, ba)", sep)
1359
1360 output_file.write("\n")
1361
1362
1363 if __name__ == '__main__':
1364 sys.exit(Hamming_Distance_Analysis(sys.argv))