comparison hd.py @ 29:6b15b3b6405c draft

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/hd commit 5b3ab8c6467fe3a52e89f5a7d175bd8a0189018a-dirty
author mheinzl
date Wed, 24 Jul 2019 05:58:15 -0400
parents 1fa7342a140d
children 46bfbec0f9e6
comparison
equal deleted inserted replaced
28:1fa7342a140d 29:6b15b3b6405c
3 # Hamming distance analysis of SSCSs 3 # Hamming distance analysis of SSCSs
4 # 4 #
5 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria) 5 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria)
6 # Contact: monika.heinzl@edumail.at 6 # Contact: monika.heinzl@edumail.at
7 # 7 #
8 # Takes at least one TABULAR file with tags before the alignment to the SSCS and optionally a second TABULAR file as input. 8 # Takes at least one TABULAR file with tags before the alignment to the SSCS and
9 # The program produces a plot which shows a histogram of Hamming distances separated after family sizes, 9 # optionally a second TABULAR file as input. The program produces a plot which shows a histogram of Hamming distances
10 # a family size distribution separated after Hamming distances for all (sample_size=0) or a given sample of SSCSs or SSCSs, which form a DCS. 10 # separated after family sizes, a family size distribution separated after Hamming distances for all (sample_size=0)
11 # In additon, the tool produces HD and FSD plots for the difference between the HDs of both parts of the tags and for the chimeric reads 11 # or a given sample of SSCSs or SSCSs, which form a DCS. In additon, the tool produces HD and FSD plots for the
12 # and finally a CSV file with the data of the plots. 12 # difference between the HDs of both parts of the tags and for the chimeric reads and finally a CSV file with the
13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input. 13 # data of the plots. It is also possible to perform the HD analysis with shortened tags with given sizes as input.
14 # The tool can run on a certain number of processors, which can be defined by the user. 14 # The tool can run on a certain number of processors, which can be defined by the user.
15 15
16 # USAGE: python hd.py --inputFile filename --inputName1 filename --sample_size int / 16 # USAGE: python hd.py --inputFile filename --inputName1 filename --sample_size int /
17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular 17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int
18 # --nr_above_bars True/False --output_tabular outptufile_name_tabular
18 19
19 import argparse 20 import argparse
20 import itertools 21 import itertools
21 import operator 22 import operator
22 import sys 23 import sys
32 33
33 plt.switch_backend('agg') 34 plt.switch_backend('agg')
34 35
35 36
36 def plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, originalCounts, 37 def plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, originalCounts,
37 title_file1, subtitle, pdf, relative=False, diff=True): 38 subtitle, pdf, relative=False, diff=True, rel_freq=False):
38 if diff is False: 39 if diff is False:
39 colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"] 40 colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"]
40 labels = ["HD=1", "HD=2", "HD=3", "HD=4", "HD=5-8", "HD>8"] 41 labels = ["HD=1", "HD=2", "HD=3", "HD=4", "HD=5-8", "HD>8"]
41 else: 42 else:
42 colors = ["#93A6AB", "#403C14", "#731E41", "#BAB591", "#085B6F", "#E8AA35", "#726C66"] 43 colors = ["#93A6AB", "#403C14", "#731E41", "#BAB591", "#085B6F", "#E8AA35", "#726C66"]
46 labels = ["d=0", "d=1", "d=2", "d=3", "d=4", "d=5-8", "d>8"] 47 labels = ["d=0", "d=1", "d=2", "d=3", "d=4", "d=5-8", "d>8"]
47 48
48 fig = plt.figure(figsize=(6, 7)) 49 fig = plt.figure(figsize=(6, 7))
49 ax = fig.add_subplot(111) 50 ax = fig.add_subplot(111)
50 plt.subplots_adjust(bottom=0.1) 51 plt.subplots_adjust(bottom=0.1)
51 p1 = numpy.bincount(numpy.concatenate((familySizeList1))) 52 p1 = numpy.bincount(numpy.concatenate(familySizeList1))
52 maximumY = numpy.amax(p1) 53 maximumY = numpy.amax(p1)
53 54
54 if len(range(minimumXFS, maximumXFS)) == 0: 55 if len(range(minimumXFS, maximumXFS)) == 0:
55 range1 = range(minimumXFS - 1, minimumXFS + 2) 56 range1 = range(minimumXFS - 1, minimumXFS + 2)
56 else: 57 else:
57 range1 = range(0, maximumXFS + 2) 58 range1 = range(0, maximumXFS + 2)
58 counts = plt.hist(familySizeList1, label=labels, 59
59 color=colors, stacked=True, 60 if rel_freq:
60 rwidth=0.8, alpha=1, align="left", 61 w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(familySizeList1)) for data in familySizeList1]
61 edgecolor="None", bins=range1) 62 counts = plt.hist(familySizeList1, label=labels, weights=w, color=colors, stacked=True,
63 rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1)
64 plt.ylabel("Relative Frequency", fontsize=14)
65 plt.ylim((0, (float(maximumY) / sum(p1)) * 1.1))
66 else:
67 counts = plt.hist(familySizeList1, label=labels, color=colors, stacked=True,
68 rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1)
69 if len(numpy.concatenate(familySizeList1)) != 0:
70 plt.ylim((0, max(numpy.bincount(numpy.concatenate(familySizeList1))) * 1.1))
71 plt.ylabel("Absolute Frequency", fontsize=14)
72 plt.ylim((0, maximumY * 1.2))
62 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) 73 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
63
64 # plt.title(title_file1, fontsize=12)
65 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) 74 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
66 plt.xlabel("Family size", fontsize=14) 75 plt.xlabel("Family size", fontsize=14)
67 plt.ylabel("Absolute Frequency", fontsize=14)
68
69 ticks = numpy.arange(0, maximumXFS + 1, 1) 76 ticks = numpy.arange(0, maximumXFS + 1, 1)
70 ticks1 = map(str, ticks) 77 ticks1 = map(str, ticks)
71 if maximumXFS >= 20: 78 if maximumXFS >= 20:
72 ticks1[len(ticks1) - 1] = ">=20" 79 ticks1[len(ticks1) - 1] = ">=20"
73 plt.xticks(numpy.array(ticks), ticks1) 80 plt.xticks(numpy.array(ticks), ticks1)
74 [l.set_visible(False) for (i, l) in enumerate(ax.get_xticklabels()) if i % 5 != 0] 81 [l.set_visible(False) for (i, l) in enumerate(ax.get_xticklabels()) if i % 5 != 0]
75
76 plt.xlim((0, maximumXFS + 1)) 82 plt.xlim((0, maximumXFS + 1))
77 if len(numpy.concatenate(familySizeList1)) != 0:
78 plt.ylim((0, max(numpy.bincount(numpy.concatenate(familySizeList1))) * 1.1))
79
80 plt.ylim((0, maximumY * 1.2))
81 legend = "\nfamily size: \nabsolute frequency: \nrelative frequency: " 83 legend = "\nfamily size: \nabsolute frequency: \nrelative frequency: "
82 plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure) 84 plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure)
83 85
84 count = numpy.bincount(originalCounts) # original counts 86 count = numpy.bincount(originalCounts) # original counts
85 if max(originalCounts) >= 20: 87 if max(originalCounts) >= 20:
86 max_count = ">= 20" 88 max_count = ">= 20"
87 else: 89 else:
88 max_count = max(originalCounts) 90 max_count = max(originalCounts)
89 legend1 = "{}\n{}\n{:.5f}".format(max_count, count[len(count) - 1], float(count[len(count) - 1]) / sum(count)) 91 legend1 = "{}\n{}\n{:.5f}".format(max_count, p1[len(p1) - 1], float(p1[len(p1) - 1]) / sum(p1))
90 plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure) 92 plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure)
91 legend3 = "singletons\n{:,}\n{:.5f}".format(int(counts[0][len(counts[0]) - 1][1]), float(counts[0][len(counts[0]) - 1][1]) / sum(counts[0][len(counts[0]) - 1])) 93 legend3 = "singletons\n{:,}\n{:.5f}".format(int(p1[1]), float(p1[1]) / sum(p1))
92 plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12) 94 plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12)
93 plt.grid(b=True, which='major', color='#424242', linestyle=':') 95 plt.grid(b=True, which='major', color='#424242', linestyle=':')
94
95 pdf.savefig(fig, bbox_inches="tight") 96 pdf.savefig(fig, bbox_inches="tight")
96 plt.close("all") 97 plt.close("all")
97 98
98 99
99 def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, title_file1, pdf, xlabel, relative=False, nr_above_bars=True, nr_unique_chimeras=0, len_sample=0): 100 def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False,
101 nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False):
100 if relative is True: 102 if relative is True:
101 step = 0.1 103 step = 0.1
102 else: 104 else:
103 step = 1 105 step = 1
104 106
105 fig = plt.figure(figsize=(6, 8)) 107 fig = plt.figure(figsize=(6, 8))
106 plt.subplots_adjust(bottom=0.1) 108 plt.subplots_adjust(bottom=0.1)
107 con_list1 = numpy.concatenate(list1) 109 p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())])
108 p1 = numpy.array([v for k, v in sorted(Counter(con_list1).iteritems())])
109 maximumY = numpy.amax(p1) 110 maximumY = numpy.amax(p1)
110 maximumX = int(maximumX)
111 print("max X", maximumX )
112
113 if relative is True: # relative difference 111 if relative is True: # relative difference
114 bin1 = numpy.arange(-1, maximumX + 0.2, 0.1) 112 bin1 = numpy.arange(-1, maximumX + 0.2, 0.1)
115 else: 113 else:
116 bin1 = maximumX + 1 114 bin1 = maximumX + 1
117 115
118 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, 116 if rel_freq:
119 label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", 117 w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1]
120 "FS>10"], rwidth=0.8, 118 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w,
121 color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"], 119 label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8,
122 stacked=True, alpha=1, 120 color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"],
123 align="left", 121 stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
124 range=(0, maximumX + 1)) 122 plt.ylim((0, (float(maximumY) / sum(p1)) * 1.2))
123 plt.ylabel("Relative Frequency", fontsize=14)
124 bins = counts[1] # width of bins
125 counts = numpy.array(map(float, counts[0][5]))
126
127 else:
128 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1,
129 label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8,
130 color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"],
131 stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
132 maximumY = numpy.amax(p1)
133 plt.ylim((0, maximumY * 1.2))
134 plt.ylabel("Absolute Frequency", fontsize=14)
135 bins = counts[1] # width of bins
136 counts = numpy.array(map(int, counts[0][5]))
137
125 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) 138 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
126 bins = counts[1] # width of bins
127 counts = numpy.array(map(int, counts[0][5]))
128 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) 139 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
129 # plt.title(title_file1, fontsize=12)
130 plt.xlabel(xlabel, fontsize=14) 140 plt.xlabel(xlabel, fontsize=14)
131 plt.ylabel("Absolute Frequency", fontsize=14)
132
133 plt.grid(b=True, which='major', color='#424242', linestyle=':') 141 plt.grid(b=True, which='major', color='#424242', linestyle=':')
134 plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) 142 plt.xlim((minimumX - step, maximumX + step))
143 # plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1))
135 plt.xticks(numpy.arange(0, maximumX + step, step)) 144 plt.xticks(numpy.arange(0, maximumX + step, step))
136 145
137 plt.ylim((0, maximumY * 1.2)) 146 if nr_above_bars:
138
139 if nr_above_bars is True:
140 bin_centers = -0.4 * numpy.diff(bins) + bins[:-1] 147 bin_centers = -0.4 * numpy.diff(bins) + bins[:-1]
141 for x_label, label in zip(counts, bin_centers): # labels for values 148 for x_label, label in zip(counts, bin_centers): # labels for values
142 if x_label == 0: 149 if x_label == 0:
143 continue 150 continue
144 else: 151 else:
145 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts), 1), 152 if rel_freq:
146 xy=(label, x_label + len(con_list1) * 0.01), 153 plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))),
147 xycoords="data", color="#000066", fontsize=10) 154 float(x_label)),
148 155 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001),
149 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(lenTags, len_sample, sum(counts)) 156 xycoords="data", color="#000066", fontsize=10)
150 plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure) 157 else:
151 158 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)),
152 # if nr_unique_chimeras != 0 and len_sample != 0: 159 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01),
153 # if relative == True: 160 xycoords="data", color="#000066", fontsize=10)
154 # legend = "nr. of unique chimeric tags= {:,} ({:.5f}) (rel.diff=1)".format(nr_unique_chimeras, 161
155 # int(nr_unique_chimeras) / float(len_sample)) 162 if nr_unique_chimeras != 0:
156 # else: 163 if (relative and ((counts[len(counts)-1] / nr_unique_chimeras) == 2)) or \
157 # legend = "nr. of unique chimeric tags= {:,} ({:.5f})".format(nr_unique_chimeras, int(nr_unique_chimeras) / float(len_sample)) 164 (sum(counts) / nr_unique_chimeras) == 2:
158 # plt.text(0.14, -0.09, legend, size=12, transform=plt.gcf().transFigure) 165 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})"\
159 166 .format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2)
167 else:
168 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format(
169 lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras)
170 else:
171 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(
172 lenTags, len_sample, len(numpy.concatenate(list1)))
173
174 plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure)
160 pdf.savefig(fig, bbox_inches="tight") 175 pdf.savefig(fig, bbox_inches="tight")
161 plt.close("all") 176 plt.close("all")
162 plt.clf() 177 plt.clf()
163 178
164 179
165 def plotHDwithinSeq_Sum2(sum1, sum1min, sum2, sum2min, min_value, lenTags, title_file1, pdf, len_sample): 180 def plotHDwithDCS(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False,
181 nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False):
182 step = 1
166 fig = plt.figure(figsize=(6, 8)) 183 fig = plt.figure(figsize=(6, 8))
167 plt.subplots_adjust(bottom=0.1) 184 plt.subplots_adjust(bottom=0.1)
185 p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())])
186 maximumY = numpy.amax(p1)
187 bin1 = maximumX + 1
188 if rel_freq:
189 w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1]
190 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w,
191 label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"],
192 stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
193 plt.ylim((0, (float(maximumY) / sum(p1)) * 1.2))
194 plt.ylabel("Relative Frequency", fontsize=14)
195 bins = counts[1] # width of bins
196 counts = numpy.array(map(float, counts[0][2]))
197
198 else:
199 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1,
200 label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"],
201 stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
202 plt.ylim((0, maximumY * 1.2))
203 plt.ylabel("Absolute Frequency", fontsize=14)
204 bins = counts[1] # width of bins
205 counts = numpy.array(map(int, counts[0][2]))
206
207 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
208 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
209 plt.xlabel(xlabel, fontsize=14)
210 plt.grid(b=True, which='major', color='#424242', linestyle=':')
211 plt.xlim((minimumX - step, maximumX + step))
212 # plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1))
213 plt.xticks(numpy.arange(0, maximumX + step, step))
214
215 if nr_above_bars:
216 bin_centers = -0.4 * numpy.diff(bins) + bins[:-1]
217 for x_label, label in zip(counts, bin_centers): # labels for values
218 if x_label == 0:
219 continue
220 else:
221 if rel_freq:
222 plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))),
223 float(x_label)),
224 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001),
225 xycoords="data", color="#000066", fontsize=10)
226 else:
227 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)),
228 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01),
229 xycoords="data", color="#000066", fontsize=10)
230
231 if nr_unique_chimeras != 0:
232 if (sum(counts) / nr_unique_chimeras) == 2:
233 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})".\
234 format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2)
235 else:
236 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format(
237 lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras)
238 else:
239 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(
240 lenTags, len_sample, len(numpy.concatenate(list1)))
241 plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure)
242
243 legend2 = "SSCS ab = {:,}\nSSCS ba = {:,}\nDCS = {:,}".format(len(list1[1]), len(list1[2]), len(list1[0]))
244 plt.text(0.6, -0.047, legend2, size=12, transform=plt.gcf().transFigure)
245
246 pdf.savefig(fig, bbox_inches="tight")
247 plt.close("all")
248 plt.clf()
249
250
251 def plotHDwithinSeq(sum1, sum1min, sum2, sum2min, min_value, lenTags, pdf, len_sample, rel_freq=False):
252 fig = plt.figure(figsize=(6, 8))
253 plt.subplots_adjust(bottom=0.1)
168 254
169 ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)] # new hd within tags 255 ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)] # new hd within tags
170
171 maximumX = numpy.amax(numpy.concatenate(ham_partial)) 256 maximumX = numpy.amax(numpy.concatenate(ham_partial))
172 minimumX = numpy.amin(numpy.concatenate(ham_partial)) 257 minimumX = numpy.amin(numpy.concatenate(ham_partial))
173 maximumY = numpy.amax(numpy.array(numpy.concatenate(map(lambda x: numpy.bincount(x), ham_partial)))) 258 maximumY = numpy.amax(numpy.array(numpy.concatenate(map(lambda x: numpy.bincount(x), ham_partial))))
174 259
175 if len(range(minimumX, maximumX)) == 0: 260 if len(range(minimumX, maximumX)) == 0:
176 range1 = minimumX 261 range1 = minimumX
177 else: 262 else:
178 range1 = range(minimumX, maximumX + 2) 263 range1 = range(minimumX, maximumX + 2)
179 264
180 plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, label=["HD a", "HD b'", "HD b", "HD a'", "HD a+b"], bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], edgecolor='black', linewidth=1) 265 if rel_freq:
266 w = [numpy.zeros_like(data) + 1. / len(data) for data in ham_partial]
267 plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, weights=w,
268 label=["HD a", "HD b'", "HD b", "HD a'", "HD a+b', a'+b"],
269 bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"],
270 edgecolor='black', linewidth=1)
271 plt.ylabel("Relative Frequency", fontsize=14)
272 # plt.ylim(-0.1, (float(maximumY) / len(numpy.concatenate(ham_partial))) * 1.2)
273 else:
274 plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False,
275 label=["HD a", "HD b'", "HD b", "HD a'", "HD a+b', a'+b"],
276 bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"],
277 edgecolor='black', linewidth=1)
278 plt.ylabel("Absolute Frequency", fontsize=14)
181 279
182 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1)) 280 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1))
183 plt.suptitle('Hamming distances within tags', fontsize=14) 281 plt.suptitle('Hamming distances within tags', fontsize=14)
184 # plt.title(title_file1, fontsize=12)
185 plt.xlabel("HD", fontsize=14) 282 plt.xlabel("HD", fontsize=14)
186 plt.ylabel("Absolute Frequency", fontsize=14)
187 plt.grid(b=True, which='major', color='#424242', linestyle=':') 283 plt.grid(b=True, which='major', color='#424242', linestyle=':')
188 284 plt.xlim((minimumX - 1, maximumX + 1))
189 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2)) 285 # plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2))
190 plt.xticks(numpy.arange(0, maximumX + 1, 1.0)) 286 plt.xticks(numpy.arange(0, maximumX + 1, 1.0))
191 # plt.ylim(0, maximumY * 1.2) 287 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(
192 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(lenTags, len_sample, len(numpy.concatenate(ham_partial))) 288 lenTags, len_sample, len(numpy.concatenate(ham_partial)))
193
194 # legend = "sample size= {:,} against {:,}".format(len(numpy.concatenate(ham_partial)), lenTags)
195 plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure) 289 plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure)
196 pdf.savefig(fig, bbox_inches="tight") 290 pdf.savefig(fig, bbox_inches="tight")
197 plt.close("all") 291 plt.close("all")
198 plt.clf() 292 plt.clf()
199 293
204 nr = numpy.arange(0, len(uniqueFS), 1) 298 nr = numpy.arange(0, len(uniqueFS), 1)
205 if diff is False: 299 if diff is False:
206 count = numpy.zeros((len(uniqueFS), 6)) 300 count = numpy.zeros((len(uniqueFS), 6))
207 else: 301 else:
208 count = numpy.zeros((len(uniqueFS), 7)) 302 count = numpy.zeros((len(uniqueFS), 7))
209
210 state = 1 303 state = 1
211 for i in list1: 304 for i in list1:
212 counts = list(Counter(i).items()) 305 counts = list(Counter(i).items())
213 hd = [item[0] for item in counts] 306 hd = [item[0] for item in counts]
214 c = [item[1] for item in counts] 307 c = [item[1] for item in counts]
216 if len(table) == 0: 309 if len(table) == 0:
217 state = state + 1 310 state = state + 1
218 continue 311 continue
219 else: 312 else:
220 if state == 1: 313 if state == 1:
221 for i, l in zip(uniqueFS, nr): 314 for k, l in zip(uniqueFS, nr):
222 for j in table: 315 for j in table:
223 if j[0] == uniqueFS[l]: 316 if j[0] == uniqueFS[l]:
224 count[l, 0] = j[1] 317 count[l, 0] = j[1]
225 if state == 2: 318 if state == 2:
226 for i, l in zip(uniqueFS, nr): 319 for k, l in zip(uniqueFS, nr):
227 for j in table: 320 for j in table:
228 if j[0] == uniqueFS[l]: 321 if j[0] == uniqueFS[l]:
229 count[l, 1] = j[1] 322 count[l, 1] = j[1]
230
231 if state == 3: 323 if state == 3:
232 for i, l in zip(uniqueFS, nr): 324 for k, l in zip(uniqueFS, nr):
233 for j in table: 325 for j in table:
234 if j[0] == uniqueFS[l]: 326 if j[0] == uniqueFS[l]:
235 count[l, 2] = j[1] 327 count[l, 2] = j[1]
236
237 if state == 4: 328 if state == 4:
238 for i, l in zip(uniqueFS, nr): 329 for k, l in zip(uniqueFS, nr):
239 for j in table: 330 for j in table:
240 if j[0] == uniqueFS[l]: 331 if j[0] == uniqueFS[l]:
241 count[l, 3] = j[1] 332 count[l, 3] = j[1]
242
243 if state == 5: 333 if state == 5:
244 for i, l in zip(uniqueFS, nr): 334 for k, l in zip(uniqueFS, nr):
245 for j in table: 335 for j in table:
246 if j[0] == uniqueFS[l]: 336 if j[0] == uniqueFS[l]:
247 count[l, 4] = j[1] 337 count[l, 4] = j[1]
248
249 if state == 6: 338 if state == 6:
250 for i, l in zip(uniqueFS, nr): 339 for k, l in zip(uniqueFS, nr):
251 for j in table: 340 for j in table:
252 if j[0] == uniqueFS[l]: 341 if j[0] == uniqueFS[l]:
253 count[l, 5] = j[1] 342 count[l, 5] = j[1]
254
255 if state == 7: 343 if state == 7:
256 for i, l in zip(uniqueFS, nr): 344 for k, l in zip(uniqueFS, nr):
257 for j in table: 345 for j in table:
258 if j[0] == uniqueFS[l]: 346 if j[0] == uniqueFS[l]:
259 count[l, 6] = j[1] 347 count[l, 6] = j[1]
260 state = state + 1 348 state = state + 1
261
262 sumRow = count.sum(axis=1) 349 sumRow = count.sum(axis=1)
263 sumCol = count.sum(axis=0) 350 sumCol = count.sum(axis=0)
264
265 uniqueFS = uniqueFS.astype(str) 351 uniqueFS = uniqueFS.astype(str)
266 if uniqueFS[len(uniqueFS) - 1] == "20": 352 if uniqueFS[len(uniqueFS) - 1] == "20":
267 uniqueFS[len(uniqueFS) - 1] = ">20" 353 uniqueFS[len(uniqueFS) - 1] = ">20"
268
269 first = ["FS={}".format(i) for i in uniqueFS] 354 first = ["FS={}".format(i) for i in uniqueFS]
270 final = numpy.column_stack((first, count, sumRow)) 355 final = numpy.column_stack((first, count, sumRow))
271
272 return (final, sumCol) 356 return (final, sumCol)
273 357
274 358
275 def createFileFSD2(summary, sumCol, overallSum, output_file, name, sep, rel=False, diff=True): 359 def createFileFSD2(summary, sumCol, overallSum, output_file, name, sep, rel=False, diff=True):
276 output_file.write(name) 360 output_file.write(name)
277 output_file.write("\n") 361 output_file.write("\n")
278 if diff is False: 362 if diff is False:
279 output_file.write("{}HD=1{}HD=2{}HD=3{}HD=4{}HD=5-8{}HD>8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep)) 363 output_file.write("{}HD=1{}HD=2{}HD=3{}HD=4{}HD=5-8{}HD>8{}sum{}\n".format(
364 sep, sep, sep, sep, sep, sep, sep, sep))
280 else: 365 else:
281 if rel is False: 366 if rel is False:
282 output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) 367 output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format(
368 sep, sep, sep, sep, sep, sep, sep, sep, sep))
283 else: 369 else:
284 output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) 370 output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n".
285 371 format(sep, sep, sep, sep, sep, sep, sep, sep, sep))
286 for item in summary: 372 for item in summary:
287 for nr in item: 373 for nr in item:
288 if "FS" not in nr and "diff" not in nr: 374 if "FS" not in nr and "diff" not in nr:
289 nr = nr.astype(float) 375 nr = nr.astype(float)
290 nr = nr.astype(int) 376 nr = nr.astype(int)
312 if len(table) == 0: 398 if len(table) == 0:
313 state = state + 1 399 state = state + 1
314 continue 400 continue
315 else: 401 else:
316 if state == 1: 402 if state == 1:
317 for i, l in zip(uniqueHD, nr): 403 for k, l in zip(uniqueHD, nr):
318 for j in table: 404 for j in table:
319 if j[0] == uniqueHD[l]: 405 if j[0] == uniqueHD[l]:
320 count[l, 0] = j[1] 406 count[l, 0] = j[1]
321 if state == 2: 407 if state == 2:
322 for i, l in zip(uniqueHD, nr): 408 for k, l in zip(uniqueHD, nr):
323 for j in table: 409 for j in table:
324 if j[0] == uniqueHD[l]: 410 if j[0] == uniqueHD[l]:
325 count[l, 1] = j[1] 411 count[l, 1] = j[1]
326
327 if state == 3: 412 if state == 3:
328 for i, l in zip(uniqueHD, nr): 413 for k, l in zip(uniqueHD, nr):
329 for j in table: 414 for j in table:
330 if j[0] == uniqueHD[l]: 415 if j[0] == uniqueHD[l]:
331 count[l, 2] = j[1] 416 count[l, 2] = j[1]
332
333 if state == 4: 417 if state == 4:
334 for i, l in zip(uniqueHD, nr): 418 for k, l in zip(uniqueHD, nr):
335 for j in table: 419 for j in table:
336 if j[0] == uniqueHD[l]: 420 if j[0] == uniqueHD[l]:
337 count[l, 3] = j[1] 421 count[l, 3] = j[1]
338
339 if state == 5: 422 if state == 5:
340 for i, l in zip(uniqueHD, nr): 423 for k, l in zip(uniqueHD, nr):
341 for j in table: 424 for j in table:
342 if j[0] == uniqueHD[l]: 425 if j[0] == uniqueHD[l]:
343 count[l, 4] = j[1] 426 count[l, 4] = j[1]
344
345 if state == 6: 427 if state == 6:
346 for i, l in zip(uniqueHD, nr): 428 for k, l in zip(uniqueHD, nr):
347 for j in table: 429 for j in table:
348 if j[0] == uniqueHD[l]: 430 if j[0] == uniqueHD[l]:
349 count[l, 5] = j[1] 431 count[l, 5] = j[1]
350 state = state + 1 432 state = state + 1
351
352 sumRow = count.sum(axis=1) 433 sumRow = count.sum(axis=1)
353 sumCol = count.sum(axis=0) 434 sumCol = count.sum(axis=0)
354 first = ["{}{}".format(row_label, i) for i in uniqueHD] 435 first = ["{}{}".format(row_label, i) for i in uniqueHD]
355 final = numpy.column_stack((first, count, sumRow)) 436 final = numpy.column_stack((first, count, sumRow))
356
357 return (final, sumCol) 437 return (final, sumCol)
358 438
359 439
360 def createTableHDwithTags(list1): 440 def createTableHDwithTags(list1):
361 selfAB = numpy.concatenate(list1) 441 selfAB = numpy.concatenate(list1)
362 uniqueHD = numpy.unique(selfAB) 442 uniqueHD = numpy.unique(selfAB)
363 nr = numpy.arange(0, len(uniqueHD), 1) 443 nr = numpy.arange(0, len(uniqueHD), 1)
364 count = numpy.zeros((len(uniqueHD), 5)) 444 count = numpy.zeros((len(uniqueHD), 5))
365
366 state = 1 445 state = 1
367 for i in list1: 446 for i in list1:
368 counts = list(Counter(i).items()) 447 counts = list(Counter(i).items())
369 hd = [item[0] for item in counts] 448 hd = [item[0] for item in counts]
370 c = [item[1] for item in counts] 449 c = [item[1] for item in counts]
372 if len(table) == 0: 451 if len(table) == 0:
373 state = state + 1 452 state = state + 1
374 continue 453 continue
375 else: 454 else:
376 if state == 1: 455 if state == 1:
377 for i, l in zip(uniqueHD, nr): 456 for k, l in zip(uniqueHD, nr):
378 for j in table: 457 for j in table:
379 if j[0] == uniqueHD[l]: 458 if j[0] == uniqueHD[l]:
380 count[l, 0] = j[1] 459 count[l, 0] = j[1]
381 if state == 2: 460 if state == 2:
382 for i, l in zip(uniqueHD, nr): 461 for k, l in zip(uniqueHD, nr):
383 for j in table: 462 for j in table:
384 if j[0] == uniqueHD[l]: 463 if j[0] == uniqueHD[l]:
385 count[l, 1] = j[1] 464 count[l, 1] = j[1]
386 if state == 3: 465 if state == 3:
387 for i, l in zip(uniqueHD, nr): 466 for k, l in zip(uniqueHD, nr):
388 for j in table: 467 for j in table:
389 if j[0] == uniqueHD[l]: 468 if j[0] == uniqueHD[l]:
390 count[l, 2] = j[1] 469 count[l, 2] = j[1]
391 if state == 4: 470 if state == 4:
392 for i, l in zip(uniqueHD, nr): 471 for k, l in zip(uniqueHD, nr):
393 for j in table: 472 for j in table:
394 if j[0] == uniqueHD[l]: 473 if j[0] == uniqueHD[l]:
395 count[l, 3] = j[1] 474 count[l, 3] = j[1]
396 if state == 5: 475 if state == 5:
397 for i, l in zip(uniqueHD, nr): 476 for k, l in zip(uniqueHD, nr):
398 for j in table: 477 for j in table:
399 if j[0] == uniqueHD[l]: 478 if j[0] == uniqueHD[l]:
400 count[l, 4] = j[1] 479 count[l, 4] = j[1]
401
402 state = state + 1 480 state = state + 1
403
404 sumRow = count.sum(axis=1) 481 sumRow = count.sum(axis=1)
405 sumCol = count.sum(axis=0) 482 sumCol = count.sum(axis=0)
406 first = ["HD={}".format(i) for i in uniqueHD] 483 first = ["HD={}".format(i) for i in uniqueHD]
407 final = numpy.column_stack((first, count, sumRow)) 484 final = numpy.column_stack((first, count, sumRow))
408 485 return (final, sumCol)
486
487
488 def createTableHDwithDCS(list1):
489 selfAB = numpy.concatenate(list1)
490 uniqueHD = numpy.unique(selfAB)
491 nr = numpy.arange(0, len(uniqueHD), 1)
492 count = numpy.zeros((len(uniqueHD), len(list1)))
493 state = 1
494 for i in list1:
495 counts = list(Counter(i).items())
496 hd = [item[0] for item in counts]
497 c = [item[1] for item in counts]
498 table = numpy.column_stack((hd, c))
499 if len(table) == 0:
500 state = state + 1
501 continue
502 else:
503 if state == 1:
504 for k, l in zip(uniqueHD, nr):
505 for j in table:
506 if j[0] == uniqueHD[l]:
507 count[l, 0] = j[1]
508 if state == 2:
509 for k, l in zip(uniqueHD, nr):
510 for j in table:
511 if j[0] == uniqueHD[l]:
512 count[l, 1] = j[1]
513 if state == 3:
514 for k, l in zip(uniqueHD, nr):
515 for j in table:
516 if j[0] == uniqueHD[l]:
517 count[l, 2] = j[1]
518 state = state + 1
519 sumRow = count.sum(axis=1)
520 sumCol = count.sum(axis=0)
521 first = ["HD={}".format(i) for i in uniqueHD]
522 final = numpy.column_stack((first, count, sumRow))
409 return (final, sumCol) 523 return (final, sumCol)
410 524
411 525
412 def createFileHD(summary, sumCol, overallSum, output_file, name, sep): 526 def createFileHD(summary, sumCol, overallSum, output_file, name, sep):
413 output_file.write(name) 527 output_file.write(name)
414 output_file.write("\n") 528 output_file.write("\n")
415 output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep)) 529 output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format(
530 sep, sep, sep, sep, sep, sep, sep, sep))
416 for item in summary: 531 for item in summary:
417 for nr in item: 532 for nr in item:
418 if "HD" not in nr and "diff" not in nr: 533 if "HD" not in nr and "diff" not in nr:
419 nr = nr.astype(float) 534 nr = nr.astype(float)
420 nr = nr.astype(int) 535 nr = nr.astype(int)
426 output_file.write("{}{}".format(el, sep)) 541 output_file.write("{}{}".format(el, sep))
427 output_file.write("{}{}".format(overallSum.astype(int), sep)) 542 output_file.write("{}{}".format(overallSum.astype(int), sep))
428 output_file.write("\n\n") 543 output_file.write("\n\n")
429 544
430 545
431 def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name, sep): 546 def createFileHDwithDCS(summary, sumCol, overallSum, output_file, name, sep):
432 output_file.write(name) 547 output_file.write(name)
433 output_file.write("\n") 548 output_file.write("\n")
434 output_file.write("{}HD a{}HD b'{}HD b{}HD a'{}HD a+b{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep)) 549 output_file.write("{}DCS{}SSCS ab{}SSCS ba{}sum{}\n".format(sep, sep, sep, sep, sep))
435 for item in summary: 550 for item in summary:
436 for nr in item: 551 for nr in item:
437 if "HD" not in nr: 552 if "HD" not in nr:
438 nr = nr.astype(float) 553 nr = nr.astype(float)
439 nr = nr.astype(int) 554 nr = nr.astype(int)
445 output_file.write("{}{}".format(el, sep)) 560 output_file.write("{}{}".format(el, sep))
446 output_file.write("{}{}".format(overallSum.astype(int), sep)) 561 output_file.write("{}{}".format(overallSum.astype(int), sep))
447 output_file.write("\n\n") 562 output_file.write("\n\n")
448 563
449 564
565 def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name, sep):
566 output_file.write(name)
567 output_file.write("\n")
568 output_file.write("{}HD DCS{}HD b'{}HD b{}HD a'{}HD a+b', a'+b{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep))
569 for item in summary:
570 for nr in item:
571 if "HD" not in nr:
572 nr = nr.astype(float)
573 nr = nr.astype(int)
574 output_file.write("{}{}".format(nr, sep))
575 output_file.write("\n")
576 output_file.write("sum{}".format(sep))
577 sumCol = map(int, sumCol)
578 for el in sumCol:
579 output_file.write("{}{}".format(el, sep))
580 output_file.write("{}{}".format(overallSum.astype(int), sep))
581 output_file.write("\n\n")
582
583
450 def hamming(array1, array2): 584 def hamming(array1, array2):
451 res = 99 * numpy.ones(len(array1)) 585 res = 99 * numpy.ones(len(array1))
452 i = 0 586 i = 0
453 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time 587 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time
454 for a in array1: 588 for a in array1:
455 dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2]) # fastest 589 dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2]) # fastest
456 res[i] = numpy.amin(dist[dist > 0]) # pick min distance greater than zero 590 res[i] = numpy.amin(dist[dist > 0]) # pick min distance greater than zero
457 # print(i)
458 i += 1 591 i += 1
459 return res 592 return res
460 593
461 594
462 def hamming_difference(array1, array2, mate_b): 595 def hamming_difference(array1, array2, mate_b):
463 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time 596 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time
464
465 array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1 597 array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1
466 array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2 598 array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2
467
468 array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1 599 array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1
469 array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2 600 array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2
470 601
471 # diff11 = 999 * numpy.ones(len(array2)) 602 # diff11 = 999 * numpy.ones(len(array2))
472 # relativeDiffList = 999 * numpy.ones(len(array2)) 603 # relativeDiffList = 999 * numpy.ones(len(array2))
497 elif mate_b is True: # HD calculation for all b's 628 elif mate_b is True: # HD calculation for all b's
498 half1_mate1 = array1_half2 629 half1_mate1 = array1_half2
499 half2_mate1 = array1_half 630 half2_mate1 = array1_half
500 half1_mate2 = array2_half2 631 half1_mate2 = array2_half2
501 half2_mate2 = array2_half 632 half2_mate2 = array2_half
633
502 # half1_mate1, index_halves = numpy.unique(half1_mate1, return_index=True) 634 # half1_mate1, index_halves = numpy.unique(half1_mate1, return_index=True)
503 # print(len(half1_mate1)) 635 # print(len(half1_mate1))
504 # half2_mate1 = half2_mate1[index_halves] 636 # half2_mate1 = half2_mate1[index_halves]
505 # array1 = array1[index_halves] 637 # array1 = array1[index_halves]
506 638
512 644
513 # all tags without identical tag 645 # all tags without identical tag
514 array2_half_withoutSame = half1_mate2[index_withoutSame] 646 array2_half_withoutSame = half1_mate2[index_withoutSame]
515 array2_half2_withoutSame = half2_mate2[index_withoutSame] 647 array2_half2_withoutSame = half2_mate2[index_withoutSame]
516 array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs) 648 array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs)
517 649 # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
518 dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in 650 dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in
519 array2_half_withoutSame]) # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" 651 array2_half_withoutSame])
520 min_index = numpy.where(dist == dist.min())[0] # get index of min HD 652 min_index = numpy.where(dist == dist.min())[0] # get index of min HD
521 min_value = dist.min() 653 min_value = dist.min()
522 # min_value = dist[min_index] # get minimum HDs 654 # min_value = dist[min_index] # get minimum HDs
523 min_tag_half2 = array2_half2_withoutSame[min_index] # get all "b's" of the tag or all "a's" of the tag with minimum HD 655 # get all "b's" of the tag or all "a's" of the tag with minimum HD
656 min_tag_half2 = array2_half2_withoutSame[min_index]
524 min_tag_array2 = array2_withoutSame[min_index] # get whole tag with min HD 657 min_tag_array2 = array2_withoutSame[min_index] # get whole tag with min HD
525 658
526 dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" 659 dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in
660 min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's"
527 max_value = dist_second_half.max() 661 max_value = dist_second_half.max()
528 max_index = numpy.where(dist_second_half == dist_second_half.max())[0] # get index of max HD 662 max_index = numpy.where(dist_second_half == dist_second_half.max())[0] # get index of max HD
529 max_tag = min_tag_array2[max_index] 663 max_tag = min_tag_array2[max_index]
530 664
531 # for d, d2 in zip(min_value, max_value): 665 # for d, d2 in zip(min_value, max_value):
546 # tags which have identical parts: 680 # tags which have identical parts:
547 if min_value == 0 or max_value == 0: 681 if min_value == 0 or max_value == 0:
548 min_tagsList_zeros.append(numpy.array(tag)) 682 min_tagsList_zeros.append(numpy.array(tag))
549 difference1_zeros = abs(min_value - max_value) # hd of non-identical part 683 difference1_zeros = abs(min_value - max_value) # hd of non-identical part
550 diff11_zeros.append(difference1_zeros) 684 diff11_zeros.append(difference1_zeros)
551 max_tag_list.append(max_tag) 685 max_tag_list.append(numpy.array(max_tag))
552 else: 686 else:
553 min_tagsList_zeros.append(None) 687 min_tagsList_zeros.append(None)
554 diff11_zeros.append(None) 688 diff11_zeros.append(None)
555 max_tag_list.append(numpy.array(["None"])) 689 max_tag_list.append(None)
556
557 # max_tag_list.append(numpy.array(max_tag))
558
559 i += 1 690 i += 1
560 691
561 # print(i) 692 # print(i)
562 # diff11 = [st for st in diff11 if st != 999] 693 # diff11 = [st for st in diff11 if st != 999]
563 # ham1 = [st for st in ham1 if st != 999] 694 # ham1 = [st for st in ham1 if st != 999]
565 # min_valueList = [st for st in min_valueList if st != 999] 696 # min_valueList = [st for st in min_valueList if st != 999]
566 # min_tagsList = [st for st in min_tagsList if st != 999] 697 # min_tagsList = [st for st in min_tagsList if st != 999]
567 # relativeDiffList = [st for st in relativeDiffList if st != 999] 698 # relativeDiffList = [st for st in relativeDiffList if st != 999]
568 # diff11_zeros = [st for st in diff11_zeros if st != 999] 699 # diff11_zeros = [st for st in diff11_zeros if st != 999]
569 # min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999] 700 # min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999]
570 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros, ham1min, ham2min, max_tag_list]) 701 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros,
702 min_tagsList_zeros, ham1min, ham2min, max_tag_list])
571 703
572 704
573 def readFileReferenceFree(file): 705 def readFileReferenceFree(file):
574 with open(file, 'r') as dest_f: 706 with open(file, 'r') as dest_f:
575 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') 707 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string')
606 738
607 739
608 def familySizeDistributionWithHD(fs, ham, diff=False, rel=True): 740 def familySizeDistributionWithHD(fs, ham, diff=False, rel=True):
609 hammingDistances = numpy.unique(ham) 741 hammingDistances = numpy.unique(ham)
610 fs = numpy.asarray(fs) 742 fs = numpy.asarray(fs)
611
612 ham = numpy.asarray(ham) 743 ham = numpy.asarray(ham)
613 bigFamilies2 = numpy.where(fs > 19)[0] 744 bigFamilies2 = numpy.where(fs > 19)[0]
614 if len(bigFamilies2) != 0: 745 if len(bigFamilies2) != 0:
615 fs[bigFamilies2] = 20 746 fs[bigFamilies2] = 20
616 maximum = max(fs) 747 maximum = max(fs)
659 list1 = [data0, data, data2, data3, data4, data5, data6] 790 list1 = [data0, data, data2, data3, data4, data5, data6]
660 else: 791 else:
661 list1 = [data, data2, data3, data4, data5, data6] 792 list1 = [data, data2, data3, data4, data5, data6]
662 793
663 return(list1, hammingDistances, maximum, minimum) 794 return(list1, hammingDistances, maximum, minimum)
795
796
797 def hammingDistanceWithDCS(minHD_tags_zeros, diff_zeros, data_array):
798 diff_zeros = numpy.array(diff_zeros)
799 maximum = numpy.amax(diff_zeros)
800 minimum = numpy.amin(diff_zeros)
801 minHD_tags_zeros = numpy.array(minHD_tags_zeros)
802
803 idx = numpy.concatenate([numpy.where(data_array[:, 1] == i)[0] for i in minHD_tags_zeros])
804 subset_data = data_array[idx, :]
805
806 seq = numpy.array(subset_data[:, 1])
807
808 # find all unique tags and get the indices for ALL tags, but only once
809 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
810 DCS_tags = u[c == 2]
811 rest_tags = u[c == 1]
812
813 dcs = numpy.repeat("DCS", len(DCS_tags))
814 idx_sscs = numpy.concatenate([numpy.where(subset_data[:, 1] == i)[0] for i in rest_tags])
815 sscs = subset_data[idx_sscs, 2]
816
817 all_tags = numpy.column_stack((numpy.concatenate((DCS_tags, subset_data[idx_sscs, 1])),
818 numpy.concatenate((dcs, sscs))))
819 hd_DCS = []
820 ab_SSCS = []
821 ba_SSCS = []
822
823 for i in range(len(all_tags)):
824 tag = all_tags[i, :]
825 hd = diff_zeros[numpy.where(minHD_tags_zeros == tag[0])[0]]
826
827 if tag[1] == "DCS":
828 hd_DCS.append(hd)
829 elif tag[1] == "ab":
830 ab_SSCS.append(hd)
831 elif tag[1] == "ba":
832 ba_SSCS.append(hd)
833
834 if len(hd_DCS) != 0:
835 hd_DCS = numpy.concatenate(hd_DCS)
836 if len(ab_SSCS) != 0:
837 ab_SSCS = numpy.concatenate(ab_SSCS)
838 if len(ba_SSCS) != 0:
839 ba_SSCS = numpy.concatenate(ba_SSCS)
840 list1 = [hd_DCS, ab_SSCS, ba_SSCS] # list for plotting
841 return(list1, maximum, minimum)
664 842
665 843
666 def make_argparser(): 844 def make_argparser():
667 parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data') 845 parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data')
668 parser.add_argument('--inputFile', 846 parser.add_argument('--inputFile',
676 help='The tool runs with the given number of processors.') 854 help='The tool runs with the given number of processors.')
677 parser.add_argument('--only_DCS', action="store_false", 855 parser.add_argument('--only_DCS', action="store_false",
678 help='Only tags of the DCSs are included in the HD analysis') 856 help='Only tags of the DCSs are included in the HD analysis')
679 857
680 parser.add_argument('--minFS', default=1, type=int, 858 parser.add_argument('--minFS', default=1, type=int,
681 help='Only tags, which have a family size greater or equal than specified, are included in the HD analysis') 859 help='Only tags, which have a family size greater or equal than specified, '
860 'are included in the HD analysis')
682 parser.add_argument('--maxFS', default=0, type=int, 861 parser.add_argument('--maxFS', default=0, type=int,
683 help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis') 862 help='Only tags, which have a family size smaller or equal than specified, '
863 'are included in the HD analysis')
684 parser.add_argument('--nr_above_bars', action="store_true", 864 parser.add_argument('--nr_above_bars', action="store_true",
685 help='If no, values above bars in the histograms are removed') 865 help='If False, values above bars in the histograms are removed')
866 parser.add_argument('--rel_freq', action="store_false",
867 help='If True, the relative frequencies are displayed.')
686 868
687 parser.add_argument('--output_tabular', default="data.tabular", type=str, 869 parser.add_argument('--output_tabular', default="data.tabular", type=str,
688 help='Name of the tabular file.') 870 help='Name of the tabular file.')
689 parser.add_argument('--output_pdf', default="data.pdf", type=str, 871 parser.add_argument('--output_pdf', default="data.pdf", type=str,
690 help='Name of the pdf file.') 872 help='Name of the pdf file.')
693 875
694 return parser 876 return parser
695 877
696 878
697 def Hamming_Distance_Analysis(argv): 879 def Hamming_Distance_Analysis(argv):
698 880 # def Hamming_Distance_Analysis(file1, name1, index_size, title_savedFile_pdf, title_savedFile_csv,
881 # output_chimeras_tabular, onlyDuplicates, minFS=1, maxFS=0, nr_above_bars=True,
882 # subset=False, nproc=12, rel_freq=False):
699 parser = make_argparser() 883 parser = make_argparser()
700 args = parser.parse_args(argv[1:]) 884 args = parser.parse_args(argv[1:])
701
702 file1 = args.inputFile 885 file1 = args.inputFile
703 name1 = args.inputName1 886 name1 = args.inputName1
704
705 index_size = args.sample_size 887 index_size = args.sample_size
706 title_savedFile_pdf = args.output_pdf 888 title_savedFile_pdf = args.output_pdf
707 title_savedFile_csv = args.output_tabular 889 title_savedFile_csv = args.output_tabular
708 output_chimeras_tabular = args.output_chimeras_tabular 890 output_chimeras_tabular = args.output_chimeras_tabular
709
710 sep = "\t"
711 onlyDuplicates = args.only_DCS 891 onlyDuplicates = args.only_DCS
892 rel_freq = args.rel_freq
712 minFS = args.minFS 893 minFS = args.minFS
713 maxFS = args.maxFS 894 maxFS = args.maxFS
714 nr_above_bars = args.nr_above_bars 895 nr_above_bars = args.nr_above_bars
715
716 subset = args.subset_tag 896 subset = args.subset_tag
717 nproc = args.nproc 897 nproc = args.nproc
898 sep = "\t"
718 899
719 # input checks 900 # input checks
720 if index_size < 0: 901 if index_size < 0:
721 print("index_size is a negative integer.") 902 print("index_size is a negative integer.")
722 exit(2) 903 exit(2)
723
724 if nproc <= 0: 904 if nproc <= 0:
725 print("nproc is smaller or equal zero") 905 print("nproc is smaller or equal zero")
726 exit(3) 906 exit(3)
727
728 if subset < 0: 907 if subset < 0:
729 print("subset_tag is smaller or equal zero.") 908 print("subset_tag is smaller or equal zero.")
730 exit(5) 909 exit(5)
731 910
732 # PLOT 911 # PLOT
733 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color 912 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color
734 plt.rcParams['xtick.labelsize'] = 14 913 plt.rcParams['xtick.labelsize'] = 14
735 plt.rcParams['ytick.labelsize'] = 14 914 plt.rcParams['ytick.labelsize'] = 14
736 plt.rcParams['patch.edgecolor'] = "#000000" 915 plt.rcParams['patch.edgecolor'] = "#000000"
737 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format 916 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format
738
739 name1 = name1.split(".tabular")[0] 917 name1 = name1.split(".tabular")[0]
740 918
741 with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf: 919 with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf:
742 print("dataset: ", name1) 920 print("dataset: ", name1)
743 integers, data_array = readFileReferenceFree(file1) 921 integers, data_array = readFileReferenceFree(file1)
744 data_array = numpy.array(data_array) 922 data_array = numpy.array(data_array)
745 print("total nr of tags with Ns:", len(data_array)) 923 print("total nr of tags:", len(data_array))
746 n = [i for i, x in enumerate(data_array[:, 1]) if "N" in x] 924
747 if len(n) != 0: # delete tags with N in the tag from data 925 # filter tags out which contain any other character than ATCG
748 print("nr of tags with N's within tag:", len(n), float(len(n)) / len(data_array)) 926 valid_bases = ["A", "T", "G", "C"]
927 tagsToDelete = []
928 for idx, t in enumerate(data_array[:, 1]):
929 for char in t:
930 if char not in valid_bases:
931 tagsToDelete.append(idx)
932 break
933
934 if len(tagsToDelete) != 0: # delete tags with N in the tag from data
935 print("nr of tags with any other character than A, T, C, G:", len(tagsToDelete),
936 float(len(tagsToDelete)) / len(data_array))
749 index_whole_array = numpy.arange(0, len(data_array), 1) 937 index_whole_array = numpy.arange(0, len(data_array), 1)
750 index_withoutN_inTag = numpy.delete(index_whole_array, n) 938 index_withoutN_inTag = numpy.delete(index_whole_array, tagsToDelete)
751 data_array = data_array[index_withoutN_inTag, :] 939 data_array = data_array[index_withoutN_inTag, :]
752 integers = integers[index_withoutN_inTag] 940 integers = integers[index_withoutN_inTag]
753 print("total nr of tags without Ns:", len(data_array)) 941 print("total nr of filtered tags:", len(data_array))
754 942
755 int_f = numpy.array(data_array[:, 0]).astype(int) 943 int_f = numpy.array(data_array[:, 0]).astype(int)
756 data_array = data_array[numpy.where(int_f >= minFS)] 944 data_array = data_array[numpy.where(int_f >= minFS)]
757 integers = integers[integers >= minFS] 945 integers = integers[integers >= minFS]
758 946
766 tags = data_array[:, 2] 954 tags = data_array[:, 2]
767 seq = data_array[:, 1] 955 seq = data_array[:, 1]
768 956
769 # find all unique tags and get the indices for ALL tags, but only once 957 # find all unique tags and get the indices for ALL tags, but only once
770 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) 958 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
771 d = u[c > 1] 959 d = u[c == 2]
772 960
773 # get family sizes, tag for duplicates 961 # get family sizes, tag for duplicates
774 duplTags_double = integers[numpy.in1d(seq, d)] 962 duplTags_double = integers[numpy.in1d(seq, d)]
775 duplTags = duplTags_double[0::2] # ab of DCS 963 duplTags = duplTags_double[0::2] # ab of DCS
776 duplTagsBA = duplTags_double[1::2] # ba of DCS 964 duplTagsBA = duplTags_double[1::2] # ba of DCS
777 965
778 duplTags_tag = tags[numpy.in1d(seq, d)][0::2] # ab 966 duplTags_tag = tags[numpy.in1d(seq, d)][0::2] # ab
779 duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags 967 duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags
780 968
781 if minFS > 1: 969 if minFS > 1:
782 duplTags_tag = duplTags_tag[(duplTags >= 3) & (duplTagsBA >= 3)] 970 duplTags_tag = duplTags_tag[(duplTags >= minFS) & (duplTagsBA >= minFS)]
783 duplTags_seq = duplTags_seq[(duplTags >= 3) & (duplTagsBA >= 3)] 971 duplTags_seq = duplTags_seq[(duplTags >= minFS) & (duplTagsBA >= minFS)]
784 duplTags = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)] # ab+ba with FS>=3 972 duplTags = duplTags[(duplTags >= minFS) & (duplTagsBA >= minFS)] # ab+ba with FS>=3
785 973
786 data_array = numpy.column_stack((duplTags, duplTags_seq)) 974 data_array = numpy.column_stack((duplTags, duplTags_seq))
787 data_array = numpy.column_stack((data_array, duplTags_tag)) 975 data_array = numpy.column_stack((data_array, duplTags_tag))
788 integers = numpy.array(data_array[:, 0]).astype(int) 976 integers = numpy.array(data_array[:, 0]).astype(int)
789 print("DCS in whole dataset", len(data_array)) 977 print("DCS in whole dataset", len(data_array))
817 if index_size == 0: 1005 if index_size == 0:
818 result = numpy.arange(0, len(data_array), 1) 1006 result = numpy.arange(0, len(data_array), 1)
819 else: 1007 else:
820 numpy.random.shuffle(data_array) 1008 numpy.random.shuffle(data_array)
821 unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True) # get only unique tags 1009 unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True) # get only unique tags
822 result = numpy.random.choice(unique_indices, size=index_size, replace=False) # array of random sequences of size=index.size 1010 result = numpy.random.choice(unique_indices, size=index_size,
1011 replace=False) # array of random sequences of size=index.size
823 1012
824 # result = numpy.random.choice(len(integers), size=index_size, 1013 # result = numpy.random.choice(len(integers), size=index_size,
825 # replace=False) # array of random sequences of size=index.size 1014 # replace=False) # array of random sequences of size=index.size
826 # result = numpy.where(numpy.array(random_tags) == numpy.array(data_array[:,1]))[0] 1015 # result = numpy.where(numpy.array(random_tags) == numpy.array(data_array[:,1]))[0]
827 1016
828 # with open("index_result1_{}.pkl".format(app_f), "wb") as o: 1017 # with open("index_result.pkl", "wb") as o:
829 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL) 1018 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL)
1019
1020 # save counts
1021 # with open(data_folder + "index_sampleTags1000_Barcode3_DCS.pkl", "wb") as f:
1022 # pickle.dump(result, f, pickle.HIGHEST_PROTOCOL)
1023 # with open(data_folder + "dataArray_sampleTags1000_Barcode3_DCS.pkl", "wb") as f1:
1024 # pickle.dump(data_array, f1, pickle.HIGHEST_PROTOCOL)
1025 #
1026 # with open(data_folder + "index_sampleTags100.pkl", "rb") as f:
1027 # result = pickle.load(f)
1028 #
1029 # with open(data_folder + "dataArray_sampleTags100.pkl", "rb") as f1:
1030 # data_array = pickle.load(f1)
1031
1032 # with open(data_folder + "index_result.txt", "w") as t:
1033 # for text in result:
1034 # t.write("{}\n".format(text))
830 1035
831 # comparison random tags to whole dataset 1036 # comparison random tags to whole dataset
832 result1 = data_array[result, 1] # random tags 1037 result1 = data_array[result, 1] # random tags
833 result2 = data_array[:, 1] # all tags 1038 result2 = data_array[:, 1] # all tags
834 print("sample size= ", len(result1)) 1039 print("sample size= ", len(result1))
871 diff_zeros1 = numpy.concatenate([item[6] for item in diff_list_a]) 1076 diff_zeros1 = numpy.concatenate([item[6] for item in diff_list_a])
872 diff_zeros2 = numpy.concatenate([item[6] for item in diff_list_b]) 1077 diff_zeros2 = numpy.concatenate([item[6] for item in diff_list_b])
873 minHD_tags = numpy.concatenate([item[4] for item in diff_list_a]) 1078 minHD_tags = numpy.concatenate([item[4] for item in diff_list_a])
874 minHD_tags_zeros1 = numpy.concatenate([item[7] for item in diff_list_a]) 1079 minHD_tags_zeros1 = numpy.concatenate([item[7] for item in diff_list_a])
875 minHD_tags_zeros2 = numpy.concatenate([item[7] for item in diff_list_b]) 1080 minHD_tags_zeros2 = numpy.concatenate([item[7] for item in diff_list_b])
876 chim_tags = [item[10] for item in diff_list_a] 1081
877 chim_tags2 = [item[10] for item in diff_list_b] 1082 chimera_tags1 = sum([item[10] for item in diff_list_a], [])
878 chimera_tags1 = [ii if isinstance(i, list) else i for i in chim_tags for ii in i] 1083 chimera_tags2 = numpy.concatenate([item[10] for item in diff_list_b])
879 chimera_tags2 = [ii if isinstance(i, list) else i for i in chim_tags2 for ii in i]
880 1084
881 rel_Diff = [] 1085 rel_Diff = []
882 diff_zeros = [] 1086 diff_zeros = []
883 minHD_tags_zeros = [] 1087 minHD_tags_zeros = []
884 diff = [] 1088 diff = []
885 chimera_tags = [] 1089 chimera_tags = []
886 for d1, d2, rel1, rel2, zeros1, zeros2, tag1, tag2, ctag1, ctag2 in \ 1090 for d1, d2, rel1, rel2, zeros1, zeros2, tag1, tag2, ctag1, ctag2 in \
887 zip(diff1, diff2, rel_Diff1, rel_Diff2, diff_zeros1, diff_zeros2, minHD_tags_zeros1, minHD_tags_zeros2, chimera_tags1, chimera_tags2): 1091 zip(diff1, diff2, rel_Diff1, rel_Diff2, diff_zeros1, diff_zeros2, minHD_tags_zeros1, minHD_tags_zeros2,
1092 chimera_tags1, chimera_tags2):
888 rel_Diff.append(max(rel1, rel2)) 1093 rel_Diff.append(max(rel1, rel2))
889 diff.append(max(d1, d2)) 1094 diff.append(max(d1, d2))
890 1095
891 if all(i is not None for i in [zeros1, zeros2]): 1096 if all(i is not None for i in [zeros1, zeros2]):
892 diff_zeros.append(max(zeros1, zeros2)) 1097 diff_zeros.append(max(zeros1, zeros2))
901 diff_zeros.append(zeros2) 1106 diff_zeros.append(zeros2)
902 minHD_tags_zeros.append(str(tag2)) 1107 minHD_tags_zeros.append(str(tag2))
903 chimera_tags.append(ctag2) 1108 chimera_tags.append(ctag2)
904 1109
905 chimera_tags_new = chimera_tags 1110 chimera_tags_new = chimera_tags
906 #data_chimeraAnalysis = numpy.column_stack((minHD_tags_zeros, chimera_tags_new)) 1111 data_chimeraAnalysis = numpy.column_stack((minHD_tags_zeros, chimera_tags_new))
907 # chimeras_dic = defaultdict(list) 1112 # chimeras_dic = defaultdict(list)
908 # 1113 #
909 # for t1, t2 in zip(minHD_tags_zeros, chimera_tags_new): 1114 # for t1, t2 in zip(minHD_tags_zeros, chimera_tags_new):
910 # if len(t2) >1 and type(t2) is not numpy.ndarray: 1115 # if len(t2) >1 and type(t2) is not numpy.ndarray:
911 # t2 = numpy.concatenate(t2) 1116 # t2 = numpy.concatenate(t2)
912 # chimeras_dic[t1].append(t2) 1117 # chimeras_dic[t1].append(t2)
913 1118
1119 checked_tags = []
1120 stat_maxTags = []
1121
914 with open(output_chimeras_tabular, "w") as output_file1: 1122 with open(output_chimeras_tabular, "w") as output_file1:
915 output_file1.write("chimera tag\tsimilar tag with HD=0\n") 1123 output_file1.write("chimera tag\tfamily size, read direction\tsimilar tag with HD=0\n")
916 for i in range(len(minHD_tags_zeros)): 1124 for i in range(len(data_chimeraAnalysis)):
917 tag1 = minHD_tags_zeros[i] 1125 tag1 = data_chimeraAnalysis[i, 0]
1126
1127 info_tag1 = data_array[data_array[:, 1] == tag1, :]
1128 fs_tag1 = ["{} {}".format(t[0], t[2]) for t in info_tag1]
1129
1130 if tag1 in checked_tags: # skip tag if already written to file
1131 continue
1132
918 sample_half_a = tag1[0:(len(tag1)) / 2] 1133 sample_half_a = tag1[0:(len(tag1)) / 2]
919 sample_half_b = tag1[len(tag1) / 2:len(tag1)] 1134 sample_half_b = tag1[len(tag1) / 2:len(tag1)]
920 1135
921 max_tags = chimera_tags_new[i] 1136 max_tags = data_chimeraAnalysis[i, 1]
922 if isinstance(max_tags, list) and len(max_tags) > 1: 1137 if len(max_tags) > 1 and type(max_tags) is not numpy.ndarray:
923 max_tags = numpy.concatenate(max_tags) 1138 max_tags = numpy.concatenate(max_tags)
924 #if isinstance(max_tags, list): #and type(max_tags) is not numpy.ndarray:
925 # print(max_tags)
926 # max_tags = numpy.concatenate(max_tags)
927 max_tags = numpy.unique(max_tags) 1139 max_tags = numpy.unique(max_tags)
928 1140 stat_maxTags.append(len(max_tags))
929 chimera_half_a = numpy.array([i[0:(len(i)) / 2] for i in max_tags]) # mate1 part1 1141
930 chimera_half_b = numpy.array([i[len(i) / 2:len(i)] for i in max_tags]) # mate1 part 2 1142 info_maxTags = [data_array[data_array[:, 1] == t, :] for t in max_tags]
1143
1144 chimera_half_a = numpy.array([t[0:(len(t)) / 2] for t in max_tags]) # mate1 part1
1145 chimera_half_b = numpy.array([t[len(t) / 2:len(t)] for t in max_tags]) # mate1 part 2
931 1146
932 new_format = [] 1147 new_format = []
933 for j in range(len(max_tags)): 1148 for j in range(len(max_tags)):
1149 fs_maxTags = ["{} {}".format(t[0], t[2]) for t in info_maxTags[j]]
1150
934 if sample_half_a == chimera_half_a[j]: 1151 if sample_half_a == chimera_half_a[j]:
935 max_tag = "*{}* {}".format(chimera_half_a[j], chimera_half_b[j]) 1152 max_tag = "*{}* {} {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags))
936 new_format.append(max_tag) 1153 new_format.append(max_tag)
937 1154
938 elif sample_half_b == chimera_half_b[j]: 1155 elif sample_half_b == chimera_half_b[j]:
939 max_tag = "{} *{}*".format(chimera_half_a[j], chimera_half_b[j]) 1156 max_tag = "{} *{}* {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags))
940 new_format.append(max_tag) 1157 new_format.append(max_tag)
941 1158 checked_tags.append(max_tags[j])
942 sample_tag = "{} {}".format(sample_half_a, sample_half_b) 1159
1160 sample_tag = "{} {}\t{}".format(sample_half_a, sample_half_b, ", ".join(fs_tag1))
943 output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format))) 1161 output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format)))
1162
1163 checked_tags.append(tag1)
1164
944 output_file1.write( 1165 output_file1.write(
945 "This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.\n " 1166 "This file contains all tags that were identified as chimeras as the first column and the "
1167 "corresponding tags which returned a Hamming distance of zero in either the first or the second "
1168 "half of the sample tag as the second column.\n"
946 "The tags were separated by an empty space into their halves and the * marks the identical half.") 1169 "The tags were separated by an empty space into their halves and the * marks the identical half.")
947 1170 output_file1.write("\n\nStatistics of nr. of tags that returned max. HD (2nd column)\n")
948 # unique_chimeras = numpy.array(minHD_tags_zeros) 1171 output_file1.write("minimum\t{}\ttag(s)\n".format(numpy.amin(numpy.array(stat_maxTags))))
949 # 1172 output_file1.write("mean\t{}\ttag(s)\n".format(numpy.mean(numpy.array(stat_maxTags))))
950 # sample_half_a = numpy.array([i[0:(len(i)) / 2] for i in unique_chimeras]) # mate1 part1 1173 output_file1.write("median\t{}\ttag(s)\n".format(numpy.median(numpy.array(stat_maxTags))))
951 # sample_half_b = numpy.array([i[len(i) / 2:len(i)] for i in unique_chimeras]) # mate1 part 2 1174 output_file1.write("maximum\t{}\ttag(s)\n".format(numpy.amax(numpy.array(stat_maxTags))))
952 # 1175 output_file1.write("sum\t{}\ttag(s)\n".format(numpy.sum(numpy.array(stat_maxTags))))
953 # output_file1.write("sample tag\tsimilar tag\n")
954 # for tag1, a, b in zip(unique_chimeras, sample_half_a, sample_half_b):
955 # max_tags = numpy.concatenate(chimeras_dic.get(tag1))
956 # max_tags = numpy.unique(max_tags)
957 #
958 # chimera_half_a = numpy.array([i[0:(len(i)) / 2] for i in max_tags]) # mate1 part1
959 # chimera_half_b = numpy.array([i[len(i) / 2:len(i)] for i in max_tags]) # mate1 part 2
960 #
961 # new_format = []
962 # for i in range(len(max_tags)):
963 # if a == chimera_half_a[i]:
964 # max_tag = "*{}* {}".format(chimera_half_a[i], chimera_half_b[i])
965 # new_format.append(max_tag)
966 #
967 # elif b == chimera_half_b[i]:
968 # max_tag = "{} *{}*".format(chimera_half_a[i], chimera_half_b[i])
969 # new_format.append(max_tag)
970 #
971 # sample_tag = "{} {}".format(a, b)
972 # output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format)))
973 # output_file1.write(
974 # "This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.\n "
975 # "The tags were separated by an empty space into their halves and the * marks the identical half.")
976
977 nr_chimeric_tags = len(minHD_tags_zeros)
978 print("nr of unique chimeras", nr_chimeric_tags)
979 1176
980 lenTags = len(data_array) 1177 lenTags = len(data_array)
981 len_sample = len(result1) 1178 len_sample = len(result1)
982 1179
983 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags 1180 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags
989 seq = numpy.tile(seq, 2) 1186 seq = numpy.tile(seq, 2)
990 ham = numpy.tile(ham, 2) 1187 ham = numpy.tile(ham, 2)
991 diff = numpy.tile(diff, 2) 1188 diff = numpy.tile(diff, 2)
992 rel_Diff = numpy.tile(rel_Diff, 2) 1189 rel_Diff = numpy.tile(rel_Diff, 2)
993 diff_zeros = numpy.tile(diff_zeros, 2) 1190 diff_zeros = numpy.tile(diff_zeros, 2)
1191
1192 nr_chimeric_tags = len(data_chimeraAnalysis)
1193 print("nr of chimeras", nr_chimeric_tags)
994 1194
995 # prepare data for different kinds of plots 1195 # prepare data for different kinds of plots
996 # distribution of FSs separated after HD 1196 # distribution of FSs separated after HD
997 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False) 1197 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False)
998 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS 1198 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS
1009 lst_minHD_tags = [] 1209 lst_minHD_tags = []
1010 for i in minHD_tags: 1210 for i in minHD_tags:
1011 lst_minHD_tags.append(seqDic.get(i)) 1211 lst_minHD_tags.append(seqDic.get(i))
1012 1212
1013 if onlyDuplicates: 1213 if onlyDuplicates:
1014 lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags], [item_b[1] for item_b in lst_minHD_tags])).astype(int) 1214 lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags],
1015 1215 [item_b[1] for item_b in lst_minHD_tags])).astype(int)
1016 # histogram with absolute and relative difference between HDs of both parts of the tag 1216 # histogram with absolute and relative difference between HDs of both parts of the tag
1017 listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff) 1217 listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff)
1018 listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags, 1218 listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags,
1019 rel_Diff) 1219 rel_Diff)
1020 # chimeric read analysis: tags which have HD=0 in one of the halfs 1220 # chimeric read analysis: tags which have HD=0 in one of the halfs
1021 if len(minHD_tags_zeros) != 0: 1221 if len(minHD_tags_zeros) != 0:
1022 lst_minHD_tags_zeros = [] 1222 lst_minHD_tags_zeros = []
1023 for i in minHD_tags_zeros: 1223 for i in minHD_tags_zeros:
1024 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads 1224 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads
1025 if onlyDuplicates: 1225 if onlyDuplicates:
1026 lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros], [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int) 1226 lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros],
1227 [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int)
1027 1228
1028 # histogram with HD of non-identical half 1229 # histogram with HD of non-identical half
1029 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(lst_minHD_tags_zeros, diff_zeros) 1230 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(
1231 lst_minHD_tags_zeros, diff_zeros)
1232
1233 if onlyDuplicates is False:
1234 listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros = hammingDistanceWithDCS(minHD_tags_zeros,
1235 diff_zeros, data_array)
1030 1236
1031 # plot Hamming Distance with Family size distribution 1237 # plot Hamming Distance with Family size distribution
1032 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, 1238 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, rel_freq=rel_freq,
1033 subtitle="Hamming distance separated by family size", title_file1=name1, lenTags=lenTags, 1239 subtitle="Hamming distance separated by family size", lenTags=lenTags,
1034 xlabel="HD", nr_above_bars=nr_above_bars, len_sample=len_sample) 1240 xlabel="HD", nr_above_bars=nr_above_bars, len_sample=len_sample)
1035 1241
1036 # Plot FSD with separation after 1242 # Plot FSD with separation after
1037 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, 1243 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, rel_freq=rel_freq,
1038 originalCounts=quant, subtitle="Family size distribution separated by Hamming distance", 1244 originalCounts=quant, subtitle="Family size distribution separated by Hamming distance",
1039 pdf=pdf, relative=False, title_file1=name1, diff=False) 1245 pdf=pdf, relative=False, diff=False)
1040 1246
1041 # Plot HD within tags 1247 # Plot HD within tags
1042 plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, 1248 plotHDwithinSeq(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags,
1043 title_file1=name1, len_sample=len_sample) 1249 rel_freq=rel_freq, len_sample=len_sample)
1044 1250
1045 # Plot difference between HD's separated after FSD 1251 # Plot difference between HD's separated after FSD
1046 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, 1252 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf,
1047 subtitle="Delta Hamming distance within tags", 1253 subtitle="Delta Hamming distance within tags", lenTags=lenTags, rel_freq=rel_freq,
1048 title_file1=name1, lenTags=lenTags,
1049 xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars, len_sample=len_sample) 1254 xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars, len_sample=len_sample)
1050 1255
1051 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, 1256 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf,
1052 subtitle="Chimera Analysis: relative delta Hamming distances", 1257 subtitle="Chimera Analysis: relative delta Hamming distance", lenTags=lenTags, rel_freq=rel_freq,
1053 title_file1=name1, lenTags=lenTags, 1258 xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars,
1054 xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) 1259 nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
1055 1260
1056 # plots for chimeric reads 1261 # plots for chimeric reads
1057 if len(minHD_tags_zeros) != 0: 1262 if len(minHD_tags_zeros) != 0:
1058 # HD 1263 # HD
1059 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, subtitle="Hamming distance of chimeras", 1264 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf,
1060 title_file1=name1, lenTags=lenTags, xlabel="HD", relative=False, 1265 subtitle="Hamming distance of chimeric families (CF)", rel_freq=rel_freq,
1266 lenTags=lenTags, xlabel="HD", relative=False,
1061 nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) 1267 nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
1268
1269 if onlyDuplicates is False:
1270 plotHDwithDCS(listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros, pdf=pdf,
1271 subtitle="Hamming distance of chimeric families (CF)", rel_freq=rel_freq,
1272 lenTags=lenTags, xlabel="HD", relative=False,
1273 nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
1062 1274
1063 # print all data to a CSV file 1275 # print all data to a CSV file
1064 # HD 1276 # HD
1065 summary, sumCol = createTableHD(list1, "HD=") 1277 summary, sumCol = createTableHD(list1, "HD=")
1066 overallSum = sum(sumCol) # sum of columns in table 1278 overallSum = sum(sumCol) # sum of columns in table
1085 if len(minHD_tags_zeros) != 0: 1297 if len(minHD_tags_zeros) != 0:
1086 # absolute difference and tags where at least one half has HD=0 1298 # absolute difference and tags where at least one half has HD=0
1087 summary15, sumCol15 = createTableHD(listDifference1_zeros, "HD=") 1299 summary15, sumCol15 = createTableHD(listDifference1_zeros, "HD=")
1088 overallSum15 = sum(sumCol15) 1300 overallSum15 = sum(sumCol15)
1089 1301
1302 if onlyDuplicates is False:
1303 summary16, sumCol16 = createTableHDwithDCS(listDCS_zeros)
1304 overallSum16 = sum(sumCol16)
1305
1090 output_file.write("{}\n".format(name1)) 1306 output_file.write("{}\n".format(name1))
1091 output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len( 1307 output_file.write("nr of tags{}{:,}\nsample size{}{:,}\n\n".format(sep, lenTags, sep, len_sample))
1092 numpy.concatenate(list1)), lenTags, lenTags))
1093 1308
1094 # HD 1309 # HD
1095 createFileHD(summary, sumCol, overallSum, output_file, 1310 createFileHD(summary, sumCol, overallSum, output_file,
1096 "Hamming distance separated by family size", sep) 1311 "Hamming distance separated by family size", sep)
1097 # FSD 1312 # FSD
1107 output_file.write( 1322 output_file.write(
1108 "relative frequency:{}{}\n\n".format(sep, float(max_fs[len(max_fs) - 1]) / sum(max_fs))) 1323 "relative frequency:{}{}\n\n".format(sep, float(max_fs[len(max_fs) - 1]) / sum(max_fs)))
1109 1324
1110 # HD within tags 1325 # HD within tags
1111 output_file.write( 1326 output_file.write(
1112 "The Hamming distances were calculated by comparing the first halve against all halves and selected the minimum value (HD a).\n" 1327 "The Hamming distances were calculated by comparing the first halve against all halves and selected the "
1113 "For the second half of the tag, we compared them against all tags which resulted in the minimum HD of the previous step and selected the maximum value (HD b').\n" 1328 "minimum value (HD a).\nFor the second half of the tag, we compared them against all tags which resulted "
1114 "Finally, it was possible to calculate the absolute and relative differences between the HDs (absolute and relative delta HD).\n" 1329 "in the minimum HD of the previous step and selected the maximum value (HD b').\nFinally, it was possible "
1115 "These calculations were repeated, but starting with the second half in the first step to find all possible chimeras in the data (HD b and HD For simplicity we used the maximum value between the delta values in the end.\n" 1330 "to calculate the absolute and relative differences between the HDs (absolute and relative delta HD).\n"
1116 "When only tags that can form DCS were allowed in the analysis, family sizes for the forward and reverse (ab and ba) will be included in the plots.\n") 1331 "These calculations were repeated, but starting with the second half in the first step to find all "
1117 1332 "possible chimeras in the data (HD b and HD For simplicity we used the maximum value between the delta "
1118 output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2)) 1333 "values in the end.\nWhen only tags that can form DCS were allowed in the analysis, family sizes for the "
1334 "forward and reverse (ab and ba) will be included in the plots.\n")
1335
1336 output_file.write("\nlength of one half of the tag{}{}\n\n".format(sep, len(data_array[0, 1]) / 2))
1119 1337
1120 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, 1338 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file,
1121 "Hamming distance of each half in the tag", sep) 1339 "Hamming distance of each half in the tag", sep)
1122 createFileHD(summary11, sumCol11, overallSum11, output_file, 1340 createFileHD(summary11, sumCol11, overallSum11, output_file,
1123 "Absolute delta Hamming distances within the tag", sep) 1341 "Absolute delta Hamming distance within the tag", sep)
1124 1342
1125 createFileHD(summary13, sumCol13, overallSum13, output_file, 1343 createFileHD(summary13, sumCol13, overallSum13, output_file,
1126 "Chimera analysis: relative delta Hamming distances", sep) 1344 "Chimera analysis: relative delta Hamming distance", sep)
1127 1345
1128 if len(minHD_tags_zeros) != 0: 1346 if len(minHD_tags_zeros) != 0:
1129 output_file.write( 1347 output_file.write(
1130 "Chimeras:\nAll tags were filtered: only those tags where at least one half was identical (HD=0) and therefore, had a relative delta of 1 were kept. These tags are considered as chimeric.\nSo the Hamming distances of the chimeric tags are shown.\n") 1348 "All tags were filtered: only those tags where at least one half was identical (HD=0) and therefore, "
1349 "had a relative delta of 1 were kept. These tags are considered as chimeric.\nSo the Hamming distances "
1350 "of the chimeric tags are shown.\n")
1131 createFileHD(summary15, sumCol15, overallSum15, output_file, 1351 createFileHD(summary15, sumCol15, overallSum15, output_file,
1132 "Hamming distances of chimeras", sep) 1352 "Hamming distance of chimeric families separated after FS", sep)
1353
1354 if onlyDuplicates is False:
1355 createFileHDwithDCS(summary16, sumCol16, overallSum16, output_file,
1356 "Hamming distance of chimeric families separated after DCS and single SSCS", sep)
1133 1357
1134 output_file.write("\n") 1358 output_file.write("\n")
1135 1359
1136 1360
1137 if __name__ == '__main__': 1361 if __name__ == '__main__':