comparison hd.py @ 19:2e9f7ea7ae93 draft

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/hd commit dfaab79252a858e8df16bbea3607ebf1b6962e5a-dirty
author mheinzl
date Mon, 08 Oct 2018 05:56:04 -0400
parents a8581bf627fd
children b084b6a8e3ac
comparison
equal deleted inserted replaced
18:a8581bf627fd 19:2e9f7ea7ae93
11 # In additon, the tool produces HD and FSD plots for the difference between the HDs of both parts of the tags and for the chimeric reads 11 # In additon, the tool produces HD and FSD plots for the difference between the HDs of both parts of the tags and for the chimeric reads
12 # and finally a CSV file with the data of the plots. 12 # and finally a CSV file with the data of the plots.
13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input. 13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input.
14 # The tool can run on a certain number of processors, which can be defined by the user. 14 # The tool can run on a certain number of processors, which can be defined by the user.
15 15
16 # USAGE: python HDnew6_1Plot_FINAL.py --inputFile filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --sample_size int/0 --sep "characterWhichSeparatesCSVFile" / 16 # USAGE: python hd.py --inputFile filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --sample_size int/0 --sep "characterWhichSeparatesCSVFile" /
17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False--output_csv outptufile_name_csv --output_pdf outptufile_name_pdf 17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular --output_pdf outputfile_name_pdf
18 18
19 import numpy 19 import argparse
20 import itertools 20 import itertools
21 import operator 21 import operator
22 import sys
23 from collections import Counter
24 from functools import partial
25 from multiprocessing.pool import Pool
26
22 import matplotlib.pyplot as plt 27 import matplotlib.pyplot as plt
23 import os.path 28 import numpy
24 import cPickle as pickle
25 from multiprocessing.pool import Pool
26 from functools import partial
27 import argparse
28 import sys
29 import os
30 from matplotlib.backends.backend_pdf import PdfPages 29 from matplotlib.backends.backend_pdf import PdfPages
31 from collections import Counter 30
32 31 plt.switch_backend('agg')
33 def plotFSDwithHD2(familySizeList1,maximumXFS,minimumXFS, originalCounts, 32
34 title_file1, subtitle, pdf, relative=False, diff = True): 33
34 def plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, originalCounts,
35 title_file1, subtitle, pdf, relative=False, diff=True):
35 if diff is False: 36 if diff is False:
36 colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"] 37 colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"]
37 labels = ["HD=1", "HD=2", "HD=3", "HD=4", "HD=5-8","HD>8"] 38 labels = ["HD=1", "HD=2", "HD=3", "HD=4", "HD=5-8", "HD>8"]
38 else: 39 else:
39 colors = ["#93A6AB", "#403C14", "#731E41", "#BAB591", "#085B6F", "#E8AA35", "#726C66"] 40 colors = ["#93A6AB", "#403C14", "#731E41", "#BAB591", "#085B6F", "#E8AA35", "#726C66"]
40 if relative is True: 41 if relative is True:
41 labels = ["d=0", "d=0.1", "d=0.2", "d=0.3", "d=0.4", "d=0.5-0.8", "d>0.8"] 42 labels = ["d=0", "d=0.1", "d=0.2", "d=0.3", "d=0.4", "d=0.5-0.8", "d>0.8"]
42 else: 43 else:
43 labels = ["d=0","d=1", "d=2", "d=3", "d=4", "d=5-8","d>8"] 44 labels = ["d=0", "d=1", "d=2", "d=3", "d=4", "d=5-8", "d>8"]
44 45
45 fig = plt.figure(figsize=(6, 7)) 46 fig = plt.figure(figsize=(6, 7))
46 ax = fig.add_subplot(111) 47 ax = fig.add_subplot(111)
47 plt.subplots_adjust(bottom=0.1) 48 plt.subplots_adjust(bottom=0.1)
48 p1 = numpy.bincount(numpy.concatenate((familySizeList1))) 49 p1 = numpy.bincount(numpy.concatenate((familySizeList1)))
52 range1 = range(minimumXFS - 1, minimumXFS + 2) 53 range1 = range(minimumXFS - 1, minimumXFS + 2)
53 else: 54 else:
54 range1 = range(0, maximumXFS + 2) 55 range1 = range(0, maximumXFS + 2)
55 counts = plt.hist(familySizeList1, label=labels, 56 counts = plt.hist(familySizeList1, label=labels,
56 color=colors, stacked=True, 57 color=colors, stacked=True,
57 rwidth=0.8,alpha=1, align="left", 58 rwidth=0.8, alpha=1, align="left",
58 edgecolor="None",bins=range1) 59 edgecolor="None", bins=range1)
59 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) 60 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
60 61
61 #plt.title(title_file1, fontsize=12) 62 # plt.title(title_file1, fontsize=12)
62 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) 63 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
63 plt.xlabel("Family size", fontsize=14) 64 plt.xlabel("Family size", fontsize=14)
64 plt.ylabel("Absolute Frequency", fontsize=14) 65 plt.ylabel("Absolute Frequency", fontsize=14)
65 66
66 ticks = numpy.arange(0, maximumXFS + 1, 1) 67 ticks = numpy.arange(0, maximumXFS + 1, 1)
77 plt.ylim((0, maximumY * 1.2)) 78 plt.ylim((0, maximumY * 1.2))
78 legend = "\nmax. family size: \nabsolute frequency: \nrelative frequency: " 79 legend = "\nmax. family size: \nabsolute frequency: \nrelative frequency: "
79 plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure) 80 plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure)
80 81
81 count = numpy.bincount(originalCounts) # original counts 82 count = numpy.bincount(originalCounts) # original counts
82 legend1 = "{}\n{}\n{:.5f}" \ 83 legend1 = "{}\n{}\n{:.5f}".format(max(originalCounts), count[len(count) - 1], float(count[len(count) - 1]) / sum(count))
83 .format(max(originalCounts), count[len(count) - 1], float(count[len(count) - 1]) / sum(count))
84 plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure) 84 plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure)
85 legend3 = "singletons\n{:,}\n{:.5f}".format(int(counts[0][len(counts[0]) - 1][1]), 85 legend3 = "singletons\n{:,}\n{:.5f}".format(int(counts[0][len(counts[0]) - 1][1]), float(counts[0][len(counts[0]) - 1][1]) / sum(counts[0][len(counts[0]) - 1]))
86 float(counts[0][len(counts[0]) - 1][1]) / sum(
87 counts[0][len(counts[0]) - 1]))
88 plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12) 86 plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12)
89 plt.grid(b=True, which='major', color='#424242', linestyle=':') 87 plt.grid(b=True, which='major', color='#424242', linestyle=':')
90 88
91 pdf.savefig(fig, bbox_inches="tight") 89 pdf.savefig(fig, bbox_inches="tight")
92 plt.close("all") 90 plt.close("all")
93 91
94 def plotHDwithFSD(list1,maximumX,minimumX, subtitle, lenTags, title_file1,pdf, 92
95 xlabel,relative=False, nr_above_bars = True): 93 def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, title_file1, pdf, xlabel, relative=False, nr_above_bars=True):
96 if relative is True: 94 if relative is True:
97 step = 0.1 95 step = 0.1
98 else: 96 else:
99 step = 1 97 step = 1
100 98
118 range=(0, maximumX + 1)) 116 range=(0, maximumX + 1))
119 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) 117 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
120 bins = counts[1] # width of bins 118 bins = counts[1] # width of bins
121 counts = numpy.array(map(int, counts[0][5])) 119 counts = numpy.array(map(int, counts[0][5]))
122 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) 120 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
123 # plt.title(title_file1, fontsize=12) 121 # plt.title(title_file1, fontsize=12)
124 plt.xlabel(xlabel, fontsize=14) 122 plt.xlabel(xlabel, fontsize=14)
125 plt.ylabel("Absolute Frequency", fontsize=14) 123 plt.ylabel("Absolute Frequency", fontsize=14)
126 124
127 plt.grid(b=True, which='major', color='#424242', linestyle=':') 125 plt.grid(b=True, which='major', color='#424242', linestyle=':')
128 plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) 126 plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1))
136 if x_label == 0: 134 if x_label == 0:
137 continue 135 continue
138 else: 136 else:
139 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts), 1), 137 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts), 1),
140 xy=(label, x_label + len(con_list1) * 0.01), 138 xy=(label, x_label + len(con_list1) * 0.01),
141 xycoords="data", color="#000066",fontsize=10) 139 xycoords="data", color="#000066", fontsize=10)
142 140
143 legend = "sample size= {:,} against {:,}".format(sum(counts), lenTags) 141 legend = "sample size= {:,} against {:,}".format(sum(counts), lenTags)
144 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) 142 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure)
145 143
146 pdf.savefig(fig, bbox_inches="tight") 144 pdf.savefig(fig, bbox_inches="tight")
147 plt.close("all") 145 plt.close("all")
148 plt.clf() 146 plt.clf()
149 147
150 def plotHDwithinSeq_Sum2(sum1, sum2,sum1min, sum2min, min_value, lenTags, title_file1, pdf): 148
149 def plotHDwithinSeq_Sum2(sum1, sum1min, sum2, sum2min, min_value, lenTags, title_file1, pdf):
151 fig = plt.figure(figsize=(6, 8)) 150 fig = plt.figure(figsize=(6, 8))
152 plt.subplots_adjust(bottom=0.1) 151 plt.subplots_adjust(bottom=0.1)
153 152
154 #ham = [sum1, sum2,numpy.array(min_value)] # new hd within tags 153 ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)] # new hd within tags
155 ham = [sum1, sum2, sum1min, sum2min, numpy.array(min_value)] # new hd within tags 154
156 155 maximumX = numpy.amax(numpy.concatenate(ham_partial))
157 156 minimumX = numpy.amin(numpy.concatenate(ham_partial))
158 maximumX = numpy.amax(numpy.concatenate(ham)) 157 maximumY = numpy.amax(numpy.array(numpy.concatenate(map(lambda (x): numpy.bincount(x), ham_partial))))
159 minimumX = numpy.amin(numpy.concatenate(ham))
160 maximumY = numpy.amax(numpy.concatenate(map(lambda (x): numpy.bincount(x), ham)))
161 158
162 if len(range(minimumX, maximumX)) == 0: 159 if len(range(minimumX, maximumX)) == 0:
163 range1 = minimumX 160 range1 = minimumX
164 else: 161 else:
165 range1 = range(minimumX, maximumX + 2) 162 range1 = range(minimumX, maximumX + 2)
166 163
167 counts = plt.hist(ham, align="left", rwidth=0.8, stacked=False, 164 plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, label=[ "HD a", "HD b'", "HD b", "HD a'", "HD a+b"], bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], edgecolor='black', linewidth=1)
168 # label=[ "HD a", "HD b","HD a+b"], 165
169 label=[ "HD a","HD b'", "HD b", "HD a'", "HD a+b"],
170 #bins=range1, color=[ "#58ACFA", "#FA5858","#585858"],
171 color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"],
172 edgecolor='black', linewidth=1)
173 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1)) 166 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1))
174 plt.suptitle('Hamming distances within tags', fontsize=14) 167 plt.suptitle('Hamming distances within tags', fontsize=14)
175 #plt.title(title_file1, fontsize=12) 168 # plt.title(title_file1, fontsize=12)
176 plt.xlabel("HD", fontsize=14) 169 plt.xlabel("HD", fontsize=14)
177 plt.ylabel("Absolute Frequency", fontsize=14) 170 plt.ylabel("Absolute Frequency", fontsize=14)
178 plt.grid(b=True, which='major', color='#424242', linestyle=':') 171 plt.grid(b=True, which='major', color='#424242', linestyle=':')
179 172
180 173 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2))
181 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.1))
182 plt.xticks(numpy.arange(0, maximumX + 1, 1.0)) 174 plt.xticks(numpy.arange(0, maximumX + 1, 1.0))
183 plt.ylim((0, maximumY * 1.2)) 175 # plt.ylim(0, maximumY * 1.2)
184 176
185 legend = "sample size= {:,} against {:,}".format(len(ham[0]), lenTags, lenTags) 177 legend = "sample size= {:,} against {:,}".format(sum(ham_partial[4]), lenTags)
186 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) 178 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure)
187 pdf.savefig(fig, bbox_inches="tight") 179 pdf.savefig(fig, bbox_inches="tight")
188 plt.close("all") 180 plt.close("all")
189 plt.clf() 181 plt.clf()
182
190 183
191 def createTableFSD2(list1, diff=True): 184 def createTableFSD2(list1, diff=True):
192 selfAB = numpy.concatenate(list1) 185 selfAB = numpy.concatenate(list1)
193 uniqueFS = numpy.unique(selfAB) 186 uniqueFS = numpy.unique(selfAB)
194 nr = numpy.arange(0, len(uniqueFS), 1) 187 nr = numpy.arange(0, len(uniqueFS), 1)
206 if len(table) == 0: 199 if len(table) == 0:
207 state = state + 1 200 state = state + 1
208 continue 201 continue
209 else: 202 else:
210 if state == 1: 203 if state == 1:
211 for i, l in zip(uniqueFS, nr): 204 for i, l in zip(uniqueFS, nr):
212 for j in table: 205 for j in table:
213 if j[0] == uniqueFS[l]: 206 if j[0] == uniqueFS[l]:
214 count[l, 0] = j[1] 207 count[l, 0] = j[1]
215 if state == 2: 208 if state == 2:
216 for i, l in zip(uniqueFS, nr): 209 for i, l in zip(uniqueFS, nr):
259 first = ["FS={}".format(i) for i in uniqueFS] 252 first = ["FS={}".format(i) for i in uniqueFS]
260 final = numpy.column_stack((first, count, sumRow)) 253 final = numpy.column_stack((first, count, sumRow))
261 254
262 return (final, sumCol) 255 return (final, sumCol)
263 256
264 def createFileFSD2(summary, sumCol, overallSum, output_file, name,sep, rel=False, diff=True): 257
258 def createFileFSD2(summary, sumCol, overallSum, output_file, name, sep, rel=False, diff=True):
265 output_file.write(name) 259 output_file.write(name)
266 output_file.write("\n") 260 output_file.write("\n")
267 if diff is False: 261 if diff is False:
268 output_file.write("{}HD=1{}HD=2{}HD=3{}HD=4{}HD=5-8{}HD>8{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep)) 262 output_file.write("{}HD=1{}HD=2{}HD=3{}HD=4{}HD=5-8{}HD>8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep))
269 else: 263 else:
270 if rel is False: 264 if rel is False:
271 output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep,sep)) 265 output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep))
272 else: 266 else:
273 output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep,sep)) 267 output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep))
274 268
275 for item in summary: 269 for item in summary:
276 for nr in item: 270 for nr in item:
277 if "FS" not in nr and "diff" not in nr: 271 if "FS" not in nr and "diff" not in nr:
278 nr = nr.astype(float) 272 nr = nr.astype(float)
279 nr = nr.astype(int) 273 nr = nr.astype(int)
280 output_file.write("{}{}".format(nr,sep)) 274 output_file.write("{}{}".format(nr, sep))
281 output_file.write("\n") 275 output_file.write("\n")
282 output_file.write("sum{}".format(sep)) 276 output_file.write("sum{}".format(sep))
283 sumCol = map(int, sumCol) 277 sumCol = map(int, sumCol)
284 for el in sumCol: 278 for el in sumCol:
285 output_file.write("{}{}".format(el,sep)) 279 output_file.write("{}{}".format(el, sep))
286 output_file.write("{}{}".format(overallSum.astype(int),sep)) 280 output_file.write("{}{}".format(overallSum.astype(int), sep))
287 output_file.write("\n\n") 281 output_file.write("\n\n")
282
288 283
289 def createTableHD(list1, row_label): 284 def createTableHD(list1, row_label):
290 selfAB = numpy.concatenate(list1) 285 selfAB = numpy.concatenate(list1)
291 uniqueHD = numpy.unique(selfAB) 286 uniqueHD = numpy.unique(selfAB)
292 nr = numpy.arange(0, len(uniqueHD), 1) 287 nr = numpy.arange(0, len(uniqueHD), 1)
300 if len(table) == 0: 295 if len(table) == 0:
301 state = state + 1 296 state = state + 1
302 continue 297 continue
303 else: 298 else:
304 if state == 1: 299 if state == 1:
305 for i, l in zip(uniqueHD, nr): 300 for i, l in zip(uniqueHD, nr):
306 for j in table: 301 for j in table:
307 if j[0] == uniqueHD[l]: 302 if j[0] == uniqueHD[l]:
308 count[l, 0] = j[1] 303 count[l, 0] = j[1]
309 if state == 2: 304 if state == 2:
310 for i, l in zip(uniqueHD, nr): 305 for i, l in zip(uniqueHD, nr):
337 count[l, 5] = j[1] 332 count[l, 5] = j[1]
338 state = state + 1 333 state = state + 1
339 334
340 sumRow = count.sum(axis=1) 335 sumRow = count.sum(axis=1)
341 sumCol = count.sum(axis=0) 336 sumCol = count.sum(axis=0)
342 first = ["{}{}".format(row_label,i) for i in uniqueHD] 337 first = ["{}{}".format(row_label, i) for i in uniqueHD]
343 final = numpy.column_stack((first, count, sumRow)) 338 final = numpy.column_stack((first, count, sumRow))
344 339
345 return (final, sumCol) 340 return (final, sumCol)
341
346 342
347 def createTableHDwithTags(list1): 343 def createTableHDwithTags(list1):
348 selfAB = numpy.concatenate(list1) 344 selfAB = numpy.concatenate(list1)
349 uniqueHD = numpy.unique(selfAB) 345 uniqueHD = numpy.unique(selfAB)
350 nr = numpy.arange(0, len(uniqueHD), 1) 346 nr = numpy.arange(0, len(uniqueHD), 1)
351 count = numpy.zeros((len(uniqueHD), 3)) 347 count = numpy.zeros((len(uniqueHD), 5))
352 348
353 state = 1 349 state = 1
354 for i in list1: 350 for i in list1:
355 counts = list(Counter(i).items()) 351 counts = list(Counter(i).items())
356 hd = [item[0] for item in counts] 352 hd = [item[0] for item in counts]
359 if len(table) == 0: 355 if len(table) == 0:
360 state = state + 1 356 state = state + 1
361 continue 357 continue
362 else: 358 else:
363 if state == 1: 359 if state == 1:
364 for i, l in zip(uniqueHD, nr): 360 for i, l in zip(uniqueHD, nr):
365 for j in table: 361 for j in table:
366 if j[0] == uniqueHD[l]: 362 if j[0] == uniqueHD[l]:
367 count[l, 0] = j[1] 363 count[l, 0] = j[1]
368 if state == 2: 364 if state == 2:
369 for i, l in zip(uniqueHD, nr): 365 for i, l in zip(uniqueHD, nr):
370 for j in table: 366 for j in table:
371 if j[0] == uniqueHD[l]: 367 if j[0] == uniqueHD[l]:
372 count[l, 1] = j[1] 368 count[l, 1] = j[1]
373
374 if state == 3: 369 if state == 3:
375 for i, l in zip(uniqueHD, nr): 370 for i, l in zip(uniqueHD, nr):
376 for j in table: 371 for j in table:
377 if j[0] == uniqueHD[l]: 372 if j[0] == uniqueHD[l]:
378 count[l, 2] = j[1] 373 count[l, 2] = j[1]
374 if state == 4:
375 for i, l in zip(uniqueHD, nr):
376 for j in table:
377 if j[0] == uniqueHD[l]:
378 count[l, 3] = j[1]
379 if state == 5:
380 for i, l in zip(uniqueHD, nr):
381 for j in table:
382 if j[0] == uniqueHD[l]:
383 count[l, 4] = j[1]
384
379 state = state + 1 385 state = state + 1
380 386
381 sumRow = count.sum(axis=1) 387 sumRow = count.sum(axis=1)
382 sumCol = count.sum(axis=0) 388 sumCol = count.sum(axis=0)
383 first = ["HD={}".format(i) for i in uniqueHD] 389 first = ["HD={}".format(i) for i in uniqueHD]
384 final = numpy.column_stack((first, count, sumRow)) 390 final = numpy.column_stack((first, count, sumRow))
385 391
386 return (final, sumCol) 392 return (final, sumCol)
387 393
388 def createFileHD(summary, sumCol, overallSum, output_file, name,sep): 394
395 def createFileHD(summary, sumCol, overallSum, output_file, name, sep):
389 output_file.write(name) 396 output_file.write(name)
390 output_file.write("\n") 397 output_file.write("\n")
391 output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep)) 398 output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep))
392 for item in summary: 399 for item in summary:
393 for nr in item: 400 for nr in item:
394 if "HD" not in nr and "diff" not in nr: 401 if "HD" not in nr and "diff" not in nr:
395 nr = nr.astype(float) 402 nr = nr.astype(float)
396 nr = nr.astype(int) 403 nr = nr.astype(int)
397 output_file.write("{}{}".format(nr,sep)) 404 output_file.write("{}{}".format(nr, sep))
398 output_file.write("\n") 405 output_file.write("\n")
399 output_file.write("sum{}".format(sep)) 406 output_file.write("sum{}".format(sep))
400 sumCol = map(int, sumCol) 407 sumCol = map(int, sumCol)
401 for el in sumCol: 408 for el in sumCol:
402 output_file.write("{}{}".format(el,sep)) 409 output_file.write("{}{}".format(el, sep))
403 output_file.write("{}{}".format(overallSum.astype(int),sep)) 410 output_file.write("{}{}".format(overallSum.astype(int), sep))
404 output_file.write("\n\n") 411 output_file.write("\n\n")
405 412
406 def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name,sep): 413
414 def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name, sep):
407 output_file.write(name) 415 output_file.write(name)
408 output_file.write("\n") 416 output_file.write("\n")
409 output_file.write("{}HD a+b;HD a{}HD b{}sum{}\n".format(sep,sep,sep,sep)) 417 output_file.write("{}HD a{}HD b'{}HD b{}HD a'{}HD a+b{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep))
410 for item in summary: 418 for item in summary:
411 for nr in item: 419 for nr in item:
412 if "HD" not in nr: 420 if "HD" not in nr:
413 nr = nr.astype(float) 421 nr = nr.astype(float)
414 nr = nr.astype(int) 422 nr = nr.astype(int)
415 output_file.write("{}{}".format(nr,sep)) 423 output_file.write("{}{}".format(nr, sep))
416 output_file.write("\n") 424 output_file.write("\n")
417 output_file.write("sum{}".format(sep)) 425 output_file.write("sum{}".format(sep))
418 sumCol = map(int, sumCol) 426 sumCol = map(int, sumCol)
419 for el in sumCol: 427 for el in sumCol:
420 output_file.write("{}{}".format(el,sep)) 428 output_file.write("{}{}".format(el, sep))
421 output_file.write("{}{}".format(overallSum.astype(int),sep)) 429 output_file.write("{}{}".format(overallSum.astype(int), sep))
422 output_file.write("\n\n") 430 output_file.write("\n\n")
423 431
432
424 def hamming(array1, array2): 433 def hamming(array1, array2):
425 res = 99 * numpy.ones(len(array1)) 434 res = 99 * numpy.ones(len(array1))
426 i = 0 435 i = 0
427 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time 436 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time
428 for a in array1: 437 for a in array1:
429 dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2]) # fastest 438 dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2]) # fastest
430 res[i] = numpy.amin(dist[dist > 0]) # pick min distance greater than zero 439 res[i] = numpy.amin(dist[dist > 0]) # pick min distance greater than zero
431 #print(i) 440 # print(i)
432 i += 1 441 i += 1
433 return res 442 return res
434 443
444
435 def hamming_difference(array1, array2, mate_b): 445 def hamming_difference(array1, array2, mate_b):
436 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time 446 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time
437 array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1 447 array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1
438 array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2 448 array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2
439 449
440 array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1 450 array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1
441 array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2 451 array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2
442 452
443 #diff11 = 999 * numpy.ones(len(array2)) 453 # diff11 = 999 * numpy.ones(len(array2))
444 #relativeDiffList = 999 * numpy.ones(len(array2)) 454 # relativeDiffList = 999 * numpy.ones(len(array2))
445 #ham1 = 999 * numpy.ones(len(array2)) 455 # ham1 = 999 * numpy.ones(len(array2))
446 #ham2 = 999 * numpy.ones(len(array2)) 456 # ham2 = 999 * numpy.ones(len(array2))
447 #min_valueList = 999 * numpy.ones(len(array2)) 457 # min_valueList = 999 * numpy.ones(len(array2))
448 #min_tagsList = 999 * numpy.ones(len(array2)) 458 # min_tagsList = 999 * numpy.ones(len(array2))
449 #diff11_zeros = 999 * numpy.ones(len(array2)) 459 # diff11_zeros = 999 * numpy.ones(len(array2))
450 #min_tagsList_zeros = 999 * numpy.ones(len(array2)) 460 # min_tagsList_zeros = 999 * numpy.ones(len(array2))
451 461
452
453 diff11 = [] 462 diff11 = []
454 relativeDiffList = [] 463 relativeDiffList = []
455 ham1 = [] 464 ham1 = []
456 ham2 = [] 465 ham2 = []
457 ham1min = [] 466 ham1min = []
458 ham2min = [] 467 ham2min = []
459 min_valueList = [] 468 min_valueList = []
460 min_tagsList = [] 469 min_tagsList = []
461 diff11_zeros = [] 470 diff11_zeros = []
462 min_tagsList_zeros = [] 471 min_tagsList_zeros = []
463 i = 0 # counter, only used to see how many HDs of tags were already calculated 472 i = 0 # counter, only used to see how many HDs of tags were already calculated
464 if mate_b is False: # HD calculation for all a's 473 if mate_b is False: # HD calculation for all a's
465 half1_mate1 = array1_half 474 half1_mate1 = array1_half
466 half2_mate1 = array1_half2 475 half2_mate1 = array1_half2
467 half1_mate2 = array2_half 476 half1_mate2 = array2_half
468 half2_mate2 = array2_half2 477 half2_mate2 = array2_half2
469 elif mate_b is True: # HD calculation for all b's 478 elif mate_b is True: # HD calculation for all b's
470 half1_mate1 = array1_half2 479 half1_mate1 = array1_half2
471 half2_mate1 = array1_half 480 half2_mate1 = array1_half
472 half1_mate2 = array2_half2 481 half1_mate2 = array2_half2
473 half2_mate2 = array2_half 482 half2_mate2 = array2_half
474 483
475 for a, b, tag in zip(half1_mate1, half2_mate1, array1): 484 for a, b, tag in zip(half1_mate1, half2_mate1, array1):
476 ## exclude identical tag from array2, to prevent comparison to itself 485 # exclude identical tag from array2, to prevent comparison to itself
477 sameTag = numpy.where(array2 == tag) 486 sameTag = numpy.where(array2 == tag)
478 indexArray2 = numpy.arange(0, len(array2), 1) 487 indexArray2 = numpy.arange(0, len(array2), 1)
479 index_withoutSame = numpy.delete(indexArray2, sameTag) # delete identical tag from the data 488 index_withoutSame = numpy.delete(indexArray2, sameTag) # delete identical tag from the data
480 489
481 # all tags without identical tag 490 # all tags without identical tag
482 array2_half_withoutSame = half1_mate2[index_withoutSame] 491 array2_half_withoutSame = half1_mate2[index_withoutSame]
483 array2_half2_withoutSame = half2_mate2[index_withoutSame] 492 array2_half2_withoutSame = half2_mate2[index_withoutSame]
484 #array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs) 493 # array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs)
485 494
486 dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in 495 dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in
487 array2_half_withoutSame]) # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" 496 array2_half_withoutSame]) # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
488 min_index = numpy.where(dist == dist.min()) # get index of min HD 497 min_index = numpy.where(dist == dist.min()) # get index of min HD
489 min_value = dist[min_index] # get minimum HDs 498 min_value = dist[min_index] # get minimum HDs
490 min_tag_half2 = array2_half2_withoutSame[min_index] # get all "b's" of the tag or all "a's" of the tag with minimum HD 499 min_tag_half2 = array2_half2_withoutSame[min_index] # get all "b's" of the tag or all "a's" of the tag with minimum HD
491 #min_tag = array2_withoutSame[min_index] # get whole tag with min HD 500 # min_tag = array2_withoutSame[min_index] # get whole tag with min HD
492 501
493 dist2 = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in 502 dist2 = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in
494 min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" 503 min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's"
495 for d_1, d_2 in zip(min_value, dist2): 504 for d, d2 in zip(min_value, dist2):
496 if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b 505 if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b
497 d = d_2
498 d2 = d_1
499 ham2.append(d) 506 ham2.append(d)
500 ham2min.append(d2) 507 ham2min.append(d2)
501 else: # half1, corrects the variable of the HD from both halfs if it is a or b 508 else: # half1, corrects the variable of the HD from both halfs if it is a or b
502 d = d_1
503 d2 = d_2
504 ham1.append(d) 509 ham1.append(d)
505 ham1min.append(d2) 510 ham1min.append(d2)
506 511
507 min_valueList.append(d + d2) 512 min_valueList.append(d + d2)
508 min_tagsList.append(tag) 513 min_tagsList.append(tag)
509 # ham1.append(d)
510 # ham2.append(d2)
511 difference1 = abs(d - d2) 514 difference1 = abs(d - d2)
512 diff11.append(difference1) 515 diff11.append(difference1)
513 rel_difference = round(float(difference1) / (d + d2), 1) 516 rel_difference = round(float(difference1) / (d + d2), 1)
514 relativeDiffList.append(rel_difference) 517 relativeDiffList.append(rel_difference)
515 518
516 #### tags which have identical parts: 519 # tags which have identical parts:
517 if d == 0 or d2 == 0: 520 if d == 0 or d2 == 0:
518 min_tagsList_zeros.append(tag) 521 min_tagsList_zeros.append(tag)
519 difference1_zeros = abs(d - d2) 522 difference1_zeros = abs(d - d2)
520 diff11_zeros.append(difference1_zeros) 523 diff11_zeros.append(difference1_zeros)
521 i += 1 524 i += 1
522 525
523 #print(i) 526 # print(i)
524 #diff11 = [st for st in diff11 if st != 999] 527 # diff11 = [st for st in diff11 if st != 999]
525 #ham1 = [st for st in ham1 if st != 999] 528 # ham1 = [st for st in ham1 if st != 999]
526 #ham2 = [st for st in ham2 if st != 999] 529 # ham2 = [st for st in ham2 if st != 999]
527 #min_valueList = [st for st in min_valueList if st != 999] 530 # min_valueList = [st for st in min_valueList if st != 999]
528 #min_tagsList = [st for st in min_tagsList if st != 999] 531 # min_tagsList = [st for st in min_tagsList if st != 999]
529 #relativeDiffList = [st for st in relativeDiffList if st != 999] 532 # relativeDiffList = [st for st in relativeDiffList if st != 999]
530 #diff11_zeros = [st for st in diff11_zeros if st != 999] 533 # diff11_zeros = [st for st in diff11_zeros if st != 999]
531 #min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999] 534 # min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999]
532 535
533 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros, ham1min, ham2min]) 536 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros, ham1min, ham2min])
537
534 538
535 def readFileReferenceFree(file): 539 def readFileReferenceFree(file):
536 with open(file, 'r') as dest_f: 540 with open(file, 'r') as dest_f:
537 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') 541 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string')
538 integers = numpy.array(data_array[:, 0]).astype(int) 542 integers = numpy.array(data_array[:, 0]).astype(int)
539 return(integers, data_array) 543 return(integers, data_array)
540 544
545
541 def hammingDistanceWithFS(fs, ham): 546 def hammingDistanceWithFS(fs, ham):
542 fs = numpy.asarray(fs) 547 fs = numpy.asarray(fs)
543 maximum = max(ham) 548 maximum = max(ham)
544 minimum = min(ham) 549 minimum = min(ham)
545 ham = numpy.asarray(ham) 550 ham = numpy.asarray(ham)
563 data6 = ham[hd6] 568 data6 = ham[hd6]
564 569
565 list1 = [data, data2, data3, data4, data5, data6] 570 list1 = [data, data2, data3, data4, data5, data6]
566 return(list1, maximum, minimum) 571 return(list1, maximum, minimum)
567 572
568 def familySizeDistributionWithHD(fs, ham, diff=False, rel = True): 573
574 def familySizeDistributionWithHD(fs, ham, diff=False, rel=True):
569 hammingDistances = numpy.unique(ham) 575 hammingDistances = numpy.unique(ham)
570 fs = numpy.asarray(fs) 576 fs = numpy.asarray(fs)
571 577
572 ham = numpy.asarray(ham) 578 ham = numpy.asarray(ham)
573 bigFamilies2 = numpy.where(fs > 19)[0] 579 bigFamilies2 = numpy.where(fs > 19)[0]
614 else: 620 else:
615 hd6 = numpy.where(ham > 8)[0] 621 hd6 = numpy.where(ham > 8)[0]
616 data6 = fs[hd6] 622 data6 = fs[hd6]
617 623
618 if diff is True: 624 if diff is True:
619 list1 = [data0,data, data2, data3, data4, data5, data6] 625 list1 = [data0, data, data2, data3, data4, data5, data6]
620 else: 626 else:
621 list1 = [data, data2, data3, data4, data5, data6] 627 list1 = [data, data2, data3, data4, data5, data6]
622 628
623 return(list1, hammingDistances, maximum, minimum) 629 return(list1, hammingDistances, maximum, minimum)
630
624 631
625 def make_argparser(): 632 def make_argparser():
626 parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data') 633 parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data')
627 parser.add_argument('--inputFile', 634 parser.add_argument('--inputFile',
628 help='Tabular File with three columns: ab or ba, tag and family size.') 635 help='Tabular File with three columns: ab or ba, tag and family size.')
629 parser.add_argument('--inputName1') 636 parser.add_argument('--inputName1')
630 parser.add_argument('--inputFile2',default=None, 637 parser.add_argument('--inputFile2', default=None,
631 help='Tabular File with three columns: ab or ba, tag and family size.') 638 help='Tabular File with three columns: ab or ba, tag and family size.')
632 parser.add_argument('--inputName2') 639 parser.add_argument('--inputName2')
633 parser.add_argument('--sample_size', default=1000,type=int, 640 parser.add_argument('--sample_size', default=1000, type=int,
634 help='Sample size of Hamming distance analysis.') 641 help='Sample size of Hamming distance analysis.')
635 parser.add_argument('--sep', default=",", 642 parser.add_argument('--subset_tag', default=0, type=int,
636 help='Separator in the csv file.')
637 parser.add_argument('--subset_tag', default=0,type=int,
638 help='The tag is shortened to the given number.') 643 help='The tag is shortened to the given number.')
639 parser.add_argument('--nproc', default=4,type=int, 644 parser.add_argument('--nproc', default=4, type=int,
640 help='The tool runs with the given number of processors.') 645 help='The tool runs with the given number of processors.')
641 parser.add_argument('--only_DCS', action="store_false", # default=False, type=bool, 646 parser.add_argument('--only_DCS', action="store_false",
642 help='Only tags of the DCSs are included in the HD analysis') 647 help='Only tags of the DCSs are included in the HD analysis')
643 648
644 parser.add_argument('--minFS', default=1, type=int, 649 parser.add_argument('--minFS', default=1, type=int,
645 help='Only tags, which have a family size greater or equal than specified, are included in the HD analysis') 650 help='Only tags, which have a family size greater or equal than specified, are included in the HD analysis')
646 parser.add_argument('--maxFS', default=0, type=int, 651 parser.add_argument('--maxFS', default=0, type=int,
647 help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis') 652 help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis')
648 parser.add_argument('--nr_above_bars', action="store_true", # default=False, type=bool, 653 parser.add_argument('--nr_above_bars', action="store_true",
649 help='If no, values above bars in the histrograms are removed') 654 help='If no, values above bars in the histrograms are removed')
650 655
651 parser.add_argument('--output_csv', default="data.csv", type=str, 656 parser.add_argument('--output_tabular', default="data.tabular", type=str,
652 help='Name of the csv file.') 657 help='Name of the tabular file.')
653 parser.add_argument('--output_pdf', default="data.pdf", type=str, 658 parser.add_argument('--output_pdf', default="data.pdf", type=str,
654 help='Name of the pdf file.') 659 help='Name of the pdf file.')
655 parser.add_argument('--output_pdf2', default="data2.pdf", type=str, 660 parser.add_argument('--output_pdf2', default="data2.pdf", type=str,
656 help='Name of the pdf file.') 661 help='Name of the pdf file.')
657 parser.add_argument('--output_csv2', default="data2.csv", type=str, 662 parser.add_argument('--output_tabular2', default="data2.tabular", type=str,
658 help='Name of the csv file.') 663 help='Name of the tabular file.')
659 664
660 return parser 665 return parser
666
661 667
662 def Hamming_Distance_Analysis(argv): 668 def Hamming_Distance_Analysis(argv):
663 parser = make_argparser() 669 parser = make_argparser()
664 args = parser.parse_args(argv[1:]) 670 args = parser.parse_args(argv[1:])
665 671
671 677
672 index_size = args.sample_size 678 index_size = args.sample_size
673 title_savedFile_pdf = args.output_pdf 679 title_savedFile_pdf = args.output_pdf
674 title_savedFile_pdf2 = args.output_pdf2 680 title_savedFile_pdf2 = args.output_pdf2
675 681
676 title_savedFile_csv = args.output_csv 682 title_savedFile_csv = args.output_tabular
677 title_savedFile_csv2 = args.output_csv2 683 title_savedFile_csv2 = args.output_tabular2
678 684
679 sep = args.sep 685 sep = "\t"
680 onlyDuplicates = args.only_DCS 686 onlyDuplicates = args.only_DCS
681 minFS = args.minFS 687 minFS = args.minFS
682 maxFS = args.maxFS 688 maxFS = args.maxFS
683 nr_above_bars = args.nr_above_bars 689 nr_above_bars = args.nr_above_bars
684
685 690
686 subset = args.subset_tag 691 subset = args.subset_tag
687 nproc = args.nproc 692 nproc = args.nproc
688 693
689 ### input checks 694 # input checks
690 if index_size < 0: 695 if index_size < 0:
691 print("index_size is a negative integer.") 696 print("index_size is a negative integer.")
692 exit(2) 697 exit(2)
693 698
694 if nproc <= 0: 699 if nproc <= 0:
695 print("nproc is smaller or equal zero") 700 print("nproc is smaller or equal zero")
696 exit(3) 701 exit(3)
697 702
698 if type(sep) is not str or len(sep)>1:
699 print("sep must be a single character.")
700 exit(4)
701
702 if subset < 0: 703 if subset < 0:
703 print("subset_tag is smaller or equal zero.") 704 print("subset_tag is smaller or equal zero.")
704 exit(5) 705 exit(5)
705 706
706 ### PLOT ### 707 # PLOT
707 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color 708 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color
708 plt.rcParams['xtick.labelsize'] = 14 709 plt.rcParams['xtick.labelsize'] = 14
709 plt.rcParams['ytick.labelsize'] = 14 710 plt.rcParams['ytick.labelsize'] = 14
710 plt.rcParams['patch.edgecolor'] = "#000000" 711 plt.rcParams['patch.edgecolor'] = "#000000"
711 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format 712 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format
761 data_array = numpy.column_stack((duplTags, duplTags_seq)) 762 data_array = numpy.column_stack((duplTags, duplTags_seq))
762 data_array = numpy.column_stack((data_array, duplTags_tag)) 763 data_array = numpy.column_stack((data_array, duplTags_tag))
763 integers = numpy.array(data_array[:, 0]).astype(int) 764 integers = numpy.array(data_array[:, 0]).astype(int)
764 print("DCS in whole dataset", len(data_array)) 765 print("DCS in whole dataset", len(data_array))
765 766
766 ## HD analysis for a subset of the tag 767 # HD analysis for a subset of the tag
767 if subset > 0: 768 if subset > 0:
768 tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]]) 769 tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]])
769 tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]]) 770 tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]])
770 771
771 flanking_region_float = float((len(tag1[0]) - subset)) / 2 772 flanking_region_float = float((len(tag1[0]) - subset)) / 2
787 print("length of tag= ", len(data_array[0, 1])) 788 print("length of tag= ", len(data_array[0, 1]))
788 # select sample: if no size given --> all vs. all comparison 789 # select sample: if no size given --> all vs. all comparison
789 if index_size == 0: 790 if index_size == 0:
790 result = numpy.arange(0, len(data_array), 1) 791 result = numpy.arange(0, len(data_array), 1)
791 else: 792 else:
792 result = numpy.random.choice(len(integers), size=index_size, 793 result = numpy.random.choice(len(integers), size=index_size, replace=False) # array of random sequences of size=index.size
793 replace=False) # array of random sequences of size=index.size 794
794 795 # with open("index_result1_{}.pkl".format(app_f), "wb") as o:
795 # with open("index_result1_{}.pkl".format(app_f), "wb") as o: 796 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL)
796 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL)
797 797
798 # comparison random tags to whole dataset 798 # comparison random tags to whole dataset
799 result1 = data_array[result, 1] # random tags 799 result1 = data_array[result, 1] # random tags
800 result2 = data_array[:, 1] # all tags 800 result2 = data_array[:, 1] # all tags
801 print("size of the whole dataset= ", len(result2)) 801 print("size of the whole dataset= ", len(result2))
806 chunks_sample = numpy.array_split(result1, nproc) 806 chunks_sample = numpy.array_split(result1, nproc)
807 ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample) 807 ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample)
808 proc_pool.close() 808 proc_pool.close()
809 proc_pool.join() 809 proc_pool.join()
810 ham = numpy.concatenate(ham).astype(int) 810 ham = numpy.concatenate(ham).astype(int)
811 # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1: 811 # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1:
812 # for h, tag in zip(ham, result1): 812 # for h, tag in zip(ham, result1):
813 # output_file1.write("{}\t{}\n".format(tag, h)) 813 # output_file1.write("{}\t{}\n".format(tag, h))
814 814
815 # HD analysis for chimeric reads 815 # HD analysis for chimeric reads
816 proc_pool_b = Pool(nproc) 816 proc_pool_b = Pool(nproc)
817 diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample) 817 diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample)
818 diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample) 818 diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample)
832 numpy.concatenate([item_b[5] for item_b in diff_list_b]))) 832 numpy.concatenate([item_b[5] for item_b in diff_list_b])))
833 diff_zeros = numpy.concatenate((numpy.concatenate([item[6] for item in diff_list_a]), 833 diff_zeros = numpy.concatenate((numpy.concatenate([item[6] for item in diff_list_a]),
834 numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int) 834 numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int)
835 minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]), 835 minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]),
836 numpy.concatenate([item_b[7] for item_b in diff_list_b]))) 836 numpy.concatenate([item_b[7] for item_b in diff_list_b])))
837 HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]), 837 HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]), numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int)
838 numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int)
839 HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]), 838 HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]),
840 numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int) 839 numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int)
841 # with open("HD_within tag_{}.txt".format(app_f), "w") as output_file2:
842 # for d, s1, s2, hd, rel_d, tag in zip(diff, HDhalf1, HDhalf2, minHDs, rel_Diff, minHD_tags):
843 # output_file2.write(
844 # "{}\t{}\t{}\t{}\t{}\t{}\n".format(tag, hd, s1, s2, d, rel_d))
845 840
846 lenTags = len(data_array) 841 lenTags = len(data_array)
847 842
848 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags 843 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags
849 seq = numpy.array(data_array[result, 1]) # tags of sample 844 seq = numpy.array(data_array[result, 1]) # tags of sample
854 seq = numpy.tile(seq, 2) 849 seq = numpy.tile(seq, 2)
855 ham = numpy.tile(ham, 2) 850 ham = numpy.tile(ham, 2)
856 851
857 # prepare data for different kinds of plots 852 # prepare data for different kinds of plots
858 # distribution of FSs separated after HD 853 # distribution of FSs separated after HD
859 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham,rel=False) 854 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False)
860 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS 855 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS
861 856
862 ## get FS for all tags with min HD of analysis of chimeric reads 857 # get FS for all tags with min HD of analysis of chimeric reads
863 # there are more tags than sample size in the plot, because one tag can have multiple minimas 858 # there are more tags than sample size in the plot, because one tag can have multiple minimas
864 seqDic = dict(zip(seq, quant)) 859 seqDic = dict(zip(seq, quant))
865 lst_minHD_tags = [] 860 lst_minHD_tags = []
866 for i in minHD_tags: 861 for i in minHD_tags:
867 lst_minHD_tags.append(seqDic.get(i)) 862 lst_minHD_tags.append(seqDic.get(i))
881 if len(minHD_tags_zeros) != 0: 876 if len(minHD_tags_zeros) != 0:
882 lst_minHD_tags_zeros = [] 877 lst_minHD_tags_zeros = []
883 for i in minHD_tags_zeros: 878 for i in minHD_tags_zeros:
884 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads 879 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads
885 880
886 # histogram with HD of non-identical half 881 # histogram with HD of non-identical half
887 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS( 882 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(lst_minHD_tags_zeros, diff_zeros)
888 lst_minHD_tags_zeros, diff_zeros) 883 # family size distribution of non-identical half
889 # family size distribution of non-identical half 884 familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD(lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False)
890 familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD( 885
891 lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False) 886 # plot Hamming Distance with Family size distribution
892 887 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, subtitle="Hamming distance separated by family size", title_file1=name_file, lenTags=lenTags, xlabel="HD", nr_above_bars=nr_above_bars)
893 ##################################################################################################################### 888
894 ################## plot Hamming Distance with Family size distribution ############################## 889 # Plot FSD with separation after
895 #####################################################################################################################
896 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf,
897 subtitle="Hamming distance separated by family size", title_file1=name_file,
898 lenTags=lenTags,xlabel="HD", nr_above_bars=nr_above_bars)
899
900 ########################## Plot FSD with separation after ###############################################
901 ######################################################################################################################
902 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, 890 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS,
903 originalCounts=quant, subtitle="Family size distribution separated by Hamming distance", 891 originalCounts=quant, subtitle="Family size distribution separated by Hamming distance",
904 pdf=pdf,relative=False, title_file1=name_file, diff=False) 892 pdf=pdf, relative=False, title_file1=name_file, diff=False)
905 893
906 ########################## Plot HD within tags ######################################################## 894 # Plot HD within tags
907 ###################################################################################################################### 895 plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, title_file1=name_file)
908 # plotHDwithinSeq_Sum2(HDhalf1, HDhalf2, minHDs, pdf=pdf, lenTags=lenTags, title_file1=name_file) 896
909 plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2min , HDhalf2, minHDs, pdf=pdf, lenTags=lenTags, title_file1=name_file) 897 # Plot difference between HD's separated after FSD
910
911
912 ########################## Plot difference between HD's separated after FSD ####################################
913 ######################################################################################################################
914 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, 898 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf,
915 subtitle="Delta Hamming distance within tags", 899 subtitle="Delta Hamming distance within tags",
916 title_file1=name_file, lenTags=lenTags, 900 title_file1=name_file, lenTags=lenTags,
917 xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars) 901 xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars)
918 902
919 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, 903 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf,
920 subtitle="Chimera Analysis: relative delta Hamming distances", 904 subtitle="Chimera Analysis: relative delta Hamming distances",
921 title_file1=name_file, lenTags=lenTags, 905 title_file1=name_file, lenTags=lenTags,
922 xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars) 906 xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars)
923 907
924 #################### Plot FSD separated after difference between HD's #####################################
925 ########################################################################################################################
926 # plotFSDwithHD2(familySizeList1_diff, maximumXFS_diff, minimumXFS_diff,
927 # subtitle="Family size distribution separated by delta Hamming distances within the tags",
928 # pdf=pdf,relative=False, diff=True, title_file1=name_file, originalCounts=quant)
929
930 # plotFSDwithHD2(familySizeList1_reldiff, maximumXFS_reldiff, minimumXFS_reldiff, originalCounts=quant, pdf=pdf,
931 # subtitle="Family size distribution separated by delta Hamming distances within the tags",
932 # relative=True, diff=True, title_file1=name_file)
933
934
935 # plots for chimeric reads 908 # plots for chimeric reads
936 if len(minHD_tags_zeros) != 0: 909 if len(minHD_tags_zeros) != 0:
937 ## HD 910 # HD
938 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, 911 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf,
939 subtitle="Hamming distance of the non-identical half of chimeras", 912 subtitle="Hamming distance of the non-identical half of chimeras",
940 title_file1=name_file, lenTags=lenTags,xlabel="HD", relative=False, nr_above_bars=nr_above_bars) 913 title_file1=name_file, lenTags=lenTags, xlabel="HD", relative=False, nr_above_bars=nr_above_bars)
941 914
942 ## FSD 915 # print all data to a CSV file
943 # plotFSDwithHD2(familySizeList1_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros, 916 # HD
944 # originalCounts=quant, pdf=pdf,
945 # subtitle="Family size distribution separated by Hamming distance of the non-identical half of chimeras",
946 # relative=False, diff=False, title_file1=name_file)
947
948 ### print all data to a CSV file
949 #### HD ####
950 summary, sumCol = createTableHD(list1, "HD=") 917 summary, sumCol = createTableHD(list1, "HD=")
951 overallSum = sum(sumCol) # sum of columns in table 918 overallSum = sum(sumCol) # sum of columns in table
952 919
953 #### FSD #### 920 # FSD
954 summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False) 921 summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False)
955 overallSum5 = sum(sumCol5) 922 overallSum5 = sum(sumCol5)
956 923
957 ### HD of both parts of the tag #### 924 # HD of both parts of the tag
958 summary9, sumCol9 = createTableHDwithTags([HDhalf1, HDhalf2,numpy.array(minHDs)]) 925 summary9, sumCol9 = createTableHDwithTags([HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, numpy.array(minHDs)])
959 overallSum9 = sum(sumCol9) 926 overallSum9 = sum(sumCol9)
960 927
961 ## HD 928 # HD
962 # absolute difference 929 # absolute difference
963 summary11, sumCol11 = createTableHD(listDifference1, "diff=") 930 summary11, sumCol11 = createTableHD(listDifference1, "diff=")
964 overallSum11 = sum(sumCol11) 931 overallSum11 = sum(sumCol11)
965 # relative difference and all tags 932 # relative difference and all tags
966 summary13, sumCol13 = createTableHD(listRelDifference1, "diff=") 933 summary13, sumCol13 = createTableHD(listRelDifference1, "diff=")
967 overallSum13 = sum(sumCol13) 934 overallSum13 = sum(sumCol13)
968 935
969 ## FSD
970 # absolute difference
971 # summary19, sumCol19 = createTableFSD2(familySizeList1_diff)
972 # overallSum19 = sum(sumCol19)
973 # relative difference
974 # summary21, sumCol21 = createTableFSD2(familySizeList1_reldiff)
975 # overallSum21 = sum(sumCol21)
976
977 # chimeric reads 936 # chimeric reads
978 if len(minHD_tags_zeros) != 0: 937 if len(minHD_tags_zeros) != 0:
979 # absolute difference and tags where at least one half has HD=0 938 # absolute difference and tags where at least one half has HD=0
980 summary15, sumCol15 = createTableHD(listDifference1_zeros, "HD=") 939 summary15, sumCol15 = createTableHD(listDifference1_zeros, "HD=")
981 overallSum15 = sum(sumCol15) 940 overallSum15 = sum(sumCol15)
982 # absolute difference and tags where at least one half has HD=0
983 # summary23, sumCol23 = createTableFSD2(familySizeList1_diff_zeros, diff=False)
984 # overallSum23 = sum(sumCol23)
985 941
986 output_file.write("{}\n".format(name_file)) 942 output_file.write("{}\n".format(name_file))
987 output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len( 943 output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len(
988 numpy.concatenate(list1)), lenTags, lenTags)) 944 numpy.concatenate(list1)), lenTags, lenTags))
989 945
990 ### HD ### 946 # HD
991 createFileHD(summary, sumCol, overallSum, output_file, 947 createFileHD(summary, sumCol, overallSum, output_file,
992 "Hamming distance separated by family size", sep) 948 "Hamming distance separated by family size", sep)
993 ### FSD ### 949 # FSD
994 createFileFSD2(summary5, sumCol5, overallSum5, output_file, 950 createFileFSD2(summary5, sumCol5, overallSum5, output_file,
995 "Family size distribution separated by Hamming distance", sep, 951 "Family size distribution separated by Hamming distance", sep,
996 diff=False) 952 diff=False)
997 953
998 count = numpy.bincount(quant) 954 count = numpy.bincount(quant)
999 #output_file.write("{}{}\n".format(sep, name_file)) 955 # output_file.write("{}{}\n".format(sep, name_file))
1000 output_file.write("\n") 956 output_file.write("\n")
1001 output_file.write("max. family size:{}{}\n".format(sep, max(quant))) 957 output_file.write("max. family size:{}{}\n".format(sep, max(quant)))
1002 output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1])) 958 output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1]))
1003 output_file.write( 959 output_file.write(
1004 "relative frequency:{}{}\n\n".format(sep, float(count[len(count) - 1]) / sum(count))) 960 "relative frequency:{}{}\n\n".format(sep, float(count[len(count) - 1]) / sum(count)))
1005 961
1006 ### HD within tags ### 962 # HD within tags
1007 output_file.write( 963 output_file.write(
1008 "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n" 964 "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n"
1009 "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n") 965 "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n")
1010 output_file.write( 966 output_file.write(
1011 "actual number of tags with min HD = {:,} (sample size by user = {:,})\n".format( 967 "actual number of tags with min HD = {:,} (sample size by user = {:,})\n".format(
1017 createFileHD(summary11, sumCol11, overallSum11, output_file, 973 createFileHD(summary11, sumCol11, overallSum11, output_file,
1018 "Absolute delta Hamming distances within the tag", sep) 974 "Absolute delta Hamming distances within the tag", sep)
1019 createFileHD(summary13, sumCol13, overallSum13, output_file, 975 createFileHD(summary13, sumCol13, overallSum13, output_file,
1020 "Chimera analysis: relative delta Hamming distances", sep) 976 "Chimera analysis: relative delta Hamming distances", sep)
1021 977
1022 # createFileFSD2(summary19, sumCol19, overallSum19, output_file,
1023 # "Family size distribution separated by absolute delta Hamming distance",
1024 # sep)
1025 # createFileFSD2(summary21, sumCol21, overallSum21, output_file,
1026 # "Family size distribution separated by relative delta Hamming distance",
1027 # sep, rel=True)
1028
1029 if len(minHD_tags_zeros) != 0: 978 if len(minHD_tags_zeros) != 0:
1030 output_file.write( 979 output_file.write(
1031 "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n") 980 "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n")
1032 createFileHD(summary15, sumCol15, overallSum15, output_file, 981 createFileHD(summary15, sumCol15, overallSum15, output_file,
1033 "Hamming distances of non-zero half", sep) 982 "Hamming distances of non-zero half", sep)
1034 # createFileFSD2(summary23, sumCol23, overallSum23, output_file,
1035 # "Family size distribution separated by Hamming distance of non-zero half",
1036 # sep, diff=False)
1037 output_file.write("\n") 983 output_file.write("\n")
1038
1039 984
1040 985
1041 if __name__ == '__main__': 986 if __name__ == '__main__':
1042 sys.exit(Hamming_Distance_Analysis(sys.argv)) 987 sys.exit(Hamming_Distance_Analysis(sys.argv))