comparison fsd.py @ 42:321a4871564b draft

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd commit 033dd7b750f68e8aa68f327d7d72bd311ddbee4e-dirty
author mheinzl
date Wed, 14 Aug 2019 08:29:27 -0400
parents 54f0dac1c834
children f72593bcc8ee
comparison
equal deleted inserted replaced
41:54f0dac1c834 42:321a4871564b
272 272
273 for l in range(len(to_plot)): 273 for l in range(len(to_plot)):
274 ax = fig.add_subplot(2, 1, l+1) 274 ax = fig.add_subplot(2, 1, l+1)
275 ticks = numpy.arange(1, 22, 1) 275 ticks = numpy.arange(1, 22, 1)
276 ticks1 = map(str, ticks) 276 ticks1 = map(str, ticks)
277 if maximumX > 20: 277 ticks1[len(ticks1) - 1] = ">20"
278 ticks1[len(ticks1) - 1] = ">20"
279 278
280 if to_plot[l] == "Relative frequencies": 279 if to_plot[l] == "Relative frequencies":
281 w = [numpy.zeros_like(data) + 1. / len(data) for data in list_to_plot2] 280 w = [numpy.zeros_like(data) + 1. / len(data) for data in list_to_plot2]
282 counts_rel = ax.hist(list_to_plot2, weights=w, 281 counts_rel = ax.hist(list_to_plot2, weights=w,
283 bins=numpy.arange(1, 23), stacked=False, edgecolor="black", 282 bins=numpy.arange(1, 23), stacked=False, edgecolor="black",
284 linewidth=1, label=label, align="left", alpha=0.7, rwidth=0.8) 283 linewidth=1, label=label, align="left", alpha=0.7, rwidth=0.8)
285 ax.set_ylim(0, 1.07) 284 ax.set_ylim(0, 1.07)
286 else: 285 else:
287 counts = ax.hist(list_to_plot2, bins=numpy.arange(minimumX, maximumX + 2), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=0.7, rwidth=0.8, color=colors) 286 counts = ax.hist(list_to_plot2, bins=numpy.arange(1, 23), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=0.7, rwidth=0.8, color=colors)
288 ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) 287 ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
289 288
290 ax.set_xticks(numpy.array(ticks)) 289 ax.set_xticks(numpy.array(ticks))
291 ax.set_xticklabels(ticks1) 290 ax.set_xticklabels(ticks1)
292 291
301 300
302 fig2.suptitle('Family Size Distribution (PE reads)', fontsize=14) 301 fig2.suptitle('Family Size Distribution (PE reads)', fontsize=14)
303 for l in range(len(to_plot)): 302 for l in range(len(to_plot)):
304 ax = fig2.add_subplot(2, 1, l + 1) 303 ax = fig2.add_subplot(2, 1, l + 1)
305 ticks = numpy.arange(minimumX, maximumX + 1) 304 ticks = numpy.arange(minimumX, maximumX + 1)
305 ticks = numpy.arange(1, 22)
306 ticks1 = map(str, ticks) 306 ticks1 = map(str, ticks)
307 if maximumX > 20: 307 ticks1[len(ticks1) - 1] = ">20"
308 ticks1[len(ticks1) - 1] = ">20"
309 reads = [] 308 reads = []
310 reads_rel = [] 309 reads_rel = []
311 310
312 barWidth = 0 - (len(list_to_plot)+1)/2 * 1./(len(list_to_plot) + 1) 311 barWidth = 0 - (len(list_to_plot)+1)/2 * 1./(len(list_to_plot)+1)
313 312
314 for i in range(len(list_to_plot2)): 313 for i in range(len(list_to_plot2)):
314 x = list(numpy.arange(1, 22).astype(float))
315 unique, c = numpy.unique(list_to_plot2[i], return_counts=True) 315 unique, c = numpy.unique(list_to_plot2[i], return_counts=True)
316 new_c = [] 316 y = unique * c
317 new_unique = []
318 for t in ticks:
319 if t not in unique:
320 new_c.append(0) # add zero count of not occuring
321 new_unique.append(t)
322 else:
323 c_idx = numpy.where(t == unique)[0]
324 new_c.append(c[c_idx])
325 new_unique.append(unique[c_idx])
326 y = numpy.array(new_unique) * numpy.array(new_c)
327 if sum(list_to_plot_original[i] > 20) > 0: 317 if sum(list_to_plot_original[i] > 20) > 0:
328 y[len(y) - 1] = sum(list_to_plot_original[i][list_to_plot_original[i] > 20]) 318 y[len(y) - 1] = sum(list_to_plot_original[i][list_to_plot_original[i] > 20])
329 # y = [y[x[idx] == unique][0] if x[idx] in unique else 0 for idx in range(len(x))] 319 y = [y[x[idx] == unique][0] if x[idx] in unique else 0 for idx in range(len(x))]
330 reads.append(y) 320 reads.append(y)
331 reads_rel.append(list(numpy.float_(y)) / sum(y)) 321 reads_rel.append(list(numpy.float_(y)) / sum(y))
332 322 #x = [xi + barWidth for xi in x]
333 x = list(numpy.arange(numpy.amin(unique), numpy.amax(unique) + 1).astype(float)) 323
334 if len(list_to_plot2) == 1: 324 if len(list_to_plot2) == 1:
335 x = [xi * 0.5 for xi in x] 325 x = [xi * 0.5 for xi in x]
336 w = 0.4 326 w = 0.4
337 else: 327 else:
338 x = [xi + barWidth for xi in x] 328 x = [xi + barWidth for xi in x]
339 w = 1./(len(list_to_plot) + 1) 329 w = 1./(len(list_to_plot) + 1)
340
341 if to_plot[l] == "Relative frequencies": 330 if to_plot[l] == "Relative frequencies":
342 counts2_rel = ax.bar(x, list(numpy.float_(y)) / numpy.sum(y), align="edge", width=w, 331 counts2_rel = ax.bar(x, list(numpy.float_(y)) / numpy.sum(y), align="edge", width=w,
343 edgecolor="black", label=label[i],linewidth=1, alpha=0.7, color=colors[i]) 332 edgecolor="black", label=label[i],linewidth=1, alpha=0.7, color=colors[i])
344 ax.set_ylim(0, 1.07) 333 ax.set_ylim(0, 1.07)
345 else: 334 else:
346 y = list(y.reshape((len(y)))) 335 #y = list(y.reshape((len(y))))
347
348 counts2 = ax.bar(x, y, align="edge", width=w, edgecolor="black", label=label[i], linewidth=1, 336 counts2 = ax.bar(x, y, align="edge", width=w, edgecolor="black", label=label[i], linewidth=1,
349 alpha=0.7, color=colors[i]) 337 alpha=0.7, color=colors[i])
350 if i == len(list_to_plot2): 338 if i == len(list_to_plot2)-1:
351 barWidth += 1. / (len(list_to_plot) + 1) + 1. / (len(list_to_plot) + 1) 339 barWidth += 1. / (len(list_to_plot) + 1) + 1. / (len(list_to_plot) + 1)
352 else: 340 else:
353 barWidth += 1. / (len(list_to_plot) + 1) 341 barWidth += 1. / (len(list_to_plot) + 1)
354 342
355 if to_plot[l] == "Absolute frequencies": 343 if to_plot[l] == "Absolute frequencies":
356 ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) 344 ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
357 else: 345 else:
358 ax.set_xlabel("Family size", fontsize=14) 346 ax.set_xlabel("Family size", fontsize=14)
359 347
405 for i in label: 393 for i in label:
406 output_file.write("{}{}".format(sep, i)) 394 output_file.write("{}{}".format(sep, i))
407 # output_file.write("{}sum".format(sep)) 395 # output_file.write("{}sum".format(sep))
408 output_file.write("\n") 396 output_file.write("\n")
409 j = 0 397 j = 0
398
410 for fs in bins: 399 for fs in bins:
411 if fs == 21: 400 if fs == 21:
412 fs = ">20" 401 fs = ">20"
413 else: 402 else:
414 fs = "={}".format(fs) 403 fs = "={}".format(fs)
419 for n in range(len(label)): 408 for n in range(len(label)):
420 output_file.write("{}{}".format(int(reads[n][j]), sep)) 409 output_file.write("{}{}".format(int(reads[n][j]), sep))
421 output_file.write("\n") 410 output_file.write("\n")
422 j += 1 411 j += 1
423 output_file.write("sum{}".format(sep)) 412 output_file.write("sum{}".format(sep))
424 if len(label) == 1: 413 if len(label) == 1:
425 output_file.write("{}{}".format(int(sum(numpy.concatenate(reads))), sep)) 414 output_file.write("{}{}".format(int(sum(numpy.concatenate(reads))), sep))
426 else: 415 else:
427 for i in reads: 416 for i in reads:
428 output_file.write("{}{}".format(int(sum(i)), sep)) 417 output_file.write("{}{}".format(int(sum(i)), sep))
429 output_file.write("\n") 418 output_file.write("\n")
492 duplTags_FS3_BA_o = duplTagsBA_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)] # ba+ab with FS>=3 481 duplTags_FS3_BA_o = duplTagsBA_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)] # ba+ab with FS>=3
493 duplTags_double_FS3_o = sum(duplTags_FS3_o) + sum(duplTags_FS3_BA_o) # both ab and ba strands with FS>=3 482 duplTags_double_FS3_o = sum(duplTags_FS3_o) + sum(duplTags_FS3_BA_o) # both ab and ba strands with FS>=3
494 483
495 fig = plt.figure() 484 fig = plt.figure()
496 plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0) 485 plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0)
497 counts = plt.hist(list1, bins=numpy.arange(minimumX, maximumX + 2), stacked=True, label=["duplex", "ab", "ba"], 486 counts = plt.hist(list1, bins=numpy.arange(1, 23), stacked=True, label=["duplex", "ab", "ba"],
498 edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"], 487 edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"],
499 rwidth=0.8) 488 rwidth=0.8)
500 # tick labels of x axis 489 # tick labels of x axis
501 ticks = numpy.arange(1, 22, 1) 490 ticks = numpy.arange(1, 22, 1)
502 ticks1 = map(str, ticks) 491 ticks1 = map(str, ticks)
503 if maximumX > 20: 492 ticks1[len(ticks1) - 1] = ">20"
504 ticks1[len(ticks1) - 1] = ">20"
505 plt.xticks(numpy.array(ticks), ticks1) 493 plt.xticks(numpy.array(ticks), ticks1)
506 singl = counts[0][2][0] # singletons 494 singl = counts[0][2][0] # singletons
507 last = counts[0][2][len(counts[0][0]) - 1] # large families 495 last = counts[0][2][len(counts[0][0]) - 1] # large families
508 if log_axis: 496 if log_axis:
509 plt.yscale('log') 497 plt.yscale('log')
516 504
517 # extra information beneath the plot 505 # extra information beneath the plot
518 legend = "SSCS ab= \nSSCS ba= \nDCS (total)= \ntotal nr. of tags=" 506 legend = "SSCS ab= \nSSCS ba= \nDCS (total)= \ntotal nr. of tags="
519 plt.text(0.1, 0.09, legend, size=10, transform=plt.gcf().transFigure) 507 plt.text(0.1, 0.09, legend, size=10, transform=plt.gcf().transFigure)
520 508
521 legend = "nr. of tags\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,}".format(len(dataAB), len(dataBA), len(duplTags), len(duplTags_double), (len(dataAB) + len(dataBA) + len(duplTags))) 509 legend = "nr. of tags\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(len(dataAB), len(dataBA), len(duplTags), len(duplTags_double), (len(dataAB) + len(dataBA) + len(duplTags)), (len(ab) + len(ba)))
522 plt.text(0.23, 0.09, legend, size=10, transform=plt.gcf().transFigure) 510 plt.text(0.23, 0.09, legend, size=10, transform=plt.gcf().transFigure)
523 511
524 legend5 = "PE reads\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,}".format(sum(dataAB_o), sum(dataBA_o), sum(duplTags_o), sum(duplTags_double_o), (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o))) 512 legend5 = "PE reads\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(sum(dataAB_o), sum(dataBA_o), sum(duplTags_o), sum(duplTags_double_o), (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), (sum(ab_o) + sum(ba_o)))
525 plt.text(0.38, 0.09, legend5, size=10, transform=plt.gcf().transFigure) 513 plt.text(0.38, 0.09, legend5, size=10, transform=plt.gcf().transFigure)
526 514
527 legend = "rel. freq. of tags\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)), float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)), float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), (len(dataAB) + len(dataBA) + len(duplTags))) 515 legend = "rel. freq. of tags\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)), float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)), float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), (len(dataAB) + len(dataBA) + len(duplTags)))
528 plt.text(0.54, 0.09, legend, size=10, transform=plt.gcf().transFigure) 516 plt.text(0.54, 0.09, legend, size=10, transform=plt.gcf().transFigure)
529 517