comparison hd.py @ 11:7adc48c8a03d draft

planemo upload for repository https://github.com/monikaheinzl/galaxyProject/tree/master/tools/hd commit 8d9bdadb5e154dadc455720c99700afbd9aafae9
author mheinzl
date Tue, 15 May 2018 13:31:02 -0400
parents 69aa17354a6e
children 5b0a95f205ad
comparison
equal deleted inserted replaced
10:69aa17354a6e 11:7adc48c8a03d
28 import sys 28 import sys
29 import os 29 import os
30 from matplotlib.backends.backend_pdf import PdfPages 30 from matplotlib.backends.backend_pdf import PdfPages
31 from collections import Counter 31 from collections import Counter
32 32
33 def plotFSDwithHD2(familySizeList1,maximumXFS,minimumXFS, quant, 33 def plotFSDwithHD2(familySizeList1,maximumXFS,minimumXFS, originalCounts,
34 title_file1, subtitle, pdf, relative=False, diff = True): 34 title_file1, subtitle, pdf, relative=False, diff = True):
35 if diff is False: 35 if diff is False:
36 colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"] 36 colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"]
37 labels = ["HD=1", "HD=2", "HD=3", "HD=4", "HD=5-8","HD>8"] 37 labels = ["HD=1", "HD=2", "HD=3", "HD=4", "HD=5-8","HD>8"]
38 else: 38 else:
76 76
77 plt.ylim((0, maximumY * 1.2)) 77 plt.ylim((0, maximumY * 1.2))
78 legend = "\nmax. family size: \nabsolute frequency: \nrelative frequency: " 78 legend = "\nmax. family size: \nabsolute frequency: \nrelative frequency: "
79 plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure) 79 plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure)
80 80
81 count = numpy.bincount(quant) # original counts 81 count = numpy.bincount(originalCounts) # original counts
82 legend1 = "{}\n{}\n{:.5f}" \ 82 legend1 = "{}\n{}\n{:.5f}" \
83 .format(max(quant), count[len(count) - 1], float(count[len(count) - 1]) / sum(count)) 83 .format(max(originalCounts), count[len(count) - 1], float(count[len(count) - 1]) / sum(count))
84 plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure) 84 plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure)
85 legend3 = "singletons\n{:,}\n{:.5f}".format(int(counts[0][len(counts[0]) - 1][1]), 85 legend3 = "singletons\n{:,}\n{:.5f}".format(int(counts[0][len(counts[0]) - 1][1]),
86 float(counts[0][len(counts[0]) - 1][1]) / sum( 86 float(counts[0][len(counts[0]) - 1][1]) / sum(
87 counts[0][len(counts[0]) - 1])) 87 counts[0][len(counts[0]) - 1]))
88 plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12) 88 plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12)
506 difference1_zeros = abs(d - d2) 506 difference1_zeros = abs(d - d2)
507 diff11_zeros.append(difference1_zeros) 507 diff11_zeros.append(difference1_zeros)
508 i += 1 508 i += 1
509 509
510 #print(i) 510 #print(i)
511 diff11 = [st for st in diff11 if st != 999] 511 #diff11 = [st for st in diff11 if st != 999]
512 ham1 = [st for st in ham1 if st != 999] 512 #ham1 = [st for st in ham1 if st != 999]
513 ham2 = [st for st in ham2 if st != 999] 513 #ham2 = [st for st in ham2 if st != 999]
514 min_valueList = [st for st in min_valueList if st != 999] 514 #min_valueList = [st for st in min_valueList if st != 999]
515 min_tagsList = [st for st in min_tagsList if st != 999] 515 #min_tagsList = [st for st in min_tagsList if st != 999]
516 relativeDiffList = [st for st in relativeDiffList if st != 999] 516 #relativeDiffList = [st for st in relativeDiffList if st != 999]
517 diff11_zeros = [st for st in diff11_zeros if st != 999] 517 #diff11_zeros = [st for st in diff11_zeros if st != 999]
518 min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999] 518 #min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999]
519 519
520 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros]) 520 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros])
521 521
522 def readFileReferenceFree(file): 522 def readFileReferenceFree(file):
523 with open(file, 'r') as dest_f: 523 with open(file, 'r') as dest_f:
524 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') 524 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string')
525 integers = numpy.array(data_array[:, 0]).astype(int) 525 integers = numpy.array(data_array[:, 0]).astype(int)
526 return(integers, data_array) 526 return(integers, data_array)
527 527
528 def hammingDistanceWithFS(quant, ham): 528 def hammingDistanceWithFS(fs, ham):
529 quant = numpy.asarray(quant) 529 fs = numpy.asarray(fs)
530 maximum = max(ham) 530 maximum = max(ham)
531 minimum = min(ham) 531 minimum = min(ham)
532 ham = numpy.asarray(ham) 532 ham = numpy.asarray(ham)
533 533
534 singletons = numpy.where(quant == 1)[0] 534 singletons = numpy.where(fs == 1)[0]
535 data = ham[singletons] 535 data = ham[singletons]
536 536
537 hd2 = numpy.where(quant == 2)[0] 537 hd2 = numpy.where(fs == 2)[0]
538 data2 = ham[hd2] 538 data2 = ham[hd2]
539 539
540 hd3 = numpy.where(quant == 3)[0] 540 hd3 = numpy.where(fs == 3)[0]
541 data3 = ham[hd3] 541 data3 = ham[hd3]
542 542
543 hd4 = numpy.where(quant == 4)[0] 543 hd4 = numpy.where(fs == 4)[0]
544 data4 = ham[hd4] 544 data4 = ham[hd4]
545 545
546 hd5 = numpy.where((quant >= 5) & (quant <= 10))[0] 546 hd5 = numpy.where((fs >= 5) & (fs <= 10))[0]
547 data5 = ham[hd5] 547 data5 = ham[hd5]
548 548
549 hd6 = numpy.where(quant > 10)[0] 549 hd6 = numpy.where(fs > 10)[0]
550 data6 = ham[hd6] 550 data6 = ham[hd6]
551 551
552 list1 = [data, data2, data3, data4, data5, data6] 552 list1 = [data, data2, data3, data4, data5, data6]
553 return(list1, maximum, minimum) 553 return(list1, maximum, minimum)
554 554
882 lenTags=lenTags,xlabel="Hamming distance") 882 lenTags=lenTags,xlabel="Hamming distance")
883 883
884 ########################## Plot FSD with separation after HD ############################################### 884 ########################## Plot FSD with separation after HD ###############################################
885 ######################################################################################################################## 885 ########################################################################################################################
886 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, 886 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS,
887 quant=quant, subtitle="Family size distribution separated by Hamming distance", 887 originalCounts=quant, subtitle="Family size distribution separated by Hamming distance",
888 pdf=pdf,relative=False, title_file1=name_file, diff=False) 888 pdf=pdf,relative=False, title_file1=name_file, diff=False)
889 889
890 ########################## Plot difference between HD's separated after FSD ########################################## 890 ########################## Plot difference between HD's separated after FSD ##########################################
891 ######################################################################################################################## 891 ########################################################################################################################
892 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, 892 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf,
901 901
902 #################### Plot FSD separated after difference between HD's ##################################### 902 #################### Plot FSD separated after difference between HD's #####################################
903 ######################################################################################################################## 903 ########################################################################################################################
904 plotFSDwithHD2(familySizeList1_diff, maximumXFS_diff, minimumXFS_diff, 904 plotFSDwithHD2(familySizeList1_diff, maximumXFS_diff, minimumXFS_diff,
905 subtitle="Family size distribution separated by delta Hamming distances within the tags", 905 subtitle="Family size distribution separated by delta Hamming distances within the tags",
906 pdf=pdf,relative=False, diff=True, title_file1=name_file, quant=quant) 906 pdf=pdf,relative=False, diff=True, title_file1=name_file, originalCounts=quant)
907 907
908 plotFSDwithHD2(familySizeList1_reldiff, maximumXFS_reldiff, minimumXFS_reldiff, quant=quant, pdf=pdf, 908 plotFSDwithHD2(familySizeList1_reldiff, maximumXFS_reldiff, minimumXFS_reldiff, originalCounts=quant, pdf=pdf,
909 subtitle="Family size distribution separated by delta Hamming distances within the tags", 909 subtitle="Family size distribution separated by delta Hamming distances within the tags",
910 relative=True, diff=True, title_file1=name_file) 910 relative=True, diff=True, title_file1=name_file)
911 911
912 912
913 # plots for chimeric reads 913 # plots for chimeric reads
917 subtitle="Hamming distance of the non-identical half of chimeras", 917 subtitle="Hamming distance of the non-identical half of chimeras",
918 title_file1=name_file, lenTags=lenTags,xlabel="Hamming distance", relative=False) 918 title_file1=name_file, lenTags=lenTags,xlabel="Hamming distance", relative=False)
919 919
920 ## FSD 920 ## FSD
921 plotFSDwithHD2(familySizeList1_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros, 921 plotFSDwithHD2(familySizeList1_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros,
922 quant=quant, pdf=pdf, 922 originalCounts=quant, pdf=pdf,
923 subtitle="Family size distribution separated by Hamming distance of the non-identical half of chimeras", 923 subtitle="Family size distribution separated by Hamming distance of the non-identical half of chimeras",
924 relative=False, diff=False, title_file1=name_file) 924 relative=False, diff=False, title_file1=name_file)
925 925
926 ### print all data to a CSV file 926 ### print all data to a CSV file
927 #### HD #### 927 #### HD ####