comparison hd.py @ 2:316fbf91dd12 draft

planemo upload for repository https://github.com/monikaheinzl/galaxyProject/tree/master/tools/hd commit f9d5547849dabb59a33a5e998bda4730323d62a9
author mheinzl
date Tue, 15 May 2018 10:36:34 -0400
parents 7414792e1cb8
children 82eaf30dd089
comparison
equal deleted inserted replaced
1:7414792e1cb8 2:316fbf91dd12
61 color=colors, stacked=True, 61 color=colors, stacked=True,
62 rwidth=0.8,alpha=1, align="left", 62 rwidth=0.8,alpha=1, align="left",
63 edgecolor="None",bins=range1) 63 edgecolor="None",bins=range1)
64 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) 64 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
65 65
66 plt.title(title_file1, fontsize=12) 66 #plt.title(title_file1, fontsize=12)
67 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) 67 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
68 plt.xlabel("No. of Family Members", fontsize=12) 68 plt.xlabel("Family size", fontsize=14)
69 plt.ylabel("Absolute Frequency", fontsize=12) 69 plt.ylabel("Absolute Frequency", fontsize=14)
70 70
71 ticks = numpy.arange(0, maximumXFS + 1, 1) 71 ticks = numpy.arange(0, maximumXFS + 1, 1)
72 ticks1 = map(str, ticks) 72 ticks1 = map(str, ticks)
73 if maximumXFS >= 20: 73 if maximumXFS >= 20:
74 ticks1[len(ticks1) - 1] = ">=20" 74 ticks1[len(ticks1) - 1] = ">=20"
123 range=(0, maximumX + 1)) 123 range=(0, maximumX + 1))
124 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) 124 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
125 bins = counts[1] # width of bins 125 bins = counts[1] # width of bins
126 counts = numpy.array(map(int, counts[0][5])) 126 counts = numpy.array(map(int, counts[0][5]))
127 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) 127 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
128 plt.title(title_file1, fontsize=12) 128 # plt.title(title_file1, fontsize=12)
129 plt.xlabel(xlabel, fontsize=12) 129 plt.xlabel(xlabel, fontsize=14)
130 plt.ylabel("Absolute Frequency", fontsize=12) 130 plt.ylabel("Absolute Frequency", fontsize=14)
131 131
132 plt.grid(b=True, which='major', color='#424242', linestyle=':') 132 plt.grid(b=True, which='major', color='#424242', linestyle=':')
133 plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) 133 plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1))
134 plt.xticks(numpy.arange(0, maximumX + step, step)) 134 plt.xticks(numpy.arange(0, maximumX + step, step))
135 135
153 153
154 def plotHDwithinSeq_Sum2(sum1, sum2,min_value, lenTags, title_file1, pdf): 154 def plotHDwithinSeq_Sum2(sum1, sum2,min_value, lenTags, title_file1, pdf):
155 fig = plt.figure(figsize=(6, 8)) 155 fig = plt.figure(figsize=(6, 8))
156 plt.subplots_adjust(bottom=0.1) 156 plt.subplots_adjust(bottom=0.1)
157 157
158 ham = [numpy.array(min_value), sum1, sum2] # new hd within tags 158 ham = [sum1, sum2,numpy.array(min_value)] # new hd within tags
159 159
160 maximumX = numpy.amax(numpy.concatenate(ham)) 160 maximumX = numpy.amax(numpy.concatenate(ham))
161 minimumX = numpy.amin(numpy.concatenate(ham)) 161 minimumX = numpy.amin(numpy.concatenate(ham))
162 maximumY = numpy.amax(numpy.concatenate(map(lambda (x): numpy.bincount(x), ham))) 162 maximumY = numpy.amax(numpy.concatenate(map(lambda (x): numpy.bincount(x), ham)))
163 163
165 range1 = minimumX 165 range1 = minimumX
166 else: 166 else:
167 range1 = range(minimumX, maximumX + 2) 167 range1 = range(minimumX, maximumX + 2)
168 168
169 counts = plt.hist(ham, align="left", rwidth=0.8, stacked=False, 169 counts = plt.hist(ham, align="left", rwidth=0.8, stacked=False,
170 label=["HD of whole tag", "tag1 - a\nvs. tag2 - a", "tag1 - b\nvs. tag2 - b"], 170 label=[ "HD a", "HD b","HD a+b"],
171 bins=range1, color=["#585858", "#58ACFA", "#FA5858"], edgecolor='black', linewidth=1) 171 bins=range1, color=["#585858", "#58ACFA", "#FA5858"], edgecolor='black', linewidth=1)
172 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1)) 172 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1))
173 plt.suptitle('Hamming distances within tags', fontsize=14) 173 plt.suptitle('Hamming distances within tags', fontsize=14)
174 plt.title(title_file1, fontsize=12) 174 #plt.title(title_file1, fontsize=12)
175 plt.xlabel("Hamming Distance", fontsize=12) 175 plt.xlabel("Hamming Distance", fontsize=14)
176 plt.ylabel("Absolute Frequency", fontsize=12) 176 plt.ylabel("Absolute Frequency", fontsize=14)
177 plt.grid(b=True, which='major', color='#424242', linestyle=':') 177 plt.grid(b=True, which='major', color='#424242', linestyle=':')
178 178
179 179
180 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.1)) 180 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.1))
181 plt.xticks(numpy.arange(minimumX - 1, maximumX + 1, 1.0)) 181 plt.xticks(numpy.arange(0, maximumX + 1, 1.0))
182 plt.ylim((0, maximumY * 1.1)) 182 plt.ylim((0, maximumY * 1.1))
183 183
184 legend = "sample size= {:,} against {:,}".format(len(ham[0]), lenTags, lenTags) 184 legend = "sample size= {:,} against {:,}".format(len(ham[0]), lenTags, lenTags)
185 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) 185 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure)
186 pdf.savefig(fig, bbox_inches="tight") 186 pdf.savefig(fig, bbox_inches="tight")
403 output_file.write("\n\n") 403 output_file.write("\n\n")
404 404
405 def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name,sep): 405 def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name,sep):
406 output_file.write(name) 406 output_file.write(name)
407 output_file.write("\n") 407 output_file.write("\n")
408 output_file.write("{}HD of whole tag;tag1-half1 vs. tag2-half1{}tag1-half2 vs. tag2-half2{}sum{}\n".format(sep,sep,sep,sep)) 408 output_file.write("{}HD a+b;HD a{}HD b{}sum{}\n".format(sep,sep,sep,sep))
409 for item in summary: 409 for item in summary:
410 for nr in item: 410 for nr in item:
411 if "HD" not in nr: 411 if "HD" not in nr:
412 nr = nr.astype(float) 412 nr = nr.astype(float)
413 nr = nr.astype(int) 413 nr = nr.astype(int)
417 sumCol = map(int, sumCol) 417 sumCol = map(int, sumCol)
418 for el in sumCol: 418 for el in sumCol:
419 output_file.write("{}{}".format(el,sep)) 419 output_file.write("{}{}".format(el,sep))
420 output_file.write("{}{}".format(overallSum.astype(int),sep)) 420 output_file.write("{}{}".format(overallSum.astype(int),sep))
421 output_file.write("\n\n") 421 output_file.write("\n\n")
422
423
424 422
425 def hamming(array1, array2): 423 def hamming(array1, array2):
426 res = 99 * numpy.ones(len(array1)) 424 res = 99 * numpy.ones(len(array1))
427 i = 0 425 i = 0
428 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time 426 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time
439 array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2 437 array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2
440 438
441 array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1 439 array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1
442 array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2 440 array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2
443 441
444 diff11 = [] 442 diff11 = 999 * numpy.ones(len(array2))
445 relativeDiffList = [] 443 relativeDiffList = 999 * numpy.ones(len(array2))
446 ham1 = [] 444 ham1 = 999 * numpy.ones(len(array2))
447 ham2 = [] 445 ham2 = 999 * numpy.ones(len(array2))
448 min_valueList = [] 446 min_valueList = 999 * numpy.ones(len(array2))
449 min_tagsList = [] 447 min_tagsList = 999 * numpy.ones(len(array2))
450 diff11_zeros = [] 448 diff11_zeros = 999 * numpy.ones(len(array2))
451 min_tagsList_zeros = [] 449 min_tagsList_zeros = 999 * numpy.ones(len(array2))
450
451
452 #diff11 = []
453 #relativeDiffList = []
454 #ham1 = []
455 #ham2 = []
456 #min_valueList = []
457 #min_tagsList = []
458 #diff11_zeros = []
459 #min_tagsList_zeros = []
452 i = 0 # counter, only used to see how many HDs of tags were already calculated 460 i = 0 # counter, only used to see how many HDs of tags were already calculated
453 if mate_b is False: # HD calculation for all a's 461 if mate_b is False: # HD calculation for all a's
454 half1_mate1 = array1_half 462 half1_mate1 = array1_half
455 half2_mate1 = array1_half2 463 half2_mate1 = array1_half2
456 half1_mate2 = array2_half 464 half1_mate2 = array2_half
486 d = d_2 494 d = d_2
487 d2 = d_1 495 d2 = d_1
488 else: # half1, corrects the variable of the HD from both halfs if it is a or b 496 else: # half1, corrects the variable of the HD from both halfs if it is a or b
489 d = d_1 497 d = d_1
490 d2 = d_2 498 d2 = d_2
491 min_valueList.append(d + d2) 499 min_valueList[i] = d + d2
492 min_tagsList.append(tag) 500 min_tagsList[i] = tag
493 ham1.append(d) 501 ham1.append[i] = d
494 ham2.append(d2) 502 ham2.append[i] = d2
495 difference1 = abs(d - d2) 503 difference1 = abs(d - d2)
496 diff11.append(difference1) 504 diff11[i] = difference1
497 rel_difference = round(float(difference1) / (d + d2), 1) 505 rel_difference = round(float(difference1) / (d + d2), 1)
498 relativeDiffList.append(rel_difference) 506 relativeDiffList[i] = rel_difference
499 507
500 #### tags which have identical parts: 508 #### tags which have identical parts:
501 if d == 0 or d2 == 0: 509 if d == 0 or d2 == 0:
502 min_tagsList_zeros.append(tag) 510 min_tagsList_zeros[i] = tag
503 difference1_zeros = abs(d - d2) 511 difference1_zeros = abs(d - d2)
504 diff11_zeros.append(difference1_zeros) 512 diff11_zeros[i] = difference1_zeros
513 i += 1
514
505 #print(i) 515 #print(i)
506 i += 1 516 diff11 = [st for st in diff11 if st != 999]
517 ham1 = [st for st in ham1 if st != 999]
518 ham2 = [st for st in ham2 if st != 999]
519 min_valueList = [st for st in min_valueList if st != 999]
520 min_tagsList = [st for st in min_tagsList if st != 999]
521 relativeDiffList = [st for st in relativeDiffList if st != 999]
522 diff11_zeros = [st for st in diff11_zeros if st != 999]
523 min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999]
524
507 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros]) 525 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros])
508 526
509 def readFileReferenceFree(file): 527 def readFileReferenceFree(file):
510 with open(file, 'r') as dest_f: 528 with open(file, 'r') as dest_f:
511 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') 529 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string')
823 quant = numpy.concatenate((quant, duplTagsBA[result])) 841 quant = numpy.concatenate((quant, duplTagsBA[result]))
824 seq = numpy.tile(seq, 2) 842 seq = numpy.tile(seq, 2)
825 ham = numpy.tile(ham, 2) 843 ham = numpy.tile(ham, 2)
826 844
827 # prepare data for different kinds of plots 845 # prepare data for different kinds of plots
846 # distribution of FSs separated after HD
847 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham,rel=False)
828 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS 848 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS
829 # distribution of FSs separated after HD 849
830 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham,
831 rel=False)
832
833 ## get FS for all tags with min HD of analysis of chimeric reads 850 ## get FS for all tags with min HD of analysis of chimeric reads
834 # there are more tags than sample size in the plot, because one tag can have multiple minimas 851 # there are more tags than sample size in the plot, because one tag can have multiple minimas
835 seqDic = dict(zip(seq, quant)) 852 seqDic = dict(zip(seq, quant))
836 lst_minHD_tags = [] 853 lst_minHD_tags = []
837 for i in minHD_tags: 854 for i in minHD_tags:
867 884
868 ##################################################################################################################### 885 #####################################################################################################################
869 ################## plot Hamming Distance with Family size distribution ############################## 886 ################## plot Hamming Distance with Family size distribution ##############################
870 ##################################################################################################################### 887 #####################################################################################################################
871 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, 888 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf,
872 subtitle="Overall hamming distance with separation after family size", title_file1=name_file, 889 subtitle="Hamming distance separated by family size", title_file1=name_file,
873 lenTags=lenTags,xlabel="Hamming distance") 890 lenTags=lenTags,xlabel="Hamming distance")
874 891
875 ########################## Plot FSD with separation after HD ############################################### 892 ########################## Plot FSD with separation after HD ###############################################
876 ######################################################################################################################## 893 ########################################################################################################################
877 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, 894 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS,
878 quant=quant, subtitle="Family size distribution with separation after hamming distance", 895 quant=quant, subtitle="Family size distribution separated by Hamming distance",
879 pdf=pdf,relative=False, title_file1=name_file, diff=False) 896 pdf=pdf,relative=False, title_file1=name_file, diff=False)
880 897
881 ########################## Plot difference between HD's separated after FSD ########################################## 898 ########################## Plot difference between HD's separated after FSD ##########################################
882 ######################################################################################################################## 899 ########################################################################################################################
883 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, 900 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf,
884 subtitle="Delta Hamming distances within tags with separation after family size", 901 subtitle="Delta Hamming distance within tags",
885 title_file1=name_file, lenTags=lenTags, 902 title_file1=name_file, lenTags=lenTags,
886 xlabel="absolute delta Hamming distance", relative=False) 903 xlabel="abs delta Hamming distance", relative=False)
887 904
888 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, 905 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf,
889 subtitle="Relative delta Hamming distances within tags with separation after family size", 906 subtitle="Relative delta Hamming distances within tags",
890 title_file1=name_file, lenTags=lenTags, 907 title_file1=name_file, lenTags=lenTags,
891 xlabel="relative delta Hamming distance", relative=True) 908 xlabel="rel delta Hamming distance", relative=True)
892 909
893 #################### Plot FSD separated after difference between HD's ##################################### 910 #################### Plot FSD separated after difference between HD's #####################################
894 ######################################################################################################################## 911 ########################################################################################################################
895 plotFSDwithHD2(familySizeList1_diff, maximumXFS_diff, minimumXFS_diff, 912 plotFSDwithHD2(familySizeList1_diff, maximumXFS_diff, minimumXFS_diff,
896 subtitle="Family size distribution with separation after delta Hamming distances within the tags", 913 subtitle="Family size distribution with delta Hamming distances within the tags",
897 pdf=pdf,relative=False, diff=True, title_file1=name_file, quant=quant) 914 pdf=pdf,relative=False, diff=True, title_file1=name_file, quant=quant)
898 915
899 plotFSDwithHD2(familySizeList1_reldiff, maximumXFS_reldiff, minimumXFS_reldiff, quant=quant, pdf=pdf, 916 plotFSDwithHD2(familySizeList1_reldiff, maximumXFS_reldiff, minimumXFS_reldiff, quant=quant, pdf=pdf,
900 subtitle="Family size distribution with separation after delta Hamming distances within the tags", 917 subtitle="Family size distribution with delta Hamming distances within the tags",
901 relative=True, diff=True, title_file1=name_file) 918 relative=True, diff=True, title_file1=name_file)
902 919
903 920
904 # plots for chimeric reads 921 # plots for chimeric reads
905 if len(minHD_tags_zeros) != 0: 922 if len(minHD_tags_zeros) != 0:
906 ## HD 923 ## HD
907 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, 924 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf,
908 subtitle="Hamming Distance of the non-identical half with separation after family size" 925 subtitle="Hamming distance of the non-identical half of chimeras",
909 "\n(at least one half is identical with the half of the min. tag)\n",
910 title_file1=name_file, lenTags=lenTags,xlabel="Hamming distance", relative=False) 926 title_file1=name_file, lenTags=lenTags,xlabel="Hamming distance", relative=False)
911 927
912 ## FSD 928 ## FSD
913 plotFSDwithHD2(familySizeList1_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros, 929 plotFSDwithHD2(familySizeList1_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros,
914 quant=quant, pdf=pdf, 930 quant=quant, pdf=pdf,
915 subtitle="Family size distribution with separation after hamming distances from the non-identical half\n" 931 subtitle="Family size distribution with Hamming distance from the non-identical half of chimeras",
916 "(at least one half is identical with the half of the min. tag)\n",
917 relative=False, diff=False, title_file1=name_file) 932 relative=False, diff=False, title_file1=name_file)
918 933
919 ### print all data to a CSV file 934 ### print all data to a CSV file
920 #### HD #### 935 #### HD ####
921 summary, sumCol = createTableHD(list1, "HD=") 936 summary, sumCol = createTableHD(list1, "HD=")
952 overallSum15 = sum(sumCol15) 967 overallSum15 = sum(sumCol15)
953 # absolute difference and tags where at least one half has HD=0 968 # absolute difference and tags where at least one half has HD=0
954 summary23, sumCol23 = createTableFSD2(familySizeList1_diff_zeros, diff=False) 969 summary23, sumCol23 = createTableFSD2(familySizeList1_diff_zeros, diff=False)
955 overallSum23 = sum(sumCol23) 970 overallSum23 = sum(sumCol23)
956 971
957 output_file.write("{}\n".format(f)) 972 output_file.write("{}\n".format(name_file))
958 output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len( 973 output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len(
959 numpy.concatenate(list1)), lenTags, lenTags)) 974 numpy.concatenate(list1)), lenTags, lenTags))
960 975
961 ### HD ### 976 ### HD ###
962 createFileHD(summary, sumCol, overallSum, output_file, 977 createFileHD(summary, sumCol, overallSum, output_file,
963 "Hamming distance with separation after family size: file1", sep) 978 "Hamming distance separated by family size", sep)
964 ### FSD ### 979 ### FSD ###
965 createFileFSD2(summary5, sumCol5, overallSum5, output_file, 980 createFileFSD2(summary5, sumCol5, overallSum5, output_file,
966 "Family size distribution with separation after hamming distances: file1", sep, 981 "Family size distribution separated by Hamming distance", sep,
967 diff=False) 982 diff=False)
968 983
969 count = numpy.bincount(quant) 984 count = numpy.bincount(quant)
970 output_file.write("{}{}\n".format(sep, f)) 985 output_file.write("{}{}\n".format(sep, f))
971 output_file.write("max. family size:{}{}\n".format(sep, max(quant))) 986 output_file.write("max. family size:{}{}\n".format(sep, max(quant)))
976 ### HD within tags ### 991 ### HD within tags ###
977 output_file.write( 992 output_file.write(
978 "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n" 993 "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n"
979 "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n") 994 "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n")
980 output_file.write( 995 output_file.write(
981 "file 1: actual number of tags with min HD = {:,} (sample size by user = {:,})\n".format( 996 "actual number of tags with min HD = {:,} (sample size by user = {:,})\n".format(
982 len(numpy.concatenate(listDifference1)), len(numpy.concatenate(list1)))) 997 len(numpy.concatenate(listDifference1)), len(numpy.concatenate(list1))))
983 output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2)) 998 output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2))
984 999
985 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, 1000 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file,
986 "Hamming distance of each half in the tag: file1", sep) 1001 "Hamming distance of each half in the tag", sep)
987 createFileHD(summary11, sumCol11, overallSum11, output_file, 1002 createFileHD(summary11, sumCol11, overallSum11, output_file,
988 "Absolute delta Hamming distances within the tag: file1", sep) 1003 "Absolute delta Hamming distances within the tag", sep)
989 createFileHD(summary13, sumCol13, overallSum13, output_file, 1004 createFileHD(summary13, sumCol13, overallSum13, output_file,
990 "Relative delta Hamming distances within the tag: file1", sep) 1005 "Relative delta Hamming distances within the tag", sep)
991 1006
992 createFileFSD2(summary19, sumCol19, overallSum19, output_file, 1007 createFileFSD2(summary19, sumCol19, overallSum19, output_file,
993 "Family size distribution with separation after absolute delta Hamming distances: file1", 1008 "Family size distribution separated by absolute delta Hamming distance",
994 sep) 1009 sep)
995 createFileFSD2(summary21, sumCol21, overallSum21, output_file, 1010 createFileFSD2(summary21, sumCol21, overallSum21, output_file,
996 "Family size distribution with separation after relative delta Hamming distances: file1", 1011 "Family size distribution separated by relative delta Hamming distance",
997 sep, rel=True) 1012 sep, rel=True)
998 1013
999 if len(minHD_tags_zeros) != 0: 1014 if len(minHD_tags_zeros) != 0:
1000 output_file.write( 1015 output_file.write(
1001 "All tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n") 1016 "Identifiaction of chimeric reads:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n")
1002 createFileHD(summary15, sumCol15, overallSum15, output_file, 1017 createFileHD(summary15, sumCol15, overallSum15, output_file,
1003 "Hamming distances of non-zero half: file1", sep) 1018 "Hamming distances of non-zero half", sep)
1004 createFileFSD2(summary23, sumCol23, overallSum23, output_file, 1019 createFileFSD2(summary23, sumCol23, overallSum23, output_file,
1005 "Family size distribution with separation after Hamming distances of non-zero half: file1", 1020 "Family size distribution separated by Hamming distance of non-zero half",
1006 sep, diff=False) 1021 sep, diff=False)
1007 output_file.write("\n") 1022 output_file.write("\n")
1008 1023
1009 1024
1010 1025