comparison hd.py @ 14:883e6381ba29 draft

planemo upload for repository https://github.com/monikaheinzl/galaxyProject/tree/master/tools/hd commit 38f5c032262361131c645812dd3dc639be6a5f4e
author mheinzl
date Wed, 23 May 2018 14:14:10 -0400
parents 5b0a95f205ad
children cf7874bb4934
comparison
equal deleted inserted replaced
13:5b0a95f205ad 14:883e6381ba29
12 # and finally a CSV file with the data of the plots. 12 # and finally a CSV file with the data of the plots.
13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input. 13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input.
14 # The tool can run on a certain number of processors, which can be defined by the user. 14 # The tool can run on a certain number of processors, which can be defined by the user.
15 15
16 # USAGE: python HDnew6_1Plot_FINAL.py --inputFile filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --sample_size int/0 --sep "characterWhichSeparatesCSVFile" / 16 # USAGE: python HDnew6_1Plot_FINAL.py --inputFile filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --sample_size int/0 --sep "characterWhichSeparatesCSVFile" /
17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --output_csv outptufile_name_csv --output_pdf outptufile_name_pdf 17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False--output_csv outptufile_name_csv --output_pdf outptufile_name_pdf
18 18
19 import numpy 19 import numpy
20 import itertools 20 import itertools
21 import operator 21 import operator
22 import matplotlib.pyplot as plt 22 import matplotlib.pyplot as plt
90 90
91 pdf.savefig(fig, bbox_inches="tight") 91 pdf.savefig(fig, bbox_inches="tight")
92 plt.close("all") 92 plt.close("all")
93 93
94 def plotHDwithFSD(list1,maximumX,minimumX, subtitle, lenTags, title_file1,pdf, 94 def plotHDwithFSD(list1,maximumX,minimumX, subtitle, lenTags, title_file1,pdf,
95 xlabel,relative=False): 95 xlabel,relative=False, nr_above_bars = True):
96 if relative is True: 96 if relative is True:
97 step = 0.1 97 step = 0.1
98 else: 98 else:
99 step = 1 99 step = 1
100 100
128 plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) 128 plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1))
129 plt.xticks(numpy.arange(0, maximumX + step, step)) 129 plt.xticks(numpy.arange(0, maximumX + step, step))
130 130
131 plt.ylim((0, maximumY * 1.2)) 131 plt.ylim((0, maximumY * 1.2))
132 132
133 bin_centers = -0.4 * numpy.diff(bins) + bins[:-1] 133 if nr_above_bars is True:
134 for x_label, label in zip(counts, bin_centers): # labels for values 134 bin_centers = -0.4 * numpy.diff(bins) + bins[:-1]
135 if x_label == 0: 135 for x_label, label in zip(counts, bin_centers): # labels for values
136 continue 136 if x_label == 0:
137 else: 137 continue
138 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts), 1), 138 else:
139 xy=(label, x_label + len(con_list1) * 0.01), 139 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts), 1),
140 xycoords="data", color="#000066",fontsize=10) 140 xy=(label, x_label + len(con_list1) * 0.01),
141 141 xycoords="data", color="#000066",fontsize=10)
142
142 legend = "sample size= {:,} against {:,}".format(sum(counts), lenTags) 143 legend = "sample size= {:,} against {:,}".format(sum(counts), lenTags)
143 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) 144 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure)
144 145
145 pdf.savefig(fig, bbox_inches="tight") 146 pdf.savefig(fig, bbox_inches="tight")
146 plt.close("all") 147 plt.close("all")
147 plt.clf() 148 plt.clf()
148 149
149 def plotHDwithinSeq_Sum2(sum1, sum2,min_value, lenTags, title_file1, pdf): 150 def plotHDwithinSeq_Sum2(sum1, sum2,sum1min, sum2min, min_value, lenTags, title_file1, pdf):
150 fig = plt.figure(figsize=(6, 8)) 151 fig = plt.figure(figsize=(6, 8))
151 plt.subplots_adjust(bottom=0.1) 152 plt.subplots_adjust(bottom=0.1)
152 153
153 ham = [sum1, sum2,numpy.array(min_value)] # new hd within tags 154 #ham = [sum1, sum2,numpy.array(min_value)] # new hd within tags
155 ham = [sum1, sum2, sum1min, sum2min, numpy.array(min_value)] # new hd within tags
156
154 157
155 maximumX = numpy.amax(numpy.concatenate(ham)) 158 maximumX = numpy.amax(numpy.concatenate(ham))
156 minimumX = numpy.amin(numpy.concatenate(ham)) 159 minimumX = numpy.amin(numpy.concatenate(ham))
157 maximumY = numpy.amax(numpy.concatenate(map(lambda (x): numpy.bincount(x), ham))) 160 maximumY = numpy.amax(numpy.concatenate(map(lambda (x): numpy.bincount(x), ham)))
158 161
160 range1 = minimumX 163 range1 = minimumX
161 else: 164 else:
162 range1 = range(minimumX, maximumX + 2) 165 range1 = range(minimumX, maximumX + 2)
163 166
164 counts = plt.hist(ham, align="left", rwidth=0.8, stacked=False, 167 counts = plt.hist(ham, align="left", rwidth=0.8, stacked=False,
165 label=[ "HD a", "HD b","HD a+b"], 168 # label=[ "HD a", "HD b","HD a+b"],
166 bins=range1, color=[ "#58ACFA", "#FA5858","#585858"], edgecolor='black', linewidth=1) 169 label=[ "HD a","HD b'", "HD b", "HD a'", "HD a+b"],
170 #bins=range1, color=[ "#58ACFA", "#FA5858","#585858"],
171 color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"],
172 edgecolor='black', linewidth=1)
167 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1)) 173 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1))
168 plt.suptitle('Hamming distances within tags', fontsize=14) 174 plt.suptitle('Hamming distances within tags', fontsize=14)
169 #plt.title(title_file1, fontsize=12) 175 #plt.title(title_file1, fontsize=12)
170 plt.xlabel("Hamming Distance", fontsize=14) 176 plt.xlabel("HD", fontsize=14)
171 plt.ylabel("Absolute Frequency", fontsize=14) 177 plt.ylabel("Absolute Frequency", fontsize=14)
172 plt.grid(b=True, which='major', color='#424242', linestyle=':') 178 plt.grid(b=True, which='major', color='#424242', linestyle=':')
173 179
174 180
175 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.1)) 181 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.1))
446 452
447 diff11 = [] 453 diff11 = []
448 relativeDiffList = [] 454 relativeDiffList = []
449 ham1 = [] 455 ham1 = []
450 ham2 = [] 456 ham2 = []
457 ham1min = []
458 ham2min = []
451 min_valueList = [] 459 min_valueList = []
452 min_tagsList = [] 460 min_tagsList = []
453 diff11_zeros = [] 461 diff11_zeros = []
454 min_tagsList_zeros = [] 462 min_tagsList_zeros = []
455 i = 0 # counter, only used to see how many HDs of tags were already calculated 463 i = 0 # counter, only used to see how many HDs of tags were already calculated
486 min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" 494 min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's"
487 for d_1, d_2 in zip(min_value, dist2): 495 for d_1, d_2 in zip(min_value, dist2):
488 if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b 496 if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b
489 d = d_2 497 d = d_2
490 d2 = d_1 498 d2 = d_1
499 ham2.append(d)
500 ham2min.append(d2)
491 else: # half1, corrects the variable of the HD from both halfs if it is a or b 501 else: # half1, corrects the variable of the HD from both halfs if it is a or b
492 d = d_1 502 d = d_1
493 d2 = d_2 503 d2 = d_2
504 ham1.append(d)
505 ham1min.append(d2)
506
494 min_valueList.append(d + d2) 507 min_valueList.append(d + d2)
495 min_tagsList.append(tag) 508 min_tagsList.append(tag)
496 ham1.append(d) 509 # ham1.append(d)
497 ham2.append(d2) 510 # ham2.append(d2)
498 difference1 = abs(d - d2) 511 difference1 = abs(d - d2)
499 diff11.append(difference1) 512 diff11.append(difference1)
500 rel_difference = round(float(difference1) / (d + d2), 1) 513 rel_difference = round(float(difference1) / (d + d2), 1)
501 relativeDiffList.append(rel_difference) 514 relativeDiffList.append(rel_difference)
502 515
515 #min_tagsList = [st for st in min_tagsList if st != 999] 528 #min_tagsList = [st for st in min_tagsList if st != 999]
516 #relativeDiffList = [st for st in relativeDiffList if st != 999] 529 #relativeDiffList = [st for st in relativeDiffList if st != 999]
517 #diff11_zeros = [st for st in diff11_zeros if st != 999] 530 #diff11_zeros = [st for st in diff11_zeros if st != 999]
518 #min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999] 531 #min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999]
519 532
520 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros]) 533 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros, ham1min, ham2min])
521 534
522 def readFileReferenceFree(file): 535 def readFileReferenceFree(file):
523 with open(file, 'r') as dest_f: 536 with open(file, 'r') as dest_f:
524 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') 537 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string')
525 integers = numpy.array(data_array[:, 0]).astype(int) 538 integers = numpy.array(data_array[:, 0]).astype(int)
630 643
631 parser.add_argument('--minFS', default=1, type=int, 644 parser.add_argument('--minFS', default=1, type=int,
632 help='Only tags, which have a family size greater or equal than specified, are included in the HD analysis') 645 help='Only tags, which have a family size greater or equal than specified, are included in the HD analysis')
633 parser.add_argument('--maxFS', default=0, type=int, 646 parser.add_argument('--maxFS', default=0, type=int,
634 help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis') 647 help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis')
635 648 parser.add_argument('--nr_above_bars', action="store_true", # default=False, type=bool,
649 help='If no, values above bars in the histrograms are removed')
650
636 parser.add_argument('--output_csv', default="data.csv", type=str, 651 parser.add_argument('--output_csv', default="data.csv", type=str,
637 help='Name of the csv file.') 652 help='Name of the csv file.')
638 parser.add_argument('--output_pdf', default="data.pdf", type=str, 653 parser.add_argument('--output_pdf', default="data.pdf", type=str,
639 help='Name of the pdf file.') 654 help='Name of the pdf file.')
640 parser.add_argument('--output_pdf2', default="data2.pdf", type=str, 655 parser.add_argument('--output_pdf2', default="data2.pdf", type=str,
663 678
664 sep = args.sep 679 sep = args.sep
665 onlyDuplicates = args.only_DCS 680 onlyDuplicates = args.only_DCS
666 minFS = args.minFS 681 minFS = args.minFS
667 maxFS = args.maxFS 682 maxFS = args.maxFS
683 nr_above_bars = args.nr_above_bars
684
668 685
669 subset = args.subset_tag 686 subset = args.subset_tag
670 nproc = args.nproc 687 nproc = args.nproc
671 688
672 ### input checks 689 ### input checks
815 numpy.concatenate([item_b[5] for item_b in diff_list_b]))) 832 numpy.concatenate([item_b[5] for item_b in diff_list_b])))
816 diff_zeros = numpy.concatenate((numpy.concatenate([item[6] for item in diff_list_a]), 833 diff_zeros = numpy.concatenate((numpy.concatenate([item[6] for item in diff_list_a]),
817 numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int) 834 numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int)
818 minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]), 835 minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]),
819 numpy.concatenate([item_b[7] for item_b in diff_list_b]))) 836 numpy.concatenate([item_b[7] for item_b in diff_list_b])))
820 837 HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]),
838 numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int)
839 HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]),
840 numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int)
821 # with open("HD_within tag_{}.txt".format(app_f), "w") as output_file2: 841 # with open("HD_within tag_{}.txt".format(app_f), "w") as output_file2:
822 # for d, s1, s2, hd, rel_d, tag in zip(diff, HDhalf1, HDhalf2, minHDs, rel_Diff, minHD_tags): 842 # for d, s1, s2, hd, rel_d, tag in zip(diff, HDhalf1, HDhalf2, minHDs, rel_Diff, minHD_tags):
823 # output_file2.write( 843 # output_file2.write(
824 # "{}\t{}\t{}\t{}\t{}\t{}\n".format(tag, hd, s1, s2, d, rel_d)) 844 # "{}\t{}\t{}\t{}\t{}\t{}\n".format(tag, hd, s1, s2, d, rel_d))
825 845
868 lst_minHD_tags_zeros, diff_zeros) 888 lst_minHD_tags_zeros, diff_zeros)
869 # family size distribution of non-identical half 889 # family size distribution of non-identical half
870 familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD( 890 familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD(
871 lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False) 891 lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False)
872 892
873 ########################## Plot HD within tags ########################################################
874 ######################################################################################################################
875 plotHDwithinSeq_Sum2(HDhalf1, HDhalf2, minHDs, pdf=pdf, lenTags=lenTags, title_file1=name_file)
876
877 ##################################################################################################################### 893 #####################################################################################################################
878 ################## plot Hamming Distance with Family size distribution ############################## 894 ################## plot Hamming Distance with Family size distribution ##############################
879 ##################################################################################################################### 895 #####################################################################################################################
880 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, 896 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf,
881 subtitle="Hamming distance separated by family size", title_file1=name_file, 897 subtitle="Hamming distance separated by family size", title_file1=name_file,
882 lenTags=lenTags,xlabel="Hamming distance") 898 lenTags=lenTags,xlabel="HD", nr_above_bars=nr_above_bars)
883 899
884 ########################## Plot FSD with separation after HD ############################################### 900 ########################## Plot FSD with separation after ###############################################
885 ######################################################################################################################## 901 ######################################################################################################################
886 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, 902 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS,
887 originalCounts=quant, subtitle="Family size distribution separated by Hamming distance", 903 originalCounts=quant, subtitle="Family size distribution separated by Hamming distance",
888 pdf=pdf,relative=False, title_file1=name_file, diff=False) 904 pdf=pdf,relative=False, title_file1=name_file, diff=False)
889 905
890 ########################## Plot difference between HD's separated after FSD ########################################## 906 ########################## Plot HD within tags ########################################################
891 ######################################################################################################################## 907 ######################################################################################################################
908 # plotHDwithinSeq_Sum2(HDhalf1, HDhalf2, minHDs, pdf=pdf, lenTags=lenTags, title_file1=name_file)
909 plotHDwithinSeq_Sum2(HDhalf1, HDhalf2, HDhalf1min, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, title_file1=name_file)
910
911
912 ########################## Plot difference between HD's separated after FSD ####################################
913 ######################################################################################################################
892 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, 914 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf,
893 subtitle="Delta Hamming distance within tags", 915 subtitle="Delta Hamming distance within tags",
894 title_file1=name_file, lenTags=lenTags, 916 title_file1=name_file, lenTags=lenTags,
895 xlabel="absolute delta Hamming distance", relative=False) 917 xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars)
896 918
897 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, 919 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf,
898 subtitle="Relative delta Hamming distances within tags", 920 subtitle="Chimera Analysis: relative delta Hamming distances",
899 title_file1=name_file, lenTags=lenTags, 921 title_file1=name_file, lenTags=lenTags,
900 xlabel="relative delta Hamming distance", relative=True) 922 xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars)
901 923
902 #################### Plot FSD separated after difference between HD's ##################################### 924 #################### Plot FSD separated after difference between HD's #####################################
903 ######################################################################################################################## 925 ########################################################################################################################
904 plotFSDwithHD2(familySizeList1_diff, maximumXFS_diff, minimumXFS_diff, 926 # plotFSDwithHD2(familySizeList1_diff, maximumXFS_diff, minimumXFS_diff,
905 subtitle="Family size distribution separated by delta Hamming distances within the tags", 927 # subtitle="Family size distribution separated by delta Hamming distances within the tags",
906 pdf=pdf,relative=False, diff=True, title_file1=name_file, originalCounts=quant) 928 # pdf=pdf,relative=False, diff=True, title_file1=name_file, originalCounts=quant)
907 929
908 plotFSDwithHD2(familySizeList1_reldiff, maximumXFS_reldiff, minimumXFS_reldiff, originalCounts=quant, pdf=pdf, 930 # plotFSDwithHD2(familySizeList1_reldiff, maximumXFS_reldiff, minimumXFS_reldiff, originalCounts=quant, pdf=pdf,
909 subtitle="Family size distribution separated by delta Hamming distances within the tags", 931 # subtitle="Family size distribution separated by delta Hamming distances within the tags",
910 relative=True, diff=True, title_file1=name_file) 932 # relative=True, diff=True, title_file1=name_file)
911 933
912 934
913 # plots for chimeric reads 935 # plots for chimeric reads
914 if len(minHD_tags_zeros) != 0: 936 if len(minHD_tags_zeros) != 0:
915 ## HD 937 ## HD
916 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, 938 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf,
917 subtitle="Hamming distance of the non-identical half of chimeras", 939 subtitle="Hamming distance of the non-identical half of chimeras",
918 title_file1=name_file, lenTags=lenTags,xlabel="Hamming distance", relative=False) 940 title_file1=name_file, lenTags=lenTags,xlabel="HD", relative=False, nr_above_bars=nr_above_bars)
919 941
920 ## FSD 942 ## FSD
921 plotFSDwithHD2(familySizeList1_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros, 943 # plotFSDwithHD2(familySizeList1_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros,
922 originalCounts=quant, pdf=pdf, 944 # originalCounts=quant, pdf=pdf,
923 subtitle="Family size distribution separated by Hamming distance of the non-identical half of chimeras", 945 # subtitle="Family size distribution separated by Hamming distance of the non-identical half of chimeras",
924 relative=False, diff=False, title_file1=name_file) 946 # relative=False, diff=False, title_file1=name_file)
925 947
926 ### print all data to a CSV file 948 ### print all data to a CSV file
927 #### HD #### 949 #### HD ####
928 summary, sumCol = createTableHD(list1, "HD=") 950 summary, sumCol = createTableHD(list1, "HD=")
929 overallSum = sum(sumCol) # sum of columns in table 951 overallSum = sum(sumCol) # sum of columns in table
944 summary13, sumCol13 = createTableHD(listRelDifference1, "diff=") 966 summary13, sumCol13 = createTableHD(listRelDifference1, "diff=")
945 overallSum13 = sum(sumCol13) 967 overallSum13 = sum(sumCol13)
946 968
947 ## FSD 969 ## FSD
948 # absolute difference 970 # absolute difference
949 summary19, sumCol19 = createTableFSD2(familySizeList1_diff) 971 # summary19, sumCol19 = createTableFSD2(familySizeList1_diff)
950 overallSum19 = sum(sumCol19) 972 # overallSum19 = sum(sumCol19)
951 # relative difference 973 # relative difference
952 summary21, sumCol21 = createTableFSD2(familySizeList1_reldiff) 974 # summary21, sumCol21 = createTableFSD2(familySizeList1_reldiff)
953 overallSum21 = sum(sumCol21) 975 # overallSum21 = sum(sumCol21)
954 976
955 # chimeric reads 977 # chimeric reads
956 if len(minHD_tags_zeros) != 0: 978 if len(minHD_tags_zeros) != 0:
957 # absolute difference and tags where at least one half has HD=0 979 # absolute difference and tags where at least one half has HD=0
958 summary15, sumCol15 = createTableHD(listDifference1_zeros, "diff=") 980 summary15, sumCol15 = createTableHD(listDifference1_zeros, "diff=")
959 overallSum15 = sum(sumCol15) 981 overallSum15 = sum(sumCol15)
960 # absolute difference and tags where at least one half has HD=0 982 # absolute difference and tags where at least one half has HD=0
961 summary23, sumCol23 = createTableFSD2(familySizeList1_diff_zeros, diff=False) 983 # summary23, sumCol23 = createTableFSD2(familySizeList1_diff_zeros, diff=False)
962 overallSum23 = sum(sumCol23) 984 # overallSum23 = sum(sumCol23)
963 985
964 output_file.write("{}\n".format(name_file)) 986 output_file.write("{}\n".format(name_file))
965 output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len( 987 output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len(
966 numpy.concatenate(list1)), lenTags, lenTags)) 988 numpy.concatenate(list1)), lenTags, lenTags))
967 989
992 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, 1014 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file,
993 "Hamming distance of each half in the tag", sep) 1015 "Hamming distance of each half in the tag", sep)
994 createFileHD(summary11, sumCol11, overallSum11, output_file, 1016 createFileHD(summary11, sumCol11, overallSum11, output_file,
995 "Absolute delta Hamming distances within the tag", sep) 1017 "Absolute delta Hamming distances within the tag", sep)
996 createFileHD(summary13, sumCol13, overallSum13, output_file, 1018 createFileHD(summary13, sumCol13, overallSum13, output_file,
997 "Relative delta Hamming distances within the tag", sep) 1019 "Chimera analysis: relative delta Hamming distances", sep)
998 1020
999 createFileFSD2(summary19, sumCol19, overallSum19, output_file, 1021 # createFileFSD2(summary19, sumCol19, overallSum19, output_file,
1000 "Family size distribution separated by absolute delta Hamming distance", 1022 # "Family size distribution separated by absolute delta Hamming distance",
1001 sep) 1023 # sep)
1002 createFileFSD2(summary21, sumCol21, overallSum21, output_file, 1024 # createFileFSD2(summary21, sumCol21, overallSum21, output_file,
1003 "Family size distribution separated by relative delta Hamming distance", 1025 # "Family size distribution separated by relative delta Hamming distance",
1004 sep, rel=True) 1026 # sep, rel=True)
1005 1027
1006 if len(minHD_tags_zeros) != 0: 1028 if len(minHD_tags_zeros) != 0:
1007 output_file.write( 1029 output_file.write(
1008 "Identifiaction of chimeric reads:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n") 1030 "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n")
1009 createFileHD(summary15, sumCol15, overallSum15, output_file, 1031 createFileHD(summary15, sumCol15, overallSum15, output_file,
1010 "Hamming distances of non-zero half", sep) 1032 "Hamming distances of non-zero half", sep)
1011 createFileFSD2(summary23, sumCol23, overallSum23, output_file, 1033 # createFileFSD2(summary23, sumCol23, overallSum23, output_file,
1012 "Family size distribution separated by Hamming distance of non-zero half", 1034 # "Family size distribution separated by Hamming distance of non-zero half",
1013 sep, diff=False) 1035 # sep, diff=False)
1014 output_file.write("\n") 1036 output_file.write("\n")
1015 1037
1016 1038
1017 1039
1018 if __name__ == '__main__': 1040 if __name__ == '__main__':