comparison hd.py @ 25:9e384b0741f1 draft

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/hd commit b8a2f7b7615b2bcd3b602027af31f4e677da94f6-dirty
author mheinzl
date Tue, 14 May 2019 03:29:37 -0400
parents 7e570ba56b83
children df1fc5cedc8b
comparison
equal deleted inserted replaced
24:3bc67ac46740 25:9e384b0741f1
12 # and finally a CSV file with the data of the plots. 12 # and finally a CSV file with the data of the plots.
13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input. 13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input.
14 # The tool can run on a certain number of processors, which can be defined by the user. 14 # The tool can run on a certain number of processors, which can be defined by the user.
15 15
16 # USAGE: python hd.py --inputFile filename --inputName1 filename --sample_size int / 16 # USAGE: python hd.py --inputFile filename --inputName1 filename --sample_size int /
17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_pdf outputfile_name_pdf --output_tabular outputfile_name_tabular --output_chimeras_tabular outputfile_name_chimeras_tabular 17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular
18 18
19 import argparse 19 import argparse
20 import itertools 20 import itertools
21 import operator 21 import operator
22 import sys 22 import sys
23 from collections import Counter, defaultdict 23 from collections import Counter, defaultdict
24 from functools import partial 24 from functools import partial
25 from multiprocessing.pool import Pool 25 from multiprocessing.pool import Pool
26 import random
27 import os
26 28
27 import matplotlib.pyplot as plt 29 import matplotlib.pyplot as plt
28 import numpy 30 import numpy
29 from matplotlib.backends.backend_pdf import PdfPages 31 from matplotlib.backends.backend_pdf import PdfPages
30 32
140 else: 142 else:
141 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts), 1), 143 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts), 1),
142 xy=(label, x_label + len(con_list1) * 0.01), 144 xy=(label, x_label + len(con_list1) * 0.01),
143 xycoords="data", color="#000066", fontsize=10) 145 xycoords="data", color="#000066", fontsize=10)
144 146
145 legend = "sample size= {:,} against {:,}".format(sum(counts), lenTags) 147 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(lenTags, len_sample, sum(counts))
146 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) 148 plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure)
147 if nr_unique_chimeras != 0 and len_sample != 0: 149
148 if relative == True: 150 # if nr_unique_chimeras != 0 and len_sample != 0:
149 legend = "nr. of unique chimeric tags= {:,} ({:.5f}) (rel.diff=1)".format(nr_unique_chimeras, 151 # if relative == True:
150 int(nr_unique_chimeras) / float(len_sample)) 152 # legend = "nr. of unique chimeric tags= {:,} ({:.5f}) (rel.diff=1)".format(nr_unique_chimeras,
151 else: 153 # int(nr_unique_chimeras) / float(len_sample))
152 legend = "nr. of unique chimeric tags= {:,} ({:.5f})".format(nr_unique_chimeras, int(nr_unique_chimeras) / float(len_sample)) 154 # else:
153 plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure) 155 # legend = "nr. of unique chimeric tags= {:,} ({:.5f})".format(nr_unique_chimeras, int(nr_unique_chimeras) / float(len_sample))
156 # plt.text(0.14, -0.09, legend, size=12, transform=plt.gcf().transFigure)
154 157
155 pdf.savefig(fig, bbox_inches="tight") 158 pdf.savefig(fig, bbox_inches="tight")
156 plt.close("all") 159 plt.close("all")
157 plt.clf() 160 plt.clf()
158 161
159 162
160 def plotHDwithinSeq_Sum2(sum1, sum1min, sum2, sum2min, min_value, lenTags, title_file1, pdf): 163 def plotHDwithinSeq_Sum2(sum1, sum1min, sum2, sum2min, min_value, lenTags, title_file1, pdf, len_sample):
161 fig = plt.figure(figsize=(6, 8)) 164 fig = plt.figure(figsize=(6, 8))
162 plt.subplots_adjust(bottom=0.1) 165 plt.subplots_adjust(bottom=0.1)
163 166
164 ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)] # new hd within tags 167 ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)] # new hd within tags
165 168
170 if len(range(minimumX, maximumX)) == 0: 173 if len(range(minimumX, maximumX)) == 0:
171 range1 = minimumX 174 range1 = minimumX
172 else: 175 else:
173 range1 = range(minimumX, maximumX + 2) 176 range1 = range(minimumX, maximumX + 2)
174 177
175 plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, label=[ "HD a", "HD b'", "HD b", "HD a'", "HD a+b"], bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], edgecolor='black', linewidth=1) 178 plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, label=["HD a", "HD b'", "HD b", "HD a'", "HD a+b"], bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], edgecolor='black', linewidth=1)
176 179
177 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1)) 180 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1))
178 plt.suptitle('Hamming distances within tags', fontsize=14) 181 plt.suptitle('Hamming distances within tags', fontsize=14)
179 # plt.title(title_file1, fontsize=12) 182 # plt.title(title_file1, fontsize=12)
180 plt.xlabel("HD", fontsize=14) 183 plt.xlabel("HD", fontsize=14)
182 plt.grid(b=True, which='major', color='#424242', linestyle=':') 185 plt.grid(b=True, which='major', color='#424242', linestyle=':')
183 186
184 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2)) 187 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2))
185 plt.xticks(numpy.arange(0, maximumX + 1, 1.0)) 188 plt.xticks(numpy.arange(0, maximumX + 1, 1.0))
186 # plt.ylim(0, maximumY * 1.2) 189 # plt.ylim(0, maximumY * 1.2)
187 190 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(lenTags, len_sample, len(numpy.concatenate(ham_partial)))
188 legend = "sample size= {:,} against {:,}".format(len(numpy.concatenate(ham_partial)), lenTags) 191
189 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) 192 # legend = "sample size= {:,} against {:,}".format(len(numpy.concatenate(ham_partial)), lenTags)
193 plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure)
190 pdf.savefig(fig, bbox_inches="tight") 194 pdf.savefig(fig, bbox_inches="tight")
191 plt.close("all") 195 plt.close("all")
192 plt.clf() 196 plt.clf()
193 197
194 198
491 elif mate_b is True: # HD calculation for all b's 495 elif mate_b is True: # HD calculation for all b's
492 half1_mate1 = array1_half2 496 half1_mate1 = array1_half2
493 half2_mate1 = array1_half 497 half2_mate1 = array1_half
494 half1_mate2 = array2_half2 498 half1_mate2 = array2_half2
495 half2_mate2 = array2_half 499 half2_mate2 = array2_half
500 # half1_mate1, index_halves = numpy.unique(half1_mate1, return_index=True)
501 # print(len(half1_mate1))
502 # half2_mate1 = half2_mate1[index_halves]
503 # array1 = array1[index_halves]
496 504
497 for a, b, tag in zip(half1_mate1, half2_mate1, array1): 505 for a, b, tag in zip(half1_mate1, half2_mate1, array1):
498 # exclude identical tag from array2, to prevent comparison to itself 506 # exclude identical tag from array2, to prevent comparison to itself
499 sameTag = numpy.where(array2 == tag)[0] 507 sameTag = numpy.where(array2 == tag)[0]
500 indexArray2 = numpy.arange(0, len(array2), 1) 508 indexArray2 = numpy.arange(0, len(array2), 1)
506 array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs) 514 array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs)
507 515
508 dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in 516 dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in
509 array2_half_withoutSame]) # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" 517 array2_half_withoutSame]) # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
510 min_index = numpy.where(dist == dist.min())[0] # get index of min HD 518 min_index = numpy.where(dist == dist.min())[0] # get index of min HD
511 min_value = dist[min_index] # get minimum HDs 519 min_value = dist.min()
520 # min_value = dist[min_index] # get minimum HDs
512 min_tag_half2 = array2_half2_withoutSame[min_index] # get all "b's" of the tag or all "a's" of the tag with minimum HD 521 min_tag_half2 = array2_half2_withoutSame[min_index] # get all "b's" of the tag or all "a's" of the tag with minimum HD
513 min_tag_array2 = array2_withoutSame[min_index] # get whole tag with min HD 522 min_tag_array2 = array2_withoutSame[min_index] # get whole tag with min HD
514 523
515 dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in 524 dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's"
516 min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" 525 max_value = dist_second_half.max()
517 dist2 = [dist_second_half.max()]
518 min_value = [dist.min()]
519 max_index = numpy.where(dist_second_half == dist_second_half.max())[0] # get index of max HD 526 max_index = numpy.where(dist_second_half == dist_second_half.max())[0] # get index of max HD
520 max_tag = min_tag_array2[max_index] 527 max_tag = min_tag_array2[max_index]
521 528
522 for d, d2 in zip(min_value, dist2): 529 # for d, d2 in zip(min_value, max_value):
523 if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b 530 if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b
524 ham2.append(d) 531 ham2.append(min_value)
525 ham2min.append(d2) 532 ham2min.append(max_value)
526 else: # half1, corrects the variable of the HD from both halfs if it is a or b 533 else: # half1, corrects the variable of the HD from both halfs if it is a or b
527 ham1.append(d) 534 ham1.append(min_value)
528 ham1min.append(d2) 535 ham1min.append(max_value)
529 536
530 min_valueList.append(d + d2) 537 min_valueList.append(min_value + max_value)
531 min_tagsList.append(tag) 538 min_tagsList.append(tag)
532 difference1 = abs(d - d2) 539 difference1 = abs(min_value - max_value)
533 diff11.append(difference1) 540 diff11.append(difference1)
534 rel_difference = round(float(difference1) / (d + d2), 1) 541 rel_difference = round(float(difference1) / (min_value + max_value), 1)
535 relativeDiffList.append(rel_difference) 542 relativeDiffList.append(rel_difference)
536 543
537 # tags which have identical parts: 544 # tags which have identical parts:
538 if d == 0 or d2 == 0: 545 if min_value == 0 or max_value == 0:
539 min_tagsList_zeros.append(numpy.array(tag)) 546 min_tagsList_zeros.append(numpy.array(tag))
540 difference1_zeros = abs(d - d2) # hd of non-identical part 547 difference1_zeros = abs(min_value - max_value) # hd of non-identical part
541 diff11_zeros.append(difference1_zeros) 548 diff11_zeros.append(difference1_zeros)
542 max_tag_list.append(numpy.array(max_tag)) 549 max_tag_list.append(max_tag)
550 else:
551 min_tagsList_zeros.append(None)
552 diff11_zeros.append(None)
553 max_tag_list.append(numpy.array(["None"]))
554
555 # max_tag_list.append(numpy.array(max_tag))
543 556
544 i += 1 557 i += 1
545 558
546 # print(i) 559 # print(i)
547 # diff11 = [st for st in diff11 if st != 999] 560 # diff11 = [st for st in diff11 if st != 999]
665 parser.add_argument('--minFS', default=1, type=int, 678 parser.add_argument('--minFS', default=1, type=int,
666 help='Only tags, which have a family size greater or equal than specified, are included in the HD analysis') 679 help='Only tags, which have a family size greater or equal than specified, are included in the HD analysis')
667 parser.add_argument('--maxFS', default=0, type=int, 680 parser.add_argument('--maxFS', default=0, type=int,
668 help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis') 681 help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis')
669 parser.add_argument('--nr_above_bars', action="store_true", 682 parser.add_argument('--nr_above_bars', action="store_true",
670 help='If no, values above bars in the histrograms are removed') 683 help='If no, values above bars in the histograms are removed')
671 684
672 parser.add_argument('--output_tabular', default="data.tabular", type=str, 685 parser.add_argument('--output_tabular', default="data.tabular", type=str,
673 help='Name of the tabular file.') 686 help='Name of the tabular file.')
674 parser.add_argument('--output_pdf', default="data.pdf", type=str, 687 parser.add_argument('--output_pdf', default="data.pdf", type=str,
675 help='Name of the pdf file.') 688 help='Name of the pdf file.')
676 parser.add_argument('--output_chimeras_tabular', default="data_chimeras.tabular", type=str, 689 parser.add_argument('--output_chimeras_tabular', default="data.tabular", type=str,
677 help='Name of the tabular file with all chimeric tags.') 690 help='Name of the tabular file with all chimeric tags.')
691
678 return parser 692 return parser
679 693
680 694
681 def Hamming_Distance_Analysis(argv): 695 def Hamming_Distance_Analysis(argv):
696
682 parser = make_argparser() 697 parser = make_argparser()
683 args = parser.parse_args(argv[1:]) 698 args = parser.parse_args(argv[1:])
699
684 file1 = args.inputFile 700 file1 = args.inputFile
685 name1 = args.inputName1 701 name1 = args.inputName1
702
686 index_size = args.sample_size 703 index_size = args.sample_size
687 title_savedFile_pdf = args.output_pdf 704 title_savedFile_pdf = args.output_pdf
688 title_savedFile_csv = args.output_tabular 705 title_savedFile_csv = args.output_tabular
689 output_chimeras_tabular = args.output_chimeras_tabular 706 output_chimeras_tabular = args.output_chimeras_tabular
690 707
691 sep = "\t" 708 sep = "\t"
692 onlyDuplicates = args.only_DCS 709 onlyDuplicates = args.only_DCS
693 minFS = args.minFS 710 minFS = args.minFS
694 maxFS = args.maxFS 711 maxFS = args.maxFS
695 nr_above_bars = args.nr_above_bars 712 nr_above_bars = args.nr_above_bars
713
696 subset = args.subset_tag 714 subset = args.subset_tag
697 nproc = args.nproc 715 nproc = args.nproc
698 716
699 # input checks 717 # input checks
700 if index_size < 0: 718 if index_size < 0:
720 738
721 with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf: 739 with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf:
722 print("dataset: ", name1) 740 print("dataset: ", name1)
723 integers, data_array = readFileReferenceFree(file1) 741 integers, data_array = readFileReferenceFree(file1)
724 data_array = numpy.array(data_array) 742 data_array = numpy.array(data_array)
725 print("total nr of tags:", len(data_array)) 743 print("total nr of tags with Ns:", len(data_array))
726 n = [i for i, x in enumerate(data_array[:, 1]) if "N" in x] 744 n = [i for i, x in enumerate(data_array[:, 1]) if "N" in x]
727 if len(n) != 0: # delete tags with N in the tag from data 745 if len(n) != 0: # delete tags with N in the tag from data
728 print("nr of tags with N's within tag:", len(n), float(len(n)) / len(data_array)) 746 print("nr of tags with N's within tag:", len(n), float(len(n)) / len(data_array))
729 index_whole_array = numpy.arange(0, len(data_array), 1) 747 index_whole_array = numpy.arange(0, len(data_array), 1)
730 index_withoutN_inTag = numpy.delete(index_whole_array, n) 748 index_withoutN_inTag = numpy.delete(index_whole_array, n)
731 data_array = data_array[index_withoutN_inTag, :] 749 data_array = data_array[index_withoutN_inTag, :]
732 integers = integers[index_withoutN_inTag] 750 integers = integers[index_withoutN_inTag]
733 print("total nr of tags without Ns:", len(data_array)) 751 print("total nr of tags without Ns:", len(data_array))
734 752
735 data_array_whole_dataset = data_array
736
737 int_f = numpy.array(data_array[:, 0]).astype(int) 753 int_f = numpy.array(data_array[:, 0]).astype(int)
738 data_array = data_array[numpy.where(int_f >= minFS)] 754 data_array = data_array[numpy.where(int_f >= minFS)]
739 integers = integers[integers >= minFS] 755 integers = integers[integers >= minFS]
740 756
741 # select family size for tags 757 # select family size for tags
742 if maxFS > 0: 758 if maxFS > 0:
743 int_f2 = numpy.array(data_array[:, 0]).astype(int) 759 int_f2 = numpy.array(data_array[:, 0]).astype(int)
744 data_array = data_array[numpy.where(int_f2 <= maxFS)] 760 data_array = data_array[numpy.where(int_f2 <= maxFS)]
745 integers = integers[integers <= maxFS] 761 integers = integers[integers <= maxFS]
746 762
747 print("min FS", min(integers))
748 print("max FS", max(integers))
749
750 tags = data_array[:, 2]
751 seq = data_array[:, 1]
752
753 if onlyDuplicates is True: 763 if onlyDuplicates is True:
764 tags = data_array[:, 2]
765 seq = data_array[:, 1]
766
754 # find all unique tags and get the indices for ALL tags, but only once 767 # find all unique tags and get the indices for ALL tags, but only once
755 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) 768 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
756 d = u[c > 1] 769 d = u[c > 1]
757 770
758 # get family sizes, tag for duplicates 771 # get family sizes, tag for duplicates
761 duplTagsBA = duplTags_double[1::2] # ba of DCS 774 duplTagsBA = duplTags_double[1::2] # ba of DCS
762 775
763 duplTags_tag = tags[numpy.in1d(seq, d)][0::2] # ab 776 duplTags_tag = tags[numpy.in1d(seq, d)][0::2] # ab
764 duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags 777 duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags
765 778
779 if minFS > 1:
780 duplTags_tag = duplTags_tag[(duplTags >= 3) & (duplTagsBA >= 3)]
781 duplTags_seq = duplTags_seq[(duplTags >= 3) & (duplTagsBA >= 3)]
782 duplTags = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)] # ab+ba with FS>=3
783
766 data_array = numpy.column_stack((duplTags, duplTags_seq)) 784 data_array = numpy.column_stack((duplTags, duplTags_seq))
767 data_array = numpy.column_stack((data_array, duplTags_tag)) 785 data_array = numpy.column_stack((data_array, duplTags_tag))
768 integers = numpy.array(data_array[:, 0]).astype(int) 786 integers = numpy.array(data_array[:, 0]).astype(int)
769 print("DCS in whole dataset", len(data_array)) 787 print("DCS in whole dataset", len(data_array))
788
789 print("min FS", min(integers))
790 print("max FS", max(integers))
770 791
771 # HD analysis for a subset of the tag 792 # HD analysis for a subset of the tag
772 if subset > 0: 793 if subset > 0:
773 tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]]) 794 tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]])
774 tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]]) 795 tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]])
787 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2]) 808 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2])
788 809
789 data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)]) 810 data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)])
790 data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2])) 811 data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2]))
791 812
792 print("length of tag:", len(data_array[0, 1])) 813 print("length of tag= ", len(data_array[0, 1]))
793 # select sample: if no size given --> all vs. all comparison 814 # select sample: if no size given --> all vs. all comparison
794 if index_size == 0: 815 if index_size == 0:
795 result = numpy.arange(0, len(data_array), 1) 816 result = numpy.arange(0, len(data_array), 1)
796 else: 817 else:
797 result = numpy.random.choice(len(integers), size=index_size, 818 numpy.random.shuffle(data_array)
798 replace=False) # array of random sequences of size=index.size 819 unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True) # get only unique tags
799 # unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True) # get only unique tags 820 result = numpy.random.choice(unique_indices, size=index_size, replace=False) # array of random sequences of size=index.size
800 # result = numpy.random.choice(unique_indices, size=index_size, 821
801 # replace=False) # array of random sequences of size=index.size 822 # result = numpy.random.choice(len(integers), size=index_size,
823 # replace=False) # array of random sequences of size=index.size
824 # result = numpy.where(numpy.array(random_tags) == numpy.array(data_array[:,1]))[0]
802 825
803 # with open("index_result1_{}.pkl".format(app_f), "wb") as o: 826 # with open("index_result1_{}.pkl".format(app_f), "wb") as o:
804 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL) 827 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL)
805 828
806 # comparison random tags to whole dataset 829 # comparison random tags to whole dataset
807 result1 = data_array[result, 1] # random tags 830 result1 = data_array[result, 1] # random tags
808 result2 = data_array[:, 1] # all tags 831 result2 = data_array[:, 1] # all tags
809 print("sample size:", len(result1)) 832 print("sample size= ", len(result1))
810 833
811 # HD analysis of whole tag 834 # HD analysis of whole tag
812 proc_pool = Pool(nproc) 835 proc_pool = Pool(nproc)
813 chunks_sample = numpy.array_split(result1, nproc) 836 chunks_sample = numpy.array_split(result1, nproc)
814 ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample) 837 ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample)
817 ham = numpy.concatenate(ham).astype(int) 840 ham = numpy.concatenate(ham).astype(int)
818 # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1: 841 # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1:
819 # for h, tag in zip(ham, result1): 842 # for h, tag in zip(ham, result1):
820 # output_file1.write("{}\t{}\n".format(tag, h)) 843 # output_file1.write("{}\t{}\n".format(tag, h))
821 844
822 # HD analysis for chimeric reads 845 # # HD analysis for chimeric reads
823 # result2 = data_array_whole_dataset[:,1] 846 # result2 = data_array_whole_dataset[:,1]
824 847
825 proc_pool_b = Pool(nproc) 848 proc_pool_b = Pool(nproc)
826 diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample) 849 diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample)
827 diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample) 850 diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample)
828 proc_pool_b.close() 851 proc_pool_b.close()
829 proc_pool_b.join() 852 proc_pool_b.join()
830 diff = numpy.concatenate((numpy.concatenate([item[0] for item in diff_list_a]),
831 numpy.concatenate([item_b[0] for item_b in diff_list_b]))).astype(int)
832 HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]), 853 HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]),
833 numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int) 854 numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int)
834 HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]), 855 HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]),
835 numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int) 856 numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int)
836 minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]), 857 minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]),
837 numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int) 858 numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int)
838 minHD_tags = numpy.concatenate((numpy.concatenate([item[4] for item in diff_list_a]),
839 numpy.concatenate([item_b[4] for item_b in diff_list_b])))
840 rel_Diff = numpy.concatenate((numpy.concatenate([item[5] for item in diff_list_a]),
841 numpy.concatenate([item_b[5] for item_b in diff_list_b])))
842 diff_zeros = numpy.concatenate((numpy.concatenate([item[6] for item in diff_list_a]),
843 numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int)
844 minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]),
845 numpy.concatenate([item_b[7] for item_b in diff_list_b])))
846 HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]), 859 HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]),
847 numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int) 860 numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int)
848 HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]), 861 HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]),
849 numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int) 862 numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int)
850 863
851 chimera_tags = numpy.concatenate(([item[10] for item in diff_list_a], 864 rel_Diff1 = numpy.concatenate([item[5] for item in diff_list_a])
852 [item_b[10] for item_b in diff_list_b])) 865 rel_Diff2 = numpy.concatenate([item[5] for item in diff_list_b])
853 866 diff1 = numpy.concatenate([item[0] for item in diff_list_a])
854 chimera_tags = [x for x in chimera_tags if x != []] 867 diff2 = numpy.concatenate([item[0] for item in diff_list_b])
855 chimera_tags_new = [] 868
856 869 diff_zeros1 = numpy.concatenate([item[6] for item in diff_list_a])
857 for i in chimera_tags: 870 diff_zeros2 = numpy.concatenate([item[6] for item in diff_list_b])
858 if len(i) > 1: 871 minHD_tags = numpy.concatenate([item[4] for item in diff_list_a])
859 for t in i: 872 minHD_tags_zeros1 = numpy.concatenate([item[7] for item in diff_list_a])
860 chimera_tags_new.append(t) 873 minHD_tags_zeros2 = numpy.concatenate([item[7] for item in diff_list_b])
861 else: 874 chim_tags = [item[10] for item in diff_list_a]
862 chimera_tags_new.extend(i) 875 chim_tags2 = [item[10] for item in diff_list_b]
863 876 chimera_tags1 = [ii if isinstance(i, list) else i for i in chim_tags for ii in i]
864 chimeras_dic = defaultdict(list) 877 chimera_tags2 = [ii if isinstance(i, list) else i for i in chim_tags2 for ii in i]
865 for t1, t2 in zip(minHD_tags_zeros, chimera_tags_new): 878
866 chimeras_dic[t1].append(t2) 879 rel_Diff = []
867 880 diff_zeros = []
868 lst_unique_chimeras = [] 881 minHD_tags_zeros = []
882 diff = []
883 chimera_tags = []
884 for d1, d2, rel1, rel2, zeros1, zeros2, tag1, tag2, ctag1, ctag2 in \
885 zip(diff1, diff2, rel_Diff1, rel_Diff2, diff_zeros1, diff_zeros2, minHD_tags_zeros1, minHD_tags_zeros2, chimera_tags1, chimera_tags2):
886 rel_Diff.append(max(rel1, rel2))
887 diff.append(max(d1, d2))
888
889 if all(i is not None for i in [zeros1, zeros2]):
890 diff_zeros.append(max(zeros1, zeros2))
891 minHD_tags_zeros.append(str(tag1))
892 tags = [ctag1, ctag2]
893 chimera_tags.append(tags)
894 elif zeros1 is not None and zeros2 is None:
895 diff_zeros.append(zeros1)
896 minHD_tags_zeros.append(str(tag1))
897 chimera_tags.append(ctag1)
898 elif zeros1 is None and zeros2 is not None:
899 diff_zeros.append(zeros2)
900 minHD_tags_zeros.append(str(tag2))
901 chimera_tags.append(ctag2)
902
903 chimera_tags_new = chimera_tags
904 #data_chimeraAnalysis = numpy.column_stack((minHD_tags_zeros, chimera_tags_new))
905 # chimeras_dic = defaultdict(list)
906 #
907 # for t1, t2 in zip(minHD_tags_zeros, chimera_tags_new):
908 # if len(t2) >1 and type(t2) is not numpy.ndarray:
909 # t2 = numpy.concatenate(t2)
910 # chimeras_dic[t1].append(t2)
911
869 with open(output_chimeras_tabular, "w") as output_file1: 912 with open(output_chimeras_tabular, "w") as output_file1:
870 unique_chimeras = numpy.unique(minHD_tags_zeros) 913 output_file1.write("chimera tag\tsimilar tag with HD=0\n")
871 sample_half_a = numpy.array([i[0:(len(i)) / 2] for i in unique_chimeras]) # mate1 part1 914 for i in range(len(minHD_tags_zeros)):
872 sample_half_b = numpy.array([i[len(i) / 2:len(i)] for i in unique_chimeras]) # mate1 part 2 915 tag1 = minHD_tags_zeros[i]
873 916 sample_half_a = tag1[0:(len(tag1)) / 2]
874 output_file1.write("sample tag\tsimilar tag\n") 917 sample_half_b = tag1[len(tag1) / 2:len(tag1)]
875 for tag1, a, b in zip(unique_chimeras, sample_half_a, sample_half_b): 918
876 max_tags = numpy.concatenate(chimeras_dic.get(tag1)) 919 max_tags = chimera_tags_new[i]
877 920 if isinstance(max_tags, list) and len(max_tags) > 1:
878 if tag1 in chimeras_dic.values(): 921 max_tags = numpy.concatenate(max_tags)
879 continue 922 #if isinstance(max_tags, list): #and type(max_tags) is not numpy.ndarray:
880 else: 923 # print(max_tags)
881 lst_unique_chimeras.append(tag1) 924 # max_tags = numpy.concatenate(max_tags)
925 max_tags = numpy.unique(max_tags)
882 926
883 chimera_half_a = numpy.array([i[0:(len(i)) / 2] for i in max_tags]) # mate1 part1 927 chimera_half_a = numpy.array([i[0:(len(i)) / 2] for i in max_tags]) # mate1 part1
884 chimera_half_b = numpy.array([i[len(i) / 2:len(i)] for i in max_tags]) # mate1 part 2 928 chimera_half_b = numpy.array([i[len(i) / 2:len(i)] for i in max_tags]) # mate1 part 2
885 929
886 new_format = [] 930 new_format = []
887 for i in range(len(max_tags)): 931 for j in range(len(max_tags)):
888 if a == chimera_half_a[i]: 932 if sample_half_a == chimera_half_a[j]:
889 max_tag = "*{}* {}".format(chimera_half_a[i], chimera_half_b[i]) 933 max_tag = "*{}* {}".format(chimera_half_a[j], chimera_half_b[j])
890 new_format.append(max_tag) 934 new_format.append(max_tag)
891 935
892 elif b == chimera_half_b[i]: 936 elif sample_half_b == chimera_half_b[j]:
893 max_tag = "{} *{}*".format(chimera_half_a[i], chimera_half_b[i]) 937 max_tag = "{} *{}*".format(chimera_half_a[j], chimera_half_b[j])
894 new_format.append(max_tag) 938 new_format.append(max_tag)
895 939
896 sample_tag = "{} {}".format(a, b) 940 sample_tag = "{} {}".format(sample_half_a, sample_half_b)
897 output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format))) 941 output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format)))
898 output_file1.write( 942 output_file1.write(
899 "This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.\n " 943 "This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.\n "
900 "The tags were separated by an empty space into their halves and the * marks the identical half.") 944 "The tags were separated by an empty space into their halves and the * marks the identical half.")
901 945
902 nr_chimeric_tags = len(lst_unique_chimeras) 946 # unique_chimeras = numpy.array(minHD_tags_zeros)
903 print("nr of unique chimeras:", nr_chimeric_tags) 947 #
948 # sample_half_a = numpy.array([i[0:(len(i)) / 2] for i in unique_chimeras]) # mate1 part1
949 # sample_half_b = numpy.array([i[len(i) / 2:len(i)] for i in unique_chimeras]) # mate1 part 2
950 #
951 # output_file1.write("sample tag\tsimilar tag\n")
952 # for tag1, a, b in zip(unique_chimeras, sample_half_a, sample_half_b):
953 # max_tags = numpy.concatenate(chimeras_dic.get(tag1))
954 # max_tags = numpy.unique(max_tags)
955 #
956 # chimera_half_a = numpy.array([i[0:(len(i)) / 2] for i in max_tags]) # mate1 part1
957 # chimera_half_b = numpy.array([i[len(i) / 2:len(i)] for i in max_tags]) # mate1 part 2
958 #
959 # new_format = []
960 # for i in range(len(max_tags)):
961 # if a == chimera_half_a[i]:
962 # max_tag = "*{}* {}".format(chimera_half_a[i], chimera_half_b[i])
963 # new_format.append(max_tag)
964 #
965 # elif b == chimera_half_b[i]:
966 # max_tag = "{} *{}*".format(chimera_half_a[i], chimera_half_b[i])
967 # new_format.append(max_tag)
968 #
969 # sample_tag = "{} {}".format(a, b)
970 # output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format)))
971 # output_file1.write(
972 # "This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.\n "
973 # "The tags were separated by an empty space into their halves and the * marks the identical half.")
974
975 nr_chimeric_tags = len(minHD_tags_zeros)
976 print("nr of unique chimeras", nr_chimeric_tags)
904 977
905 lenTags = len(data_array) 978 lenTags = len(data_array)
906 len_sample = len(result1) 979 len_sample = len(result1)
907 980
908 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags 981 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags
929 for s, q in zip(seq, quant): 1002 for s, q in zip(seq, quant):
930 seqDic[s].append(q) 1003 seqDic[s].append(q)
931 else: 1004 else:
932 seqDic = dict(zip(seq, quant)) 1005 seqDic = dict(zip(seq, quant))
933 1006
934
935 lst_minHD_tags = [] 1007 lst_minHD_tags = []
936 for i in minHD_tags: 1008 for i in minHD_tags:
937 lst_minHD_tags.append(seqDic.get(i)) 1009 lst_minHD_tags.append(seqDic.get(i))
938 1010
939 if onlyDuplicates: 1011 if onlyDuplicates:
940 lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags], 1012 lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags], [item_b[1] for item_b in lst_minHD_tags])).astype(int)
941 [item_b[1] for item_b in lst_minHD_tags])).astype(int)
942 # else:
943 # lst_minHD_tags = numpy.concatenate(lst_minHD_tags)
944 1013
945 # histogram with absolute and relative difference between HDs of both parts of the tag 1014 # histogram with absolute and relative difference between HDs of both parts of the tag
946 listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff) 1015 listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff)
947 listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags, 1016 listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags,
948 rel_Diff) 1017 rel_Diff)
949
950 # family size distribution separated after the difference between HDs of both parts of the tag
951 # familySizeList1_diff, hammingDistances_diff, maximumXFS_diff, minimumXFS_diff = familySizeDistributionWithHD(
952 # lst_minHD_tags, diff, diff=True, rel=False)
953 # familySizeList1_reldiff, hammingDistances_reldiff, maximumXFS_reldiff, minimumXFS_reldiff = familySizeDistributionWithHD(
954 # lst_minHD_tags, rel_Diff, diff=True, rel=True)
955
956 # chimeric read analysis: tags which have HD=0 in one of the halfs 1018 # chimeric read analysis: tags which have HD=0 in one of the halfs
957 if len(minHD_tags_zeros) != 0: 1019 if len(minHD_tags_zeros) != 0:
958 lst_minHD_tags_zeros = [] 1020 lst_minHD_tags_zeros = []
959 for i in minHD_tags_zeros: 1021 for i in minHD_tags_zeros:
960 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads 1022 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads
961 if onlyDuplicates: 1023 if onlyDuplicates:
962 lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros], 1024 lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros], [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int)
963 [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int) 1025
964
965 # histogram with HD of non-identical half 1026 # histogram with HD of non-identical half
966 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS( 1027 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(lst_minHD_tags_zeros, diff_zeros)
967 lst_minHD_tags_zeros, diff_zeros) 1028
968
969 # plot Hamming Distance with Family size distribution 1029 # plot Hamming Distance with Family size distribution
970 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, 1030 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf,
971 subtitle="Hamming distance separated by family size", title_file1=name1, lenTags=lenTags, 1031 subtitle="Hamming distance separated by family size", title_file1=name1, lenTags=lenTags,
972 xlabel="HD", nr_above_bars=nr_above_bars) 1032 xlabel="HD", nr_above_bars=nr_above_bars, len_sample=len_sample)
973 1033
974 # Plot FSD with separation after 1034 # Plot FSD with separation after
975 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, 1035 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS,
976 originalCounts=quant, subtitle="Family size distribution separated by Hamming distance", 1036 originalCounts=quant, subtitle="Family size distribution separated by Hamming distance",
977 pdf=pdf, relative=False, title_file1=name1, diff=False) 1037 pdf=pdf, relative=False, title_file1=name1, diff=False)
978 1038
979 # Plot HD within tags 1039 # Plot HD within tags
980 plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, 1040 plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags,
981 title_file1=name1) 1041 title_file1=name1, len_sample=len_sample)
982 1042
983 # Plot difference between HD's separated after FSD 1043 # Plot difference between HD's separated after FSD
984 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, 1044 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf,
985 subtitle="Delta Hamming distance within tags", 1045 subtitle="Delta Hamming distance within tags",
986 title_file1=name1, lenTags=lenTags, 1046 title_file1=name1, lenTags=lenTags,
987 xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars) 1047 xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars, len_sample=len_sample)
988 1048
989 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, 1049 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf,
990 subtitle="Chimera Analysis: relative delta Hamming distances", 1050 subtitle="Chimera Analysis: relative delta Hamming distances",
991 title_file1=name1, lenTags=lenTags, 1051 title_file1=name1, lenTags=lenTags,
992 xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) 1052 xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
993 1053
994 # plots for chimeric reads 1054 # plots for chimeric reads
995 if len(minHD_tags_zeros) != 0: 1055 if len(minHD_tags_zeros) != 0:
996 # HD 1056 # HD
997 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf,subtitle="Hamming distance of the non-identical half of chimeras", 1057 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, subtitle="Hamming distance of chimeras",
998 title_file1=name1, lenTags=lenTags, xlabel="HD", relative=False, 1058 title_file1=name1, lenTags=lenTags, xlabel="HD", relative=False,
999 nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) 1059 nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
1000 1060
1001 # print all data to a CSV file 1061 # print all data to a CSV file
1002 # HD 1062 # HD
1045 output_file.write( 1105 output_file.write(
1046 "relative frequency:{}{}\n\n".format(sep, float(max_fs[len(max_fs) - 1]) / sum(max_fs))) 1106 "relative frequency:{}{}\n\n".format(sep, float(max_fs[len(max_fs) - 1]) / sum(max_fs)))
1047 1107
1048 # HD within tags 1108 # HD within tags
1049 output_file.write( 1109 output_file.write(
1050 "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n" 1110 "The Hamming distances were calculated by comparing the first halve against all halves and selected the minimum value (HD a).\n"
1051 "Since this calculation was repeated, but starting with the second half to find all possible chimeras in the data, the actual number of tags in the plots differs from the sample size entered by the user.\n" 1111 "For the second half of the tag, we compared them against all tags which resulted in the minimum HD of the previous step and selected the maximum value (HD b').\n"
1052 "In addition, both family sizes of one tag will be included in the plots if only tags of reads that can form a DCS were allowed.\n") 1112 "Finally, it was possible to calculate the absolute and relative differences between the HDs (absolute and relative delta HD).\n"
1113 "These calculations were repeated, but starting with the second half in the first step to find all possible chimeras in the data (HD b and HD For simplicity we used the maximum value between the delta values in the end.\n"
1114 "When only tags that can form DCS were allowed in the analysis, family sizes for the forward and reverse (ab and ba) will be included in the plots.\n")
1053 1115
1054 output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2)) 1116 output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2))
1055 1117
1056 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, 1118 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file,
1057 "Hamming distance of each half in the tag", sep) 1119 "Hamming distance of each half in the tag", sep)
1061 createFileHD(summary13, sumCol13, overallSum13, output_file, 1123 createFileHD(summary13, sumCol13, overallSum13, output_file,
1062 "Chimera analysis: relative delta Hamming distances", sep) 1124 "Chimera analysis: relative delta Hamming distances", sep)
1063 1125
1064 if len(minHD_tags_zeros) != 0: 1126 if len(minHD_tags_zeros) != 0:
1065 output_file.write( 1127 output_file.write(
1066 "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the Hamming distance of the non-identical half is shown.\n") 1128 "Chimeras:\nAll tags were filtered: only those tags where at least one half was identical (HD=0) and therefore, had a relative delta of 1 were kept. These tags are considered as chimeric.\nSo the Hamming distances of the chimeric tags are shown.\n")
1067 output_file.write(
1068 "Be aware that the real number of chimeric tags (where rel. diff = 1) is not shown in the plot because of the above reasons.\n")
1069 output_file.write("real number of chimeric tags{}{}{}{}\n".format(sep, nr_chimeric_tags, sep, int(nr_chimeric_tags) / float(len_sample)))
1070 createFileHD(summary15, sumCol15, overallSum15, output_file, 1129 createFileHD(summary15, sumCol15, overallSum15, output_file,
1071 "Hamming distances of non-zero half", sep) 1130 "Hamming distances of chimeras", sep)
1072 1131
1073 output_file.write("\n") 1132 output_file.write("\n")
1074 1133
1075 1134
1076 if __name__ == '__main__': 1135 if __name__ == '__main__':
1077 sys.exit(Hamming_Distance_Analysis(sys.argv)) 1136 sys.exit(Hamming_Distance_Analysis(sys.argv))
1078