comparison hd.py @ 22:7e570ba56b83 draft

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/hd commit b8a2f7b7615b2bcd3b602027af31f4e677da94f6-dirty
author mheinzl
date Wed, 27 Feb 2019 04:50:56 -0500
parents 9919024d7778
children 9e384b0741f1
comparison
equal deleted inserted replaced
21:9919024d7778 22:7e570ba56b83
11 # In additon, the tool produces HD and FSD plots for the difference between the HDs of both parts of the tags and for the chimeric reads 11 # In additon, the tool produces HD and FSD plots for the difference between the HDs of both parts of the tags and for the chimeric reads
12 # and finally a CSV file with the data of the plots. 12 # and finally a CSV file with the data of the plots.
13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input. 13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input.
14 # The tool can run on a certain number of processors, which can be defined by the user. 14 # The tool can run on a certain number of processors, which can be defined by the user.
15 15
16 # USAGE: python hd.py --inputFile filename --inputName1 filename --sample_size int/0 --sep "characterWhichSeparatesCSVFile" / 16 # USAGE: python hd.py --inputFile filename --inputName1 filename --sample_size int /
17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular 17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_pdf outputfile_name_pdf --output_tabular outputfile_name_tabular --output_chimeras_tabular outputfile_name_chimeras_tabular
18 18
19 import argparse 19 import argparse
20 import itertools 20 import itertools
21 import operator 21 import operator
22 import sys 22 import sys
23 from collections import Counter 23 from collections import Counter, defaultdict
24 from functools import partial 24 from functools import partial
25 from multiprocessing.pool import Pool 25 from multiprocessing.pool import Pool
26 26
27 import matplotlib.pyplot as plt 27 import matplotlib.pyplot as plt
28 import numpy 28 import numpy
92 92
93 pdf.savefig(fig, bbox_inches="tight") 93 pdf.savefig(fig, bbox_inches="tight")
94 plt.close("all") 94 plt.close("all")
95 95
96 96
97 def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, title_file1, pdf, xlabel, relative=False, nr_above_bars=True): 97 def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, title_file1, pdf, xlabel, relative=False, nr_above_bars=True, nr_unique_chimeras=0, len_sample=0):
98 if relative is True: 98 if relative is True:
99 step = 0.1 99 step = 0.1
100 else: 100 else:
101 step = 1 101 step = 1
102 102
142 xy=(label, x_label + len(con_list1) * 0.01), 142 xy=(label, x_label + len(con_list1) * 0.01),
143 xycoords="data", color="#000066", fontsize=10) 143 xycoords="data", color="#000066", fontsize=10)
144 144
145 legend = "sample size= {:,} against {:,}".format(sum(counts), lenTags) 145 legend = "sample size= {:,} against {:,}".format(sum(counts), lenTags)
146 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) 146 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure)
147 if nr_unique_chimeras != 0 and len_sample != 0:
148 if relative == True:
149 legend = "nr. of unique chimeric tags= {:,} ({:.5f}) (rel.diff=1)".format(nr_unique_chimeras,
150 int(nr_unique_chimeras) / float(len_sample))
151 else:
152 legend = "nr. of unique chimeric tags= {:,} ({:.5f})".format(nr_unique_chimeras, int(nr_unique_chimeras) / float(len_sample))
153 plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure)
147 154
148 pdf.savefig(fig, bbox_inches="tight") 155 pdf.savefig(fig, bbox_inches="tight")
149 plt.close("all") 156 plt.close("all")
150 plt.clf() 157 plt.clf()
151 158
156 163
157 ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)] # new hd within tags 164 ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)] # new hd within tags
158 165
159 maximumX = numpy.amax(numpy.concatenate(ham_partial)) 166 maximumX = numpy.amax(numpy.concatenate(ham_partial))
160 minimumX = numpy.amin(numpy.concatenate(ham_partial)) 167 minimumX = numpy.amin(numpy.concatenate(ham_partial))
161 maximumY = numpy.amax(numpy.array(numpy.concatenate(map(lambda (x): numpy.bincount(x), ham_partial)))) 168 maximumY = numpy.amax(numpy.array(numpy.concatenate(map(lambda x: numpy.bincount(x), ham_partial))))
162 169
163 if len(range(minimumX, maximumX)) == 0: 170 if len(range(minimumX, maximumX)) == 0:
164 range1 = minimumX 171 range1 = minimumX
165 else: 172 else:
166 range1 = range(minimumX, maximumX + 2) 173 range1 = range(minimumX, maximumX + 2)
446 return res 453 return res
447 454
448 455
449 def hamming_difference(array1, array2, mate_b): 456 def hamming_difference(array1, array2, mate_b):
450 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time 457 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time
458
451 array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1 459 array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1
452 array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2 460 array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2
453 461
454 array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1 462 array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1
455 array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2 463 array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2
471 ham2min = [] 479 ham2min = []
472 min_valueList = [] 480 min_valueList = []
473 min_tagsList = [] 481 min_tagsList = []
474 diff11_zeros = [] 482 diff11_zeros = []
475 min_tagsList_zeros = [] 483 min_tagsList_zeros = []
484 max_tag_list = []
476 i = 0 # counter, only used to see how many HDs of tags were already calculated 485 i = 0 # counter, only used to see how many HDs of tags were already calculated
477 if mate_b is False: # HD calculation for all a's 486 if mate_b is False: # HD calculation for all a's
478 half1_mate1 = array1_half 487 half1_mate1 = array1_half
479 half2_mate1 = array1_half2 488 half2_mate1 = array1_half2
480 half1_mate2 = array2_half 489 half1_mate2 = array2_half
485 half1_mate2 = array2_half2 494 half1_mate2 = array2_half2
486 half2_mate2 = array2_half 495 half2_mate2 = array2_half
487 496
488 for a, b, tag in zip(half1_mate1, half2_mate1, array1): 497 for a, b, tag in zip(half1_mate1, half2_mate1, array1):
489 # exclude identical tag from array2, to prevent comparison to itself 498 # exclude identical tag from array2, to prevent comparison to itself
490 sameTag = numpy.where(array2 == tag) 499 sameTag = numpy.where(array2 == tag)[0]
491 indexArray2 = numpy.arange(0, len(array2), 1) 500 indexArray2 = numpy.arange(0, len(array2), 1)
492 index_withoutSame = numpy.delete(indexArray2, sameTag) # delete identical tag from the data 501 index_withoutSame = numpy.delete(indexArray2, sameTag) # delete identical tag from the data
493 502
494 # all tags without identical tag 503 # all tags without identical tag
495 array2_half_withoutSame = half1_mate2[index_withoutSame] 504 array2_half_withoutSame = half1_mate2[index_withoutSame]
496 array2_half2_withoutSame = half2_mate2[index_withoutSame] 505 array2_half2_withoutSame = half2_mate2[index_withoutSame]
497 # array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs) 506 array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs)
498 507
499 dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in 508 dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in
500 array2_half_withoutSame]) # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" 509 array2_half_withoutSame]) # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
501 min_index = numpy.where(dist == dist.min()) # get index of min HD 510 min_index = numpy.where(dist == dist.min())[0] # get index of min HD
502 min_value = dist[min_index] # get minimum HDs 511 min_value = dist[min_index] # get minimum HDs
503 min_tag_half2 = array2_half2_withoutSame[min_index] # get all "b's" of the tag or all "a's" of the tag with minimum HD 512 min_tag_half2 = array2_half2_withoutSame[min_index] # get all "b's" of the tag or all "a's" of the tag with minimum HD
504 # min_tag = array2_withoutSame[min_index] # get whole tag with min HD 513 min_tag_array2 = array2_withoutSame[min_index] # get whole tag with min HD
505 514
506 dist2 = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in 515 dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in
507 min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" 516 min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's"
517 dist2 = [dist_second_half.max()]
518 min_value = [dist.min()]
519 max_index = numpy.where(dist_second_half == dist_second_half.max())[0] # get index of max HD
520 max_tag = min_tag_array2[max_index]
521
508 for d, d2 in zip(min_value, dist2): 522 for d, d2 in zip(min_value, dist2):
509 if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b 523 if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b
510 ham2.append(d) 524 ham2.append(d)
511 ham2min.append(d2) 525 ham2min.append(d2)
512 else: # half1, corrects the variable of the HD from both halfs if it is a or b 526 else: # half1, corrects the variable of the HD from both halfs if it is a or b
520 rel_difference = round(float(difference1) / (d + d2), 1) 534 rel_difference = round(float(difference1) / (d + d2), 1)
521 relativeDiffList.append(rel_difference) 535 relativeDiffList.append(rel_difference)
522 536
523 # tags which have identical parts: 537 # tags which have identical parts:
524 if d == 0 or d2 == 0: 538 if d == 0 or d2 == 0:
525 min_tagsList_zeros.append(tag) 539 min_tagsList_zeros.append(numpy.array(tag))
526 difference1_zeros = abs(d - d2) 540 difference1_zeros = abs(d - d2) # hd of non-identical part
527 diff11_zeros.append(difference1_zeros) 541 diff11_zeros.append(difference1_zeros)
542 max_tag_list.append(numpy.array(max_tag))
543
528 i += 1 544 i += 1
529 545
530 # print(i) 546 # print(i)
531 # diff11 = [st for st in diff11 if st != 999] 547 # diff11 = [st for st in diff11 if st != 999]
532 # ham1 = [st for st in ham1 if st != 999] 548 # ham1 = [st for st in ham1 if st != 999]
534 # min_valueList = [st for st in min_valueList if st != 999] 550 # min_valueList = [st for st in min_valueList if st != 999]
535 # min_tagsList = [st for st in min_tagsList if st != 999] 551 # min_tagsList = [st for st in min_tagsList if st != 999]
536 # relativeDiffList = [st for st in relativeDiffList if st != 999] 552 # relativeDiffList = [st for st in relativeDiffList if st != 999]
537 # diff11_zeros = [st for st in diff11_zeros if st != 999] 553 # diff11_zeros = [st for st in diff11_zeros if st != 999]
538 # min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999] 554 # min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999]
539 555 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros, ham1min, ham2min, max_tag_list])
540 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros, ham1min, ham2min])
541 556
542 557
543 def readFileReferenceFree(file): 558 def readFileReferenceFree(file):
544 with open(file, 'r') as dest_f: 559 with open(file, 'r') as dest_f:
545 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') 560 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string')
636 def make_argparser(): 651 def make_argparser():
637 parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data') 652 parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data')
638 parser.add_argument('--inputFile', 653 parser.add_argument('--inputFile',
639 help='Tabular File with three columns: ab or ba, tag and family size.') 654 help='Tabular File with three columns: ab or ba, tag and family size.')
640 parser.add_argument('--inputName1') 655 parser.add_argument('--inputName1')
641 # parser.add_argument('--inputFile2', default=None,
642 # help='Tabular File with three columns: ab or ba, tag and family size.')
643 # parser.add_argument('--inputName2')
644 parser.add_argument('--sample_size', default=1000, type=int, 656 parser.add_argument('--sample_size', default=1000, type=int,
645 help='Sample size of Hamming distance analysis.') 657 help='Sample size of Hamming distance analysis.')
646 parser.add_argument('--subset_tag', default=0, type=int, 658 parser.add_argument('--subset_tag', default=0, type=int,
647 help='The tag is shortened to the given number.') 659 help='The tag is shortened to the given number.')
648 parser.add_argument('--nproc', default=4, type=int, 660 parser.add_argument('--nproc', default=4, type=int,
659 671
660 parser.add_argument('--output_tabular', default="data.tabular", type=str, 672 parser.add_argument('--output_tabular', default="data.tabular", type=str,
661 help='Name of the tabular file.') 673 help='Name of the tabular file.')
662 parser.add_argument('--output_pdf', default="data.pdf", type=str, 674 parser.add_argument('--output_pdf', default="data.pdf", type=str,
663 help='Name of the pdf file.') 675 help='Name of the pdf file.')
664 # parser.add_argument('--output_pdf2', default="data2.pdf", type=str, 676 parser.add_argument('--output_chimeras_tabular', default="data_chimeras.tabular", type=str,
665 # help='Name of the pdf file.') 677 help='Name of the tabular file with all chimeric tags.')
666 # parser.add_argument('--output_tabular2', default="data2.tabular", type=str,
667 # help='Name of the tabular file.')
668
669 return parser 678 return parser
670 679
671 680
672 def Hamming_Distance_Analysis(argv): 681 def Hamming_Distance_Analysis(argv):
673 parser = make_argparser() 682 parser = make_argparser()
674 args = parser.parse_args(argv[1:]) 683 args = parser.parse_args(argv[1:])
675
676 file1 = args.inputFile 684 file1 = args.inputFile
677 name1 = args.inputName1 685 name1 = args.inputName1
678
679 # file2 = args.inputFile2
680 # name2 = args.inputName2
681
682 index_size = args.sample_size 686 index_size = args.sample_size
683 title_savedFile_pdf = args.output_pdf 687 title_savedFile_pdf = args.output_pdf
684 # title_savedFile_pdf2 = args.output_pdf2
685
686 title_savedFile_csv = args.output_tabular 688 title_savedFile_csv = args.output_tabular
687 # title_savedFile_csv2 = args.output_tabular2 689 output_chimeras_tabular = args.output_chimeras_tabular
688 690
689 sep = "\t" 691 sep = "\t"
690 onlyDuplicates = args.only_DCS 692 onlyDuplicates = args.only_DCS
691 minFS = args.minFS 693 minFS = args.minFS
692 maxFS = args.maxFS 694 maxFS = args.maxFS
693 nr_above_bars = args.nr_above_bars 695 nr_above_bars = args.nr_above_bars
694
695 subset = args.subset_tag 696 subset = args.subset_tag
696 nproc = args.nproc 697 nproc = args.nproc
697 698
698 # input checks 699 # input checks
699 if index_size < 0: 700 if index_size < 0:
713 plt.rcParams['xtick.labelsize'] = 14 714 plt.rcParams['xtick.labelsize'] = 14
714 plt.rcParams['ytick.labelsize'] = 14 715 plt.rcParams['ytick.labelsize'] = 14
715 plt.rcParams['patch.edgecolor'] = "#000000" 716 plt.rcParams['patch.edgecolor'] = "#000000"
716 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format 717 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format
717 718
718 # if file2 != str(None):
719 # files = [file1, file2]
720 # name1 = name1.split(".tabular")[0]
721 # name2 = name2.split(".tabular")[0]
722 # names = [name1, name2]
723 # pdf_files = [title_savedFile_pdf, title_savedFile_pdf2]
724 # csv_files = [title_savedFile_csv, title_savedFile_csv2]
725 # else:
726 # f = file1
727 name1 = name1.split(".tabular")[0] 719 name1 = name1.split(".tabular")[0]
728 # name_file = name1 720
729 # pdf_f = title_savedFile_pdf
730 # csv_f = title_savedFile_csv
731
732 #for f, name_file, pdf_f, csv_f in zip(files, names, pdf_files, csv_files):
733 with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf: 721 with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf:
734 print("dataset: ", name1) 722 print("dataset: ", name1)
735 integers, data_array = readFileReferenceFree(file1) 723 integers, data_array = readFileReferenceFree(file1)
736 data_array = numpy.array(data_array) 724 data_array = numpy.array(data_array)
725 print("total nr of tags:", len(data_array))
726 n = [i for i, x in enumerate(data_array[:, 1]) if "N" in x]
727 if len(n) != 0: # delete tags with N in the tag from data
728 print("nr of tags with N's within tag:", len(n), float(len(n)) / len(data_array))
729 index_whole_array = numpy.arange(0, len(data_array), 1)
730 index_withoutN_inTag = numpy.delete(index_whole_array, n)
731 data_array = data_array[index_withoutN_inTag, :]
732 integers = integers[index_withoutN_inTag]
733 print("total nr of tags without Ns:", len(data_array))
734
735 data_array_whole_dataset = data_array
736
737 int_f = numpy.array(data_array[:, 0]).astype(int) 737 int_f = numpy.array(data_array[:, 0]).astype(int)
738 data_array = data_array[numpy.where(int_f >= minFS)] 738 data_array = data_array[numpy.where(int_f >= minFS)]
739 integers = integers[integers >= minFS] 739 integers = integers[integers >= minFS]
740 740
741 # select family size for tags 741 # select family size for tags
787 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2]) 787 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2])
788 788
789 data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)]) 789 data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)])
790 data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2])) 790 data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2]))
791 791
792 print("length of tag= ", len(data_array[0, 1])) 792 print("length of tag:", len(data_array[0, 1]))
793 # select sample: if no size given --> all vs. all comparison 793 # select sample: if no size given --> all vs. all comparison
794 if index_size == 0: 794 if index_size == 0:
795 result = numpy.arange(0, len(data_array), 1) 795 result = numpy.arange(0, len(data_array), 1)
796 else: 796 else:
797 result = numpy.random.choice(len(integers), size=index_size, 797 result = numpy.random.choice(len(integers), size=index_size,
798 replace=False) # array of random sequences of size=index.size 798 replace=False) # array of random sequences of size=index.size
799 # unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True) # get only unique tags
800 # result = numpy.random.choice(unique_indices, size=index_size,
801 # replace=False) # array of random sequences of size=index.size
799 802
800 # with open("index_result1_{}.pkl".format(app_f), "wb") as o: 803 # with open("index_result1_{}.pkl".format(app_f), "wb") as o:
801 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL) 804 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL)
802 805
803 # comparison random tags to whole dataset 806 # comparison random tags to whole dataset
804 result1 = data_array[result, 1] # random tags 807 result1 = data_array[result, 1] # random tags
805 result2 = data_array[:, 1] # all tags 808 result2 = data_array[:, 1] # all tags
806 print("size of the whole dataset= ", len(result2)) 809 print("sample size:", len(result1))
807 print("sample size= ", len(result1))
808 810
809 # HD analysis of whole tag 811 # HD analysis of whole tag
810 proc_pool = Pool(nproc) 812 proc_pool = Pool(nproc)
811 chunks_sample = numpy.array_split(result1, nproc) 813 chunks_sample = numpy.array_split(result1, nproc)
812 ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample) 814 ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample)
816 # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1: 818 # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1:
817 # for h, tag in zip(ham, result1): 819 # for h, tag in zip(ham, result1):
818 # output_file1.write("{}\t{}\n".format(tag, h)) 820 # output_file1.write("{}\t{}\n".format(tag, h))
819 821
820 # HD analysis for chimeric reads 822 # HD analysis for chimeric reads
823 # result2 = data_array_whole_dataset[:,1]
824
821 proc_pool_b = Pool(nproc) 825 proc_pool_b = Pool(nproc)
822 diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample) 826 diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample)
823 diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample) 827 diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample)
824 proc_pool_b.close() 828 proc_pool_b.close()
825 proc_pool_b.join() 829 proc_pool_b.join()
842 HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]), 846 HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]),
843 numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int) 847 numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int)
844 HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]), 848 HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]),
845 numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int) 849 numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int)
846 850
851 chimera_tags = numpy.concatenate(([item[10] for item in diff_list_a],
852 [item_b[10] for item_b in diff_list_b]))
853
854 chimera_tags = [x for x in chimera_tags if x != []]
855 chimera_tags_new = []
856
857 for i in chimera_tags:
858 if len(i) > 1:
859 for t in i:
860 chimera_tags_new.append(t)
861 else:
862 chimera_tags_new.extend(i)
863
864 chimeras_dic = defaultdict(list)
865 for t1, t2 in zip(minHD_tags_zeros, chimera_tags_new):
866 chimeras_dic[t1].append(t2)
867
868 lst_unique_chimeras = []
869 with open(output_chimeras_tabular, "w") as output_file1:
870 unique_chimeras = numpy.unique(minHD_tags_zeros)
871 sample_half_a = numpy.array([i[0:(len(i)) / 2] for i in unique_chimeras]) # mate1 part1
872 sample_half_b = numpy.array([i[len(i) / 2:len(i)] for i in unique_chimeras]) # mate1 part 2
873
874 output_file1.write("sample tag\tsimilar tag\n")
875 for tag1, a, b in zip(unique_chimeras, sample_half_a, sample_half_b):
876 max_tags = numpy.concatenate(chimeras_dic.get(tag1))
877
878 if tag1 in chimeras_dic.values():
879 continue
880 else:
881 lst_unique_chimeras.append(tag1)
882
883 chimera_half_a = numpy.array([i[0:(len(i)) / 2] for i in max_tags]) # mate1 part1
884 chimera_half_b = numpy.array([i[len(i) / 2:len(i)] for i in max_tags]) # mate1 part 2
885
886 new_format = []
887 for i in range(len(max_tags)):
888 if a == chimera_half_a[i]:
889 max_tag = "*{}* {}".format(chimera_half_a[i], chimera_half_b[i])
890 new_format.append(max_tag)
891
892 elif b == chimera_half_b[i]:
893 max_tag = "{} *{}*".format(chimera_half_a[i], chimera_half_b[i])
894 new_format.append(max_tag)
895
896 sample_tag = "{} {}".format(a, b)
897 output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format)))
898 output_file1.write(
899 "This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.\n "
900 "The tags were separated by an empty space into their halves and the * marks the identical half.")
901
902 nr_chimeric_tags = len(lst_unique_chimeras)
903 print("nr of unique chimeras:", nr_chimeric_tags)
904
847 lenTags = len(data_array) 905 lenTags = len(data_array)
906 len_sample = len(result1)
848 907
849 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags 908 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags
850 seq = numpy.array(data_array[result, 1]) # tags of sample 909 seq = numpy.array(data_array[result, 1]) # tags of sample
851 ham = numpy.asarray(ham) # HD for sample of tags 910 ham = numpy.asarray(ham) # HD for sample of tags
852 911
853 if onlyDuplicates is True: # ab and ba strands of DCSs 912 if onlyDuplicates is True: # ab and ba strands of DCSs
854 quant = numpy.concatenate((quant, duplTagsBA[result])) 913 quant = numpy.concatenate((quant, duplTagsBA[result]))
855 seq = numpy.tile(seq, 2) 914 seq = numpy.tile(seq, 2)
856 ham = numpy.tile(ham, 2) 915 ham = numpy.tile(ham, 2)
916 diff = numpy.tile(diff, 2)
917 rel_Diff = numpy.tile(rel_Diff, 2)
918 diff_zeros = numpy.tile(diff_zeros, 2)
857 919
858 # prepare data for different kinds of plots 920 # prepare data for different kinds of plots
859 # distribution of FSs separated after HD 921 # distribution of FSs separated after HD
860 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False) 922 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False)
861 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS 923 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS
862 924
863 # get FS for all tags with min HD of analysis of chimeric reads 925 # get FS for all tags with min HD of analysis of chimeric reads
864 # there are more tags than sample size in the plot, because one tag can have multiple minimas 926 # there are more tags than sample size in the plot, because one tag can have multiple minimas
865 seqDic = dict(zip(seq, quant)) 927 if onlyDuplicates:
928 seqDic = defaultdict(list)
929 for s, q in zip(seq, quant):
930 seqDic[s].append(q)
931 else:
932 seqDic = dict(zip(seq, quant))
933
934
866 lst_minHD_tags = [] 935 lst_minHD_tags = []
867 for i in minHD_tags: 936 for i in minHD_tags:
868 lst_minHD_tags.append(seqDic.get(i)) 937 lst_minHD_tags.append(seqDic.get(i))
938
939 if onlyDuplicates:
940 lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags],
941 [item_b[1] for item_b in lst_minHD_tags])).astype(int)
942 # else:
943 # lst_minHD_tags = numpy.concatenate(lst_minHD_tags)
869 944
870 # histogram with absolute and relative difference between HDs of both parts of the tag 945 # histogram with absolute and relative difference between HDs of both parts of the tag
871 listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff) 946 listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff)
872 listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags, 947 listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags,
873 rel_Diff) 948 rel_Diff)
874 949
875 # family size distribution separated after the difference between HDs of both parts of the tag 950 # family size distribution separated after the difference between HDs of both parts of the tag
876 familySizeList1_diff, hammingDistances_diff, maximumXFS_diff, minimumXFS_diff = familySizeDistributionWithHD( 951 # familySizeList1_diff, hammingDistances_diff, maximumXFS_diff, minimumXFS_diff = familySizeDistributionWithHD(
877 lst_minHD_tags, diff, diff=True, rel=False) 952 # lst_minHD_tags, diff, diff=True, rel=False)
878 familySizeList1_reldiff, hammingDistances_reldiff, maximumXFS_reldiff, minimumXFS_reldiff = familySizeDistributionWithHD( 953 # familySizeList1_reldiff, hammingDistances_reldiff, maximumXFS_reldiff, minimumXFS_reldiff = familySizeDistributionWithHD(
879 lst_minHD_tags, rel_Diff, diff=True, rel=True) 954 # lst_minHD_tags, rel_Diff, diff=True, rel=True)
880 955
881 # chimeric read analysis: tags which have HD=0 in one of the halfs 956 # chimeric read analysis: tags which have HD=0 in one of the halfs
882 if len(minHD_tags_zeros) != 0: 957 if len(minHD_tags_zeros) != 0:
883 lst_minHD_tags_zeros = [] 958 lst_minHD_tags_zeros = []
884 for i in minHD_tags_zeros: 959 for i in minHD_tags_zeros:
885 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads 960 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads
886 961 if onlyDuplicates:
887 # histogram with HD of non-identical half 962 lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros],
888 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS( 963 [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int)
964
965 # histogram with HD of non-identical half
966 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(
889 lst_minHD_tags_zeros, diff_zeros) 967 lst_minHD_tags_zeros, diff_zeros)
890 # family size distribution of non-identical half 968
891 familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD(
892 lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False)
893
894 # plot Hamming Distance with Family size distribution 969 # plot Hamming Distance with Family size distribution
895 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, 970 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf,
896 subtitle="Hamming distance separated by family size", title_file1=name1, lenTags=lenTags, 971 subtitle="Hamming distance separated by family size", title_file1=name1, lenTags=lenTags,
897 xlabel="HD", nr_above_bars=nr_above_bars) 972 xlabel="HD", nr_above_bars=nr_above_bars)
898 973
912 xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars) 987 xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars)
913 988
914 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, 989 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf,
915 subtitle="Chimera Analysis: relative delta Hamming distances", 990 subtitle="Chimera Analysis: relative delta Hamming distances",
916 title_file1=name1, lenTags=lenTags, 991 title_file1=name1, lenTags=lenTags,
917 xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars) 992 xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
918 993
919 # plots for chimeric reads 994 # plots for chimeric reads
920 if len(minHD_tags_zeros) != 0: 995 if len(minHD_tags_zeros) != 0:
921 # HD 996 # HD
922 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, 997 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf,subtitle="Hamming distance of the non-identical half of chimeras",
923 subtitle="Hamming distance of the non-identical half of chimeras",
924 title_file1=name1, lenTags=lenTags, xlabel="HD", relative=False, 998 title_file1=name1, lenTags=lenTags, xlabel="HD", relative=False,
925 nr_above_bars=nr_above_bars) 999 nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
926 1000
927 # print all data to a CSV file 1001 # print all data to a CSV file
928 # HD 1002 # HD
929 summary, sumCol = createTableHD(list1, "HD=") 1003 summary, sumCol = createTableHD(list1, "HD=")
930 overallSum = sum(sumCol) # sum of columns in table 1004 overallSum = sum(sumCol) # sum of columns in table
972 "relative frequency:{}{}\n\n".format(sep, float(max_fs[len(max_fs) - 1]) / sum(max_fs))) 1046 "relative frequency:{}{}\n\n".format(sep, float(max_fs[len(max_fs) - 1]) / sum(max_fs)))
973 1047
974 # HD within tags 1048 # HD within tags
975 output_file.write( 1049 output_file.write(
976 "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n" 1050 "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n"
977 "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n") 1051 "Since this calculation was repeated, but starting with the second half to find all possible chimeras in the data, the actual number of tags in the plots differs from the sample size entered by the user.\n"
978 output_file.write( 1052 "In addition, both family sizes of one tag will be included in the plots if only tags of reads that can form a DCS were allowed.\n")
979 "actual number of tags with min HD = {:,} (sample size by user = {:,})\n".format( 1053
980 len(numpy.concatenate(listDifference1)), len(numpy.concatenate(list1))))
981 output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2)) 1054 output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2))
982 1055
983 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, 1056 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file,
984 "Hamming distance of each half in the tag", sep) 1057 "Hamming distance of each half in the tag", sep)
985 createFileHD(summary11, sumCol11, overallSum11, output_file, 1058 createFileHD(summary11, sumCol11, overallSum11, output_file,
986 "Absolute delta Hamming distances within the tag", sep) 1059 "Absolute delta Hamming distances within the tag", sep)
1060
987 createFileHD(summary13, sumCol13, overallSum13, output_file, 1061 createFileHD(summary13, sumCol13, overallSum13, output_file,
988 "Chimera analysis: relative delta Hamming distances", sep) 1062 "Chimera analysis: relative delta Hamming distances", sep)
989 1063
990 if len(minHD_tags_zeros) != 0: 1064 if len(minHD_tags_zeros) != 0:
991 output_file.write( 1065 output_file.write(
992 "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n") 1066 "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the Hamming distance of the non-identical half is shown.\n")
1067 output_file.write(
1068 "Be aware that the real number of chimeric tags (where rel. diff = 1) is not shown in the plot because of the above reasons.\n")
1069 output_file.write("real number of chimeric tags{}{}{}{}\n".format(sep, nr_chimeric_tags, sep, int(nr_chimeric_tags) / float(len_sample)))
993 createFileHD(summary15, sumCol15, overallSum15, output_file, 1070 createFileHD(summary15, sumCol15, overallSum15, output_file,
994 "Hamming distances of non-zero half", sep) 1071 "Hamming distances of non-zero half", sep)
1072
995 output_file.write("\n") 1073 output_file.write("\n")
996 1074
997 1075
998 if __name__ == '__main__': 1076 if __name__ == '__main__':
999 sys.exit(Hamming_Distance_Analysis(sys.argv)) 1077 sys.exit(Hamming_Distance_Analysis(sys.argv))
1078