comparison hd.py @ 20:b084b6a8e3ac draft

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/hd commit e76960d95c059a78d880ed5ecd6202f54b091025
author mheinzl
date Fri, 14 Dec 2018 04:31:21 -0500
parents 2e9f7ea7ae93
children 9919024d7778
comparison
equal deleted inserted replaced
19:2e9f7ea7ae93 20:b084b6a8e3ac
11 # In additon, the tool produces HD and FSD plots for the difference between the HDs of both parts of the tags and for the chimeric reads 11 # In additon, the tool produces HD and FSD plots for the difference between the HDs of both parts of the tags and for the chimeric reads
12 # and finally a CSV file with the data of the plots. 12 # and finally a CSV file with the data of the plots.
13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input. 13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input.
14 # The tool can run on a certain number of processors, which can be defined by the user. 14 # The tool can run on a certain number of processors, which can be defined by the user.
15 15
16 # USAGE: python hd.py --inputFile filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --sample_size int/0 --sep "characterWhichSeparatesCSVFile" / 16 # USAGE: python hd.py --inputFile filename --inputName1 filename --sample_size int/0 --sep "characterWhichSeparatesCSVFile" /
17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular --output_pdf outputfile_name_pdf 17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular
18 18
19 import argparse 19 import argparse
20 import itertools 20 import itertools
21 import operator 21 import operator
22 import sys 22 import sys
172 172
173 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2)) 173 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2))
174 plt.xticks(numpy.arange(0, maximumX + 1, 1.0)) 174 plt.xticks(numpy.arange(0, maximumX + 1, 1.0))
175 # plt.ylim(0, maximumY * 1.2) 175 # plt.ylim(0, maximumY * 1.2)
176 176
177 legend = "sample size= {:,} against {:,}".format(sum(ham_partial[4]), lenTags) 177 legend = "sample size= {:,} against {:,}".format(len(numpy.concatenate(ham_partial)), lenTags)
178 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) 178 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure)
179 pdf.savefig(fig, bbox_inches="tight") 179 pdf.savefig(fig, bbox_inches="tight")
180 plt.close("all") 180 plt.close("all")
181 plt.clf() 181 plt.clf()
182 182
632 def make_argparser(): 632 def make_argparser():
633 parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data') 633 parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data')
634 parser.add_argument('--inputFile', 634 parser.add_argument('--inputFile',
635 help='Tabular File with three columns: ab or ba, tag and family size.') 635 help='Tabular File with three columns: ab or ba, tag and family size.')
636 parser.add_argument('--inputName1') 636 parser.add_argument('--inputName1')
637 parser.add_argument('--inputFile2', default=None, 637 # parser.add_argument('--inputFile2', default=None,
638 help='Tabular File with three columns: ab or ba, tag and family size.') 638 # help='Tabular File with three columns: ab or ba, tag and family size.')
639 parser.add_argument('--inputName2') 639 # parser.add_argument('--inputName2')
640 parser.add_argument('--sample_size', default=1000, type=int, 640 parser.add_argument('--sample_size', default=1000, type=int,
641 help='Sample size of Hamming distance analysis.') 641 help='Sample size of Hamming distance analysis.')
642 parser.add_argument('--subset_tag', default=0, type=int, 642 parser.add_argument('--subset_tag', default=0, type=int,
643 help='The tag is shortened to the given number.') 643 help='The tag is shortened to the given number.')
644 parser.add_argument('--nproc', default=4, type=int, 644 parser.add_argument('--nproc', default=4, type=int,
655 655
656 parser.add_argument('--output_tabular', default="data.tabular", type=str, 656 parser.add_argument('--output_tabular', default="data.tabular", type=str,
657 help='Name of the tabular file.') 657 help='Name of the tabular file.')
658 parser.add_argument('--output_pdf', default="data.pdf", type=str, 658 parser.add_argument('--output_pdf', default="data.pdf", type=str,
659 help='Name of the pdf file.') 659 help='Name of the pdf file.')
660 parser.add_argument('--output_pdf2', default="data2.pdf", type=str, 660 # parser.add_argument('--output_pdf2', default="data2.pdf", type=str,
661 help='Name of the pdf file.') 661 # help='Name of the pdf file.')
662 parser.add_argument('--output_tabular2', default="data2.tabular", type=str, 662 # parser.add_argument('--output_tabular2', default="data2.tabular", type=str,
663 help='Name of the tabular file.') 663 # help='Name of the tabular file.')
664 664
665 return parser 665 return parser
666 666
667 667
668 def Hamming_Distance_Analysis(argv): 668 def Hamming_Distance_Analysis(argv):
670 args = parser.parse_args(argv[1:]) 670 args = parser.parse_args(argv[1:])
671 671
672 file1 = args.inputFile 672 file1 = args.inputFile
673 name1 = args.inputName1 673 name1 = args.inputName1
674 674
675 file2 = args.inputFile2 675 # file2 = args.inputFile2
676 name2 = args.inputName2 676 # name2 = args.inputName2
677 677
678 index_size = args.sample_size 678 index_size = args.sample_size
679 title_savedFile_pdf = args.output_pdf 679 title_savedFile_pdf = args.output_pdf
680 title_savedFile_pdf2 = args.output_pdf2 680 # title_savedFile_pdf2 = args.output_pdf2
681 681
682 title_savedFile_csv = args.output_tabular 682 title_savedFile_csv = args.output_tabular
683 title_savedFile_csv2 = args.output_tabular2 683 # title_savedFile_csv2 = args.output_tabular2
684 684
685 sep = "\t" 685 sep = "\t"
686 onlyDuplicates = args.only_DCS 686 onlyDuplicates = args.only_DCS
687 minFS = args.minFS 687 minFS = args.minFS
688 maxFS = args.maxFS 688 maxFS = args.maxFS
709 plt.rcParams['xtick.labelsize'] = 14 709 plt.rcParams['xtick.labelsize'] = 14
710 plt.rcParams['ytick.labelsize'] = 14 710 plt.rcParams['ytick.labelsize'] = 14
711 plt.rcParams['patch.edgecolor'] = "#000000" 711 plt.rcParams['patch.edgecolor'] = "#000000"
712 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format 712 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format
713 713
714 if file2 != str(None): 714 # if file2 != str(None):
715 files = [file1, file2] 715 # files = [file1, file2]
716 name1 = name1.split(".tabular")[0] 716 # name1 = name1.split(".tabular")[0]
717 name2 = name2.split(".tabular")[0] 717 # name2 = name2.split(".tabular")[0]
718 names = [name1, name2] 718 # names = [name1, name2]
719 pdf_files = [title_savedFile_pdf, title_savedFile_pdf2] 719 # pdf_files = [title_savedFile_pdf, title_savedFile_pdf2]
720 csv_files = [title_savedFile_csv, title_savedFile_csv2] 720 # csv_files = [title_savedFile_csv, title_savedFile_csv2]
721 else: 721 # else:
722 files = [file1] 722 # f = file1
723 name1 = name1.split(".tabular")[0] 723 name1 = name1.split(".tabular")[0]
724 names = [name1] 724 # name_file = name1
725 pdf_files = [title_savedFile_pdf] 725 # pdf_f = title_savedFile_pdf
726 csv_files = [title_savedFile_csv] 726 # csv_f = title_savedFile_csv
727 727
728 for f, name_file, pdf_f, csv_f in zip(files, names, pdf_files, csv_files): 728 #for f, name_file, pdf_f, csv_f in zip(files, names, pdf_files, csv_files):
729 with open(csv_f, "w") as output_file, PdfPages(pdf_f) as pdf: 729 with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf:
730 print("dataset: ", name_file) 730 print("dataset: ", name1)
731 integers, data_array = readFileReferenceFree(f) 731 integers, data_array = readFileReferenceFree(file1)
732 data_array = numpy.array(data_array) 732 data_array = numpy.array(data_array)
733 int_f = numpy.array(data_array[:, 0]).astype(int) 733 int_f = numpy.array(data_array[:, 0]).astype(int)
734 data_array = data_array[numpy.where(int_f >= minFS)] 734 data_array = data_array[numpy.where(int_f >= minFS)]
735 integers = integers[integers >= minFS] 735 integers = integers[integers >= minFS]
736 736
737 # select family size for tags 737 # select family size for tags
738 if maxFS > 0: 738 if maxFS > 0:
739 int_f2 = numpy.array(data_array[:, 0]).astype(int) 739 int_f2 = numpy.array(data_array[:, 0]).astype(int)
740 data_array = data_array[numpy.where(int_f2 <= maxFS)] 740 data_array = data_array[numpy.where(int_f2 <= maxFS)]
741 integers = integers[integers <= maxFS] 741 integers = integers[integers <= maxFS]
742 742
743 print("min FS", min(integers)) 743 print("min FS", min(integers))
744 print("max FS", max(integers)) 744 print("max FS", max(integers))
745 745
746 tags = data_array[:, 2] 746 tags = data_array[:, 2]
747 seq = data_array[:, 1] 747 seq = data_array[:, 1]
748 748
749 if onlyDuplicates is True: 749 if onlyDuplicates is True:
750 # find all unique tags and get the indices for ALL tags, but only once 750 # find all unique tags and get the indices for ALL tags, but only once
751 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) 751 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
752 d = u[c > 1] 752 d = u[c > 1]
753 753
754 # get family sizes, tag for duplicates 754 # get family sizes, tag for duplicates
755 duplTags_double = integers[numpy.in1d(seq, d)] 755 duplTags_double = integers[numpy.in1d(seq, d)]
756 duplTags = duplTags_double[0::2] # ab of DCS 756 duplTags = duplTags_double[0::2] # ab of DCS
757 duplTagsBA = duplTags_double[1::2] # ba of DCS 757 duplTagsBA = duplTags_double[1::2] # ba of DCS
758 758
759 duplTags_tag = tags[numpy.in1d(seq, d)][0::2] # ab 759 duplTags_tag = tags[numpy.in1d(seq, d)][0::2] # ab
760 duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags 760 duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags
761 761
762 data_array = numpy.column_stack((duplTags, duplTags_seq)) 762 data_array = numpy.column_stack((duplTags, duplTags_seq))
763 data_array = numpy.column_stack((data_array, duplTags_tag)) 763 data_array = numpy.column_stack((data_array, duplTags_tag))
764 integers = numpy.array(data_array[:, 0]).astype(int) 764 integers = numpy.array(data_array[:, 0]).astype(int)
765 print("DCS in whole dataset", len(data_array)) 765 print("DCS in whole dataset", len(data_array))
766 766
767 # HD analysis for a subset of the tag 767 # HD analysis for a subset of the tag
768 if subset > 0: 768 if subset > 0:
769 tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]]) 769 tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]])
770 tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]]) 770 tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]])
771 771
772 flanking_region_float = float((len(tag1[0]) - subset)) / 2 772 flanking_region_float = float((len(tag1[0]) - subset)) / 2
773 flanking_region = int(flanking_region_float) 773 flanking_region = int(flanking_region_float)
774 if flanking_region_float % 2 == 0: 774 if flanking_region_float % 2 == 0:
775 tag1_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag1]) 775 tag1_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag1])
776 tag2_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag2]) 776 tag2_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag2])
777 else:
778 flanking_region_rounded = int(round(flanking_region, 1))
779 flanking_region_rounded_end = len(tag1[0]) - subset - flanking_region_rounded
780 tag1_shorten = numpy.array(
781 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag1])
782 tag2_shorten = numpy.array(
783 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2])
784
785 data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)])
786 data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2]))
787
788 print("length of tag= ", len(data_array[0, 1]))
789 # select sample: if no size given --> all vs. all comparison
790 if index_size == 0:
791 result = numpy.arange(0, len(data_array), 1)
792 else: 777 else:
793 result = numpy.random.choice(len(integers), size=index_size, replace=False) # array of random sequences of size=index.size 778 flanking_region_rounded = int(round(flanking_region, 1))
794 779 flanking_region_rounded_end = len(tag1[0]) - subset - flanking_region_rounded
795 # with open("index_result1_{}.pkl".format(app_f), "wb") as o: 780 tag1_shorten = numpy.array(
796 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL) 781 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag1])
797 782 tag2_shorten = numpy.array(
798 # comparison random tags to whole dataset 783 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2])
799 result1 = data_array[result, 1] # random tags 784
800 result2 = data_array[:, 1] # all tags 785 data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)])
801 print("size of the whole dataset= ", len(result2)) 786 data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2]))
802 print("sample size= ", len(result1)) 787
803 788 print("length of tag= ", len(data_array[0, 1]))
804 # HD analysis of whole tag 789 # select sample: if no size given --> all vs. all comparison
805 proc_pool = Pool(nproc) 790 if index_size == 0:
806 chunks_sample = numpy.array_split(result1, nproc) 791 result = numpy.arange(0, len(data_array), 1)
807 ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample) 792 else:
808 proc_pool.close() 793 result = numpy.random.choice(len(integers), size=index_size,
809 proc_pool.join() 794 replace=False) # array of random sequences of size=index.size
810 ham = numpy.concatenate(ham).astype(int) 795
811 # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1: 796 # with open("index_result1_{}.pkl".format(app_f), "wb") as o:
812 # for h, tag in zip(ham, result1): 797 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL)
813 # output_file1.write("{}\t{}\n".format(tag, h)) 798
814 799 # comparison random tags to whole dataset
815 # HD analysis for chimeric reads 800 result1 = data_array[result, 1] # random tags
816 proc_pool_b = Pool(nproc) 801 result2 = data_array[:, 1] # all tags
817 diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample) 802 print("size of the whole dataset= ", len(result2))
818 diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample) 803 print("sample size= ", len(result1))
819 proc_pool_b.close() 804
820 proc_pool_b.join() 805 # HD analysis of whole tag
821 diff = numpy.concatenate((numpy.concatenate([item[0] for item in diff_list_a]), 806 proc_pool = Pool(nproc)
822 numpy.concatenate([item_b[0] for item_b in diff_list_b]))).astype(int) 807 chunks_sample = numpy.array_split(result1, nproc)
823 HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]), 808 ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample)
824 numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int) 809 proc_pool.close()
825 HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]), 810 proc_pool.join()
826 numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int) 811 ham = numpy.concatenate(ham).astype(int)
827 minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]), 812 # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1:
828 numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int) 813 # for h, tag in zip(ham, result1):
829 minHD_tags = numpy.concatenate((numpy.concatenate([item[4] for item in diff_list_a]), 814 # output_file1.write("{}\t{}\n".format(tag, h))
830 numpy.concatenate([item_b[4] for item_b in diff_list_b]))) 815
831 rel_Diff = numpy.concatenate((numpy.concatenate([item[5] for item in diff_list_a]), 816 # HD analysis for chimeric reads
832 numpy.concatenate([item_b[5] for item_b in diff_list_b]))) 817 proc_pool_b = Pool(nproc)
833 diff_zeros = numpy.concatenate((numpy.concatenate([item[6] for item in diff_list_a]), 818 diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample)
834 numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int) 819 diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample)
835 minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]), 820 proc_pool_b.close()
836 numpy.concatenate([item_b[7] for item_b in diff_list_b]))) 821 proc_pool_b.join()
837 HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]), numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int) 822 diff = numpy.concatenate((numpy.concatenate([item[0] for item in diff_list_a]),
838 HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]), 823 numpy.concatenate([item_b[0] for item_b in diff_list_b]))).astype(int)
839 numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int) 824 HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]),
840 825 numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int)
841 lenTags = len(data_array) 826 HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]),
842 827 numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int)
843 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags 828 minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]),
844 seq = numpy.array(data_array[result, 1]) # tags of sample 829 numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int)
845 ham = numpy.asarray(ham) # HD for sample of tags 830 minHD_tags = numpy.concatenate((numpy.concatenate([item[4] for item in diff_list_a]),
846 831 numpy.concatenate([item_b[4] for item_b in diff_list_b])))
847 if onlyDuplicates is True: # ab and ba strands of DCSs 832 rel_Diff = numpy.concatenate((numpy.concatenate([item[5] for item in diff_list_a]),
848 quant = numpy.concatenate((quant, duplTagsBA[result])) 833 numpy.concatenate([item_b[5] for item_b in diff_list_b])))
849 seq = numpy.tile(seq, 2) 834 diff_zeros = numpy.concatenate((numpy.concatenate([item[6] for item in diff_list_a]),
850 ham = numpy.tile(ham, 2) 835 numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int)
851 836 minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]),
852 # prepare data for different kinds of plots 837 numpy.concatenate([item_b[7] for item_b in diff_list_b])))
853 # distribution of FSs separated after HD 838 HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]),
854 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False) 839 numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int)
855 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS 840 HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]),
856 841 numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int)
857 # get FS for all tags with min HD of analysis of chimeric reads 842
858 # there are more tags than sample size in the plot, because one tag can have multiple minimas 843 lenTags = len(data_array)
859 seqDic = dict(zip(seq, quant)) 844
860 lst_minHD_tags = [] 845 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags
861 for i in minHD_tags: 846 seq = numpy.array(data_array[result, 1]) # tags of sample
862 lst_minHD_tags.append(seqDic.get(i)) 847 ham = numpy.asarray(ham) # HD for sample of tags
863 848
864 # histogram with absolute and relative difference between HDs of both parts of the tag 849 if onlyDuplicates is True: # ab and ba strands of DCSs
865 listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff) 850 quant = numpy.concatenate((quant, duplTagsBA[result]))
866 listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags, 851 seq = numpy.tile(seq, 2)
867 rel_Diff) 852 ham = numpy.tile(ham, 2)
868 853
869 # family size distribution separated after the difference between HDs of both parts of the tag 854 # prepare data for different kinds of plots
870 familySizeList1_diff, hammingDistances_diff, maximumXFS_diff, minimumXFS_diff = familySizeDistributionWithHD( 855 # distribution of FSs separated after HD
871 lst_minHD_tags, diff, diff=True, rel=False) 856 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False)
872 familySizeList1_reldiff, hammingDistances_reldiff, maximumXFS_reldiff, minimumXFS_reldiff = familySizeDistributionWithHD( 857 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS
873 lst_minHD_tags, rel_Diff, diff=True, rel=True) 858
874 859 # get FS for all tags with min HD of analysis of chimeric reads
875 # chimeric read analysis: tags which have HD=0 in one of the halfs 860 # there are more tags than sample size in the plot, because one tag can have multiple minimas
876 if len(minHD_tags_zeros) != 0: 861 seqDic = dict(zip(seq, quant))
877 lst_minHD_tags_zeros = [] 862 lst_minHD_tags = []
878 for i in minHD_tags_zeros: 863 for i in minHD_tags:
879 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads 864 lst_minHD_tags.append(seqDic.get(i))
880 865
881 # histogram with HD of non-identical half 866 # histogram with absolute and relative difference between HDs of both parts of the tag
882 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(lst_minHD_tags_zeros, diff_zeros) 867 listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff)
883 # family size distribution of non-identical half 868 listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags,
884 familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD(lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False) 869 rel_Diff)
885 870
886 # plot Hamming Distance with Family size distribution 871 # family size distribution separated after the difference between HDs of both parts of the tag
887 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, subtitle="Hamming distance separated by family size", title_file1=name_file, lenTags=lenTags, xlabel="HD", nr_above_bars=nr_above_bars) 872 familySizeList1_diff, hammingDistances_diff, maximumXFS_diff, minimumXFS_diff = familySizeDistributionWithHD(
888 873 lst_minHD_tags, diff, diff=True, rel=False)
889 # Plot FSD with separation after 874 familySizeList1_reldiff, hammingDistances_reldiff, maximumXFS_reldiff, minimumXFS_reldiff = familySizeDistributionWithHD(
890 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, 875 lst_minHD_tags, rel_Diff, diff=True, rel=True)
891 originalCounts=quant, subtitle="Family size distribution separated by Hamming distance", 876
892 pdf=pdf, relative=False, title_file1=name_file, diff=False) 877 # chimeric read analysis: tags which have HD=0 in one of the halfs
893 878 if len(minHD_tags_zeros) != 0:
894 # Plot HD within tags 879 lst_minHD_tags_zeros = []
895 plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, title_file1=name_file) 880 for i in minHD_tags_zeros:
896 881 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads
897 # Plot difference between HD's separated after FSD 882
898 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, 883 # histogram with HD of non-identical half
899 subtitle="Delta Hamming distance within tags", 884 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(
900 title_file1=name_file, lenTags=lenTags, 885 lst_minHD_tags_zeros, diff_zeros)
901 xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars) 886 # family size distribution of non-identical half
902 887 familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD(
903 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, 888 lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False)
904 subtitle="Chimera Analysis: relative delta Hamming distances", 889
905 title_file1=name_file, lenTags=lenTags, 890 # plot Hamming Distance with Family size distribution
906 xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars) 891 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf,
907 892 subtitle="Hamming distance separated by family size", title_file1=name1, lenTags=lenTags,
908 # plots for chimeric reads 893 xlabel="HD", nr_above_bars=nr_above_bars)
909 if len(minHD_tags_zeros) != 0: 894
910 # HD 895 # Plot FSD with separation after
911 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, 896 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS,
912 subtitle="Hamming distance of the non-identical half of chimeras", 897 originalCounts=quant, subtitle="Family size distribution separated by Hamming distance",
913 title_file1=name_file, lenTags=lenTags, xlabel="HD", relative=False, nr_above_bars=nr_above_bars) 898 pdf=pdf, relative=False, title_file1=name1, diff=False)
914 899
915 # print all data to a CSV file 900 # Plot HD within tags
901 plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags,
902 title_file1=name1)
903
904 # Plot difference between HD's separated after FSD
905 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf,
906 subtitle="Delta Hamming distance within tags",
907 title_file1=name1, lenTags=lenTags,
908 xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars)
909
910 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf,
911 subtitle="Chimera Analysis: relative delta Hamming distances",
912 title_file1=name1, lenTags=lenTags,
913 xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars)
914
915 # plots for chimeric reads
916 if len(minHD_tags_zeros) != 0:
916 # HD 917 # HD
917 summary, sumCol = createTableHD(list1, "HD=") 918 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf,
918 overallSum = sum(sumCol) # sum of columns in table 919 subtitle="Hamming distance of the non-identical half of chimeras",
919 920 title_file1=name1, lenTags=lenTags, xlabel="HD", relative=False,
920 # FSD 921 nr_above_bars=nr_above_bars)
921 summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False) 922
922 overallSum5 = sum(sumCol5) 923 # print all data to a CSV file
923 924 # HD
924 # HD of both parts of the tag 925 summary, sumCol = createTableHD(list1, "HD=")
925 summary9, sumCol9 = createTableHDwithTags([HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, numpy.array(minHDs)]) 926 overallSum = sum(sumCol) # sum of columns in table
926 overallSum9 = sum(sumCol9) 927
927 928 # FSD
928 # HD 929 summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False)
929 # absolute difference 930 overallSum5 = sum(sumCol5)
930 summary11, sumCol11 = createTableHD(listDifference1, "diff=") 931
931 overallSum11 = sum(sumCol11) 932 # HD of both parts of the tag
932 # relative difference and all tags 933 summary9, sumCol9 = createTableHDwithTags([HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, numpy.array(minHDs)])
933 summary13, sumCol13 = createTableHD(listRelDifference1, "diff=") 934 overallSum9 = sum(sumCol9)
934 overallSum13 = sum(sumCol13) 935
935 936 # HD
936 # chimeric reads 937 # absolute difference
937 if len(minHD_tags_zeros) != 0: 938 summary11, sumCol11 = createTableHD(listDifference1, "diff=")
938 # absolute difference and tags where at least one half has HD=0 939 overallSum11 = sum(sumCol11)
939 summary15, sumCol15 = createTableHD(listDifference1_zeros, "HD=") 940 # relative difference and all tags
940 overallSum15 = sum(sumCol15) 941 summary13, sumCol13 = createTableHD(listRelDifference1, "diff=")
941 942 overallSum13 = sum(sumCol13)
942 output_file.write("{}\n".format(name_file)) 943
943 output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len( 944 # chimeric reads
944 numpy.concatenate(list1)), lenTags, lenTags)) 945 if len(minHD_tags_zeros) != 0:
945 946 # absolute difference and tags where at least one half has HD=0
946 # HD 947 summary15, sumCol15 = createTableHD(listDifference1_zeros, "HD=")
947 createFileHD(summary, sumCol, overallSum, output_file, 948 overallSum15 = sum(sumCol15)
948 "Hamming distance separated by family size", sep) 949
949 # FSD 950 output_file.write("{}\n".format(name1))
950 createFileFSD2(summary5, sumCol5, overallSum5, output_file, 951 output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len(
951 "Family size distribution separated by Hamming distance", sep, 952 numpy.concatenate(list1)), lenTags, lenTags))
952 diff=False) 953
953 954 # HD
954 count = numpy.bincount(quant) 955 createFileHD(summary, sumCol, overallSum, output_file,
955 # output_file.write("{}{}\n".format(sep, name_file)) 956 "Hamming distance separated by family size", sep)
956 output_file.write("\n") 957 # FSD
957 output_file.write("max. family size:{}{}\n".format(sep, max(quant))) 958 createFileFSD2(summary5, sumCol5, overallSum5, output_file,
958 output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1])) 959 "Family size distribution separated by Hamming distance", sep,
960 diff=False)
961
962 count = numpy.bincount(quant)
963 # output_file.write("{}{}\n".format(sep, name1))
964 output_file.write("\n")
965 output_file.write("max. family size:{}{}\n".format(sep, max(quant)))
966 output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1]))
967 output_file.write(
968 "relative frequency:{}{}\n\n".format(sep, float(count[len(count) - 1]) / sum(count)))
969
970 # HD within tags
971 output_file.write(
972 "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n"
973 "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n")
974 output_file.write(
975 "actual number of tags with min HD = {:,} (sample size by user = {:,})\n".format(
976 len(numpy.concatenate(listDifference1)), len(numpy.concatenate(list1))))
977 output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2))
978
979 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file,
980 "Hamming distance of each half in the tag", sep)
981 createFileHD(summary11, sumCol11, overallSum11, output_file,
982 "Absolute delta Hamming distances within the tag", sep)
983 createFileHD(summary13, sumCol13, overallSum13, output_file,
984 "Chimera analysis: relative delta Hamming distances", sep)
985
986 if len(minHD_tags_zeros) != 0:
959 output_file.write( 987 output_file.write(
960 "relative frequency:{}{}\n\n".format(sep, float(count[len(count) - 1]) / sum(count))) 988 "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n")
961 989 createFileHD(summary15, sumCol15, overallSum15, output_file,
962 # HD within tags 990 "Hamming distances of non-zero half", sep)
963 output_file.write( 991 output_file.write("\n")
964 "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n"
965 "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n")
966 output_file.write(
967 "actual number of tags with min HD = {:,} (sample size by user = {:,})\n".format(
968 len(numpy.concatenate(listDifference1)), len(numpy.concatenate(list1))))
969 output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2))
970
971 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file,
972 "Hamming distance of each half in the tag", sep)
973 createFileHD(summary11, sumCol11, overallSum11, output_file,
974 "Absolute delta Hamming distances within the tag", sep)
975 createFileHD(summary13, sumCol13, overallSum13, output_file,
976 "Chimera analysis: relative delta Hamming distances", sep)
977
978 if len(minHD_tags_zeros) != 0:
979 output_file.write(
980 "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n")
981 createFileHD(summary15, sumCol15, overallSum15, output_file,
982 "Hamming distances of non-zero half", sep)
983 output_file.write("\n")
984 992
985 993
986 if __name__ == '__main__': 994 if __name__ == '__main__':
987 sys.exit(Hamming_Distance_Analysis(sys.argv)) 995 sys.exit(Hamming_Distance_Analysis(sys.argv))