# HG changeset patch # User mheinzl # Date 1544779881 18000 # Node ID b084b6a8e3ac6aecd89d4ea3b4b2ea4d147602cd # Parent 2e9f7ea7ae938f40059b344dc2c134e000640e5d planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/hd commit e76960d95c059a78d880ed5ecd6202f54b091025 diff -r 2e9f7ea7ae93 -r b084b6a8e3ac hd.py --- a/hd.py Mon Oct 08 05:56:04 2018 -0400 +++ b/hd.py Fri Dec 14 04:31:21 2018 -0500 @@ -13,8 +13,8 @@ # It is also possible to perform the HD analysis with shortened tags with given sizes as input. # The tool can run on a certain number of processors, which can be defined by the user. -# USAGE: python hd.py --inputFile filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --sample_size int/0 --sep "characterWhichSeparatesCSVFile" / -# --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular --output_pdf outputfile_name_pdf +# USAGE: python hd.py --inputFile filename --inputName1 filename --sample_size int/0 --sep "characterWhichSeparatesCSVFile" / +# --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular import argparse import itertools @@ -174,7 +174,7 @@ plt.xticks(numpy.arange(0, maximumX + 1, 1.0)) # plt.ylim(0, maximumY * 1.2) - legend = "sample size= {:,} against {:,}".format(sum(ham_partial[4]), lenTags) + legend = "sample size= {:,} against {:,}".format(len(numpy.concatenate(ham_partial)), lenTags) plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) pdf.savefig(fig, bbox_inches="tight") plt.close("all") @@ -634,9 +634,9 @@ parser.add_argument('--inputFile', help='Tabular File with three columns: ab or ba, tag and family size.') parser.add_argument('--inputName1') - parser.add_argument('--inputFile2', default=None, - help='Tabular File with three columns: ab or ba, tag and family size.') - parser.add_argument('--inputName2') + # parser.add_argument('--inputFile2', default=None, + # help='Tabular File with three columns: ab or ba, tag and family size.') + # parser.add_argument('--inputName2') parser.add_argument('--sample_size', default=1000, type=int, help='Sample size of Hamming distance analysis.') parser.add_argument('--subset_tag', default=0, type=int, @@ -657,10 +657,10 @@ help='Name of the tabular file.') parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf file.') - parser.add_argument('--output_pdf2', default="data2.pdf", type=str, - help='Name of the pdf file.') - parser.add_argument('--output_tabular2', default="data2.tabular", type=str, - help='Name of the tabular file.') + # parser.add_argument('--output_pdf2', default="data2.pdf", type=str, + # help='Name of the pdf file.') + # parser.add_argument('--output_tabular2', default="data2.tabular", type=str, + # help='Name of the tabular file.') return parser @@ -672,15 +672,15 @@ file1 = args.inputFile name1 = args.inputName1 - file2 = args.inputFile2 - name2 = args.inputName2 + # file2 = args.inputFile2 + # name2 = args.inputName2 index_size = args.sample_size title_savedFile_pdf = args.output_pdf - title_savedFile_pdf2 = args.output_pdf2 + # title_savedFile_pdf2 = args.output_pdf2 title_savedFile_csv = args.output_tabular - title_savedFile_csv2 = args.output_tabular2 + # title_savedFile_csv2 = args.output_tabular2 sep = "\t" onlyDuplicates = args.only_DCS @@ -711,276 +711,284 @@ plt.rcParams['patch.edgecolor'] = "#000000" plt.rc('figure', figsize=(11.69, 8.27)) # A4 format - if file2 != str(None): - files = [file1, file2] - name1 = name1.split(".tabular")[0] - name2 = name2.split(".tabular")[0] - names = [name1, name2] - pdf_files = [title_savedFile_pdf, title_savedFile_pdf2] - csv_files = [title_savedFile_csv, title_savedFile_csv2] - else: - files = [file1] - name1 = name1.split(".tabular")[0] - names = [name1] - pdf_files = [title_savedFile_pdf] - csv_files = [title_savedFile_csv] + # if file2 != str(None): + # files = [file1, file2] + # name1 = name1.split(".tabular")[0] + # name2 = name2.split(".tabular")[0] + # names = [name1, name2] + # pdf_files = [title_savedFile_pdf, title_savedFile_pdf2] + # csv_files = [title_savedFile_csv, title_savedFile_csv2] + # else: + # f = file1 + name1 = name1.split(".tabular")[0] + # name_file = name1 + # pdf_f = title_savedFile_pdf + # csv_f = title_savedFile_csv - for f, name_file, pdf_f, csv_f in zip(files, names, pdf_files, csv_files): - with open(csv_f, "w") as output_file, PdfPages(pdf_f) as pdf: - print("dataset: ", name_file) - integers, data_array = readFileReferenceFree(f) - data_array = numpy.array(data_array) - int_f = numpy.array(data_array[:, 0]).astype(int) - data_array = data_array[numpy.where(int_f >= minFS)] - integers = integers[integers >= minFS] + #for f, name_file, pdf_f, csv_f in zip(files, names, pdf_files, csv_files): + with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf: + print("dataset: ", name1) + integers, data_array = readFileReferenceFree(file1) + data_array = numpy.array(data_array) + int_f = numpy.array(data_array[:, 0]).astype(int) + data_array = data_array[numpy.where(int_f >= minFS)] + integers = integers[integers >= minFS] - # select family size for tags - if maxFS > 0: - int_f2 = numpy.array(data_array[:, 0]).astype(int) - data_array = data_array[numpy.where(int_f2 <= maxFS)] - integers = integers[integers <= maxFS] + # select family size for tags + if maxFS > 0: + int_f2 = numpy.array(data_array[:, 0]).astype(int) + data_array = data_array[numpy.where(int_f2 <= maxFS)] + integers = integers[integers <= maxFS] - print("min FS", min(integers)) - print("max FS", max(integers)) + print("min FS", min(integers)) + print("max FS", max(integers)) - tags = data_array[:, 2] - seq = data_array[:, 1] + tags = data_array[:, 2] + seq = data_array[:, 1] - if onlyDuplicates is True: - # find all unique tags and get the indices for ALL tags, but only once - u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) - d = u[c > 1] + if onlyDuplicates is True: + # find all unique tags and get the indices for ALL tags, but only once + u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) + d = u[c > 1] - # get family sizes, tag for duplicates - duplTags_double = integers[numpy.in1d(seq, d)] - duplTags = duplTags_double[0::2] # ab of DCS - duplTagsBA = duplTags_double[1::2] # ba of DCS + # get family sizes, tag for duplicates + duplTags_double = integers[numpy.in1d(seq, d)] + duplTags = duplTags_double[0::2] # ab of DCS + duplTagsBA = duplTags_double[1::2] # ba of DCS - duplTags_tag = tags[numpy.in1d(seq, d)][0::2] # ab - duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags + duplTags_tag = tags[numpy.in1d(seq, d)][0::2] # ab + duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags - data_array = numpy.column_stack((duplTags, duplTags_seq)) - data_array = numpy.column_stack((data_array, duplTags_tag)) - integers = numpy.array(data_array[:, 0]).astype(int) - print("DCS in whole dataset", len(data_array)) + data_array = numpy.column_stack((duplTags, duplTags_seq)) + data_array = numpy.column_stack((data_array, duplTags_tag)) + integers = numpy.array(data_array[:, 0]).astype(int) + print("DCS in whole dataset", len(data_array)) - # HD analysis for a subset of the tag - if subset > 0: - tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]]) - tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]]) + # HD analysis for a subset of the tag + if subset > 0: + tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]]) + tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]]) - flanking_region_float = float((len(tag1[0]) - subset)) / 2 - flanking_region = int(flanking_region_float) - if flanking_region_float % 2 == 0: - tag1_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag1]) - tag2_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag2]) - else: - flanking_region_rounded = int(round(flanking_region, 1)) - flanking_region_rounded_end = len(tag1[0]) - subset - flanking_region_rounded - tag1_shorten = numpy.array( - [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag1]) - tag2_shorten = numpy.array( - [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2]) + flanking_region_float = float((len(tag1[0]) - subset)) / 2 + flanking_region = int(flanking_region_float) + if flanking_region_float % 2 == 0: + tag1_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag1]) + tag2_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag2]) + else: + flanking_region_rounded = int(round(flanking_region, 1)) + flanking_region_rounded_end = len(tag1[0]) - subset - flanking_region_rounded + tag1_shorten = numpy.array( + [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag1]) + tag2_shorten = numpy.array( + [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2]) - data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)]) - data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2])) + data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)]) + data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2])) - print("length of tag= ", len(data_array[0, 1])) - # select sample: if no size given --> all vs. all comparison - if index_size == 0: - result = numpy.arange(0, len(data_array), 1) - else: - result = numpy.random.choice(len(integers), size=index_size, replace=False) # array of random sequences of size=index.size + print("length of tag= ", len(data_array[0, 1])) + # select sample: if no size given --> all vs. all comparison + if index_size == 0: + result = numpy.arange(0, len(data_array), 1) + else: + result = numpy.random.choice(len(integers), size=index_size, + replace=False) # array of random sequences of size=index.size - # with open("index_result1_{}.pkl".format(app_f), "wb") as o: - # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL) + # with open("index_result1_{}.pkl".format(app_f), "wb") as o: + # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL) - # comparison random tags to whole dataset - result1 = data_array[result, 1] # random tags - result2 = data_array[:, 1] # all tags - print("size of the whole dataset= ", len(result2)) - print("sample size= ", len(result1)) + # comparison random tags to whole dataset + result1 = data_array[result, 1] # random tags + result2 = data_array[:, 1] # all tags + print("size of the whole dataset= ", len(result2)) + print("sample size= ", len(result1)) - # HD analysis of whole tag - proc_pool = Pool(nproc) - chunks_sample = numpy.array_split(result1, nproc) - ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample) - proc_pool.close() - proc_pool.join() - ham = numpy.concatenate(ham).astype(int) - # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1: - # for h, tag in zip(ham, result1): - # output_file1.write("{}\t{}\n".format(tag, h)) + # HD analysis of whole tag + proc_pool = Pool(nproc) + chunks_sample = numpy.array_split(result1, nproc) + ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample) + proc_pool.close() + proc_pool.join() + ham = numpy.concatenate(ham).astype(int) + # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1: + # for h, tag in zip(ham, result1): + # output_file1.write("{}\t{}\n".format(tag, h)) - # HD analysis for chimeric reads - proc_pool_b = Pool(nproc) - diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample) - diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample) - proc_pool_b.close() - proc_pool_b.join() - diff = numpy.concatenate((numpy.concatenate([item[0] for item in diff_list_a]), - numpy.concatenate([item_b[0] for item_b in diff_list_b]))).astype(int) - HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]), - numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int) - HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]), - numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int) - minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]), - numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int) - minHD_tags = numpy.concatenate((numpy.concatenate([item[4] for item in diff_list_a]), - numpy.concatenate([item_b[4] for item_b in diff_list_b]))) - rel_Diff = numpy.concatenate((numpy.concatenate([item[5] for item in diff_list_a]), - numpy.concatenate([item_b[5] for item_b in diff_list_b]))) - diff_zeros = numpy.concatenate((numpy.concatenate([item[6] for item in diff_list_a]), - numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int) - minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]), - numpy.concatenate([item_b[7] for item_b in diff_list_b]))) - HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]), numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int) - HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]), - numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int) + # HD analysis for chimeric reads + proc_pool_b = Pool(nproc) + diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample) + diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample) + proc_pool_b.close() + proc_pool_b.join() + diff = numpy.concatenate((numpy.concatenate([item[0] for item in diff_list_a]), + numpy.concatenate([item_b[0] for item_b in diff_list_b]))).astype(int) + HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]), + numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int) + HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]), + numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int) + minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]), + numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int) + minHD_tags = numpy.concatenate((numpy.concatenate([item[4] for item in diff_list_a]), + numpy.concatenate([item_b[4] for item_b in diff_list_b]))) + rel_Diff = numpy.concatenate((numpy.concatenate([item[5] for item in diff_list_a]), + numpy.concatenate([item_b[5] for item_b in diff_list_b]))) + diff_zeros = numpy.concatenate((numpy.concatenate([item[6] for item in diff_list_a]), + numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int) + minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]), + numpy.concatenate([item_b[7] for item_b in diff_list_b]))) + HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]), + numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int) + HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]), + numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int) - lenTags = len(data_array) + lenTags = len(data_array) - quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags - seq = numpy.array(data_array[result, 1]) # tags of sample - ham = numpy.asarray(ham) # HD for sample of tags + quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags + seq = numpy.array(data_array[result, 1]) # tags of sample + ham = numpy.asarray(ham) # HD for sample of tags - if onlyDuplicates is True: # ab and ba strands of DCSs - quant = numpy.concatenate((quant, duplTagsBA[result])) - seq = numpy.tile(seq, 2) - ham = numpy.tile(ham, 2) + if onlyDuplicates is True: # ab and ba strands of DCSs + quant = numpy.concatenate((quant, duplTagsBA[result])) + seq = numpy.tile(seq, 2) + ham = numpy.tile(ham, 2) - # prepare data for different kinds of plots - # distribution of FSs separated after HD - familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False) - list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS - - # get FS for all tags with min HD of analysis of chimeric reads - # there are more tags than sample size in the plot, because one tag can have multiple minimas - seqDic = dict(zip(seq, quant)) - lst_minHD_tags = [] - for i in minHD_tags: - lst_minHD_tags.append(seqDic.get(i)) + # prepare data for different kinds of plots + # distribution of FSs separated after HD + familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False) + list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS - # histogram with absolute and relative difference between HDs of both parts of the tag - listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff) - listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags, - rel_Diff) + # get FS for all tags with min HD of analysis of chimeric reads + # there are more tags than sample size in the plot, because one tag can have multiple minimas + seqDic = dict(zip(seq, quant)) + lst_minHD_tags = [] + for i in minHD_tags: + lst_minHD_tags.append(seqDic.get(i)) - # family size distribution separated after the difference between HDs of both parts of the tag - familySizeList1_diff, hammingDistances_diff, maximumXFS_diff, minimumXFS_diff = familySizeDistributionWithHD( - lst_minHD_tags, diff, diff=True, rel=False) - familySizeList1_reldiff, hammingDistances_reldiff, maximumXFS_reldiff, minimumXFS_reldiff = familySizeDistributionWithHD( - lst_minHD_tags, rel_Diff, diff=True, rel=True) + # histogram with absolute and relative difference between HDs of both parts of the tag + listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff) + listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags, + rel_Diff) - # chimeric read analysis: tags which have HD=0 in one of the halfs - if len(minHD_tags_zeros) != 0: - lst_minHD_tags_zeros = [] - for i in minHD_tags_zeros: - lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads + # family size distribution separated after the difference between HDs of both parts of the tag + familySizeList1_diff, hammingDistances_diff, maximumXFS_diff, minimumXFS_diff = familySizeDistributionWithHD( + lst_minHD_tags, diff, diff=True, rel=False) + familySizeList1_reldiff, hammingDistances_reldiff, maximumXFS_reldiff, minimumXFS_reldiff = familySizeDistributionWithHD( + lst_minHD_tags, rel_Diff, diff=True, rel=True) - # histogram with HD of non-identical half - listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(lst_minHD_tags_zeros, diff_zeros) - # family size distribution of non-identical half - familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD(lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False) - - # plot Hamming Distance with Family size distribution - plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, subtitle="Hamming distance separated by family size", title_file1=name_file, lenTags=lenTags, xlabel="HD", nr_above_bars=nr_above_bars) + # chimeric read analysis: tags which have HD=0 in one of the halfs + if len(minHD_tags_zeros) != 0: + lst_minHD_tags_zeros = [] + for i in minHD_tags_zeros: + lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads - # Plot FSD with separation after - plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, - originalCounts=quant, subtitle="Family size distribution separated by Hamming distance", - pdf=pdf, relative=False, title_file1=name_file, diff=False) + # histogram with HD of non-identical half + listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS( + lst_minHD_tags_zeros, diff_zeros) + # family size distribution of non-identical half + familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD( + lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False) - # Plot HD within tags - plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, title_file1=name_file) + # plot Hamming Distance with Family size distribution + plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, + subtitle="Hamming distance separated by family size", title_file1=name1, lenTags=lenTags, + xlabel="HD", nr_above_bars=nr_above_bars) - # Plot difference between HD's separated after FSD - plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, - subtitle="Delta Hamming distance within tags", - title_file1=name_file, lenTags=lenTags, - xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars) + # Plot FSD with separation after + plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, + originalCounts=quant, subtitle="Family size distribution separated by Hamming distance", + pdf=pdf, relative=False, title_file1=name1, diff=False) + + # Plot HD within tags + plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, + title_file1=name1) - plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, - subtitle="Chimera Analysis: relative delta Hamming distances", - title_file1=name_file, lenTags=lenTags, - xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars) + # Plot difference between HD's separated after FSD + plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, + subtitle="Delta Hamming distance within tags", + title_file1=name1, lenTags=lenTags, + xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars) - # plots for chimeric reads - if len(minHD_tags_zeros) != 0: - # HD - plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, - subtitle="Hamming distance of the non-identical half of chimeras", - title_file1=name_file, lenTags=lenTags, xlabel="HD", relative=False, nr_above_bars=nr_above_bars) + plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, + subtitle="Chimera Analysis: relative delta Hamming distances", + title_file1=name1, lenTags=lenTags, + xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars) - # print all data to a CSV file - # HD - summary, sumCol = createTableHD(list1, "HD=") - overallSum = sum(sumCol) # sum of columns in table - - # FSD - summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False) - overallSum5 = sum(sumCol5) - - # HD of both parts of the tag - summary9, sumCol9 = createTableHDwithTags([HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, numpy.array(minHDs)]) - overallSum9 = sum(sumCol9) - - # HD - # absolute difference - summary11, sumCol11 = createTableHD(listDifference1, "diff=") - overallSum11 = sum(sumCol11) - # relative difference and all tags - summary13, sumCol13 = createTableHD(listRelDifference1, "diff=") - overallSum13 = sum(sumCol13) - - # chimeric reads - if len(minHD_tags_zeros) != 0: - # absolute difference and tags where at least one half has HD=0 - summary15, sumCol15 = createTableHD(listDifference1_zeros, "HD=") - overallSum15 = sum(sumCol15) - - output_file.write("{}\n".format(name_file)) - output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len( - numpy.concatenate(list1)), lenTags, lenTags)) - + # plots for chimeric reads + if len(minHD_tags_zeros) != 0: # HD - createFileHD(summary, sumCol, overallSum, output_file, - "Hamming distance separated by family size", sep) - # FSD - createFileFSD2(summary5, sumCol5, overallSum5, output_file, - "Family size distribution separated by Hamming distance", sep, - diff=False) + plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, + subtitle="Hamming distance of the non-identical half of chimeras", + title_file1=name1, lenTags=lenTags, xlabel="HD", relative=False, + nr_above_bars=nr_above_bars) + + # print all data to a CSV file + # HD + summary, sumCol = createTableHD(list1, "HD=") + overallSum = sum(sumCol) # sum of columns in table + + # FSD + summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False) + overallSum5 = sum(sumCol5) + + # HD of both parts of the tag + summary9, sumCol9 = createTableHDwithTags([HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, numpy.array(minHDs)]) + overallSum9 = sum(sumCol9) + + # HD + # absolute difference + summary11, sumCol11 = createTableHD(listDifference1, "diff=") + overallSum11 = sum(sumCol11) + # relative difference and all tags + summary13, sumCol13 = createTableHD(listRelDifference1, "diff=") + overallSum13 = sum(sumCol13) + + # chimeric reads + if len(minHD_tags_zeros) != 0: + # absolute difference and tags where at least one half has HD=0 + summary15, sumCol15 = createTableHD(listDifference1_zeros, "HD=") + overallSum15 = sum(sumCol15) - count = numpy.bincount(quant) - # output_file.write("{}{}\n".format(sep, name_file)) - output_file.write("\n") - output_file.write("max. family size:{}{}\n".format(sep, max(quant))) - output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1])) - output_file.write( - "relative frequency:{}{}\n\n".format(sep, float(count[len(count) - 1]) / sum(count))) + output_file.write("{}\n".format(name1)) + output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len( + numpy.concatenate(list1)), lenTags, lenTags)) + + # HD + createFileHD(summary, sumCol, overallSum, output_file, + "Hamming distance separated by family size", sep) + # FSD + createFileFSD2(summary5, sumCol5, overallSum5, output_file, + "Family size distribution separated by Hamming distance", sep, + diff=False) - # HD within tags + count = numpy.bincount(quant) + # output_file.write("{}{}\n".format(sep, name1)) + output_file.write("\n") + output_file.write("max. family size:{}{}\n".format(sep, max(quant))) + output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1])) + output_file.write( + "relative frequency:{}{}\n\n".format(sep, float(count[len(count) - 1]) / sum(count))) + + # HD within tags + output_file.write( + "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n" + "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n") + output_file.write( + "actual number of tags with min HD = {:,} (sample size by user = {:,})\n".format( + len(numpy.concatenate(listDifference1)), len(numpy.concatenate(list1)))) + output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2)) + + createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, + "Hamming distance of each half in the tag", sep) + createFileHD(summary11, sumCol11, overallSum11, output_file, + "Absolute delta Hamming distances within the tag", sep) + createFileHD(summary13, sumCol13, overallSum13, output_file, + "Chimera analysis: relative delta Hamming distances", sep) + + if len(minHD_tags_zeros) != 0: output_file.write( - "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n" - "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n") - output_file.write( - "actual number of tags with min HD = {:,} (sample size by user = {:,})\n".format( - len(numpy.concatenate(listDifference1)), len(numpy.concatenate(list1)))) - output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2)) - - createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, - "Hamming distance of each half in the tag", sep) - createFileHD(summary11, sumCol11, overallSum11, output_file, - "Absolute delta Hamming distances within the tag", sep) - createFileHD(summary13, sumCol13, overallSum13, output_file, - "Chimera analysis: relative delta Hamming distances", sep) - - if len(minHD_tags_zeros) != 0: - output_file.write( - "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n") - createFileHD(summary15, sumCol15, overallSum15, output_file, - "Hamming distances of non-zero half", sep) - output_file.write("\n") + "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n") + createFileHD(summary15, sumCol15, overallSum15, output_file, + "Hamming distances of non-zero half", sep) + output_file.write("\n") if __name__ == '__main__': diff -r 2e9f7ea7ae93 -r b084b6a8e3ac hd.xml --- a/hd.xml Mon Oct 08 05:56:04 2018 -0400 +++ b/hd.xml Fri Dec 14 04:31:21 2018 -0500 @@ -1,20 +1,16 @@ - - Hamming distance (HD) analysis of tags + + hamming distance analysis of duplex tags python matplotlib - python2 '$__tool_directory__/hd.py' --inputFile '$inputFile' --inputName1 '$inputFile.name' --inputFile2 '$inputFile2' --inputName2 '$inputFile2.name' --sample_size $sampleSize --subset_tag $subsetTag --nproc $nproc $onlyDCS --minFS $minFS --maxFS $maxFS - $nr_above_bars --output_pdf $output_pdf --output_tabular $output_tabular - #if $inputFile2: - --output_pdf2 $output_pdf2 --output_tabular2 $output_tabular2 - #end if + python2 '$__tool_directory__/hd.py' --inputFile '$inputFile' --inputName1 '$inputFile.name' --sample_size $sampleSize --subset_tag $subsetTag --nproc $nproc $onlyDCS --minFS $minFS --maxFS $maxFS + $nr_above_bars --output_pdf $output_pdf --output_tabular $output_tabular - - + @@ -26,51 +22,65 @@ - - inputFile2 - - - inputFile2 - - - - `_ is the **Make Families** tool that produces output in this form:: + + 1 2 3 4 + ------------------------------------------------------ + AAAAAAAAAAAAAAATAGCTCGAT ba read1 CGCTACGTGACTGGGTCATG + AAAAAAAAAAAAAAATAGCTCGAT ba read2 CGCTACGTGACTGGGTCATG + AAAAAAAAAAAAAAATAGCTCGAT ba read3 CGCTACGTGACTGGGTCATG + + we only need columns 1 and 2. These two columns can be extracted from this dataset using **Cut** tool:: + + 1 2 + --------------------------- + AAAAAAAAAAAAAAATAGCTCGAT ba + AAAAAAAAAAAAAAATAGCTCGAT ba + AAAAAAAAAAAAAAATAGCTCGAT ba + + now one needs to count the number of unique occurencies of each tag. This is done using **Unique lines** tool, which would add an additional column containg counts (column 1):: + + + 1 2 3 + ----------------------------- + 3 AAAAAAAAAAAAAAATAGCTCGAT ba + + these data can now be used in this tool. **Output** - The output is one PDF file with the plots of the Hamming distance and a tabular file with the data of the plot for each dataset. +The output is one PDF file with the plots of the Hamming distance and a tabular file with the data of the plot for each dataset. **About Author** diff -r 2e9f7ea7ae93 -r b084b6a8e3ac test-data/Test_data2.tabular --- a/test-data/Test_data2.tabular Mon Oct 08 05:56:04 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,20 +0,0 @@ -1 AAAAAAAACCGCCCAACTGCCGGT ab -5 AAAAAAAACCTCTCAACCCCAAAT ba -7 AAAAAAAACCTCTTGCGATGTTGT ab -1 AAAAAAAACCTCTTGCGCTGTTGT ab -1 AAAAAAAACCTCTTGTGATGTTGT ab -12 AAAAAAAACCTGAGCAATGGTTCC ab -3 AAAAAAAACCTTGACCCTCACATG ba -6 AAAAAAAACCTTGCACTCGTCCTA ba -9 AAAAAAAACGAAATAAAAAAACCT ba -1 AAAAAAAACGACCGGCCTTAGACA ba -4 AAAAAAAACGCCACCACCCCCTTT ab -12 AAAAAAAACGCCACGGGCACTATT ba -13 AAAAAAAACGTATCAGTAGATCCT ab -1 AAAAAAAACTAGTAGGATTTCATG ba -3 AAAAAAAACTATAGAAAATCCATT ba -1 AAAAAAAACTATTCTATTTCCGAT ba -13 AAAAAAAACTGATCTGCTTGGCGG ba -8 AAAAAAAACTTGCGAATAGCATCG ba -4 AAAAAAAACTTGTTATCAAAACGT ab -1 AAAAAAAAGAAAAGTTCAACACGC ba \ No newline at end of file diff -r 2e9f7ea7ae93 -r b084b6a8e3ac test-data/output_file2.pdf Binary file test-data/output_file2.pdf has changed diff -r 2e9f7ea7ae93 -r b084b6a8e3ac test-data/output_file2.tabular --- a/test-data/output_file2.tabular Mon Oct 08 05:56:04 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,97 +0,0 @@ -Test_data2 -number of tags per file 20 (from 20) against 20 - -Hamming distance separated by family size - FS=1 FS=2 FS=3 FS=4 FS=5-10 FS>10 sum -HD=1 2 0 0 0 1 0 3 -HD=6 0 0 0 1 0 1 2 -HD=7 2 0 1 1 2 1 7 -HD=8 1 0 1 0 2 1 5 -HD=9 1 0 0 0 0 1 2 -HD=10 1 0 0 0 0 0 1 -sum 7 0 2 2 5 4 20 - -Family size distribution separated by Hamming distance - HD=1 HD=2 HD=3 HD=4 HD=5-8 HD>8 sum -FS=1 2 0 0 0 3 2 7 -FS=3 0 0 0 0 2 0 2 -FS=4 0 0 0 0 2 0 2 -FS=5 0 0 0 0 1 0 1 -FS=6 0 0 0 0 1 0 1 -FS=7 1 0 0 0 0 0 1 -FS=8 0 0 0 0 1 0 1 -FS=9 0 0 0 0 1 0 1 -FS=12 0 0 0 0 2 0 2 -FS=13 0 0 0 0 1 1 2 -sum 3 0 0 0 14 3 20 - - -max. family size: 13 -absolute frequency: 2 -relative frequency: 0.1 - -The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half. -It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user. -actual number of tags with min HD = 79 (sample size by user = 20) -length of one part of the tag = 12 - -Hamming distance of each half in the tag - HD a HD b' HD b HD a' HD a+b sum -HD=0 20 0 0 5 0 25 -HD=1 22 4 4 3 8 41 -HD=2 9 2 0 9 2 22 -HD=3 0 0 0 10 0 10 -HD=4 0 0 2 1 0 3 -HD=5 0 0 5 0 0 5 -HD=6 0 5 7 0 3 15 -HD=7 0 7 10 0 10 27 -HD=8 0 6 0 0 10 16 -HD=9 0 7 0 0 17 24 -HD=10 0 11 0 0 13 24 -HD=11 0 8 0 0 7 15 -HD=12 0 1 0 0 5 6 -HD=13 0 0 0 0 4 4 -sum 51 51 28 28 79 237 - -Absolute delta Hamming distances within the tag - FS=1 FS=2 FS=3 FS=4 FS=5-10 FS>10 sum -diff=1 5 0 0 1 5 0 11 -diff=2 4 0 0 0 0 0 4 -diff=3 1 0 2 1 1 0 5 -diff=4 1 0 1 0 2 1 5 -diff=5 2 0 0 0 4 6 12 -diff=6 1 0 0 1 1 7 10 -diff=7 2 0 1 0 0 0 3 -diff=8 0 0 1 0 1 3 5 -diff=9 6 0 0 1 3 4 14 -diff=10 4 0 0 0 3 2 9 -diff=11 0 0 0 0 0 1 1 -sum 26 0 5 4 20 24 79 - -Chimera analysis: relative delta Hamming distances - FS=1 FS=2 FS=3 FS=4 FS=5-10 FS>10 sum -diff=0.1 1 0 0 1 1 0 3 -diff=0.3 3 0 2 0 0 0 5 -diff=0.4 1 0 0 1 3 0 5 -diff=0.5 0 0 1 0 0 1 2 -diff=0.6 1 0 0 0 3 7 11 -diff=0.7 1 0 0 0 1 5 7 -diff=0.8 10 0 0 0 2 9 21 -diff=1.0 9 0 2 2 10 2 25 -sum 26 0 5 4 20 24 79 - -Chimeras: -All tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept. -So the hamming distance of the non-identical half is compared. -Hamming distances of non-zero half - FS=1 FS=2 FS=3 FS=4 FS=5-10 FS>10 sum -HD=1 4 0 0 0 4 0 8 -HD=2 2 0 0 0 0 0 2 -HD=6 0 0 0 1 0 2 3 -HD=7 1 0 1 0 0 0 2 -HD=8 0 0 1 0 1 0 2 -HD=9 1 0 0 1 2 0 4 -HD=10 1 0 0 0 3 0 4 -sum 9 0 2 2 10 2 25 - -