diff hd.py @ 22:7e570ba56b83 draft

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/hd commit b8a2f7b7615b2bcd3b602027af31f4e677da94f6-dirty
author mheinzl
date Wed, 27 Feb 2019 04:50:56 -0500
parents 9919024d7778
children 9e384b0741f1
line wrap: on
line diff
--- a/hd.py	Fri Dec 14 05:03:24 2018 -0500
+++ b/hd.py	Wed Feb 27 04:50:56 2019 -0500
@@ -13,14 +13,14 @@
 # It is also possible to perform the HD analysis with shortened tags with given sizes as input.
 # The tool can run on a certain number of processors, which can be defined by the user.
 
-# USAGE: python hd.py --inputFile filename --inputName1 filename --sample_size int/0 --sep "characterWhichSeparatesCSVFile" /
-#        --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular
+# USAGE: python hd.py --inputFile filename --inputName1 filename --sample_size int /
+#        --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_pdf outputfile_name_pdf --output_tabular outputfile_name_tabular --output_chimeras_tabular outputfile_name_chimeras_tabular
 
 import argparse
 import itertools
 import operator
 import sys
-from collections import Counter
+from collections import Counter, defaultdict
 from functools import partial
 from multiprocessing.pool import Pool
 
@@ -94,7 +94,7 @@
     plt.close("all")
 
 
-def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, title_file1, pdf, xlabel, relative=False, nr_above_bars=True):
+def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, title_file1, pdf, xlabel, relative=False, nr_above_bars=True, nr_unique_chimeras=0, len_sample=0):
     if relative is True:
         step = 0.1
     else:
@@ -144,6 +144,13 @@
 
     legend = "sample size= {:,} against {:,}".format(sum(counts), lenTags)
     plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure)
+    if nr_unique_chimeras != 0 and len_sample != 0:
+        if relative == True:
+            legend = "nr. of unique chimeric tags= {:,} ({:.5f}) (rel.diff=1)".format(nr_unique_chimeras,
+                                                                         int(nr_unique_chimeras) / float(len_sample))
+        else:
+            legend = "nr. of unique chimeric tags= {:,} ({:.5f})".format(nr_unique_chimeras, int(nr_unique_chimeras) / float(len_sample))
+        plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure)
 
     pdf.savefig(fig, bbox_inches="tight")
     plt.close("all")
@@ -158,7 +165,7 @@
 
     maximumX = numpy.amax(numpy.concatenate(ham_partial))
     minimumX = numpy.amin(numpy.concatenate(ham_partial))
-    maximumY = numpy.amax(numpy.array(numpy.concatenate(map(lambda (x): numpy.bincount(x), ham_partial))))
+    maximumY = numpy.amax(numpy.array(numpy.concatenate(map(lambda x: numpy.bincount(x), ham_partial))))
 
     if len(range(minimumX, maximumX)) == 0:
         range1 = minimumX
@@ -448,6 +455,7 @@
 
 def hamming_difference(array1, array2, mate_b):
     array2 = numpy.unique(array2)  # remove duplicate sequences to decrease running time
+
     array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1])  # mate1 part1
     array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1])  # mate1 part 2
 
@@ -473,6 +481,7 @@
     min_tagsList = []
     diff11_zeros = []
     min_tagsList_zeros = []
+    max_tag_list = []
     i = 0  # counter, only used to see how many HDs of tags were already calculated
     if mate_b is False:  # HD calculation for all a's
         half1_mate1 = array1_half
@@ -487,24 +496,29 @@
 
     for a, b, tag in zip(half1_mate1, half2_mate1, array1):
         # exclude identical tag from array2, to prevent comparison to itself
-        sameTag = numpy.where(array2 == tag)
+        sameTag = numpy.where(array2 == tag)[0]
         indexArray2 = numpy.arange(0, len(array2), 1)
         index_withoutSame = numpy.delete(indexArray2, sameTag)  # delete identical tag from the data
 
         # all tags without identical tag
         array2_half_withoutSame = half1_mate2[index_withoutSame]
         array2_half2_withoutSame = half2_mate2[index_withoutSame]
-        # array2_withoutSame = array2[index_withoutSame]  # whole tag (=not splitted into 2 halfs)
+        array2_withoutSame = array2[index_withoutSame]  # whole tag (=not splitted into 2 halfs)
 
         dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in
                             array2_half_withoutSame])  # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
-        min_index = numpy.where(dist == dist.min())  # get index of min HD
+        min_index = numpy.where(dist == dist.min())[0]  # get index of min HD
         min_value = dist[min_index]  # get minimum HDs
         min_tag_half2 = array2_half2_withoutSame[min_index]  # get all "b's" of the tag or all "a's" of the tag with minimum HD
-        # min_tag = array2_withoutSame[min_index]  # get whole tag with min HD
+        min_tag_array2 = array2_withoutSame[min_index]  # get whole tag with min HD
 
-        dist2 = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in
+        dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in
                              min_tag_half2])  # calculate HD of "b" to all "b's" or "a" to all "a's"
+        dist2 = [dist_second_half.max()]
+        min_value = [dist.min()]
+        max_index = numpy.where(dist_second_half == dist_second_half.max())[0]  # get index of max HD
+        max_tag = min_tag_array2[max_index]
+
         for d, d2 in zip(min_value, dist2):
             if mate_b is True:  # half2, corrects the variable of the HD from both halfs if it is a or b
                 ham2.append(d)
@@ -522,9 +536,11 @@
 
             # tags which have identical parts:
             if d == 0 or d2 == 0:
-                min_tagsList_zeros.append(tag)
-                difference1_zeros = abs(d - d2)
+                min_tagsList_zeros.append(numpy.array(tag))
+                difference1_zeros = abs(d - d2)  # hd of non-identical part
                 diff11_zeros.append(difference1_zeros)
+                max_tag_list.append(numpy.array(max_tag))
+
         i += 1
 
     # print(i)
@@ -536,8 +552,7 @@
     # relativeDiffList = [st for st in relativeDiffList if st != 999]
     # diff11_zeros = [st for st in diff11_zeros if st != 999]
     # min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999]
-
-    return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros, ham1min, ham2min])
+    return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros, ham1min, ham2min, max_tag_list])
 
 
 def readFileReferenceFree(file):
@@ -638,9 +653,6 @@
     parser.add_argument('--inputFile',
                         help='Tabular File with three columns: ab or ba, tag and family size.')
     parser.add_argument('--inputName1')
-    # parser.add_argument('--inputFile2', default=None,
-    #                     help='Tabular File with three columns: ab or ba, tag and family size.')
-    # parser.add_argument('--inputName2')
     parser.add_argument('--sample_size', default=1000, type=int,
                         help='Sample size of Hamming distance analysis.')
     parser.add_argument('--subset_tag', default=0, type=int,
@@ -661,37 +673,26 @@
                         help='Name of the tabular file.')
     parser.add_argument('--output_pdf', default="data.pdf", type=str,
                         help='Name of the pdf file.')
-    # parser.add_argument('--output_pdf2', default="data2.pdf", type=str,
-    #                     help='Name of the pdf file.')
-    # parser.add_argument('--output_tabular2', default="data2.tabular", type=str,
-    #                     help='Name of the tabular file.')
-
+    parser.add_argument('--output_chimeras_tabular', default="data_chimeras.tabular", type=str,
+                        help='Name of the tabular file with all chimeric tags.')
     return parser
 
 
 def Hamming_Distance_Analysis(argv):
     parser = make_argparser()
     args = parser.parse_args(argv[1:])
-
     file1 = args.inputFile
     name1 = args.inputName1
-
-    # file2 = args.inputFile2
-    # name2 = args.inputName2
-
     index_size = args.sample_size
     title_savedFile_pdf = args.output_pdf
-    # title_savedFile_pdf2 = args.output_pdf2
-
     title_savedFile_csv = args.output_tabular
-    # title_savedFile_csv2 = args.output_tabular2
+    output_chimeras_tabular = args.output_chimeras_tabular
 
     sep = "\t"
     onlyDuplicates = args.only_DCS
     minFS = args.minFS
     maxFS = args.maxFS
     nr_above_bars = args.nr_above_bars
-
     subset = args.subset_tag
     nproc = args.nproc
 
@@ -715,25 +716,24 @@
     plt.rcParams['patch.edgecolor'] = "#000000"
     plt.rc('figure', figsize=(11.69, 8.27))  # A4 format
 
-    # if file2 != str(None):
-    #     files = [file1, file2]
-    #     name1 = name1.split(".tabular")[0]
-    #     name2 = name2.split(".tabular")[0]
-    #     names = [name1, name2]
-    #     pdf_files = [title_savedFile_pdf, title_savedFile_pdf2]
-    #     csv_files = [title_savedFile_csv, title_savedFile_csv2]
-    # else:
-    # f = file1
     name1 = name1.split(".tabular")[0]
-    # name_file = name1
-    # pdf_f = title_savedFile_pdf
-    # csv_f = title_savedFile_csv
 
-    #for f, name_file, pdf_f, csv_f in zip(files, names, pdf_files, csv_files):
     with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf:
         print("dataset: ", name1)
         integers, data_array = readFileReferenceFree(file1)
         data_array = numpy.array(data_array)
+        print("total nr of tags:", len(data_array))
+        n = [i for i, x in enumerate(data_array[:, 1]) if "N" in x]
+        if len(n) != 0:  # delete tags with N in the tag from data
+            print("nr of tags with N's within tag:", len(n), float(len(n)) / len(data_array))
+            index_whole_array = numpy.arange(0, len(data_array), 1)
+            index_withoutN_inTag = numpy.delete(index_whole_array, n)
+            data_array = data_array[index_withoutN_inTag, :]
+            integers = integers[index_withoutN_inTag]
+            print("total nr of tags without Ns:", len(data_array))
+
+        data_array_whole_dataset = data_array
+
         int_f = numpy.array(data_array[:, 0]).astype(int)
         data_array = data_array[numpy.where(int_f >= minFS)]
         integers = integers[integers >= minFS]
@@ -789,13 +789,16 @@
             data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)])
             data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2]))
 
-        print("length of tag= ", len(data_array[0, 1]))
+        print("length of tag:", len(data_array[0, 1]))
         # select sample: if no size given --> all vs. all comparison
         if index_size == 0:
             result = numpy.arange(0, len(data_array), 1)
         else:
             result = numpy.random.choice(len(integers), size=index_size,
                                          replace=False)  # array of random sequences of size=index.size
+            # unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True) # get only unique tags
+            # result = numpy.random.choice(unique_indices, size=index_size,
+            #                        replace=False)  # array of random sequences of size=index.size
 
         # with open("index_result1_{}.pkl".format(app_f), "wb") as o:
         #     pickle.dump(result, o, pickle.HIGHEST_PROTOCOL)
@@ -803,8 +806,7 @@
         # comparison random tags to whole dataset
         result1 = data_array[result, 1]  # random tags
         result2 = data_array[:, 1]  # all tags
-        print("size of the whole dataset= ", len(result2))
-        print("sample size= ", len(result1))
+        print("sample size:", len(result1))
 
         # HD analysis of whole tag
         proc_pool = Pool(nproc)
@@ -818,6 +820,8 @@
         #     output_file1.write("{}\t{}\n".format(tag, h))
 
         # HD analysis for chimeric reads
+        # result2 = data_array_whole_dataset[:,1]
+
         proc_pool_b = Pool(nproc)
         diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample)
         diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample)
@@ -844,7 +848,62 @@
         HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]),
                                         numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int)
 
+        chimera_tags = numpy.concatenate(([item[10] for item in diff_list_a],
+                                       [item_b[10] for item_b in diff_list_b]))
+
+        chimera_tags = [x for x in chimera_tags if x != []]
+        chimera_tags_new = []
+
+        for i in chimera_tags:
+            if len(i) > 1:
+                for t in i:
+                    chimera_tags_new.append(t)
+            else:
+                chimera_tags_new.extend(i)
+
+        chimeras_dic = defaultdict(list)
+        for t1, t2 in zip(minHD_tags_zeros, chimera_tags_new):
+            chimeras_dic[t1].append(t2)
+
+        lst_unique_chimeras = []
+        with open(output_chimeras_tabular, "w") as output_file1:
+            unique_chimeras = numpy.unique(minHD_tags_zeros)
+            sample_half_a = numpy.array([i[0:(len(i)) / 2] for i in unique_chimeras])  # mate1 part1
+            sample_half_b = numpy.array([i[len(i) / 2:len(i)] for i in unique_chimeras])  # mate1 part 2
+
+            output_file1.write("sample tag\tsimilar tag\n")
+            for tag1, a, b in zip(unique_chimeras, sample_half_a, sample_half_b):
+                max_tags = numpy.concatenate(chimeras_dic.get(tag1))
+
+                if tag1 in chimeras_dic.values():
+                    continue
+                else:
+                    lst_unique_chimeras.append(tag1)
+
+                chimera_half_a = numpy.array([i[0:(len(i)) / 2] for i in max_tags])  # mate1 part1
+                chimera_half_b = numpy.array([i[len(i) / 2:len(i)] for i in max_tags])  # mate1 part 2
+
+                new_format = []
+                for i in range(len(max_tags)):
+                    if a == chimera_half_a[i]:
+                        max_tag = "*{}* {}".format(chimera_half_a[i], chimera_half_b[i])
+                        new_format.append(max_tag)
+
+                    elif b == chimera_half_b[i]:
+                        max_tag = "{} *{}*".format(chimera_half_a[i], chimera_half_b[i])
+                        new_format.append(max_tag)
+
+                sample_tag = "{} {}".format(a, b)
+                output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format)))
+            output_file1.write(
+                "This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.\n "
+                "The tags were separated by an empty space into their halves and the * marks the identical half.")
+
+        nr_chimeric_tags = len(lst_unique_chimeras)
+        print("nr of unique chimeras:", nr_chimeric_tags)
+
         lenTags = len(data_array)
+        len_sample = len(result1)
 
         quant = numpy.array(data_array[result, 0]).astype(int)  # family size for sample of tags
         seq = numpy.array(data_array[result, 1])  # tags of sample
@@ -854,6 +913,9 @@
             quant = numpy.concatenate((quant, duplTagsBA[result]))
             seq = numpy.tile(seq, 2)
             ham = numpy.tile(ham, 2)
+            diff = numpy.tile(diff, 2)
+            rel_Diff = numpy.tile(rel_Diff, 2)
+            diff_zeros = numpy.tile(diff_zeros, 2)
 
         # prepare data for different kinds of plots
         # distribution of FSs separated after HD
@@ -862,35 +924,48 @@
 
         # get FS for all tags with min HD of analysis of chimeric reads
         # there are more tags than sample size in the plot, because one tag can have multiple minimas
-        seqDic = dict(zip(seq, quant))
+        if onlyDuplicates:
+            seqDic = defaultdict(list)
+            for s, q in zip(seq, quant):
+                seqDic[s].append(q)
+        else:
+            seqDic = dict(zip(seq, quant))
+
+
         lst_minHD_tags = []
         for i in minHD_tags:
             lst_minHD_tags.append(seqDic.get(i))
 
+        if onlyDuplicates:
+            lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags],
+                                        [item_b[1] for item_b in lst_minHD_tags])).astype(int)
+        # else:
+        #     lst_minHD_tags = numpy.concatenate(lst_minHD_tags)
+
         # histogram with absolute and relative difference between HDs of both parts of the tag
         listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff)
         listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags,
                                                                                                  rel_Diff)
 
         # family size distribution separated after the difference between HDs of both parts of the tag
-        familySizeList1_diff, hammingDistances_diff, maximumXFS_diff, minimumXFS_diff = familySizeDistributionWithHD(
-            lst_minHD_tags, diff, diff=True, rel=False)
-        familySizeList1_reldiff, hammingDistances_reldiff, maximumXFS_reldiff, minimumXFS_reldiff = familySizeDistributionWithHD(
-            lst_minHD_tags, rel_Diff, diff=True, rel=True)
+        # familySizeList1_diff, hammingDistances_diff, maximumXFS_diff, minimumXFS_diff = familySizeDistributionWithHD(
+        #   lst_minHD_tags, diff, diff=True, rel=False)
+        # familySizeList1_reldiff, hammingDistances_reldiff, maximumXFS_reldiff, minimumXFS_reldiff = familySizeDistributionWithHD(
+        #    lst_minHD_tags, rel_Diff, diff=True, rel=True)
 
         # chimeric read analysis: tags which have HD=0 in one of the halfs
         if len(minHD_tags_zeros) != 0:
             lst_minHD_tags_zeros = []
             for i in minHD_tags_zeros:
                 lst_minHD_tags_zeros.append(seqDic.get(i))  # get family size for tags of chimeric reads
-
-        # histogram with HD of non-identical half
-        listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(
+            if onlyDuplicates:
+                lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros],
+                                                [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int)
+            
+            # histogram with HD of non-identical half
+            listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(
             lst_minHD_tags_zeros, diff_zeros)
-        # family size distribution of non-identical half
-        familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD(
-            lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False)
-
+            
         # plot Hamming Distance with Family size distribution
         plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf,
                       subtitle="Hamming distance separated by family size", title_file1=name1, lenTags=lenTags,
@@ -914,15 +989,14 @@
         plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf,
                       subtitle="Chimera Analysis: relative delta Hamming distances",
                       title_file1=name1, lenTags=lenTags,
-                      xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars)
+                      xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
 
         # plots for chimeric reads
         if len(minHD_tags_zeros) != 0:
             # HD
-            plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf,
-                          subtitle="Hamming distance of the non-identical half of chimeras",
+            plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf,subtitle="Hamming distance of the non-identical half of chimeras",
                           title_file1=name1, lenTags=lenTags, xlabel="HD", relative=False,
-                          nr_above_bars=nr_above_bars)
+                          nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
 
         # print all data to a CSV file
         # HD
@@ -974,26 +1048,31 @@
         # HD within tags
         output_file.write(
             "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n"
-            "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n")
-        output_file.write(
-            "actual number of tags with min HD = {:,} (sample size by user = {:,})\n".format(
-                len(numpy.concatenate(listDifference1)), len(numpy.concatenate(list1))))
+            "Since this calculation was repeated, but starting with the second half to find all possible chimeras in the data, the actual number of tags in the plots differs from the sample size entered by the user.\n"
+            "In addition, both family sizes of one tag will be included in the plots if only tags of reads that can form a DCS were allowed.\n")
+
         output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2))
 
         createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file,
                               "Hamming distance of each half in the tag", sep)
         createFileHD(summary11, sumCol11, overallSum11, output_file,
                      "Absolute delta Hamming distances within the tag", sep)
+
         createFileHD(summary13, sumCol13, overallSum13, output_file,
                      "Chimera analysis: relative delta Hamming distances", sep)
 
         if len(minHD_tags_zeros) != 0:
             output_file.write(
-                "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n")
+                "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the Hamming distance of the non-identical half is shown.\n")
+            output_file.write(
+                "Be aware that the real number of chimeric tags (where rel. diff = 1) is not shown in the plot because of the above reasons.\n")
+            output_file.write("real number of chimeric tags{}{}{}{}\n".format(sep, nr_chimeric_tags, sep, int(nr_chimeric_tags) / float(len_sample)))
             createFileHD(summary15, sumCol15, overallSum15, output_file,
                          "Hamming distances of non-zero half", sep)
+
         output_file.write("\n")
 
 
 if __name__ == '__main__':
     sys.exit(Hamming_Distance_Analysis(sys.argv))
+